diff --git a/.gitignore b/.gitignore
index 4b53cfd8591686..17dd1720235e20 100644
--- a/.gitignore
+++ b/.gitignore
@@ -53,6 +53,7 @@ model_test
 
 Testing
 tools/__pycache__
+tools/nvcc_lazy
 
 # This file is automatically generated.
 # TODO(zhiqiang) Move this file to build directory.
@@ -70,7 +71,9 @@ paddle/fluid/pybind/eager_op_function.cc
 
 # these files (directories) are generated before build system generation
 paddle/fluid/operators/generated_op.cc
+paddle/fluid/operators/generated_sparse_op.cc
 paddle/phi/ops/compat/generated_sig.cc
+paddle/phi/ops/compat/generated_sparse_sig.cc
 paddle/phi/api/yaml/parsed_apis/
 python/paddle/utils/code_gen/
 paddle/fluid/pybind/tmp_eager_op_function_impl.h
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f4c0f1d97cbdd2..5782dfa14fe7d0 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,10 +4,10 @@ repos:
     hooks:
     -   id: remove-crlf
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
--   repo: https://github.com/google/yapf
-    rev: v0.32.0
+-   repo: https://github.com/psf/black.git
+    rev: 22.8.0
     hooks:
-    -   id: yapf
+    -   id: black
         files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
         exclude: |
             (?x)^(
@@ -24,7 +24,7 @@ repos:
         files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
     -   id: sort-simple-yaml
-        files: (api|backward|api_[a-z_]+)\.yaml$
+        files: (op|backward|op_[a-z_]+)\.yaml$
 -   repo: local
     hooks:
     -   id: clang-format
diff --git a/.style.yapf b/.style.yapf
deleted file mode 100644
index 4741fb4f3bbc66..00000000000000
--- a/.style.yapf
+++ /dev/null
@@ -1,3 +0,0 @@
-[style]
-based_on_style = pep8
-column_limit = 80
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 49ccb815c995d6..f4b2ee08d943cb 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -249,7 +249,7 @@ option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
 option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
 option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
 option(ON_INFER "Turn on inference optimization and inference-lib generation"
-       OFF)
+       ON)
 ################################ Internal Configurations #######################################
 option(WITH_NV_JETSON "Compile PaddlePaddle with NV JETSON" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools"
@@ -336,7 +336,7 @@ endif()
 
 if(LINUX
    AND NOT WITH_CUSTOM_DEVICE
-   AND NOT ON_INFER)
+   AND WITH_PYTHON)
   set(WITH_CUSTOM_DEVICE
       ON
       CACHE BOOL "Enable Custom Device when compiling for Linux" FORCE)
diff --git a/README.md b/README.md
index 44370457212874..869813529a4554 100644
--- a/README.md
+++ b/README.md
@@ -15,12 +15,12 @@ English | [简体中文](./README_cn.md)
 Welcome to the PaddlePaddle GitHub.
 
 PaddlePaddle, as the first independent R&D deep learning platform in China, has been officially open-sourced to professional communities since 2016. It is an industrial platform with advanced technologies and rich features that cover core deep learning frameworks, basic model libraries, end-to-end development kits, tools & components as well as service platforms.
-PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 4.7 million developers, 180,000 companies and generating 560,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
+PaddlePaddle is originated from industrial practices with dedication and commitments to industrialization. It has been widely adopted by a wide range of sectors including manufacturing, agriculture, enterprise service, and so on while serving more than 5.35 million developers, 200,000 companies and generating 670,000 models. With such advantages, PaddlePaddle has helped an increasing number of partners commercialize AI.
 
 
 ## Installation
 
-### Latest PaddlePaddle Release: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3)
+### Latest PaddlePaddle Release: [v2.4](https://github.com/PaddlePaddle/Paddle/tree/release/2.4)
 
 Our vision is to enable deep learning for everyone via PaddlePaddle.
 Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest features of PaddlePaddle.
@@ -89,8 +89,8 @@ We provide [English](https://www.paddlepaddle.org.cn/documentation/docs/en/guide
     
 ## Courses
 
-- [Server Deployments](https://aistudio.baidu.com/aistudio/course/introduce/19084): Courses intorducing high performance server deployments via local and remote services.
-- [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses intorducing edge deployments from mobile, IoT to web and applets.   
+- [Server Deployments](https://aistudio.baidu.com/aistudio/course/introduce/19084): Courses introducing high performance server deployments via local and remote services.
+- [Edge Deployments](https://aistudio.baidu.com/aistudio/course/introduce/22690): Courses introducing edge deployments from mobile, IoT to web and applets.
 
 ## Copyright and License
 PaddlePaddle is provided under the [Apache-2.0 license](LICENSE).
diff --git a/README_cn.md b/README_cn.md
index f4cb6f4fff78eb..bef0ca59225dc2 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -15,11 +15,11 @@
 
 欢迎来到 PaddlePaddle GitHub
 
-飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者477万，服务企业18万家，基于飞桨开源深度学习平台产生了56万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
+飞桨(PaddlePaddle)以百度多年的深度学习技术研究和业务应用为基础，是中国首个自主研发、功能完备、 开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体。目前，飞桨累计开发者535万，服务企业20万家，基于飞桨开源深度学习平台产生了67万个模型。飞桨助力开发者快速实现AI想法，快速上线AI业务。帮助越来越多的行业完成AI赋能，实现产业智能化升级。
 
 ## 安装
 
-### PaddlePaddle最新版本: [v2.3](https://github.com/PaddlePaddle/Paddle/tree/release/2.3)
+### PaddlePaddle最新版本: [v2.4](https://github.com/PaddlePaddle/Paddle/tree/release/2.4)
 
 跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases)
 
@@ -63,32 +63,25 @@ PaddlePaddle用户可领取**免费Tesla V100在线算力资源**，训练模型
 我们提供 [英文](https://www.paddlepaddle.org.cn/documentation/docs/en/guides/index_en.html) 和
 [中文](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html) 文档
 
-- [使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)
+- [使用指南](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/index_cn.html)：或许您想从深度学习基础开始学习飞桨
 
-   或许您想从深度学习基础开始学习飞桨
-  
-- [应用实践](https://www.paddlepaddle.org.cn/documentation/docs/zh/tutorial/index_cn.html)
+- [应用实践](https://www.paddlepaddle.org.cn/documentation/docs/zh/tutorial/index_cn.html)：使用飞桨搭建您的模型，更高效的完成深度学习任务
 
-  
-- [API Reference](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/index_cn.html)
+- [API 文档](https://www.paddlepaddle.org.cn/documentation/docs/zh/api/index_cn.html)：新的 API 支持代码更少更简洁的程序
 
-   新的API支持代码更少更简洁的程序
-   
 
-- [贡献方式](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_contribution/index_cn.html)
-
-   欢迎您的贡献!
+- [贡献方式](https://www.paddlepaddle.org.cn/documentation/docs/zh/guides/08_contribution/index_cn.html)：欢迎您的贡献!
 
 ## 交流与反馈
 
 - 欢迎您通过[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)来提交问题、报告与建议
 - QQ群: 441226485 (PaddlePaddle)
-- [论坛](https://aistudio.baidu.com/paddle/forum): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验, 营造良好的论坛氛围
-    
+- [论坛](https://aistudio.baidu.com/paddle/forum): 欢迎大家在PaddlePaddle论坛分享在使用PaddlePaddle中遇到的问题和经验，营造良好的论坛氛围
+
 ## 课程
 
 - [服务器部署](https://aistudio.baidu.com/aistudio/course/introduce/19084): 详细介绍高性能服务器端部署实操，包含本地端及服务化Serving部署等
-- [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690): 详细介绍端侧多场景部署实操，从移端端设备、IoT、网页到小程序部署
+- [端侧部署](https://aistudio.baidu.com/aistudio/course/introduce/22690): 详细介绍端侧多场景部署实操，从移动端设备、IoT、网页到小程序部署
 
 ## 版权和许可证
 PaddlePaddle由[Apache-2.0 license](LICENSE)提供
diff --git a/SECURITY.md b/SECURITY.md
index 97b092d6dfc018..073a27c61ee219 100644
--- a/SECURITY.md
+++ b/SECURITY.md
@@ -48,7 +48,7 @@ We will indicate the bug fix in the release of PaddlePaddle, and publish the vul
 
 ### What is a vulnerability?
 
-In the process of computation graphs in PaddlePaddle, models can perform arbitrary computations , including reading and writing files, communicating with the network, etc. It may cause memory exhaustion, deadlock, etc., which will lead to unexpected behavior of PaddlePaddle. We consider these behavior to be security vulnerabilities only if they are out of the intention of the operation involved. 
+In the process of computation graphs in PaddlePaddle, models can perform arbitrary computations , including reading and writing files, communicating with the network, etc. It may cause memory exhaustion, deadlock, etc., which will lead to unexpected behavior of PaddlePaddle. We consider these behavior to be security vulnerabilities only if they are out of the intention of the operation involved.
 
 
 
@@ -60,4 +60,4 @@ If malicious input can trigger memory corruption or non-clean exit, such bug is
 
 
 
-[security advisories](https://github.com/PaddlePaddle/Paddle/blob/develop/security/README.md)
+[security advisories](./security/README.md)
diff --git a/SECURITY_cn.md b/SECURITY_cn.md
index cd2b4b450b46e4..e22340349c4feb 100644
--- a/SECURITY_cn.md
+++ b/SECURITY_cn.md
@@ -46,4 +46,4 @@
 
 如果输入非预期的参数后，对飞桨代码造成了内存破坏，或者非干净退出，这类行为被认定为存在安全问题。
 
-### [安全公告](https://github.com/PaddlePaddle/Paddle/blob/develop/security/README_cn.md)
+### [安全公告](./security/README_cn.md)
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 87b943abd0106d..ab48a16529e635 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -6,6 +6,7 @@ if(WITH_NV_JETSON)
   add_definitions(-DWITH_NV_JETSON)
   set(paddle_known_gpu_archs "53 62 72")
   set(paddle_known_gpu_archs10 "53 62 72")
+  set(paddle_known_gpu_archs11 "53 62 72 87")
 elseif(NEW_RELEASE_ALL)
   message("Using New Release Strategy - All Arches Packge")
   add_definitions(-DNEW_RELEASE_ALL)
@@ -165,10 +166,14 @@ function(select_nvcc_arch_flags out_variable)
   elseif(${CUDA_ARCH_NAME} STREQUAL "Turing")
     set(cuda_arch_bin "75")
   elseif(${CUDA_ARCH_NAME} STREQUAL "Ampere")
-    if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
-      set(cuda_arch_bin "80")
-    elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
-      set(cuda_arch_bin "80 86")
+    if(WITH_NV_JETSON)
+      set(cuda_arch_bin "87")
+    else()
+      if(${CMAKE_CUDA_COMPILER_VERSION} LESS 11.1) # CUDA 11.0
+        set(cuda_arch_bin "80")
+      elseif(${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.1+
+        set(cuda_arch_bin "80 86")
+      endif()
     endif()
   elseif(${CUDA_ARCH_NAME} STREQUAL "All")
     set(cuda_arch_bin ${paddle_known_gpu_archs})
diff --git a/cmake/experiments/cuda_module_loading_lazy.cmake b/cmake/experiments/cuda_module_loading_lazy.cmake
index bcbfaacad1240f..d1e07f57cb045e 100644
--- a/cmake/experiments/cuda_module_loading_lazy.cmake
+++ b/cmake/experiments/cuda_module_loading_lazy.cmake
@@ -1,4 +1,4 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -16,16 +16,15 @@
 # cuda moduel lazy loading is supported by CUDA 11.7+
 # this experiment option makes Paddle supports lazy loading before CUDA 11.7.
 
-option(EXP_CUDA_MODULE_LOADING_LAZY "enable lazy cuda module loading" OFF)
-if(${EXP_CUDA_MODULE_LOADING_LAZY})
-  if(NOT ${ON_INFER} OR NOT ${LINUX})
+if(LINUX)
+  if(NOT ON_INFER)
     message(
       "EXP_CUDA_MODULE_LOADING_LAZY only works with ON_INFER=ON on Linux platforms"
     )
     return()
   endif()
-  if(NOT ${CUDA_FOUND})
-    message("EXP_CUDA_MODULE_LOADING_LAZY only works with CUDA")
+  if(NOT WITH_GPU)
+    message("EXP_CUDA_MODULE_LOADING_LAZY only works with GPU")
     return()
   endif()
   if(${CUDA_VERSION} VERSION_GREATER_EQUAL "11.7")
@@ -41,6 +40,12 @@ if(${EXP_CUDA_MODULE_LOADING_LAZY})
       CACHE BOOL "" FORCE)
   set(CMAKE_CUDA_FLAGS "--cudart shared")
   enable_language(CUDA)
+  execute_process(
+    COMMAND "rm" "-rf" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
+    COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh"
+    COMMAND "bash" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy.sh"
+            "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy" "${CUDA_TOOLKIT_ROOT_DIR}")
+  execute_process(COMMAND "chmod" "755" "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy")
   set(CUDA_NVCC_EXECUTABLE
       "${CMAKE_SOURCE_DIR}/tools/nvcc_lazy"
       CACHE FILEPATH "" FORCE)
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
old mode 100644
new mode 100755
index cd7b254892ed10..a0fc013a130a15
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -25,8 +25,8 @@ set(GLOO_LIBRARY_DIR
     "${GLOO_INSTALL_DIR}/lib"
     CACHE PATH "gloo library directory." FORCE)
 # As we add extra features for gloo, we use the non-official repo
-set(GLOO_REPOSITORY ${GIT_URL}/sandyhouse/gloo.git)
-set(GLOO_TAG v0.0.2)
+set(GLOO_REPOSITORY ${GIT_URL}/ziyoujiyi/gloo.git)
+set(GLOO_TAG v0.0.3)
 set(GLOO_LIBRARIES
     "${GLOO_INSTALL_DIR}/lib/libgloo.a"
     CACHE FILEPATH "gloo library." FORCE)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 6f9078c8eeecd2..5903edebae825f 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -268,8 +268,6 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
       DOWNLOAD_DIR ${PROTOBUF_SOURCE_DIR}
       DOWNLOAD_COMMAND rm -rf arm_protobuf.tar.gz && wget --no-check-certificate
                        ${ARM_PROTOBUF_URL} && tar zxvf arm_protobuf.tar.gz
-      #DOWNLOAD_COMMAND    cp /home/wangbin44/Paddle/build/arm_protobuf.tar.gz .
-      #                    && tar zxvf arm_protobuf.tar.gz
       UPDATE_COMMAND ""
       CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
                  -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
diff --git a/cmake/external/rocksdb.cmake b/cmake/external/rocksdb.cmake
index 41a1916dc33083..673b143aba8536 100644
--- a/cmake/external/rocksdb.cmake
+++ b/cmake/external/rocksdb.cmake
@@ -35,6 +35,7 @@ ExternalProject_Add(
   CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
              -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
              -DWITH_BZ2=OFF
+             -DPORTABLE=1
              -DWITH_GFLAGS=OFF
              -DCMAKE_CXX_FLAGS=${ROCKSDB_CMAKE_CXX_FLAGS}
              -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 677fed84dcb4c3..9c69a5db93b962 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -9,8 +9,8 @@ set(XPU_RT_LIB_NAME "libxpurt.so")
 
 if(NOT DEFINED XPU_BASE_URL)
   set(XPU_BASE_URL_WITHOUT_DATE
-      "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220907")
+      "https://baidu-kunlun-product.su.bcebos.com/KL-SDK/klsdk-dev")
+  set(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220919")
 else()
   set(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
@@ -19,7 +19,7 @@ endif()
 if(NOT DEFINED XPU_XDNN_BASE_URL)
   set(XPU_XDNN_BASE_URL_WITHOUT_DATE
       "https://klx-sdk-release-public.su.bcebos.com/xdnn/dev")
-  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220907")
+  set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL_WITHOUT_DATE}/20220919")
 else()
   set(XPU_XDNN_BASE_URL "${XPU_XDNN_BASE_URL}")
 endif()
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 6e685bbde402e8..648a060f9b7e80 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -97,5 +97,4 @@ endif()
 
 add_library(xxhash STATIC IMPORTED GLOBAL)
 set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
-include_directories(${XXHASH_INCLUDE_DIR})
 add_dependencies(xxhash extern_xxhash)
diff --git a/cmake/neuware.cmake b/cmake/neuware.cmake
index 16dbf16899b5d9..8c873f35b7f2d1 100644
--- a/cmake/neuware.cmake
+++ b/cmake/neuware.cmake
@@ -15,12 +15,14 @@ set(NEUWARE_LIB_DIR ${NEUWARE_HOME}/lib64)
 include_directories(${NEUWARE_INCLUDE_DIR})
 
 set(CNNL_LIB ${NEUWARE_LIB_DIR}/libcnnl.so)
+set(MLUOP_LIB ${NEUWARE_LIB_DIR}/libmluops.so)
 set(CNRT_LIB ${NEUWARE_LIB_DIR}/libcnrt.so)
 set(CNDRV_LIB ${NEUWARE_LIB_DIR}/libcndrv.so)
 set(CNPAPI_LIB ${NEUWARE_LIB_DIR}/libcnpapi.so)
 
 generate_dummy_static_lib(LIB_NAME "neuware_lib" GENERATOR "neuware.cmake")
-set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${CNRT_LIB} ${CNDRV_LIB} ${CNPAPI_LIB})
+set(NEUWARE_LIB_DEPS ${CNNL_LIB} ${MLUOP_LIB} ${CNRT_LIB} ${CNDRV_LIB}
+                     ${CNPAPI_LIB})
 
 if(WITH_CNCL)
   message(STATUS "Compile with CNCL!")
diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index c560dddfef5e79..bb5722ae82e5cb 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -510,7 +510,7 @@ function(op_library TARGET)
   if(WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
     # Append first implemented MKLDNN activation operator
     if(${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
-      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(gelu, MKLDNN);\n")
+      file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(softplus, MKLDNN);\n")
     elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op")
       file(APPEND ${pybind_file}
            "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
@@ -611,3 +611,58 @@ function(register_operators)
     endif()
   endif()
 endfunction()
+
+function(prune_pybind_h)
+  set(op_list ${OP_LIST})
+
+  list(APPEND op_list "load_combine")
+  list(APPEND op_list "tensorrt_engine")
+
+  # add fused_op in op_list
+  list(APPEND op_list "fc")
+  list(APPEND op_list "conv2d_fusion")
+  list(APPEND op_list "fusion_seqconv_eltadd_relu")
+  list(APPEND op_list "fusion_seqpool_cvm_concat")
+  list(APPEND op_list "fusion_gru")
+  list(APPEND op_list "fusion_seqexpand_concat_fc")
+  list(APPEND op_list "fusion_repeated_fc_relu")
+  list(APPEND op_list "fusion_squared_mat_sub")
+
+  # add plugin_op in op_list
+  list(APPEND op_list "anchor_generator")
+
+  file(STRINGS ${pybind_file} op_registry_list)
+
+  file(WRITE ${pybind_file_prune} "")
+  file(
+    APPEND ${pybind_file_prune}
+    "// Generated by the paddle/fluid/operators/CMakeLists.txt.  DO NOT EDIT!\n"
+  )
+
+  # add USE_OP_ITSELF for all op in op_list
+  foreach(op_name IN LISTS op_list)
+    file(APPEND ${pybind_file_prune} "USE_OP_ITSELF(${op_name});\n")
+  endforeach()
+
+  foreach(op_registry IN LISTS op_registry_list)
+    if(NOT ${op_registry} EQUAL "")
+      foreach(op_name IN LISTS op_list)
+        string(FIND ${op_registry} "(${op_name})" index1)
+        string(FIND ${op_registry} "(${op_name}," index2)
+        string(FIND ${op_registry} "USE_OP_ITSELF" index3)
+        if(((NOT ${index1} EQUAL "-1") OR (NOT ${index2} EQUAL "-1"))
+           AND (${index3} EQUAL "-1"))
+          file(APPEND ${pybind_file_prune} "${op_registry}\n")
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+
+  file(WRITE ${pybind_file} "")
+  file(STRINGS ${pybind_file_prune} op_registry_list_tmp)
+  foreach(op_name IN LISTS op_registry_list_tmp)
+    if(NOT ${op_name} EQUAL "")
+      file(APPEND ${pybind_file} "${op_name}\n")
+    endif()
+  endforeach()
+endfunction()
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index d50b2ea101d1df..04e16b9a4f5f21 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -78,7 +78,7 @@ function(kernel_declare TARGET_LIST)
     string(
       REGEX
         MATCH
-        "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[ \t\r\n\/]*[a-z0-9_]*"
+        "(PD_REGISTER_KERNEL|PD_REGISTER_GENERAL_KERNEL)\\([ \t\r\n]*[a-z0-9_]*,[[ \\\t\r\n\/]*[a-z0-9_]*]?[ \\\t\r\n]*[a-zA-Z]*,[ \\\t\r\n]*[A-Z_]*"
         first_registry
         "${kernel_impl}")
     if(NOT first_registry STREQUAL "")
@@ -89,38 +89,23 @@ function(kernel_declare TARGET_LIST)
           continue()
         endif()
       endif()
-      # parse the first kernel name
-      string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_name "${first_registry}")
-      string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_name
-                     "${kernel_name}")
-      string(REPLACE "," "" kernel_name "${kernel_name}")
-      string(REGEX REPLACE "[ \t\r\n]+" "" kernel_name "${kernel_name}")
-      string(REGEX REPLACE "//cuda_only" "" kernel_name "${kernel_name}")
+      # parse the registerd kernel message
+      string(REPLACE "PD_REGISTER_KERNEL(" "" kernel_msg "${first_registry}")
+      string(REPLACE "PD_REGISTER_GENERAL_KERNEL(" "" kernel_msg
+                     "${kernel_msg}")
+      string(REPLACE "," ";" kernel_msg "${kernel_msg}")
+      string(REGEX REPLACE "[ \\\t\r\n]+" "" kernel_msg "${kernel_msg}")
+      string(REGEX REPLACE "//cuda_only" "" kernel_msg "${kernel_msg}")
+
+      list(GET kernel_msg 0 kernel_name)
+      list(GET kernel_msg 1 kernel_backend)
+      list(GET kernel_msg 2 kernel_layout)
+
       # append kernel declare into declarations.h
-      # TODO(chenweihang): default declare ALL_LAYOUT for each kernel
-      if(${kernel_path} MATCHES "./cpu\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./gpu\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, GPU, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./xpu\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./gpudnn\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./kps\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
-      elseif(${kernel_path} MATCHES "./onednn\/")
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, OneDNN, ALL_LAYOUT);\n")
-      else()
-        # deal with device independent kernel, now we use CPU temporaary
-        file(APPEND ${kernel_declare_file}
-             "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
-      endif()
+      file(
+        APPEND ${kernel_declare_file}
+        "PD_DECLARE_KERNEL(${kernel_name}, ${kernel_backend}, ${kernel_layout});\n"
+      )
     endif()
   endforeach()
 endfunction()
@@ -163,3 +148,35 @@ function(register_op_utils TARGET_NAME)
     SRCS ${utils_srcs}
     DEPS ${register_op_utils_DEPS})
 endfunction()
+
+function(prune_declaration_h)
+  set(kernel_list ${KERNEL_LIST})
+  file(STRINGS ${kernel_declare_file} kernel_registry_list)
+
+  file(WRITE ${kernel_declare_file_prune} "")
+  file(APPEND ${kernel_declare_file_prune}
+       "// Generated by the paddle/phi/kernels/CMakeLists.txt.  DO NOT EDIT!\n")
+  file(APPEND ${kernel_declare_file_prune} "#pragma once\n")
+  file(APPEND ${kernel_declare_file_prune}
+       "#include \"paddle/phi/core/kernel_registry.h\"\n")
+
+  foreach(kernel_registry IN LISTS kernel_registry_list)
+    if(NOT ${kernel_registry} EQUAL "")
+      foreach(kernel_name IN LISTS kernel_list)
+        string(FIND ${kernel_registry} "(${kernel_name})" index1)
+        string(FIND ${kernel_registry} "(${kernel_name}," index2)
+        if((NOT ${index1} EQUAL "-1") OR (NOT ${index2} EQUAL "-1"))
+          file(APPEND ${kernel_declare_file_prune} "${kernel_registry}\n")
+        endif()
+      endforeach()
+    endif()
+  endforeach()
+
+  file(WRITE ${kernel_declare_file} "")
+  file(STRINGS ${kernel_declare_file_prune} kernel_registry_list_tmp)
+  foreach(kernel_registry IN LISTS kernel_registry_list_tmp)
+    if(NOT ${kernel_registry} EQUAL "")
+      file(APPEND ${kernel_declare_file} "${kernel_registry}\n")
+    endif()
+  endforeach()
+endfunction()
diff --git a/cmake/third_party.cmake b/cmake/third_party.cmake
index 94fb1b4d838f9a..96a78b527ac970 100755
--- a/cmake/third_party.cmake
+++ b/cmake/third_party.cmake
@@ -236,7 +236,7 @@ endif()
 if(WIN32
    OR APPLE
    OR NOT WITH_GPU
-   OR ON_INFER)
+   OR (ON_INFER AND NOT WITH_PYTHON))
   set(WITH_DGC OFF)
 endif()
 
@@ -424,6 +424,19 @@ if(WITH_PSCORE)
   list(APPEND third_party_deps extern_rocksdb)
 endif()
 
+if(WITH_DISTRIBUTE
+   AND NOT WITH_PSLIB
+   AND NOT WITH_PSCORE
+   AND NOT WITH_RPC)
+  include(external/snappy)
+  list(APPEND third_party_deps extern_snappy)
+
+  include(external/leveldb)
+  list(APPEND third_party_deps extern_leveldb)
+  include(external/brpc)
+  list(APPEND third_party_deps extern_brpc)
+endif()
+
 if(WITH_XBYAK)
   include(external/xbyak) # download, build, install xbyak
   list(APPEND third_party_deps extern_xbyak)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index fc5aa111483594..5771a0abd75b64 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -1,3 +1,33 @@
 paddle.fluid.optimizer.PipelineOptimizer (paddle.fluid.optimizer.PipelineOptimizer, ('document', '2e55a29dbeb874934f7a1a1af3a22b8c'))
 paddle.fluid.optimizer.PipelineOptimizer.__init__ (ArgSpec(args=['self', 'optimizer', 'num_microbatches', 'start_cpu_core_id'], varargs=None, keywords=None, defaults=(1, 0)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.PipelineOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.audio.features (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
+paddle.audio.features.layers.LogMelSpectrogram (ArgSpec(), ('document', 'c38b53606aa89215c4f00d3833e158b8'))
+paddle.audio.features.layers.LogMelSpectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', '6c14f6f78dc697a6981cf90412e2f1ea'))
+paddle.audio.features.layers.LogMelSpectrogram.load_dict (ArgSpec(args=[], varargs='args', varkw='kwargs', defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={}), ('document', '01221a60445ee437f439a8cbe293f759'))
+paddle.audio.features.layers.LogMelSpectrogram.state_dict (ArgSpec(args=['self', 'destination', 'include_sublayers', 'structured_name_prefix', 'use_hook'], varargs=None, varkw=None, defaults=(None, True, '', True), kwonlyargs=[], kwonlydefaults=None, annotations={}), ('document', '0c01cb0c12220c9426ae49549b145b0b'))
+paddle.audio.features.layers.MFCC (ArgSpec(), ('document', 'bcbe6499830d9228a4f746ddd63b6c0f'))
+paddle.audio.features.layers.MFCC.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', 'd86bcaa345f26851089bfdb3efecd9e7'))
+paddle.audio.features.layers.MelSpectrogram (ArgSpec(), ('document', 'adf4012310984568ae9da6170aa89f91'))
+paddle.audio.features.layers.MelSpectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', '458e9d454c8773091567c6b400f48cf5'))
+paddle.audio.features.layers.Spectrogram (ArgSpec(), ('document', '83811af6da032099bf147e3e01a458e1'))
+paddle.audio.features.layers.Spectrogram.forward (ArgSpec(args=['self', 'x'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'x': <class 'paddle.Tensor'>}), ('document', 'ab11e318fca1410f743b5432394dea35'))
+paddle.audio.functional (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
+paddle.audio.functional.functional.compute_fbank_matrix (ArgSpec(args=['sr', 'n_fft', 'n_mels', 'f_min', 'f_max', 'htk', 'norm', 'dtype'], varargs=None, varkw=None, defaults=(64, 0.0, None, False, 'slaney', 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'sr': <class 'int'>, 'n_fft': <class 'int'>, 'n_mels': <class 'int'>, 'f_min': <class 'float'>, 'f_max': typing.Union[float, NoneType], 'htk': <class 'bool'>, 'norm': typing.Union[str, float], 'dtype': <class 'str'>}), ('document', '3c5411caa6baedb68860b09c81e0147c'))
+paddle.audio.functional.functional.create_dct (ArgSpec(args=['n_mfcc', 'n_mels', 'norm', 'dtype'], varargs=None, varkw=None, defaults=('ortho', 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'n_mfcc': <class 'int'>, 'n_mels': <class 'int'>, 'norm': typing.Union[str, NoneType], 'dtype': <class 'str'>}), ('document', 'c9c57550671f9725b053769411d2f65a'))
+paddle.audio.functional.functional.fft_frequencies (ArgSpec(args=['sr', 'n_fft', 'dtype'], varargs=None, varkw=None, defaults=('float32',), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'sr': <class 'int'>, 'n_fft': <class 'int'>, 'dtype': <class 'str'>}), ('document', '057b990e79c9c780622407267c0a43c6'))
+paddle.audio.functional.functional.hz_to_mel (ArgSpec(args=['freq', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[paddle.Tensor, float], 'freq': typing.Union[paddle.Tensor, float], 'htk': <class 'bool'>}), ('document', '7ca01521dd0bf26cd3f72c67f7168dc4'))
+paddle.audio.functional.functional.mel_frequencies (ArgSpec(args=['n_mels', 'f_min', 'f_max', 'htk', 'dtype'], varargs=None, varkw=None, defaults=(64, 0.0, 11025.0, False, 'float32'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'n_mels': <class 'int'>, 'f_min': <class 'float'>, 'f_max': <class 'float'>, 'htk': <class 'bool'>, 'dtype': <class 'str'>}), ('document', '2af3cf997ed1274214ec240b2b59a98d'))
+paddle.audio.functional.functional.mel_to_hz (ArgSpec(args=['mel', 'htk'], varargs=None, varkw=None, defaults=(False,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Union[float, paddle.Tensor], 'mel': typing.Union[float, paddle.Tensor], 'htk': <class 'bool'>}), ('document', 'e93b432d382f98c60d7c7599489e7072'))
+paddle.audio.functional.functional.power_to_db (ArgSpec(args=['spect', 'ref_value', 'amin', 'top_db'], varargs=None, varkw=None, defaults=(1.0, 1e-10, 80.0), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'spect': <class 'paddle.Tensor'>, 'ref_value': <class 'float'>, 'amin': <class 'float'>, 'top_db': typing.Union[float, NoneType]}), ('document', '28bbb1973e8399e856bfaea0415cecb9'))
+paddle.audio.functional.window.get_window (ArgSpec(args=['window', 'win_length', 'fftbins', 'dtype'], varargs=None, varkw=None, defaults=(True, 'float64'), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.Tensor'>, 'window': typing.Union[str, typing.Tuple[str, float]], 'win_length': <class 'int'>, 'fftbins': <class 'bool'>, 'dtype': <class 'str'>}), ('document', '2418d63da10c0cd5da9ecf0a88ddf783'))
+paddle.audio.backends (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
+paddle.audio.backends.init_backend.get_current_audio_backend (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'str'>}), ('document', '3ff9fd62e8be1f3dc7e34afaf50e1645'))
+paddle.audio.backends.init_backend.list_available_backends (ArgSpec(args=[], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.List[str]}), ('document', '8eba49f1b69f7ec7fa139a0714a2724e'))
+paddle.audio.backends.init_backend.set_backend (ArgSpec(args=['backend_name'], varargs=None, varkw=None, defaults=None, kwonlyargs=[], kwonlydefaults=None, annotations={'backend_name': <class 'str'>}), ('document', '9680247dd97274d345dee415e2787527'))
+paddle.audio.backends.wave_backend.info (ArgSpec(args=['filepath', 'format'], varargs=None, varkw=None, defaults=(None,), kwonlyargs=[], kwonlydefaults=None, annotations={'return': <class 'paddle.audio.backends.backend.AudioInfo'>, 'filepath': <class 'str'>, 'format': typing.Union[str, NoneType]}), ('document', 'e0ffd3accd942a9b0a4c08463a9f60f6'))
+paddle.audio.backends.wave_backend.load (ArgSpec(args=['filepath', 'frame_offset', 'num_frames', 'normalize', 'channels_first', 'format'], varargs=None, varkw=None, defaults=(0, -1, True, True, None), kwonlyargs=[], kwonlydefaults=None, annotations={'return': typing.Tuple[paddle.Tensor, int], 'filepath': typing.Union[str, pathlib.Path], 'frame_offset': <class 'int'>, 'num_frames': <class 'int'>, 'normalize': <class 'bool'>, 'channels_first': <class 'bool'>, 'format': typing.Union[str, NoneType]}), ('document', '4de50575ca516b4b7c7c82c7fdec808f'))
+paddle.audio.backends.wave_backend.save (ArgSpec(args=['filepath', 'src', 'sample_rate', 'channels_first', 'compression', 'format', 'encoding', 'bits_per_sample'], varargs=None, varkw=None, defaults=(True, None, None, None, None), kwonlyargs=[], kwonlydefaults=None, annotations={'filepath': <class 'str'>, 'src': <class 'paddle.Tensor'>, 'sample_rate': <class 'int'>, 'channels_first': <class 'bool'>, 'compression': typing.Union[float, NoneType], 'format': typing.Union[str, NoneType], 'encoding': typing.Union[str, NoneType], 'bits_per_sample': typing.Union[int, NoneType]}), ('document', '4c85cfcd29a0dcdfc32e74db8c0c3961'))
+paddle.audio.datasets (ArgSpec(), ('document', 'd41d8cd98f00b204e9800998ecf8427e'))
+paddle.audio.datasets.TESS (ArgSpec(), ('document', '3605f3aa2191ede7ddbe594cd27bb067'))
+paddle.audio.datasets.TESS.meta_info (ArgSpec(), ('document', '60d548a6f71629c3b69bcda3a30d4819'))
diff --git a/paddle/fluid/distributed/CMakeLists.txt b/paddle/fluid/distributed/CMakeLists.txt
index 0201d1131eb4a2..d60c7329c93edf 100755
--- a/paddle/fluid/distributed/CMakeLists.txt
+++ b/paddle/fluid/distributed/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(auto_parallel)
 add_subdirectory(collective)
 add_subdirectory(store)
+add_subdirectory(fleet_executor)
 if(WITH_PYTHON)
   py_proto_compile(ps_py_proto SRCS the_one_ps.proto)
   add_custom_target(
@@ -29,7 +30,6 @@ if(WITH_PYTHON)
 endif()
 
 if(NOT WITH_PSCORE)
-  add_subdirectory(fleet_executor)
   return()
 endif()
 
@@ -47,4 +47,3 @@ add_subdirectory(common)
 add_subdirectory(ps)
 add_subdirectory(test)
 add_subdirectory(index_dataset)
-add_subdirectory(fleet_executor)
diff --git a/paddle/fluid/distributed/collective/ProcessGroup.h b/paddle/fluid/distributed/collective/ProcessGroup.h
index 10b1686ddb85fe..fb4832442a4699 100644
--- a/paddle/fluid/distributed/collective/ProcessGroup.h
+++ b/paddle/fluid/distributed/collective/ProcessGroup.h
@@ -122,6 +122,16 @@ class ProcessGroup {
         "ProcessGroup%s does not support broadcast", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const BroadcastOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support broadcast with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) {
     PADDLE_THROW(platform::errors::InvalidArgument(
@@ -134,38 +144,89 @@ class ProcessGroup {
         "ProcessGroup%s does not support send", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send with sync_op flag",
+        GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Recv(
-      std::vector<phi::DenseTensor>& tensors, int) {  // NOLINT
+      std::vector<phi::DenseTensor>&, int) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support receive", GetBackendName()));
+        "ProcessGroup%s does not support recv", GetBackendName()));
   }
 
-  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor&,
-                                                           int,
-                                                           int,
-                                                           int) {  // NOLINT
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>&, int, bool) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support send", GetBackendName()));
+        "ProcessGroup%s does not support recv with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int64_t,
+      int64_t) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support send_partial with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
-      phi::DenseTensor& tensors, int, int, int) {  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      int,
+      int64_t,
+      int64_t) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support receive", GetBackendName()));
+        "ProcessGroup%s does not support recv_partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor&, int, int64_t, int64_t, bool) {  // NOLINT
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support recv_partial with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>&,    // NOLINT
       std::vector<phi::DenseTensor>&) {  // NOLINT
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support AllGather", GetBackendName()));
+        "ProcessGroup%s does not support all_gather", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support all_gather with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
-      int offset,
-      int length) {  // NOLINT
+      int64_t offset,
+      int64_t length) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
+      bool) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support AllGather_Partial", GetBackendName()));
   }
@@ -177,6 +238,14 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
@@ -186,26 +255,66 @@ class ProcessGroup {
         "ProcessGroup%s does not support AllToAll_Single", GetBackendName()));
   }
 
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<int64_t>&,
+      std::vector<int64_t>&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support alltoall_single", GetBackendName()));
+  }
+
   virtual std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
       const ReduceOptions& opts) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support Reduce", GetBackendName()));
+        "ProcessGroup%s does not support reduce", GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& /* input tensors */,   // NOLINT
+      std::vector<phi::DenseTensor>& /* output tensors */,  // NOLINT
+      const ReduceOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support reduce with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ScatterOptions&) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter", GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> Scatter(
       std::vector<phi::DenseTensor>&,  // NOLINT
       std::vector<phi::DenseTensor>&,  // NOLINT
-      const ScatterOptions&) {         // NOLINT
+      const ScatterOptions&,
+      bool) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "ProcessGroup%s does not support scatter with sync_op flag",
+        GetBackendName()));
+  }
+
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      std::vector<phi::DenseTensor>&,  // NOLINT
+      const ReduceScatterOptions&,
+      bool) {
     PADDLE_THROW(platform::errors::InvalidArgument(
-        "ProcessGroup%s does not support Scatter", GetBackendName()));
+        "ProcessGroup%s does not support reduce_scatter with sync_op flag",
+        GetBackendName()));
   }
 
   virtual std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
-      phi::DenseTensor&,              // NOLINT
-      phi::DenseTensor&,              // NOLINT
-      const ReduceScatterOptions&) {  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      phi::DenseTensor&,  // NOLINT
+      const ReduceScatterOptions&) {
     PADDLE_THROW(platform::errors::InvalidArgument(
         "ProcessGroup%s does not support ReduceScatter", GetBackendName()));
   }
diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
index ad9356b368ea26..f18765a05f6190 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.cc
@@ -267,8 +267,8 @@ void* XcclGetPointerByOffset(void* raw_pointer,
 std::shared_ptr<ProcessGroup::Task> ProcessGroupCustom::AllGather_Partial(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
-    int offset,
-    int length) {
+    int64_t offset,
+    int64_t length) {
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCustomPlace(in_tensors, device_type_),
       true,
diff --git a/paddle/fluid/distributed/collective/ProcessGroupCustom.h b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
index ccce66603afe69..ce3532bbb6f0e2 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupCustom.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupCustom.h
@@ -80,8 +80,8 @@ class ProcessGroupCustom : public ProcessGroup {
   std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
-      int offset,
-      int length) override;
+      int64_t offset,
+      int64_t length) override;
 
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,
@@ -117,8 +117,8 @@ class ProcessGroupCustom : public ProcessGroup {
   std::set<int> used_place_ids_;
 
  private:
-  void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids,
-                     int root,  // NOLINT
+  void BcastCustomId(std::vector<phi::ccl::CCLRootId>& ccl_ids,  // NOLINT
+                     int root,
                      int server_fd);
 
   void BroadcastUniqueCustomID(
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
index b23942b114f3be..07065ac908e4e3 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.cc
@@ -88,6 +88,9 @@ namespace distributed {
     case experimental::DataType::BOOL:       \
       func<bool>(args);                      \
       break;                                 \
+    case experimental::DataType::BFLOAT16:   \
+      func<bfloat16>(args);                  \
+      break;                                 \
     default:                                 \
       VLOG(0) << "Error: Unknown DataType."; \
       exit(-1);                              \
@@ -293,6 +296,14 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
     std::vector<phi::DenseTensor>& inputs,
     std::vector<phi::DenseTensor>& outputs,
     const AllreduceOptions& opts) {
+  return AllReduce(inputs, outputs, opts, true);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupGloo::AllReduce(
+    std::vector<phi::DenseTensor>& inputs,
+    std::vector<phi::DenseTensor>& outputs,
+    const AllreduceOptions& opts,
+    bool sync_op) {
   auto tag = next_tag();
   std::shared_ptr<GlooTask> task;
   auto context = get_context();
diff --git a/paddle/fluid/distributed/collective/ProcessGroupGloo.h b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
index 95ce18c1d8217e..d911da91eb1a32 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupGloo.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupGloo.h
@@ -120,6 +120,12 @@ class ProcessGroupGloo : public ProcessGroup {
       std::vector<phi::DenseTensor>& outputs,
       const AllreduceOptions& opts = AllreduceOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllReduce(
+      std::vector<phi::DenseTensor>& inputs,
+      std::vector<phi::DenseTensor>& outputs,
+      const AllreduceOptions& opts,
+      bool sync_op) override;
+
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
index 239114ae6188ce..f9429e1efa774d 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.cc
@@ -51,6 +51,17 @@ std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
       places, rank, comm_type, inputs);
 }
 
+std::shared_ptr<ProcessGroupNCCL::NCCLTask> ProcessGroupNCCL::CreateTask(
+    const std::vector<Place>& places,
+    int rank,
+    CommType comm_type,
+    const std::vector<phi::DenseTensor>& inputs,
+    bool is_sync,
+    bool use_calc_stream) {
+  return std::make_shared<ProcessGroupNCCL::NCCLTask>(
+      places, rank, comm_type, inputs, is_sync, use_calc_stream);
+}
+
 ProcessGroupNCCL::NCCLTask::NCCLTask(
     const std::vector<Place>& places,
     int rank,
@@ -264,10 +275,12 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Collective(
 
   auto& nccl_comms = places_to_ncclcomm_[key];
 
-  SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  if (!use_calc_stream) {
+    SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  }
 
-  auto task = std::make_shared<ProcessGroupNCCL::NCCLTask>(
-      places, rank_, comm_type, inputs, sync_op, use_calc_stream);
+  auto task =
+      CreateTask(places, rank_, comm_type, inputs, sync_op, use_calc_stream);
 
   platform::CUDADeviceGuard cuda_guard;
 
@@ -406,6 +419,78 @@ void ProcessGroupNCCL::Collective(const phi::DenseTensor* in,
   cuda_guard.SetDevice(places[0]);
 }
 
+template <typename Fn>
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
+    std::vector<phi::DenseTensor>& tensors,
+    Fn fn,
+    int dst_rank,
+    CommType op_type,
+    bool sync_op,
+    bool use_calc_stream) {
+  const auto& places = GetPlaceList(tensors);
+  const auto& key = GetKeyFromPlaces(places);
+
+  {
+    std::lock_guard<std::mutex> lock(mutex_);
+    if (places_to_ncclcomm_.find(key) == places_to_ncclcomm_.end()) {
+      CreateNCCLManagerCache(key, places);
+    }
+  }
+
+  auto& nccl_comms = places_to_ncclcomm_[key];
+
+  if (!use_calc_stream) {
+    SyncDefaultStream(places, places_to_events_[key], places_to_ctx_[key]);
+  }
+
+  auto task =
+      CreateTask(places, rank_, op_type, tensors, sync_op, use_calc_stream);
+
+  platform::CUDADeviceGuard cuda_guard;
+
+  {
+    platform::NCCLGroupGuard nccl_guard;
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+    }
+  }
+
+  if (FLAGS_use_stream_safe_cuda_allocator) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      gpuStream_t nccl_stream;
+      if (use_calc_stream) {
+        nccl_stream =
+            static_cast<phi::GPUContext*>(
+                platform::DeviceContextPool::Instance().Get(places[i]))
+                ->stream();
+      } else {
+        nccl_stream = places_to_ctx_[key][i]->stream();
+      }
+      memory::RecordStream(tensors[i].Holder(), nccl_stream);
+    }
+  }
+
+  if (!use_calc_stream) {
+    for (size_t i = 0; i < tensors.size(); ++i) {
+      cuda_guard.SetDevice(places[i]);
+      task->control_events_[i].Record(*places_to_ctx_[key][i]);
+    }
+  }
+
+  return task;
+}
+
 template <typename Fn>
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
     std::vector<phi::DenseTensor>& tensors,
@@ -431,20 +516,20 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::PointToPoint(
   // construct uninitialize guard for device
   platform::CUDADeviceGuard cuda_guard;
 
-  if (FLAGS_use_stream_safe_cuda_allocator) {
+  {
+    platform::NCCLGroupGuard nccl_guard;
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
-      memory::RecordStream(tensors[i].Holder(),
-                           places_to_ctx_[key][i]->stream());
+      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
+      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
     }
   }
 
-  {
-    platform::NCCLGroupGuard nccl_guard;
+  if (FLAGS_use_stream_safe_cuda_allocator) {
     for (size_t i = 0; i < tensors.size(); ++i) {
       cuda_guard.SetDevice(places[i]);
-      const auto& nccl_stream = places_to_ctx_[key][i]->stream();
-      fn(tensors[i], nccl_comms[i]->GetNcclComm(), nccl_stream, dst_rank);
+      memory::RecordStream(tensors[i].Holder(),
+                           places_to_ctx_[key][i]->stream());
     }
   }
 
@@ -543,6 +628,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
       CommType::BROADCAST);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        const auto root =
+            opts.source_rank * in_tensors.size() + opts.source_root;
+        return platform::dynload::ncclBroadcast(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.type()),
+            root,
+            comm,
+            stream);
+      },
+      CommType::BROADCAST,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Barrier(
     const BarrierOptions& opts) {
   // Only support single card single process
@@ -617,6 +736,34 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send(
+    std::vector<phi::DenseTensor>& tensors,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
     std::vector<phi::DenseTensor>& tensors, int src_rank) {
   CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
@@ -640,17 +787,43 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv(
+    std::vector<phi::DenseTensor>& tensors,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
+
+  auto task = PointToPoint(
+      tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
-    phi::DenseTensor& tensors, int dst_rank, int offset, int length) {
+    phi::DenseTensor& tensors, int dst_rank, int64_t offset, int64_t length) {
   // CheckTensorsInDifferentDevices(tensors, static_cast<size_t>(GetSize()));
 
   phi::DenseTensor flatten_tensor;
   flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
 
-  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
-
-  std::vector<phi::DenseTensor> shared_tensors;
-  shared_tensors.push_back(shared_input);
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
 
   auto task = PointToPoint(
       shared_tensors,
@@ -671,16 +844,49 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
+
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& input,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int dst_rank) {
+        return platform::dynload::ncclSend(
+            input.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            dst_rank,
+            comm,
+            stream);
+      },
+      dst_rank,
+      CommType::SEND,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
-    phi::DenseTensor& tensors, int src_rank, int offset, int length) {
+    phi::DenseTensor& tensors, int src_rank, int64_t offset, int64_t length) {
   // phi::DenseTensor shared_input = tensors.Slice(offset, offset+length);
 
   phi::DenseTensor flatten_tensor;
   flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
-  phi::DenseTensor shared_input = flatten_tensor.Slice(offset, offset + length);
 
-  std::vector<phi::DenseTensor> shared_tensors;
-  shared_tensors.push_back(shared_input);
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
 
   auto task = PointToPoint(
       shared_tensors,
@@ -701,6 +907,40 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
   return task;
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  phi::DenseTensor flatten_tensor;
+  flatten_tensor.ShareDataWith(tensors).Resize({tensors.numel()});
+
+  std::vector<phi::DenseTensor> shared_tensors{
+      flatten_tensor.Slice(offset, offset + length)};
+
+  auto task = PointToPoint(
+      shared_tensors,
+      [&](phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream,
+          int src_rank) {
+        return platform::dynload::ncclRecv(
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(output.dtype()),
+            src_rank,
+            comm,
+            stream);
+      },
+      src_rank,
+      CommType::RECV,
+      sync_op,
+      use_calc_stream);
+  return task;
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
@@ -730,33 +970,69 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
       CommType::ALLGATHER);
 }
 
-void* GetPointerByOffset(void* raw_pointer,
-                         size_t offset,
-                         experimental::DataType type) {
-  if (type == experimental::DataType::FLOAT32) {
-    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
-                                   offset);
-  } else if (type == experimental::DataType::FLOAT64) {
-    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
-                                   offset);
-  } else if (type == experimental::DataType::FLOAT16) {
-    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
-                                   offset);
-  } else if (type == experimental::DataType::INT32) {
-    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
-                                   offset);
-  } else if (type == experimental::DataType::INT64) {
-    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
-                                   offset);
-  } else if (type == experimental::DataType::INT8) {
-    return reinterpret_cast<void*>(reinterpret_cast<int8_t*>(raw_pointer) +
-                                   offset);
-  } else if (type == experimental::DataType::UINT8) {
-    return reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(raw_pointer) +
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER,
+      sync_op,
+      use_calc_stream);
+}
+
+void* GetPointerByOffset(void* raw_pointer,
+                         size_t offset,
+                         experimental::DataType type) {
+  if (type == experimental::DataType::FLOAT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<float*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<double*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::FLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<int16_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT32) {
+    return reinterpret_cast<void*>(reinterpret_cast<int32_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT64) {
+    return reinterpret_cast<void*>(reinterpret_cast<int64_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::INT8) {
+    return reinterpret_cast<void*>(reinterpret_cast<int8_t*>(raw_pointer) +
+                                   offset);
+  } else if (type == experimental::DataType::UINT8) {
+    return reinterpret_cast<void*>(reinterpret_cast<uint8_t*>(raw_pointer) +
                                    offset);
   } else if (type == experimental::DataType::BOOL) {
     return reinterpret_cast<void*>(reinterpret_cast<bool*>(raw_pointer) +
                                    offset);
+  } else if (type == experimental::DataType::BFLOAT16) {
+    return reinterpret_cast<void*>(reinterpret_cast<uint16_t*>(raw_pointer) +
+                                   offset);
   } else {
     PADDLE_THROW(platform::errors::Unimplemented(
         "This datatype in nccl is not supported."));
@@ -767,8 +1043,8 @@ void* GetPointerByOffset(void* raw_pointer,
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
-    int offset,
-    int length) {
+    int64_t offset,
+    int64_t length) {
   PADDLE_ENFORCE_EQ(
       CheckTensorsInCudaPlace(in_tensors),
       true,
@@ -795,6 +1071,41 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
       CommType::ALLGATHER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All outputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        return platform::dynload::ncclAllGather(
+            GetPointerByOffset(input.data(), offset, input.dtype()),
+            output.data(),
+            length,
+            platform::ToNCCLDataType(input.dtype()),
+            comm,
+            stream);
+      },
+      CommType::ALLGATHER,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors) {
@@ -837,6 +1148,52 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
       CommType::ALLTOALL);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        size_t offset = 0;
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), offset, input.dtype()),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), offset, input.dtype()),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          offset += input.numel() / size_;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -899,6 +1256,72 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAll_Single(
       CommType::ALLTOALL_SINGLE);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::AllToAllSingle(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_EQ(input.dtype() == output.dtype(),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "The dtypes of input and output must be equal."));
+
+        std::vector<int64_t> in_dims = phi::vectorize(input.dims());
+        std::vector<int64_t> out_dims = phi::vectorize(output.dims());
+        CheckSplitSizes(&in_sizes, in_dims);
+        CheckSplitSizes(&out_sizes, out_dims);
+
+        size_t in_offset = 0, out_offset = 0;
+        size_t in_length = 0, out_length = 0;
+        size_t in_row_size = input.numel() / in_dims[0];
+        size_t out_row_size = output.numel() / out_dims[0];
+
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+        for (auto i = 0; i < size_; i++) {
+          in_length = in_sizes[i] * in_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+              GetPointerByOffset(input.data(), in_offset, input.dtype()),
+              in_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          in_offset += in_length;
+
+          out_length = out_sizes[i] * out_row_size;
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              GetPointerByOffset(output.data(), out_offset, input.dtype()),
+              out_length,
+              platform::ToNCCLDataType(input.dtype()),
+              i,
+              comm,
+              stream));
+          out_offset += out_length;
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+      },
+      CommType::ALLTOALL_SINGLE,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -927,6 +1350,70 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
       CommType::REDUCE);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](const phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduce(
+            input.data(),
+            output.data(),
+            input.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            opts.root_rank,
+            comm,
+            stream));
+      },
+      CommType::REDUCE,
+      sync_op,
+      use_calc_stream);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::ReduceScatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        if (FLAGS_use_stream_safe_cuda_allocator) {
+          platform::CUDADeviceGuard cuda_guard;
+          cuda_guard.SetDevice(output.place());
+          memory::RecordStream(output.Holder(), stream);
+        }
+        PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclReduceScatter(
+            input.data(),
+            output.data(),
+            output.numel(),
+            platform::ToNCCLDataType(input.dtype()),
+            ToNCCLRedType(opts.reduce_op),
+            comm,
+            stream));
+      },
+      CommType::REDUCE_SCATTER,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
     std::vector<phi::DenseTensor>& in_tensors,
     std::vector<phi::DenseTensor>& out_tensors,
@@ -980,6 +1467,68 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
       CommType::SCATTER);
 }
 
+std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(in_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  PADDLE_ENFORCE_EQ(
+      CheckTensorsInCudaPlace(out_tensors),
+      true,
+      platform::errors::InvalidArgument("All inputs should be in CudaPlace."));
+  return Collective(
+      in_tensors,
+      out_tensors,
+      [&](phi::DenseTensor& input,
+          phi::DenseTensor& output,
+          ncclComm_t comm,
+          const gpuStream_t& stream) {
+        PADDLE_ENFORCE_EQ(
+            output.numel(),
+            input.numel() / size_,
+            platform::errors::InvalidArgument(
+                "Input and output tensors should have the same shape."));
+        size_t offset = 0;
+        if (rank_ == opts.root_rank) {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupStart());
+          for (auto i = 0; i < size_; i++) {
+            PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
+                GetPointerByOffset(input.data(), offset, input.dtype()),
+                input.numel() / size_,
+                platform::ToNCCLDataType(input.dtype()),
+                i,
+                comm,
+                stream));
+            offset += input.numel() / size_;
+          }
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output.data(),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              opts.root_rank,
+              comm,
+              stream));
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclGroupEnd());
+        } else {
+          PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
+              output.data(),
+              input.numel() / size_,
+              platform::ToNCCLDataType(input.dtype()),
+              opts.root_rank,
+              comm,
+              stream));
+        }
+      },
+      CommType::SCATTER,
+      sync_op,
+      use_calc_stream);
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupNCCL::_ReduceScatterBase(
     phi::DenseTensor& out_tensor,
     phi::DenseTensor& in_tensor,
@@ -1044,13 +1593,22 @@ ncclComm_t ProcessGroupNCCL::NCCLComm(const Place& place) const {
 
 phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
     const Place& place) const {
-  std::vector<Place> places = {place};
-  const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places));
-  PADDLE_ENFORCE_NE(iter,
-                    places_to_ctx_.end(),
-                    platform::errors::InvalidArgument(
-                        "Cannot find device context in process group."));
-  return iter->second[0].get();
+  return GetDeviceContext(place, /*use_calc_stream*/ false);
+}
+
+phi::DeviceContext* ProcessGroupNCCL::GetDeviceContext(
+    const Place& place, bool use_calc_stream) const {
+  if (use_calc_stream) {
+    return platform::DeviceContextPool::Instance().Get(place);
+  } else {
+    std::vector<Place> places = {place};
+    const auto& iter = places_to_ctx_.find(GetKeyFromPlaces(places));
+    PADDLE_ENFORCE_NE(iter,
+                      places_to_ctx_.end(),
+                      platform::errors::InvalidArgument(
+                          "Cannot find device context in process group."));
+    return iter->second[0].get();
+  }
 }
 
 }  //  namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
index e0e298e9113e9e..6427e9e3e2ab1c 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupNCCL.h
@@ -60,7 +60,7 @@ class ProcessGroupNCCL : public ProcessGroupStream {
              int rank,
              CommType comm_type,
              const std::vector<phi::DenseTensor>& inputs,
-             bool is_sync,
+             bool sync_op,
              bool use_calc_stream);
 
     bool IsCompleted();
@@ -98,6 +98,9 @@ class ProcessGroupNCCL : public ProcessGroupStream {
 
   phi::DeviceContext* GetDeviceContext(const Place& place) const override;
 
+  phi::DeviceContext* GetDeviceContext(const Place& place,
+                                       bool use_calc_stream) const override;
+
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
@@ -116,38 +119,93 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       std::vector<phi::DenseTensor>& out_tensors,
       const BroadcastOptions& = BroadcastOptions()) override;
 
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const BroadcastOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Barrier(
       const BarrierOptions& = BarrierOptions()) override;
 
   std::shared_ptr<ProcessGroup::Task> Send(
       std::vector<phi::DenseTensor>& tensors, int dst_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,
+      int dst_rank,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Recv(
       std::vector<phi::DenseTensor>& tensors, int src_rank) override;
 
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,
+      int src_rank,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Send_Partial(phi::DenseTensor& tensors,
                                                    int dst_rank,
-                                                   int offset,
-                                                   int length) override;
+                                                   int64_t offset,
+                                                   int64_t length) override;
+
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,
+      int dst_rank,
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> Recv_Partial(phi::DenseTensor& tensors,
                                                    int src_rank,
-                                                   int offset,
-                                                   int length) override;
+                                                   int64_t offset,
+                                                   int64_t length) override;
+
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,
+      int src_rank,
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> AllGather(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int64_t offset,
+      int64_t length) override;
+
   std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
-      int offset,
-      int length) override;
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> AllToAll(
-      std::vector<phi::DenseTensor>& in,
-      std::vector<phi::DenseTensor>& out) override;
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors) override;
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> AllToAll_Single(
       std::vector<phi::DenseTensor>& in,
@@ -155,15 +213,44 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       std::vector<int64_t>& in_sizes,
       std::vector<int64_t>& out_sizes) override;
 
+  std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      std::vector<int64_t>& in_sizes,
+      std::vector<int64_t>& out_sizes,
+      bool sync_op,
+      bool use_calc_stream) override;
+
   std::shared_ptr<ProcessGroup::Task> Reduce(
       std::vector<phi::DenseTensor>& tensors,
       std::vector<phi::DenseTensor>& out_tensors,
       const ReduceOptions& opts) override;
 
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ReduceScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      const ScatterOptions& opts) override;
+
   std::shared_ptr<ProcessGroup::Task> Scatter(
       std::vector<phi::DenseTensor>& in_tensors,
       std::vector<phi::DenseTensor>& out_tensors,
-      const ScatterOptions&) override;
+      const ScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream) override;
 
   std::shared_ptr<ProcessGroup::Task> _ReduceScatterBase(
       phi::DenseTensor&,  // NOLINT
@@ -180,9 +267,17 @@ class ProcessGroupNCCL : public ProcessGroupStream {
   virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
       std::vector<Place> places,
       int rank,
-      CommType opType,
+      CommType op_type,
       const std::vector<phi::DenseTensor>& inputs);
 
+  virtual std::shared_ptr<ProcessGroupNCCL::NCCLTask> CreateTask(
+      const std::vector<Place>& places,
+      int rank,
+      CommType op_type,
+      const std::vector<phi::DenseTensor>& inputs,
+      bool sync_op,
+      bool use_calc_stream);
+
  protected:
   std::shared_ptr<Store> store_;
   std::shared_ptr<NCCLCommManager> nccl_comm_;
@@ -233,6 +328,15 @@ class ProcessGroupNCCL : public ProcessGroupStream {
       int dst_rank,
       CommType op_type);
 
+  template <typename Fn>
+  std::shared_ptr<ProcessGroup::Task> PointToPoint(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      Fn fn,
+      int dst_rank,
+      CommType op_type,
+      bool sync_op,
+      bool use_calc_stream);
+
   void CreateNCCLManagerCache(const std::string& places_key,
                               const std::vector<Place>& places);
 
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.cc b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
index 9a20b8e6eaf79f..b2cfae088b2271 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.cc
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.cc
@@ -23,6 +23,31 @@ ProcessGroupStream::ProcessGroupStream(int rank,
                                        int gid)
     : ProcessGroup(rank, size, place, gid) {}
 
+phi::DeviceContext* ProcessGroupStream::GetDeviceContext(
+    const Place& place, bool use_calc_stream) const {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support get device_context.", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
+    bool sync_op) {
+  return AllGather(input_tensors,
+                   output_tensors,
+                   sync_op,
+                   /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather(
+    std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
+    std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do all_gather", GetBackendName()));
+}
+
 std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
     std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
     std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
@@ -42,7 +67,248 @@ std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllReduce(
     bool sync_op,
     bool use_calc_stream) {
   PADDLE_THROW(platform::errors::InvalidArgument(
-      "ProcessGroup%s does not support do allreduce", GetBackendName()));
+      "ProcessGroup%s does not support do all_reduce", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op) {
+  return AllToAll(in_tensors,
+                  out_tensors,
+                  sync_op,
+                  /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAll(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do alltoall", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAllSingle(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes,
+    bool sync_op) {
+  return AllToAllSingle(in_tensors,
+                        out_tensors,
+                        in_sizes,
+                        out_sizes,
+                        sync_op,
+                        /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllToAllSingle(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    std::vector<int64_t>& in_sizes,
+    std::vector<int64_t>& out_sizes,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do alltoall_single", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts,
+    bool sync_op) {
+  return Broadcast(in_tensors,
+                   out_tensors,
+                   opts,
+                   sync_op,
+                   /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Broadcast(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const BroadcastOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do broadcast", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts,
+    bool sync_op) {
+  return Reduce(in_tensors,
+                out_tensors,
+                opts,
+                sync_op,
+                /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Reduce(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do reduce", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::ReduceScatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceScatterOptions& opts,
+    bool sync_op) {
+  return ReduceScatter(in_tensors,
+                       out_tensors,
+                       opts,
+                       sync_op,
+                       /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::ReduceScatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ReduceScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do reduce_scatter", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts,
+    bool sync_op) {
+  return Scatter(in_tensors,
+                 out_tensors,
+                 opts,
+                 sync_op,
+                 /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Scatter(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    const ScatterOptions& opts,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do scatter", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    std::vector<phi::DenseTensor>& tensors, int dst_rank, bool sync_op) {
+  return Send(tensors,
+              dst_rank,
+              sync_op,
+              /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send(
+    std::vector<phi::DenseTensor>& tensors,
+    int dst_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do send", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int64_t offset,
+    int64_t length,
+    bool sync_op) {
+  return Send_Partial(tensors,
+                      dst_rank,
+                      offset,
+                      length,
+                      sync_op,
+                      /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Send_Partial(
+    phi::DenseTensor& tensors,
+    int dst_rank,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do send_partial", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    std::vector<phi::DenseTensor>& tensors, int src_rank, bool sync_op) {
+  return Recv(tensors,
+              src_rank,
+              sync_op,
+              /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv(
+    std::vector<phi::DenseTensor>& tensors,
+    int src_rank,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int64_t offset,
+    int64_t length,
+    bool sync_op) {
+  return Recv_Partial(tensors,
+                      src_rank,
+                      offset,
+                      length,
+                      sync_op,
+                      /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::Recv_Partial(
+    phi::DenseTensor& tensors,
+    int src_rank,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv_partial", GetBackendName()));
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length,
+    bool sync_op) {
+  return AllGather_Partial(in_tensors,
+                           out_tensors,
+                           offset,
+                           length,
+                           sync_op,
+                           /*use_calc_stream*/ false);
+}
+
+std::shared_ptr<ProcessGroup::Task> ProcessGroupStream::AllGather_Partial(
+    std::vector<phi::DenseTensor>& in_tensors,
+    std::vector<phi::DenseTensor>& out_tensors,
+    int64_t offset,
+    int64_t length,
+    bool sync_op,
+    bool use_calc_stream) {
+  PADDLE_THROW(platform::errors::InvalidArgument(
+      "ProcessGroup%s does not support do recv_partial", GetBackendName()));
 }
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/collective/ProcessGroupStream.h b/paddle/fluid/distributed/collective/ProcessGroupStream.h
index 81a05ee2416e04..2f0aa139104e92 100644
--- a/paddle/fluid/distributed/collective/ProcessGroupStream.h
+++ b/paddle/fluid/distributed/collective/ProcessGroupStream.h
@@ -54,6 +54,20 @@ class ProcessGroupStream : public ProcessGroup {
   ProcessGroupStream(int rank, int size, const platform::Place& place, int gid);
   virtual ~ProcessGroupStream() = default;
 
+  virtual phi::DeviceContext* GetDeviceContext(const Place& place,
+                                               bool use_calc_stream) const;
+
+  std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op,
+      bool use_calc_stream);
+
   std::shared_ptr<ProcessGroup::Task> AllReduce(
       std::vector<phi::DenseTensor>& input_tensors,   // NOLINT
       std::vector<phi::DenseTensor>& output_tensors,  // NOLINT
@@ -66,6 +80,151 @@ class ProcessGroupStream : public ProcessGroup {
       const AllreduceOptions& options,
       bool sync_op,
       bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAll(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      std::vector<int64_t>& in_sizes,              // NOLINT
+      std::vector<int64_t>& out_sizes,             // NOLINT
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllToAllSingle(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      std::vector<int64_t>& in_sizes,              // NOLINT
+      std::vector<int64_t>& out_sizes,             // NOLINT
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const BroadcastOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Broadcast(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const BroadcastOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Reduce(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceScatterOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> ReduceScatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ReduceScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ScatterOptions& opts,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Scatter(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      const ScatterOptions& opts,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int dst_rank,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int dst_rank,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int dst_rank,
+      int64_t offset,
+      int64_t length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Send_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int dst_rank,
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int src_rank,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv(
+      std::vector<phi::DenseTensor>& tensors,  // NOLINT
+      int src_rank,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int src_rank,
+      int64_t offset,
+      int64_t length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> Recv_Partial(
+      phi::DenseTensor& tensors,  // NOLINT
+      int src_rank,
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream);
+
+  std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,
+      std::vector<phi::DenseTensor>& out_tensors,
+      int64_t offset,
+      int64_t length,
+      bool sync_op) override;
+
+  virtual std::shared_ptr<ProcessGroup::Task> AllGather_Partial(
+      std::vector<phi::DenseTensor>& in_tensors,   // NOLINT
+      std::vector<phi::DenseTensor>& out_tensors,  // NOLINT
+      int64_t offset,
+      int64_t length,
+      bool sync_op,
+      bool use_calc_stream);
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/collective/Utils.h b/paddle/fluid/distributed/collective/Utils.h
new file mode 100644
index 00000000000000..c06c0345163ed7
--- /dev/null
+++ b/paddle/fluid/distributed/collective/Utils.h
@@ -0,0 +1,253 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/device_guard.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
+
+namespace paddle {
+namespace distributed {
+
+template <typename DeviceContext, typename T>
+struct ConcatDenseTensor {
+  void operator()(const DeviceContext *context,
+                  const std::vector<phi::DenseTensor> &in,
+                  phi::DenseTensor *out,
+                  int axis = 0) {
+    phi::funcs::ConcatFunctor<DeviceContext, T> concat_functor;
+    concat_functor(*context, in, axis, out);
+  }
+};
+
+template <typename DeviceContext, typename T>
+struct SplitDenseTensor {
+  void operator()(const DeviceContext *context,
+                  const phi::DenseTensor &in,
+                  std::vector<phi::DenseTensor *> *out,
+                  int axis = 0) {
+    std::vector<const phi::DenseTensor *> shape_refer;
+    shape_refer.reserve(out->size());
+    for (auto *p_tensor : *out) {
+      shape_refer.emplace_back(p_tensor);
+    }
+    phi::funcs::SplitFunctor<DeviceContext, T> split_functor;
+    split_functor(*context, in, shape_refer, axis, out);
+  }
+};
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+template <typename T>
+struct ConcatDenseTensor<platform::CustomDeviceContext, T> {
+  void operator()(const platform::CustomDeviceContext *context,
+                  const std::vector<phi::DenseTensor> &in,
+                  phi::DenseTensor *out,
+                  int axis = 0) {
+    auto *out_data = out->data<T>();
+    auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
+    size_t offset = 0;
+    for (const auto &tensor : in) {
+      const auto *in_data = tensor.data<T>();
+      auto sz = tensor.numel() * sizeof(T);
+      device->MemoryCopyD2D(out_data + offset, in_data, sz, nullptr);
+      offset += sz;
+    }
+  }
+};
+
+template <typename T>
+struct SplitDenseTensor<platform::CustomDeviceContext, T> {
+  void operator()(const platform::CustomDeviceContext *context,
+                  const phi::DenseTensor &in,
+                  std::vector<phi::DenseTensor *> *out,
+                  int axis = 0) {
+    auto *in_data = in.data<T>();
+    auto *device = phi::DeviceManager::GetDeviceWithPlace(context->GetPlace());
+    size_t offset = 0;
+    for (auto *p_tensor : *out) {
+      auto *out_data = p_tensor->data<T>();
+      auto sz = p_tensor->numel() * sizeof(T);
+      device->MemoryCopyD2D(out_data, in_data + offset, sz, nullptr);
+      offset += sz;
+    }
+  }
+};
+#endif
+
+template <typename DeviceContext>
+void ConcatDenseTensorWithType(const DeviceContext *dev_ctx,
+                               const std::vector<phi::DenseTensor> &t_list,
+                               phi::DenseTensor *p_out,
+                               phi::DataType type) {
+  switch (type) {
+    case phi::DataType::BOOL:
+      ConcatDenseTensor<DeviceContext, bool>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::UINT8:
+      ConcatDenseTensor<DeviceContext, uint8_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::INT8:
+      ConcatDenseTensor<DeviceContext, int8_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::INT32:
+      ConcatDenseTensor<DeviceContext, int32_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::INT64:
+      ConcatDenseTensor<DeviceContext, int64_t>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::FLOAT16:
+      ConcatDenseTensor<DeviceContext, platform::float16>()(
+          dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::FLOAT32:
+      ConcatDenseTensor<DeviceContext, float>()(dev_ctx, t_list, p_out);
+      break;
+    case phi::DataType::FLOAT64:
+      ConcatDenseTensor<DeviceContext, double>()(dev_ctx, t_list, p_out);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it concats tensors.", type));
+  }
+}
+
+template <typename DeviceContext>
+void SplitDenseTensorWithType(const DeviceContext *dev_ctx,
+                              const phi::DenseTensor &t_in,
+                              std::vector<phi::DenseTensor *> *p_list,
+                              phi::DataType type) {
+  switch (type) {
+    case phi::DataType::BOOL:
+      SplitDenseTensor<DeviceContext, bool>()(dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::UINT8:
+      SplitDenseTensor<DeviceContext, uint8_t>()(dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::INT8:
+      SplitDenseTensor<DeviceContext, int8_t>()(dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::INT32:
+      SplitDenseTensor<DeviceContext, int32_t>()(dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::INT64:
+      SplitDenseTensor<DeviceContext, int64_t>()(dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::FLOAT16:
+      SplitDenseTensor<DeviceContext, platform::float16>()(
+          dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::FLOAT32:
+      SplitDenseTensor<DeviceContext, float>()(dev_ctx, t_in, p_list);
+      break;
+    case phi::DataType::FLOAT64:
+      SplitDenseTensor<DeviceContext, double>()(dev_ctx, t_in, p_list);
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Data type (%s) is not supported when it splits tensors.", type));
+  }
+}
+
+void ConcatTensor(const phi::DeviceContext *dev_ctx,
+                  const std::vector<phi::DenseTensor> &tensor_list,
+                  const experimental::Tensor *tensor) {
+  auto *dense_tensor =
+      std::dynamic_pointer_cast<phi::DenseTensor>(tensor->impl()).get();
+
+  const auto &place = dev_ctx->GetPlace();
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    ConcatDenseTensorWithType(static_cast<const phi::GPUContext *>(dev_ctx),
+                              tensor_list,
+                              dense_tensor,
+                              tensor->dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat tensor since it's not support GPU, please "
+        "recompile or reinstall Paddle with GPU support."));
+#endif
+  } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    ConcatDenseTensorWithType(
+        static_cast<const platform::CustomDeviceContext *>(dev_ctx),
+        tensor_list,
+        dense_tensor,
+        tensor->dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't concat tensor since it's not compiled with "
+        "CUSTOM_DEVICE, please recompile or reinstall Paddle with "
+        "CUSTOM_DEVICE support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    ConcatDenseTensorWithType(static_cast<const phi::CPUContext *>(dev_ctx),
+                              tensor_list,
+                              dense_tensor,
+                              tensor->dtype());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Concat tensor not supported on place (%s)", place));
+  }
+}
+
+void SplitTensor(const phi::DeviceContext *dev_ctx,
+                 const phi::DenseTensor &tensor,
+                 const std::vector<experimental::Tensor> *tensor_list) {
+  std::vector<phi::DenseTensor *> dense_list;
+  for (auto &tensor : *tensor_list) {
+    auto p_tensor =
+        std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl()).get();
+    dense_list.emplace_back(p_tensor);
+  }
+
+  const auto &place = dev_ctx->GetPlace();
+  if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    SplitDenseTensorWithType(static_cast<const phi::GPUContext *>(dev_ctx),
+                             tensor,
+                             &dense_list,
+                             tensor.dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split tensor since it's not support GPU, please "
+        "recompile or reinstall Paddle with GPU support."));
+#endif
+  } else if (platform::is_custom_place(place)) {
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    SplitDenseTensorWithType(
+        static_cast<const platform::CustomDeviceContext *>(dev_ctx),
+        tensor,
+        &dense_list,
+        tensor.dtype());
+#else
+    PADDLE_THROW(platform::errors::PermissionDenied(
+        "Paddle can't split tensor since it's not compiled with CUSTOM_DEVICE, "
+        "please recompile or reinstall Paddle with CUSTOM_DEVICE support."));
+#endif
+  } else if (platform::is_cpu_place(place)) {
+    SplitDenseTensorWithType(static_cast<const phi::CPUContext *>(dev_ctx),
+                             tensor,
+                             &dense_list,
+                             tensor.dtype());
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Split tensor not supported on place (%s)", place));
+  }
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
index 75a16bac371307..0d46425b2e8327 100644
--- a/paddle/fluid/distributed/collective/reducer.cc
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -254,6 +254,10 @@ static void ConcatTensorsWithType(
       ConcatTensorsForAllReduce<DeviceContext, double>()(
           context, dense_tensors_, p_dense_contents);
       break;
+    case phi::DataType::BFLOAT16:
+      ConcatTensorsForAllReduce<DeviceContext, platform::bfloat16>()(
+          context, dense_tensors_, p_dense_contents);
+      break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Data type (%s) is not supported when it concats tensors for "
@@ -281,6 +285,10 @@ static void SplitTensorsWithType(const DeviceContext &context,
       SplitTensorsForAllReduce<DeviceContext, double>()(
           context, p_dense_contents, p_dense_tensors);
       break;
+    case phi::DataType::BFLOAT16:
+      SplitTensorsForAllReduce<DeviceContext, platform::bfloat16>()(
+          context, p_dense_contents, p_dense_tensors);
+      break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
           "Data type (%s) is not supported when it splits tensors for "
diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
index 3cafb0bdb5f927..ff8ed811ee6f84 100755
--- a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -6,7 +6,7 @@ proto_library(interceptor_message_proto SRCS interceptor_message.proto)
 
 if(WITH_ARM_BRPC)
   set(BRPC_DEPS arm_brpc snappy gflags glog)
-elseif(WITH_DISTRIBUTE AND WITH_PSCORE)
+elseif(WITH_DISTRIBUTE)
   set(BRPC_DEPS
       brpc
       ssl
@@ -36,6 +36,8 @@ cc_library(
        interceptor.cc
        compute_interceptor.cc
        amplifier_interceptor.cc
+       cond_interceptor.cc
+       start_interceptor.cc
        source_interceptor.cc
        sink_interceptor.cc
        message_service.cc
@@ -66,6 +68,10 @@ if(WITH_DISTRIBUTE)
   set_source_files_properties(
     amplifier_interceptor.cc PROPERTIES COMPILE_FLAGS
                                         ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    cond_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+  set_source_files_properties(
+    start_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(
     source_interceptor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
   set_source_files_properties(
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
index 72c689732b5b7d..a166ff0b6dfa2f 100644
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.cc
@@ -33,7 +33,7 @@ void AmplifierInterceptor::RunOps() {
   // run_per_steps_, run_at_offset_
   // 4, 0 --> run at step 0, 4, 8, 12
   // 4, 3 --> run at step 3, 7, 11, 15
-  if ((step_ % run_per_steps_) == run_at_offset_) {
+  if ((cur_scope_id_ % run_per_steps_) == run_at_offset_) {
     ComputeInterceptor::RunOps();
   }
 }
@@ -41,7 +41,7 @@ void AmplifierInterceptor::RunOps() {
 void AmplifierInterceptor::SendDataReadyToDownStream() {
   // run multi times, send ready one times to downstream, that is
   // input multi times, output one times
-  if (step_ % send_down_per_steps_ == 0) {
+  if (cur_scope_id_ % send_down_per_steps_ == 0) {
     ComputeInterceptor::SendDataReadyToDownStream();
   }
 }
@@ -49,7 +49,7 @@ void AmplifierInterceptor::SendDataReadyToDownStream() {
 void AmplifierInterceptor::ReplyCompletedToUpStream() {
   // run multi times, reply one times to upstream, that is
   // input one times, output multi times
-  if (step_ % reply_up_per_steps_ == 0) {
+  if (cur_scope_id_ % reply_up_per_steps_ == 0) {
     ComputeInterceptor::ReplyCompletedToUpStream();
   }
 }
diff --git a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
index 776aa8d3e88db1..93e8ffa1d75aec 100644
--- a/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/amplifier_interceptor.h
@@ -21,7 +21,7 @@
 namespace paddle {
 namespace distributed {
 
-class AmplifierInterceptor : public ComputeInterceptor {
+class AmplifierInterceptor final : public ComputeInterceptor {
  public:
   AmplifierInterceptor(int64_t interceptor_id, TaskNode* node);
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.cc b/paddle/fluid/distributed/fleet_executor/carrier.cc
index 6fb0d55a4859ef..9b023e12a8893c 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.cc
+++ b/paddle/fluid/distributed/fleet_executor/carrier.cc
@@ -15,6 +15,7 @@
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 
 #include <algorithm>
+#include <vector>
 
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -24,6 +25,7 @@
 #include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
 
 namespace paddle {
@@ -33,6 +35,8 @@ USE_INTERCEPTOR(Source);
 USE_INTERCEPTOR(Compute);
 USE_INTERCEPTOR(Amplifier);
 USE_INTERCEPTOR(Sink);
+USE_INTERCEPTOR(Cond);
+USE_INTERCEPTOR(Start);
 
 void Carrier::Init(
     int64_t rank,
@@ -54,24 +58,38 @@ void Carrier::Init(
     framework::Scope* scope,
     int64_t num_micro_batches,
     const platform::Place& place,
-    const std::vector<std::string>& inference_root_scope_vars) {
+    const std::vector<std::string>& inference_root_scope_vars,
+    const std::vector<framework::Scope*>& micro_scope_list) {
   rank_ = rank;
   interceptor_id_to_rank_ = interceptor_id_to_rank;
   interceptor_id_to_node_ = interceptor_id_to_node;
   place_ = place;
   root_scope_ = scope;
   dev_ctx_ = platform::DeviceContextPool::Instance().Get(place_);
+  bool need_create_scope = micro_scope_list.empty();
 
   PADDLE_ENFORCE_NOT_NULL(
       root_scope_,
       platform::errors::InvalidArgument("root_scope can not be nullptr"));
-  minibatch_scope_ = &root_scope_->NewScope();
-  microbatch_scopes_.resize(num_micro_batches);
-  for (int i = 0; i < num_micro_batches; ++i) {
-    microbatch_scopes_[i] = &minibatch_scope_->NewScope();
-    CopyParameters(i, program, inference_root_scope_vars);
+
+  if (need_create_scope) {
+    minibatch_scope_ = &root_scope_->NewScope();
+    microbatch_scopes_.resize(num_micro_batches);
+    for (int i = 0; i < num_micro_batches; ++i) {
+      microbatch_scopes_[i] = &minibatch_scope_->NewScope();
+      CopyParameters(i, program, inference_root_scope_vars);
+    }
+  } else {
+    microbatch_scopes_ = micro_scope_list;
+    for (int i = 0; i < num_micro_batches; ++i) {
+      CopyParameters(i, program, inference_root_scope_vars);
+    }
   }
 
+  // Add source and sink interceptor id to rank
+  interceptor_id_to_rank_.emplace(SOURCE_ID, rank);
+  interceptor_id_to_rank_.emplace(SINK_ID, rank);
+
   // TODO(fleet_exe dev): thread pool
   thread_num_ = 1;
   thread_pool_.SetThreadNum(thread_num_);
@@ -93,29 +111,30 @@ void Carrier::CopyParameters(
     int microbatch_id,
     const framework::ProgramDesc& program,
     const std::vector<std::string>& inference_root_scope_vars) {
-  auto& global_block = program.Block(0);
-
   std::map<std::string, int> inference_root_scope_var_map;
   for (auto var_name : inference_root_scope_vars) {
     inference_root_scope_var_map.insert({var_name, 1});
   }
-  for (auto& var : global_block.AllVars()) {
-    std::string var_name = var->Name();
-    bool force_root = inference_root_scope_var_map.find(var_name) !=
-                      inference_root_scope_var_map.end();
-    if (force_root) {
-      VLOG(4) << var_name << " will be forced to be created in the root scope.";
-    }
-    if ((var->Persistable() || force_root) && microbatch_id == 0) {
-      auto* ptr = root_scope_->Var(var->Name());
-      InitializeVariable(ptr, var->GetType());
-      VLOG(5) << "Create persistable var: " << var->Name()
-              << ", which pointer is " << ptr;
-    } else if (!var->Persistable()) {
-      auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
-      VLOG(5) << "Create variable " << var->Name() << " for microbatch "
-              << microbatch_id << ", which pointer is " << ptr << ".";
-      InitializeVariable(ptr, var->GetType());
+  for (size_t i = 0; i < program.Size(); ++i) {
+    for (auto& var : program.Block(i).AllVars()) {
+      std::string var_name = var->Name();
+      bool force_root = inference_root_scope_var_map.find(var_name) !=
+                        inference_root_scope_var_map.end();
+      if (force_root) {
+        VLOG(4) << var_name
+                << " will be forced to be created in the root scope.";
+      }
+      if ((var->Persistable() || force_root) && microbatch_id == 0) {
+        auto* ptr = root_scope_->Var(var->Name());
+        InitializeVariable(ptr, var->GetType());
+        VLOG(5) << "Create persistable var: " << var->Name()
+                << ", which pointer is " << ptr;
+      } else if (!var->Persistable()) {
+        auto* ptr = microbatch_scopes_[microbatch_id]->Var(var->Name());
+        VLOG(5) << "Create variable " << var->Name() << " for microbatch "
+                << microbatch_id << ", which pointer is " << ptr << ".";
+        InitializeVariable(ptr, var->GetType());
+      }
     }
   }
 }
@@ -159,16 +178,11 @@ void Carrier::Start() {
                     true,
                     platform::errors::PreconditionNotMet(
                         "Using carrier before initialized."));
-  for (int64_t id : source_interceptor_ids_) {
-    VLOG(3) << "Carrier Start is sending start to source interceptor " << id
-            << ".";
-    InterceptorMessage start_msg;
-    // source node data_is_ready is send by carrier, so set src_id=-1
-    start_msg.set_src_id(-1);
-    start_msg.set_dst_id(id);
-    start_msg.set_message_type(DATA_IS_READY);
-    Send(start_msg);
-  }
+  InterceptorMessage start_msg;
+  start_msg.set_src_id(SOURCE_ID);
+  start_msg.set_dst_id(SOURCE_ID);
+  start_msg.set_message_type(START);
+  Send(start_msg);
   // TODO(wangxi): async step
   Wait();
   dev_ctx_->Wait();
@@ -270,6 +284,38 @@ void Carrier::CreateInterceptors() {
 
   auto gc = GetGC(place_);
 
+  // create source and sink task node
+  auto max_run_times = microbatch_scopes_.size();
+  TaskNode* source = new TaskNode(
+      rank_, SOURCE_ID, max_run_times);  // rank, task_id, max_run_times
+  TaskNode* sink = new TaskNode(rank_, SINK_ID, max_run_times);
+  // find nodes without upstreams or without downstreams
+  std::vector<TaskNode*> origin_sources, origin_sinks;
+  for (const auto& item : interceptor_id_to_node_) {
+    TaskNode* task_node = item.second;
+    if (task_node->upstream().empty()) {
+      origin_sources.emplace_back(task_node);
+    }
+    if (task_node->downstream().empty()) {
+      origin_sinks.emplace_back(task_node);
+    }
+  }
+  // link source node with origin source
+  for (const auto& node : origin_sources) {
+    source->AddDownstreamTask(node->task_id(),
+                              std::numeric_limits<int64_t>::max());
+    node->AddUpstreamTask(SOURCE_ID, std::numeric_limits<int64_t>::max());
+  }
+  // link sink node with origin sink
+  for (const auto& node : origin_sinks) {
+    sink->AddUpstreamTask(node->task_id(), std::numeric_limits<int64_t>::max());
+    node->AddDownstreamTask(SINK_ID, std::numeric_limits<int64_t>::max());
+  }
+  // create source and sink interceptor
+  SetInterceptor(SOURCE_ID,
+                 InterceptorFactory::Create("Source", SOURCE_ID, source));
+  SetInterceptor(SINK_ID, InterceptorFactory::Create("Sink", SINK_ID, sink));
+
   // create each Interceptor
   // no auto init since there is no config
   for (const auto& item : interceptor_id_to_node_) {
@@ -303,9 +349,15 @@ void Carrier::CreateInterceptors() {
     VLOG(3) << "Create Interceptor with interceptor id: " << interceptor_id
             << " with type: " << task_node->type() << ".";
 
-    if (task_node->upstream().empty()) {
-      source_interceptor_ids_.emplace_back(interceptor_id);
-    }
+    PADDLE_ENFORCE_EQ(
+        task_node->upstream().empty(),
+        false,
+        platform::errors::PreconditionNotMet(
+            "There should not have normal nodes as source nodes"));
+    PADDLE_ENFORCE_EQ(task_node->downstream().empty(),
+                      false,
+                      platform::errors::PreconditionNotMet(
+                          "There should not have normal nodes as sink nodes"));
   }
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/carrier.h b/paddle/fluid/distributed/fleet_executor/carrier.h
index fe3d4926766558..8e7fad3e892d87 100644
--- a/paddle/fluid/distributed/fleet_executor/carrier.h
+++ b/paddle/fluid/distributed/fleet_executor/carrier.h
@@ -25,6 +25,7 @@
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/interceptor_message.pb.h"
 #include "paddle/fluid/distributed/fleet_executor/task_loop_thread_pool.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
@@ -60,7 +61,8 @@ class Carrier final {
       framework::Scope* scope,
       int64_t num_micro_batches,
       const platform::Place& place,
-      const std::vector<std::string>& inference_root_scope_vars = {});
+      const std::vector<std::string>& inference_root_scope_vars = {},
+      const std::vector<framework::Scope*>& micro_scope_list = {});
 
   void CopyParameters(
       int microbatch_id,
@@ -100,8 +102,6 @@ class Carrier final {
   std::unordered_map<int64_t, std::unique_ptr<Interceptor>>
       interceptor_idx_to_interceptor_;
 
-  std::vector<int64_t> source_interceptor_ids_;
-
   bool is_init_{false};
 
   std::mutex running_mutex_;
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
index 5b96ee76e71446..d9c5863d603c6f 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.cc
@@ -18,10 +18,85 @@
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
 #include "paddle/fluid/framework/executor_gc_helper.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/serialization.h"
+#include "paddle/phi/core/utils/dim.h"
 
 namespace paddle {
 namespace distributed {
 
+namespace {
+
+template <typename T>
+void SetVarResult(const std::string& name,
+                  T value,
+                  int64_t scope_id,
+                  framework::Scope* scope,
+                  const platform::Place& place,
+                  const std::vector<int64_t>& dim_vec) {
+  auto* var = scope->FindVar(name);
+  auto* tensor = var->GetMutable<phi::DenseTensor>();
+  if (!var) {
+    VLOG(3) << "Create var and memory for var " << name;
+    var = scope->Var(name);
+    phi::DDim dims = phi::make_ddim(dim_vec);
+    tensor->Resize(dims);
+    tensor->mutable_data<T>(dims, place);
+  }
+
+  PADDLE_ENFORCE_EQ(
+      tensor->dims().size(),
+      1,
+      platform::errors::OutOfRange("Only support transfer size 1 value."));
+  PADDLE_ENFORCE_EQ(
+      tensor->dims().at(0),
+      1,
+      platform::errors::OutOfRange("Only support transfer size 1 value."));
+  if (platform::is_gpu_place(tensor->place())) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::DenseTensor cpu_tensor;
+    auto dim = phi::make_ddim({1});
+    cpu_tensor.mutable_data<T>(dim, platform::CPUPlace());
+    auto* cpu_tensor_ptr = cpu_tensor.data<T>();
+    cpu_tensor_ptr[0] = value;
+    framework::TensorCopySync(cpu_tensor, tensor->place(), tensor);
+#endif
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupport device for cond interceptor."));
+  }
+}
+
+template <typename T>
+T GetVarResult(const std::string& name,
+               int64_t scope_id,
+               framework::Scope* scope) {
+  auto* var = scope->FindVar(name);
+  PADDLE_ENFORCE(var,
+                 platform::errors::NotFound(
+                     "Variable %s not exists in scope %ld", name, scope_id));
+  const auto& tensor = var->Get<phi::DenseTensor>();
+  T res;
+  if (platform::is_gpu_place(tensor.place())) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::DenseTensor cpu_tensor;
+    framework::TensorCopySync(tensor, platform::CPUPlace(), &cpu_tensor);
+    res = cpu_tensor.data<T>()[0];
+#endif
+  } else if (platform::is_cpu_place(tensor.place())) {
+    res = tensor.data<T>()[0];
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupport device for cond interceptor."));
+  }
+  return res;
+}
+}  // namespace
+
 ComputeInterceptor::ComputeInterceptor(int64_t interceptor_id, TaskNode* node)
     : Interceptor(interceptor_id, node) {
   PrepareDeps();
@@ -33,57 +108,49 @@ void ComputeInterceptor::PrepareDeps() {
   auto& downstream = node_->downstream();
 
   for (auto up : upstream) {
-    in_readys_.emplace(up.first, std::make_pair(up.second, 0));
-    in_stops_.emplace(up.first, false);
+    std::map<int64_t, int64_t> ready_size_map;
+    for (int64_t i = 0; i < node_->max_run_times(); ++i) {
+      ready_size_map.emplace(i, 0);
+    }
+    in_readys_.emplace(up.first, std::make_pair(up.second, ready_size_map));
   }
   for (auto down : downstream) {
     out_buffs_.emplace(down.first, std::make_pair(down.second, 0));
   }
-
-  // source compute node, should we add a new SourceInterceptor?
-  if (upstream.empty()) {
-    is_source_ = true;
-    PADDLE_ENFORCE_GT(node_->max_run_times(),
-                      0,
-                      platform::errors::InvalidArgument(
-                          "Source ComputeInterceptor must run at least one "
-                          "times, but now max_run_times=%ld",
-                          node_->max_run_times()));
-    in_readys_.emplace(-1,
-                       std::make_pair(std::numeric_limits<int64_t>::max(), 0));
-  }
-
-  // If there is no downstream or every downstream is in different rank,
-  // then this interceptor is the last one for current rank.
-  // This can be get during init, can be cached for later use.
-  is_last_ = downstream.empty();
 }
 
-void ComputeInterceptor::IncreaseReady(int64_t up_id) {
+void ComputeInterceptor::IncreaseReady(int64_t up_id, int64_t scope_id) {
   auto it = in_readys_.find(up_id);
   PADDLE_ENFORCE_NE(it,
                     in_readys_.end(),
                     platform::errors::NotFound(
                         "Cannot find upstream=%lld in in_readys.", up_id));
 
-  // source node has no upstream, data_is_ready is send by carrier or others
-  if (is_source_ && up_id == -1) {
-    it->second.second += GetTaskNode()->max_run_times();
-    return;
-  }
-
   auto max_ready_size = it->second.first;
-  auto ready_size = it->second.second;
-  ready_size += 1;
-  PADDLE_ENFORCE_LE(ready_size,
-                    max_ready_size,
-                    platform::errors::OutOfRange(
-                        "upstream=%lld ready_size must <= max_ready_size, but "
-                        "now ready_size=%lld, max_ready_size=%lld",
-                        up_id,
-                        ready_size,
-                        max_ready_size));
-  it->second.second = ready_size;
+  const auto& ready_scope_map = it->second.second;
+  int64_t ready_size = 0;
+  for (auto& scope_iter : ready_scope_map) {
+    ready_size += scope_iter.second;
+  }
+  if (max_ready_size != INFINITE_BUFFER_SIZE) {
+    PADDLE_ENFORCE_LE(
+        ready_size,
+        max_ready_size,
+        platform::errors::OutOfRange(
+            "upstream=%lld ready_size must <= max_ready_size, but "
+            "now ready_size=%lld, max_ready_size=%lld",
+            up_id,
+            ready_size,
+            max_ready_size));
+  }
+  PADDLE_ENFORCE_NE(
+      it->second.second.find(scope_id),
+      it->second.second.end(),
+      platform::errors::OutOfRange(
+          "Interceptor %lld can not find scope %lld in upstream ready map",
+          interceptor_id_,
+          scope_id));
+  it->second.second.at(scope_id) = ready_scope_map.at(scope_id) + 1;
 }
 
 void ComputeInterceptor::DecreaseBuff(int64_t down_id) {
@@ -105,22 +172,30 @@ void ComputeInterceptor::DecreaseBuff(int64_t down_id) {
 }
 
 bool ComputeInterceptor::IsInputReady() {
-  for (auto& ins : in_readys_) {
-    auto ready_size = ins.second.second;
-    // not ready, return false
-    if (ready_size == 0) {
-      VLOG(3) << "Interceptor " << GetInterceptorId()
+  for (int64_t i = 0; i < node_->max_run_times(); ++i) {
+    bool flag = true;
+    for (auto& ins : in_readys_) {
+      auto ready_size_map = ins.second.second;
+      flag = flag && (ready_size_map.at(i) != 0);
+    }
+    if (flag) {
+      cur_scope_id_ = i;
+      return true;
+    } else {
+      VLOG(3) << "Interceptor " << GetInterceptorId() << " in scope " << i
               << "'s upstreams aren't all ready.";
-      return false;
     }
   }
-  return true;
+  return false;
 }
 
 bool ComputeInterceptor::CanWriteOutput() {
   for (auto& outs : out_buffs_) {
     auto max_buffer_size = outs.second.first;
     auto used_size = outs.second.second;
+    if (max_buffer_size == INFINITE_BUFFER_SIZE) {
+      continue;
+    }
     // full, return false
     if (used_size == max_buffer_size) {
       VLOG(3) << "Interceptor " << GetInterceptorId()
@@ -137,30 +212,76 @@ void ComputeInterceptor::SendDataReadyToDownStream() {
     auto max_buff_size = outs.second.first;
     auto used_size = outs.second.second;
     used_size += 1;
-    PADDLE_ENFORCE_LE(
-        used_size,
-        max_buff_size,
-        platform::errors::OutOfRange("downstream=%lld used buff size must <= "
-                                     "max_buff_size, but now used_size=%lld, "
-                                     "max_buff_size=%lld",
-                                     down_id,
-                                     used_size,
-                                     max_buff_size));
+    if (max_buff_size != INFINITE_BUFFER_SIZE) {
+      PADDLE_ENFORCE_LE(
+          used_size,
+          max_buff_size,
+          platform::errors::OutOfRange("downstream=%lld used buff size must <= "
+                                       "max_buff_size, but now used_size=%lld, "
+                                       "max_buff_size=%lld",
+                                       down_id,
+                                       used_size,
+                                       max_buff_size));
+    }
     outs.second.second = used_size;
 
-    InterceptorMessage ready_msg;
-    ready_msg.set_message_type(DATA_IS_READY);
-    VLOG(3) << "ComputeInterceptor " << interceptor_id_
-            << " Send data_is_ready msg to " << down_id
-            << " for step: " << step_;
-    Send(down_id, ready_msg);
+    bool need_send_vars = !(node_->vars_to_dtype().empty());
+    if (need_send_vars) {
+      InterceptorMessage ready_msg = PrepareVarsMsg();
+      VLOG(3) << "ComputeInterceptor " << interceptor_id_
+              << " Send data_with_vars msg to " << down_id
+              << " in scope: " << cur_scope_id_;
+      Send(down_id, ready_msg);
+    } else {
+      InterceptorMessage ready_msg;
+      ready_msg.set_message_type(DATA_IS_READY);
+      ready_msg.set_scope_idx(cur_scope_id_);
+      VLOG(3) << "ComputeInterceptor " << interceptor_id_
+              << " Send data_is_ready msg to " << down_id
+              << " in scope: " << cur_scope_id_;
+      Send(down_id, ready_msg);
+    }
   }
 }
 
+InterceptorMessage ComputeInterceptor::PrepareVarsMsg() {
+  PADDLE_ENFORCE_LT(cur_scope_id_,
+                    microbatch_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "Step out of range. There are %ld "
+                        "microbatch_scopes, but recevice scope index %ld",
+                        microbatch_scopes_.size(),
+                        cur_scope_id_));
+  auto* scope = microbatch_scopes_[cur_scope_id_];
+
+  InterceptorMessage ready_msg;
+  ready_msg.set_message_type(DATA_WITH_VARS);
+  ready_msg.set_scope_idx(cur_scope_id_);
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  for (auto iter : node_->vars_to_dtype()) {
+    VarList* vars = ready_msg.add_vars_list();
+    const auto& var_name = iter.first;
+    vars->set_name(var_name);
+    std::ostringstream ss;
+    auto& dev_ctx = *pool.Get(place_);
+    auto* var = scope->FindVar(var_name);
+    PADDLE_ENFORCE(
+        var,
+        platform::errors::NotFound(
+            "Variable %s not exists in scope %ld", var_name, cur_scope_id_));
+    const auto& tensor = var->Get<phi::DenseTensor>();
+    SerializeToStream(ss, tensor, dev_ctx);
+    vars->set_stensor(ss.str());
+    VLOG(3) << "Prepare vars msg " << var_name << " with dimension "
+            << tensor.dims() << " dtype " << tensor.dtype();
+  }
+  return ready_msg;
+}
+
 void ComputeInterceptor::ReplyCompletedToUpStream() {
   for (auto& ins : in_readys_) {
     auto up_id = ins.first;
-    auto ready_size = ins.second.second;
+    auto ready_size = ins.second.second.at(cur_scope_id_);
     ready_size -= 1;
     PADDLE_ENFORCE_GE(
         ready_size,
@@ -169,109 +290,98 @@ void ComputeInterceptor::ReplyCompletedToUpStream() {
             "upstream=%lld ready_size must >= 0, but now got %lld",
             up_id,
             ready_size));
-    ins.second.second = ready_size;
+    ins.second.second[cur_scope_id_] = ready_size;
 
     VLOG(3) << "ComputeInterceptor " << interceptor_id_
             << " Reply data_is_useless msg to " << up_id
-            << " for step: " << step_;
-    if (is_source_ && up_id == -1) return;
+            << " in scope: " << cur_scope_id_;
 
     InterceptorMessage reply_msg;
     reply_msg.set_message_type(DATA_IS_USELESS);
+    reply_msg.set_scope_idx(cur_scope_id_);
     Send(up_id, reply_msg);
   }
 }
 
 void ComputeInterceptor::RunOps() {
-  VLOG(3) << "ComputeInterceptor " << interceptor_id_ << " running ops for the "
-          << step_ + 1 << " time.";
   for (auto op : node_->ops()) {
-    op->Run(*microbatch_scopes_[step_ % node_->max_run_times()], place_);
+    PADDLE_ENFORCE_LT(cur_scope_id_,
+                      microbatch_scopes_.size(),
+                      platform::errors::InvalidArgument(
+                          "Step out of range. There are %ld "
+                          "microbatch_scopes, but recevice scope index %ld",
+                          microbatch_scopes_.size(),
+                          cur_scope_id_));
+    op->Run(*microbatch_scopes_[cur_scope_id_], place_);
     if (gc_) {
-      framework::DeleteUnusedTensors(
-          *microbatch_scopes_[step_ % node_->max_run_times()],
-          op,
-          node_->unused_vars(),
-          gc_.get());
+      framework::DeleteUnusedTensors(*microbatch_scopes_[cur_scope_id_],
+                                     op,
+                                     node_->unused_vars(),
+                                     gc_.get());
     }
   }
 }
 
 void ComputeInterceptor::Run() {
   while (IsInputReady() && CanWriteOutput()) {
-    VLOG(3) << "id=" << GetInterceptorId() << " ComputeInterceptor running";
+    VLOG(3) << "id=" << GetInterceptorId()
+            << " ComputeInterceptor running in scope " << cur_scope_id_;
 
     RunOps();
-    ++step_;
 
     // send to downstream and increase buff used
     SendDataReadyToDownStream();
     // reply to upstream and decrease ready data
     ReplyCompletedToUpStream();
-    // Try to stop Carrier
-    if (is_last_ && (step_ % node_->max_run_times() == 0)) {
-      VLOG(3) << "Interceptor " << GetInterceptorId()
-              << " is stopping carrier.";
-      // FIXME(wangxi): with multi sink interceptor
-      StopCarrier();
-    }
   }
 }
 
-void ComputeInterceptor::ReceivedStop(int64_t up_id) {
-  received_stop_ = true;
-
-  // source node has no upstream, stop is send by carrier or others
-  if (is_source_ && up_id == -1) return;
-
-  auto it = in_stops_.find(up_id);
-  PADDLE_ENFORCE_NE(it,
-                    in_stops_.end(),
-                    platform::errors::NotFound(
-                        "Cannot find upstream=%lld in in_stops.", up_id));
-  PADDLE_ENFORCE_EQ(
-      it->second,
-      false,
-      platform::errors::AlreadyExists("Already received stop from %lld, stop "
-                                      "cannot be send more than once."));
-  it->second = true;
-}
-
-void ComputeInterceptor::TryStop() {
-  if (!received_stop_) return;
-
-  // can stop only when all upstream is stop and
-  // downstream complete
-  for (auto& in_stop : in_stops_) {
-    if (!in_stop.second) return;
-  }
-  for (auto& out_buff : out_buffs_) {
-    auto used_size = out_buff.second.second;
-    if (used_size != 0) return;
-  }
-
-  // send stop to downstream
-  for (auto& out : out_buffs_) {
-    auto down_id = out.first;
-    InterceptorMessage stop;
-    stop.set_message_type(STOP);
-    Send(down_id, stop);
+void ComputeInterceptor::DecodeMsgVars(const InterceptorMessage& msg) {
+  int64_t scope_id = msg.scope_idx();
+  PADDLE_ENFORCE_LT(scope_id,
+                    microbatch_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "Step out of range. There are %ld "
+                        "microbatch_scopes, but recevice scope index %ld",
+                        microbatch_scopes_.size(),
+                        scope_id));
+  auto* scope = microbatch_scopes_[scope_id];
+  platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+  for (const auto& var_iter : msg.vars_list()) {
+    const std::string& name = var_iter.name();
+    auto& dev_ctx = *pool.Get(place_);
+    std::istringstream ss(var_iter.stensor());
+    auto* var = scope->Var(name);
+    auto* tensor = var->GetMutable<phi::DenseTensor>();
+    DeserializeFromStream(ss, tensor, dev_ctx);
+
+    VLOG(3) << "Set vars " << name << " with value in scope " << scope_id
+            << " with dims " << tensor->dims() << " with dtype "
+            << tensor->dtype();
   }
-  stop_ = true;
 }
 
 void ComputeInterceptor::Compute(const InterceptorMessage& msg) {
   if (msg.message_type() == DATA_IS_READY) {
-    IncreaseReady(msg.src_id());
+    VLOG(3) << "Compute interceptor " << interceptor_id_
+            << " receive data_is_ready " << msg.src_id() << " "
+            << msg.scope_idx() << " ";
+    IncreaseReady(msg.src_id(), msg.scope_idx());
     Run();
   } else if (msg.message_type() == DATA_IS_USELESS) {
+    VLOG(3) << "Compute interceptor " << interceptor_id_
+            << " receive data_is_useless " << msg.src_id() << " "
+            << msg.scope_idx() << " ";
     DecreaseBuff(msg.src_id());
     Run();
-  } else if (msg.message_type() == STOP) {
-    ReceivedStop(msg.src_id());
+  } else if (msg.message_type() == DATA_WITH_VARS) {
+    VLOG(3) << "Compute interceptor " << interceptor_id_
+            << " receive data_with_vars " << msg.src_id() << " "
+            << msg.scope_idx() << " ";
+    DecodeMsgVars(msg);
+    IncreaseReady(msg.src_id(), msg.scope_idx());
+    Run();
   }
-
-  TryStop();
 }
 
 REGISTER_INTERCEPTOR(Compute, ComputeInterceptor);
diff --git a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
index fb82ce76c7bdb8..453576f0f2e908 100644
--- a/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/compute_interceptor.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <queue>
 #include <utility>
 
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
@@ -21,6 +22,8 @@
 namespace paddle {
 namespace distributed {
 
+const int64_t INFINITE_BUFFER_SIZE = -1;
+
 class ComputeInterceptor : public Interceptor {
  public:
   ComputeInterceptor(int64_t interceptor_id, TaskNode* node);
@@ -29,33 +32,26 @@ class ComputeInterceptor : public Interceptor {
   virtual void RunOps();
   virtual void SendDataReadyToDownStream();
   virtual void ReplyCompletedToUpStream();
+  virtual void Compute(const InterceptorMessage& msg);
+  void Run();
+  void IncreaseReady(int64_t up_id, int64_t scope_id);
+  void DecreaseBuff(int64_t down_id);
+
+  int64_t cur_scope_id_;
 
-  int64_t step_{0};
+  // upstream_id-->(max_ready_size, scope-->ready_size)
+  std::map<int64_t, std::pair<int64_t, std::map<int64_t, int64_t>>>
+      in_readys_{};
+  // downstream_id-->(max_buffer_size, used_size)
+  std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
 
  private:
   void PrepareDeps();
+  InterceptorMessage PrepareVarsMsg();
+  void DecodeMsgVars(const InterceptorMessage& msg);
 
-  void IncreaseReady(int64_t up_id);
-  void DecreaseBuff(int64_t down_id);
   bool IsInputReady();
   bool CanWriteOutput();
-
-  void Run();
-  void Compute(const InterceptorMessage& msg);
-
-  void ReceivedStop(int64_t up_id);
-  void TryStop();
-
-  bool is_source_{false};
-  bool is_last_{false};
-
-  // upstream_id-->(max_ready_size, ready_size)
-  std::map<int64_t, std::pair<int64_t, int64_t>> in_readys_{};
-  // downstream_id-->(max_buffer_size, used_size)
-  std::map<int64_t, std::pair<int64_t, int64_t>> out_buffs_{};
-
-  bool received_stop_{false};
-  std::map<int64_t, bool> in_stops_{};
 };
 
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
new file mode 100644
index 00000000000000..9a9f6b3f174efa
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.cc
@@ -0,0 +1,140 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/cond_interceptor.h"
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/executor_gc_helper.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/errors.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/errors.h"
+
+namespace paddle {
+namespace distributed {
+
+CondInterceptor::CondInterceptor(int64_t interceptor_id, TaskNode* node)
+    : Interceptor(interceptor_id, node) {
+  PrepareDeps();
+  RegisterMsgHandle([this](const InterceptorMessage& msg) { Run(msg); });
+}
+
+void CondInterceptor::PrepareDeps() {
+  auto& upstream = node_->upstream();
+  auto& downstream = node_->downstream();
+  auto& id_to_dep_type = node_->id_to_dep_type();
+
+  for (const auto& up : upstream) {
+    if (id_to_dep_type.at(up.first) == DependType::NORMAL) {
+      normal_in_id_.insert(up.first);
+    }
+  }
+
+  for (const auto& down : downstream) {
+    if (id_to_dep_type.at(down.first) == DependType::NORMAL) {
+      normal_out_id_.insert(down.first);
+    } else if (id_to_dep_type.at(down.first) == DependType::STOP_LOOP) {
+      stop_loop_id_ = down.first;
+    }
+  }
+}
+
+bool CondInterceptor::GetCondResult() {
+  PADDLE_ENFORCE_LT(cur_scope_id_,
+                    microbatch_scopes_.size(),
+                    platform::errors::InvalidArgument(
+                        "Step out of range. There are %ld "
+                        "microbatch_scopes, but recevice scope index %ld",
+                        microbatch_scopes_.size(),
+                        cur_scope_id_));
+  auto* cond_var =
+      microbatch_scopes_[cur_scope_id_]->FindVar(node_->cond_var());
+  PADDLE_ENFORCE(cond_var,
+                 platform::errors::NotFound(
+                     "Condition variable %s not exists in scope %ld",
+                     node_->cond_var(),
+                     cur_scope_id_));
+  const auto& cond_tensor = cond_var->Get<phi::DenseTensor>();
+  bool res = false;
+  if (platform::is_gpu_place(cond_tensor.place())) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::DenseTensor cpu_tensor;
+    framework::TensorCopy(cond_tensor, platform::CPUPlace(), &cpu_tensor);
+    platform::DeviceContextPool::Instance().Get(cond_tensor.place())->Wait();
+    res = cpu_tensor.data<bool>()[0];
+#endif
+  } else if (platform::is_cpu_place(cond_tensor.place())) {
+    res = cond_tensor.data<bool>()[0];
+  } else {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Unsupport device for cond interceptor."));
+  }
+  return res;
+}
+
+void CondInterceptor::SendDataReady(int64_t down_id) {
+  InterceptorMessage ready_msg;
+  ready_msg.set_message_type(DATA_IS_READY);
+  ready_msg.set_scope_idx(cur_scope_id_);
+  Send(down_id, ready_msg);
+}
+
+void CondInterceptor::ReplyDataIsUseless(int64_t up_id) {
+  InterceptorMessage ready_msg;
+  ready_msg.set_message_type(DATA_IS_USELESS);
+  ready_msg.set_scope_idx(cur_scope_id_);
+  Send(up_id, ready_msg);
+}
+
+void CondInterceptor::Compute() {
+  bool cond = GetCondResult();
+  VLOG(3) << "Cond interceptor get condition var " << node_->cond_var()
+          << " with value " << cond;
+  if (cond) {
+    VLOG(3) << "Loop again in scope " << cur_scope_id_;
+    for (auto& down_id : normal_out_id_) {
+      SendDataReady(down_id);
+    }
+  } else {
+    VLOG(3) << "Finish loop in scope " << cur_scope_id_;
+    SendDataReady(stop_loop_id_);
+  }
+}
+
+void CondInterceptor::Run(const InterceptorMessage& msg) {
+  if (msg.message_type() == DATA_IS_READY ||
+      msg.message_type() == DATA_WITH_VARS) {
+    cur_scope_id_ = msg.scope_idx();
+    Compute();
+  } else if (msg.message_type() == DATA_IS_USELESS) {
+    if (node_->id_to_dep_type().at(msg.src_id()) == DependType::STOP_LOOP) {
+      for (auto& up_id : normal_in_id_) {
+        ReplyDataIsUseless(up_id);
+      }
+      // Gc the variable in while block
+      int64_t scope_id = msg.scope_idx();
+      if (gc_) {
+        VLOG(3) << "Release vars in while block in scope " << scope_id;
+        framework::DeleteUnusedTensors(*microbatch_scopes_[scope_id],
+                                       node_->while_block_vars(),
+                                       gc_.get());
+      }
+    }
+  }
+}
+
+REGISTER_INTERCEPTOR(Cond, CondInterceptor);
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/cond_interceptor.h b/paddle/fluid/distributed/fleet_executor/cond_interceptor.h
new file mode 100644
index 00000000000000..8ea2d4b370cd9a
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/cond_interceptor.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <queue>
+#include "paddle/fluid/distributed/fleet_executor/interceptor.h"
+
+namespace paddle {
+namespace distributed {
+
+/* Condition Interceptor
+ * This is a special interceptor and only one condition op in the task node.
+ * This interceptor has two downstreams,
+ *  1. If the program result is true, select one of the downstreams, otherwise
+ * select another.
+ *  2. Used to implement while op in program.
+ */
+class CondInterceptor final : public Interceptor {
+ public:
+  CondInterceptor(int64_t interceptor_id, TaskNode* node);
+
+ private:
+  void PrepareDeps();
+  void Run(const InterceptorMessage& msg);
+  void Compute();
+  bool GetCondResult();
+  void SendDataReady(int64_t down_id);
+  void ReplyDataIsUseless(int64_t up_id);
+
+  int64_t cur_scope_id_;
+
+  std::set<int64_t> normal_in_id_;
+  std::set<int64_t> normal_out_id_;
+  int64_t stop_loop_id_;
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
index a2d2ecd9bbf106..915b1f82804085 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.cc
@@ -14,6 +14,8 @@
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor.h"
 
 #include <algorithm>
+#include <unordered_map>
+#include <vector>
 
 #include "paddle/fluid/distributed/fleet_executor/global.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
@@ -24,6 +26,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/variable.h"
 
 namespace paddle {
 namespace distributed {
@@ -51,40 +54,40 @@ FleetExecutor::~FleetExecutor() {
   }
 }
 
-void FleetExecutor::Init(
-    const std::string& carrier_id,
-    const framework::ProgramDesc& program_desc,
-    framework::Scope* scope,
-    const platform::Place& place,
-    int64_t num_micro_batches,
-    const std::vector<TaskNode*>& task_nodes,
-    const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
-    const std::vector<std::string>& inference_root_scope_vars) {
-  PADDLE_ENFORCE_GT(task_nodes.size(),
-                    0,
-                    platform::errors::InvalidArgument(
-                        "Fleet executor is inited with empty task node"));
-  // TODO(fleet_exe devs): the unused_vars should be got from run time graph
-  std::vector<std::unique_ptr<framework::OperatorBase>> ops;
-  for (auto task_node : task_nodes) {
-    for (auto op : task_node->ops()) {
-      ops.emplace_back(std::unique_ptr<framework::OperatorBase>(op));
+namespace {
+void GetSubBlockTask(const std::vector<TaskNode*>& tasks,
+                     TaskNode* cur_task,
+                     std::set<TaskNode*>* sub_block_task) {
+  auto& downstream = cur_task->downstream();
+  auto& id_to_dep_type = cur_task->id_to_dep_type();
+  for (auto& down : downstream) {
+    int64_t task_id = down.first;
+    if (id_to_dep_type.at(task_id) == DependType::NORMAL) {
+      for (const auto& task : tasks) {
+        if (task->task_id() == task_id) {
+          sub_block_task->emplace(task);
+          GetSubBlockTask(tasks, task, sub_block_task);
+        }
+      }
     }
   }
-  auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
-  // NOTE: For inference, the vars in inference_root_scope_vars
-  // shouldn't be deleted during inf, for that they may be the result of the
-  // inf. If they are GCed, it will cause error during ZeroCopy the result.
+}
+
+void PreventVarsDelete(
+    std::unordered_map<const framework::OperatorBase*,
+                       std::vector<std::string>>* unused_vars,
+    const std::vector<std::string>& vars_not_gc) {
   std::vector<const framework::OperatorBase*> changed_ops;
-  for (auto pair : unused_vars) {
+
+  for (const auto& pair : *unused_vars) {
     const framework::OperatorBase* op = pair.first;
-    std::vector<std::string> unused = pair.second;
-    for (auto name : inference_root_scope_vars) {
-      auto iter = std::find(unused.begin(), unused.end(), name);
-      if (iter != unused.end()) {
+    std::vector<std::string> cur_unused = pair.second;
+    for (auto name : vars_not_gc) {
+      auto iter = std::find(cur_unused.begin(), cur_unused.end(), name);
+      if (iter != cur_unused.end()) {
         VLOG(3) << "Removing var: [" << name
                 << "] from the unused vars list of op: [" << op->Type() << "]";
-        unused.erase(iter);
+        cur_unused.erase(iter);
         if (std::find(changed_ops.begin(), changed_ops.end(), op) ==
             changed_ops.end()) {
           // record the op whose unused vars have been updated
@@ -93,28 +96,120 @@ void FleetExecutor::Init(
       }
     }
     // update the unused vars list in the map
-    unused_vars[op] = unused;
+    unused_vars->at(op) = cur_unused;
   }
   for (auto op : changed_ops) {
-    auto iter = unused_vars.find(op);
+    const auto& iter = unused_vars->find(op);
     if (iter->second.empty()) {
       // remove those ops in the map that have empty unused vars list
       VLOG(3) << "Removing op: [" << op->Type() << "] from unused_vars map.";
-      unused_vars.erase(iter);
+      unused_vars->erase(iter);
+    }
+  }
+}
+
+std::vector<std::string> GetUnusedVarsAfterWhile(
+    const framework::ProgramDesc& program_desc,
+    TaskNode* cond_task,
+    const std::vector<std::string>& vars_not_gc) {
+  // NOTE: Since while op won't appear in task node, in order to analyze
+  // the vars which should be free after calling while op, we rebuild the
+  // whole program and get the unused vars after calling while op.
+  // The vars in while block should not be free until the while op is finished.
+  // In a word, the vars need to be free after while op is:
+  //   1. Vars in parent block and being used in while block.
+  //   2. Local vars only defined in while block.
+  // The unused vars above will be free in cond interceptor.
+  std::vector<std::string> while_block_vars;
+  std::vector<std::unique_ptr<framework::OperatorBase>> ops;
+  for (const auto& desc : program_desc.Block(0).AllOps()) {
+    ops.emplace_back(framework::OpRegistry::CreateOp(*desc));
+  }
+  auto unused_vars = framework::GetUnusedVars(program_desc.Block(0), ops, {});
+  PreventVarsDelete(&unused_vars, vars_not_gc);
+  for (const auto& pair : unused_vars) {
+    if (pair.first->Type() == "while") {
+      for (const auto& var_name : pair.second) {
+        while_block_vars.emplace_back(var_name);
+      }
+      for (auto& var : program_desc.Block(1).AllVars()) {
+        while_block_vars.emplace_back(var->Name());
+      }
+    }
+  }
+  return while_block_vars;
+}
+
+}  // namespace
+
+void FleetExecutor::Init(
+    const std::string& carrier_id,
+    const framework::ProgramDesc& program_desc,
+    framework::Scope* scope,
+    const platform::Place& place,
+    int64_t num_micro_batches,
+    const std::vector<TaskNode*>& task_nodes,
+    const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
+    const std::vector<std::string>& inference_root_scope_vars,
+    const std::vector<framework::Scope*>& micro_scope_list) {
+  PADDLE_ENFORCE_GT(task_nodes.size(),
+                    0,
+                    platform::errors::InvalidArgument(
+                        "Fleet executor is inited with empty task node"));
+  // Set the unused var after running while op
+  std::set<TaskNode*> sub_block_tasks;
+  std::vector<std::string> while_block_vars;
+  for (const auto& task_node : task_nodes) {
+    if (task_node->type() == "Cond") {
+      GetSubBlockTask(task_nodes, task_node, &sub_block_tasks);
+      while_block_vars = GetUnusedVarsAfterWhile(
+          program_desc, task_node, inference_root_scope_vars);
+      VLOG(3) << "Vars will be gced after while op";
+      for (auto var : while_block_vars) {
+        VLOG(3) << var;
+      }
+      task_node->SetWhileBlockVars(while_block_vars);
+    }
+  }
+  std::vector<framework::OperatorBase*> sub_block_ops;
+  for (const auto& task_node : sub_block_tasks) {
+    for (const auto& op : task_node->ops()) {
+      sub_block_ops.emplace_back(op);
     }
   }
+  // Analyse the unused vars in block 0. The operators in block 1
+  // should be passed in first for prevent vars been released but removed soon.
+  // Since the unused vars in block 1 need to analyse separately.
+  std::vector<std::unique_ptr<framework::OperatorBase>> ops;
+  for (const auto& task_node : task_nodes) {
+    for (const auto& op : task_node->ops()) {
+      ops.emplace_back(std::unique_ptr<framework::OperatorBase>(op));
+    }
+  }
+  auto global_unused_vars =
+      framework::GetUnusedVars(program_desc.Block(0), ops, {});
+
+  for (auto& unique_op : ops) {
+    unique_op.release();
+  }
+
+  // NOTE: For inference, the vars in inference_root_scope_vars
+  // shouldn't be deleted during inf, for that they may be the result of the
+  // inf. If they are GCed, it will cause error during ZeroCopy the result.
+  PreventVarsDelete(&global_unused_vars, inference_root_scope_vars);
+
   runtime_graph_ = std::make_shared<RuntimeGraph>();
   std::unordered_map<int64_t, TaskNode*> interceptor_id_to_task;
   for (auto task_node : task_nodes) {
-    task_node->SetUnusedVars(unused_vars);
+    if (sub_block_tasks.find(task_node) == sub_block_tasks.end()) {
+      task_node->SetUnusedVars(global_unused_vars);
+    }
     int64_t interceptor_id = task_node->task_id();
     interceptor_id_to_task.emplace(interceptor_id, task_node);
   }
   runtime_graph_->SetInterceptorIdToRank(task_id_to_rank);
   runtime_graph_->SetInterceptorIdToNode(interceptor_id_to_task);
-  for (auto& unique_op : ops) {
-    unique_op.release();
-  }
+
   VLOG(5) << runtime_graph_->DebugString();
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
@@ -126,7 +221,8 @@ void FleetExecutor::Init(
               place,
               num_micro_batches,
               program_desc,
-              inference_root_scope_vars);
+              inference_root_scope_vars,
+              micro_scope_list);
   GlobalVal<MessageBus>::Get()->Barrier();
 }
 
@@ -136,7 +232,8 @@ void FleetExecutor::InitCarrier(
     const platform::Place& place,
     int64_t num_micro_batches,
     const framework::ProgramDesc& program_desc,
-    const std::vector<std::string>& inference_root_scope_vars) {
+    const std::vector<std::string>& inference_root_scope_vars,
+    const std::vector<framework::Scope*>& micro_scope_list) {
   carrier->Init(exe_desc_.cur_rank(),
                 runtime_graph_->interceptor_id_to_rank(),
                 runtime_graph_->interceptor_id_to_node(),
@@ -144,7 +241,8 @@ void FleetExecutor::InitCarrier(
                 scope,
                 num_micro_batches,
                 place,
-                inference_root_scope_vars);
+                inference_root_scope_vars,
+                micro_scope_list);
 }
 
 void FleetExecutor::InitMessageBus() {
diff --git a/paddle/fluid/distributed/fleet_executor/fleet_executor.h b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
index f633dbbc3600f6..e8123bea1e19f7 100644
--- a/paddle/fluid/distributed/fleet_executor/fleet_executor.h
+++ b/paddle/fluid/distributed/fleet_executor/fleet_executor.h
@@ -18,6 +18,7 @@
 
 #include "paddle/fluid/distributed/fleet_executor/carrier.h"
 #include "paddle/fluid/distributed/fleet_executor/fleet_executor_desc.pb.h"
+#include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/place.h"
 
@@ -45,7 +46,8 @@ class FleetExecutor final {
             int64_t num_micro_batches,
             const std::vector<TaskNode*>& task_nodes,
             const std::unordered_map<int64_t, int64_t>& task_id_to_rank,
-            const std::vector<std::string>& inference_root_scope_vars = {});
+            const std::vector<std::string>& inference_root_scope_vars = {},
+            const std::vector<framework::Scope*>& micro_scope_list = {});
   void Run(const std::string& carrier_id);
 
  private:
@@ -57,7 +59,8 @@ class FleetExecutor final {
       const platform::Place& place,
       int64_t num_micro_batches,
       const framework::ProgramDesc& program_desc,
-      const std::vector<std::string>& inference_root_scope_vars = {});
+      const std::vector<std::string>& inference_root_scope_vars = {},
+      const std::vector<framework::Scope*>& micro_scope_list = {});
   FleetExecutorDesc exe_desc_;
   std::shared_ptr<RuntimeGraph> runtime_graph_;
   std::unordered_set<std::string> carrier_ids_;
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor.h b/paddle/fluid/distributed/fleet_executor/interceptor.h
index 6a761072027a92..2c20e1ad6113ec 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/interceptor.h
@@ -93,7 +93,6 @@ class Interceptor {
   TaskNode* node_;
 
   // for stop
-  bool stop_{false};
   void StopCarrier();
 
   // for runtime
@@ -114,9 +113,6 @@ class Interceptor {
 
   std::mutex mutex_;
   std::deque<InterceptorMessage> messages_;
-
-  int64_t already_run_times_{0};
-  int64_t used_slot_nums_{0};
 };
 
 class InterceptorFactory {
diff --git a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
index 8508bc35f29bef..fadcf7edc5ae7c 100644
--- a/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
+++ b/paddle/fluid/distributed/fleet_executor/interceptor_message.proto
@@ -24,6 +24,20 @@ enum MessageType {
   ERR = 4;             // current Interceptor encounters error
   RESET = 5;           // reset the status
   START = 6;
+  DATA_WITH_VARS = 7;
+}
+
+enum ValueType {
+  INT3 = 0;
+  INT6 = 1;
+  FLOAT = 2;
+  DOUBLE = 3;
+  BOOL = 4;
+}
+
+message VarList {
+  required string name = 1;
+  required string stensor = 2;
 }
 
 message InterceptorMessage {
@@ -32,6 +46,7 @@ message InterceptorMessage {
   optional MessageType message_type = 3 [ default = RESET ];
   optional bool ctrl_message = 4 [ default = false ];
   optional int64 scope_idx = 5 [ default = 0 ];
+  repeated VarList vars_list = 6;
 }
 
 message InterceptorResponse { optional bool rst = 1 [ default = false ]; }
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.cc b/paddle/fluid/distributed/fleet_executor/message_bus.cc
index b73ee060a1719e..d1a23cc5752966 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.cc
@@ -73,7 +73,7 @@ bool MessageBus::IsInit() const { return is_init_; }
 
 MessageBus::~MessageBus() {
   VLOG(3) << "Message bus releases resource.";
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
   server_.Stop(1000);
   server_.Join();
 #endif
@@ -94,7 +94,7 @@ bool MessageBus::Send(int64_t dst_rank,
       true,
       platform::errors::PreconditionNotMet(
           "Using message bus since it has not been initialized."));
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
   int retry_time = 0;  // message bus will retry sending for 10 times
   while (retry_time < 10) {
     ++retry_time;
@@ -179,7 +179,7 @@ void MessageBus::ListenPort() {
     LOG(INFO) << "No need listen to port since training on single card.";
     return;
   }
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
   // function keep listen the port and handle the message
   PADDLE_ENFORCE_EQ(
       server_.AddService(&message_service_, brpc::SERVER_DOESNT_OWN_SERVICE),
@@ -209,7 +209,7 @@ void MessageBus::ListenPort() {
 #endif
 }
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
 bool MessageBus::SendInterRank(int64_t dst_rank,
                                const InterceptorMessage& interceptor_message) {
   const auto& dst_addr = GetAddr(dst_rank);
diff --git a/paddle/fluid/distributed/fleet_executor/message_bus.h b/paddle/fluid/distributed/fleet_executor/message_bus.h
index dfd65fdbc00d44..481a64b71c7dd9 100644
--- a/paddle/fluid/distributed/fleet_executor/message_bus.h
+++ b/paddle/fluid/distributed/fleet_executor/message_bus.h
@@ -20,7 +20,7 @@
 #include <thread>
 #include <unordered_map>
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
 #include "brpc/channel.h"
 #include "brpc/server.h"
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
@@ -63,7 +63,7 @@ class MessageBus final {
 
   const std::string& GetAddr(int64_t rank) const;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
   // send the message inter rank (dst is different rank with src)
   bool SendInterRank(int64_t dst_rank,
                      const InterceptorMessage& interceptor_message);
@@ -79,7 +79,7 @@ class MessageBus final {
   // the ip needs to be listened
   std::string addr_;
 
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
   MessageServiceImpl message_service_;
   // brpc server
   brpc::Server server_;
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.cc b/paddle/fluid/distributed/fleet_executor/message_service.cc
index 390024b67ab6bf..5a1f3bf34d9fb8 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.cc
+++ b/paddle/fluid/distributed/fleet_executor/message_service.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
 #include "paddle/fluid/distributed/fleet_executor/message_service.h"
 
 #include "brpc/server.h"
diff --git a/paddle/fluid/distributed/fleet_executor/message_service.h b/paddle/fluid/distributed/fleet_executor/message_service.h
index 54ce0b6c1c4165..115732ea08f124 100644
--- a/paddle/fluid/distributed/fleet_executor/message_service.h
+++ b/paddle/fluid/distributed/fleet_executor/message_service.h
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#if defined(PADDLE_WITH_DISTRIBUTE) && defined(PADDLE_WITH_PSCORE)
+#if defined(PADDLE_WITH_DISTRIBUTE)
 #pragma once
 
 #include "brpc/server.h"
diff --git a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
index cb1d698a78526f..1abb7a641e23a5 100644
--- a/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/sink_interceptor.h
@@ -25,7 +25,7 @@ namespace distributed {
  *   1. record the num of micro-step
  *   2. check whether to notify carrier the current step is finished
  */
-class SinkInterceptor : public Interceptor {
+class SinkInterceptor final : public Interceptor {
  public:
   SinkInterceptor(int64_t interceptor_id, TaskNode* node);
 
diff --git a/paddle/fluid/distributed/fleet_executor/source_interceptor.h b/paddle/fluid/distributed/fleet_executor/source_interceptor.h
index f8b18fb1848645..95e8c1b3b03781 100644
--- a/paddle/fluid/distributed/fleet_executor/source_interceptor.h
+++ b/paddle/fluid/distributed/fleet_executor/source_interceptor.h
@@ -25,7 +25,7 @@ namespace distributed {
  *   1. receive `start` message from carrier
  *   2. send num_of_steps `data_is_ready` message to downstream
  */
-class SourceInterceptor : public Interceptor {
+class SourceInterceptor final : public Interceptor {
  public:
   SourceInterceptor(int64_t interceptor_id, TaskNode* node);
 
diff --git a/paddle/fluid/distributed/fleet_executor/start_interceptor.cc b/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
new file mode 100644
index 00000000000000..b9ce4fabed4ad6
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/start_interceptor.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/fleet_executor/start_interceptor.h"
+
+#include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/errors.h"
+
+namespace paddle {
+namespace distributed {
+
+StartInterceptor::StartInterceptor(int64_t interceptor_id, TaskNode* node)
+    : ComputeInterceptor(interceptor_id, node) {
+  auto& downstream = node_->downstream();
+  PADDLE_ENFORCE_EQ(
+      downstream.size(),
+      1,
+      platform::errors::OutOfRange(
+          "The downstream for StartInterceptor only support 1 for now."));
+  for (auto down : downstream) {
+    batch_size_ = down.second;
+  }
+  bool evenly_divisible = ((node_->max_run_times() % batch_size_) == 0);
+  PADDLE_ENFORCE(
+      evenly_divisible,
+      platform::errors::Fatal(
+          "Wrong config: Num of step should be divided by batch_size,"
+          "num_step=%lld, batch_size=%lld",
+          node_->max_run_times(),
+          batch_size_));
+}
+
+void StartInterceptor::RunOps() {
+  finish_count_++;
+  ComputeInterceptor::RunOps();
+}
+
+void StartInterceptor::SendDataReadyToDownStream() {
+  for (auto& outs : out_buffs_) {
+    auto down_id = outs.first;
+    auto max_buff_size = outs.second.first;
+    auto used_size = outs.second.second;
+    used_size += 1;
+    if (max_buff_size != INFINITE_BUFFER_SIZE) {
+      PADDLE_ENFORCE_LE(
+          used_size,
+          max_buff_size,
+          platform::errors::OutOfRange("downstream=%lld used buff size must <= "
+                                       "max_buff_size, but now used_size=%lld, "
+                                       "max_buff_size=%lld",
+                                       down_id,
+                                       used_size,
+                                       max_buff_size));
+    }
+    outs.second.second = used_size;
+  }
+  if (finish_count_ == batch_size_) {
+    for (int64_t i = 0; i < batch_size_; ++i) {
+      int64_t scope_id = step_ % node_->max_run_times();
+      for (auto& outs : out_buffs_) {
+        auto down_id = outs.first;
+        InterceptorMessage ready_msg;
+        ready_msg.set_message_type(DATA_IS_READY);
+        ready_msg.set_scope_idx(scope_id);
+        VLOG(3) << "StartInterceptor " << interceptor_id_
+                << " Send data_is_ready msg to " << down_id
+                << " in scope: " << scope_id;
+        Send(down_id, ready_msg);
+      }
+      step_++;
+    }
+  }
+}
+
+void StartInterceptor::Compute(const InterceptorMessage& msg) {
+  if (msg.message_type() == DATA_IS_READY) {
+    VLOG(3) << "Start interceptor " << interceptor_id_
+            << " receive data_is_ready " << msg.src_id() << " "
+            << msg.scope_idx() << " ";
+    IncreaseReady(msg.src_id(), msg.scope_idx());
+    Run();
+  } else if (msg.message_type() == DATA_IS_USELESS) {
+    VLOG(3) << "Start interceptor receive data_is_useless " << msg.src_id()
+            << " " << finish_count_;
+    finish_count_--;
+    if (finish_count_ == 0) {
+      for (int64_t i = 0; i < batch_size_; ++i) {
+        for (auto& outs : out_buffs_) {
+          auto down_id = outs.first;
+          DecreaseBuff(down_id);
+        }
+      }
+      for (int64_t i = 0; i < batch_size_; ++i) {
+        Run();
+      }
+    }
+  }
+}
+
+REGISTER_INTERCEPTOR(Start, StartInterceptor);
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/start_interceptor.h b/paddle/fluid/distributed/fleet_executor/start_interceptor.h
new file mode 100644
index 00000000000000..f082c48922bdfa
--- /dev/null
+++ b/paddle/fluid/distributed/fleet_executor/start_interceptor.h
@@ -0,0 +1,39 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <utility>
+
+#include "paddle/fluid/distributed/fleet_executor/compute_interceptor.h"
+
+namespace paddle {
+namespace distributed {
+
+class StartInterceptor final : public ComputeInterceptor {
+ public:
+  StartInterceptor(int64_t interceptor_id, TaskNode* node);
+
+ private:
+  void SendDataReadyToDownStream() override;
+  void RunOps() override;
+  void Compute(const InterceptorMessage& msg) override;
+
+  int64_t batch_size_{0};
+  int64_t finish_count_{0};
+  int64_t step_{0};
+};
+
+}  // namespace distributed
+}  // namespace paddle
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.cc b/paddle/fluid/distributed/fleet_executor/task_node.cc
index 341ffe290a5205..60d219865808a5 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.cc
+++ b/paddle/fluid/distributed/fleet_executor/task_node.cc
@@ -24,33 +24,14 @@ namespace {
 using OperatorBase = TaskNode::OperatorBase;
 }
 
-TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
-                   int64_t rank,
-                   int64_t max_run_times,
-                   int64_t max_slot_nums)
-    : program_(program),
-      rank_(rank),
-      max_run_times_(max_run_times),
-      max_slot_nums_(max_slot_nums) {
-  // Should be serially invoked, not thread-safe
-  // NOTE: when instantiate TaskNode with program, won't init task node
-  // immediately, since the provided program may be updated later (with
-  // high probability) by adding_feed_fetch_ops or by RuntimeGraph.
-  // So, delay the init part to the Init() function.
-  static int64_t task_node_cnt = 0;
-  task_id_ = task_node_cnt++;
-}
-
 TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
                    int64_t rank,
                    int64_t task_id,
-                   int64_t max_run_times,
-                   int64_t max_slot_nums)
+                   int64_t max_run_times)
     : program_(program),
       rank_(rank),
       task_id_(task_id),
-      max_run_times_(max_run_times),
-      max_slot_nums_(max_slot_nums) {
+      max_run_times_(max_run_times) {
   // TODO(liyurui): Will be removed when execute program is supported.
   Init();
 }
@@ -58,7 +39,6 @@ TaskNode::TaskNode(paddle::framework::ProgramDesc* program,
 TaskNode::TaskNode(paddle::framework::ProgramDesc* program, int64_t rank)
     : program_(program), rank_(rank), task_id_(rank) {
   max_run_times_ = 1;
-  max_slot_nums_ = 1;
   LOG(INFO)
       << "Constructing TaskNode for DistModelInf. The TaskNode's id is: "
       << rank
@@ -69,6 +49,16 @@ void TaskNode::SetProgram(paddle::framework::ProgramDesc* program) {
   program_ = program;
 }
 
+void TaskNode::SetVarsToDtype(
+    const std::map<std::string, std::string>& vars_to_dtype) {
+  vars_to_dtype_ = vars_to_dtype;
+}
+
+void TaskNode::SetVarsToShape(
+    const std::map<std::string, std::vector<int64_t>>& vars_to_shape) {
+  vars_to_shape_ = vars_to_shape;
+}
+
 void TaskNode::Init(bool use_feed_fetch_ops) {
   if (!use_feed_fetch_ops) {
     VLOG(3) << "TaskNode will be inited without feed and fetch ops";
@@ -98,13 +88,11 @@ TaskNode::TaskNode(int32_t role,
                    const std::vector<framework::OpDesc*>& op_descs,
                    int64_t rank,
                    int64_t task_id,
-                   int64_t max_run_times,
-                   int64_t max_slot_nums)
+                   int64_t max_run_times)
     : role_(role),
       rank_(rank),
       task_id_(task_id),
-      max_run_times_(max_run_times),
-      max_slot_nums_(max_slot_nums) {
+      max_run_times_(max_run_times) {
   if (op_descs.empty()) {
     return;
   }
@@ -121,33 +109,35 @@ TaskNode::TaskNode(int32_t role,
                    const std::vector<framework::OperatorBase*>& ops,
                    int64_t rank,
                    int64_t task_id,
-                   int64_t max_run_times,
-                   int64_t max_slot_nums)
+                   int64_t max_run_times)
     : ops_(ops),
       role_(role),
       rank_(rank),
       task_id_(task_id),
-      max_run_times_(max_run_times),
-      max_slot_nums_(max_slot_nums) {}
+      max_run_times_(max_run_times) {}
 
 TaskNode::TaskNode(int32_t role,
                    int64_t rank,
                    int64_t task_id,
-                   int64_t max_run_times,
-                   int64_t max_slot_nums)
+                   int64_t max_run_times)
     : role_(role),
       rank_(rank),
       task_id_(task_id),
-      max_run_times_(max_run_times),
-      max_slot_nums_(max_slot_nums) {}
+      max_run_times_(max_run_times) {}
 
-bool TaskNode::AddUpstreamTask(int64_t task_id, int64_t buff_size) {
+bool TaskNode::AddUpstreamTask(int64_t task_id,
+                               int64_t buff_size,
+                               DependType type) {
   const auto& ret = upstream_.emplace(task_id, buff_size);
+  id_to_dep_type_.emplace(task_id, type);
   return ret.second;
 }
 
-bool TaskNode::AddDownstreamTask(int64_t task_id, int64_t buff_size) {
+bool TaskNode::AddDownstreamTask(int64_t task_id,
+                                 int64_t buff_size,
+                                 DependType type) {
   const auto& ret = downstream_.emplace(task_id, buff_size);
+  id_to_dep_type_.emplace(task_id, type);
   return ret.second;
 }
 
diff --git a/paddle/fluid/distributed/fleet_executor/task_node.h b/paddle/fluid/distributed/fleet_executor/task_node.h
index 8538ac9ff81fac..181ab96c242240 100644
--- a/paddle/fluid/distributed/fleet_executor/task_node.h
+++ b/paddle/fluid/distributed/fleet_executor/task_node.h
@@ -14,8 +14,10 @@
 
 #pragma once
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <string>
+#include <unordered_map>
 #include <unordered_set>
 #include <vector>
 
@@ -29,38 +31,30 @@ class OpDesc;
 }  // namespace framework
 namespace distributed {
 
+enum class DependType { NORMAL, LOOP, STOP_LOOP };
+
 class TaskNode final {
  public:
   using OperatorBase = paddle::framework::OperatorBase;
   TaskNode(int64_t rank, int64_t task_id, int64_t max_run_times);
-  TaskNode(int32_t role,
-           int64_t rank,
-           int64_t task_id,
-           int64_t max_run_times,
-           int64_t max_slot_nums);
+  TaskNode(int32_t role, int64_t rank, int64_t task_id, int64_t max_run_times);
   TaskNode(int32_t role,
            const std::vector<framework::OpDesc*>& op_descs,
            int64_t rank,
            int64_t task_id,
-           int64_t max_run_times,
-           int64_t max_slot_nums);
+           int64_t max_run_times);
   TaskNode(int32_t role,
            const std::vector<framework::OperatorBase*>& ops,
            int64_t rank,
            int64_t task_id,
-           int64_t max_run_times,
-           int64_t max_slot_nums);
-  TaskNode(paddle::framework::ProgramDesc* program,
-           int64_t rank,
-           int64_t max_run_times,
-           int64_t max_slot_nums);
+           int64_t max_run_times);
   TaskNode(paddle::framework::ProgramDesc* program, int64_t rank);
   // TODO(liyurui): This will be the only constructor for task node
   TaskNode(paddle::framework::ProgramDesc* program,
            int64_t task_id,
            int64_t rank,
-           int64_t max_run_times,
-           int64_t max_slot_nums);
+           int64_t max_run_times);
+
   ~TaskNode() = default;
 
   void SetProgram(paddle::framework::ProgramDesc* program);
@@ -69,11 +63,11 @@ class TaskNode final {
   int64_t task_id() const { return task_id_; }
   int32_t role() const { return role_; }
   int64_t max_run_times() const { return max_run_times_; }
-  int64_t max_slot_nums() const { return max_slot_nums_; }
   int64_t run_per_steps() const { return run_per_steps_; }
   int64_t run_at_offset() const { return run_at_offset_; }
   int64_t reply_up_per_steps() const { return reply_up_per_steps_; }
   int64_t send_down_per_steps() const { return send_down_per_steps_; }
+  const std::string& cond_var() const { return cond_var_; }
   const std::unordered_map<int64_t, int64_t>& upstream() const {
     return upstream_;
   }
@@ -86,11 +80,20 @@ class TaskNode final {
   const std::vector<std::unique_ptr<OperatorBase>>& unique_ops() const {
     return ops_vec_;
   }
+  const std::unordered_map<int64_t, DependType> id_to_dep_type() const {
+    return id_to_dep_type_;
+  }
   const std::unordered_map<const OperatorBase*, std::vector<std::string>>&
   unused_vars() const {
     return unused_vars_;
   }
+  const std::vector<std::string> while_block_vars() const {
+    return while_block_vars_;
+  }
 
+  void SetCondVarName(const std::string& cond_var_name) {
+    cond_var_ = cond_var_name;
+  }
   void SetRunPerSteps(int64_t value);
   void SetRunAtOffset(int64_t value);
   void SetReplyUpPerSteps(int64_t value);
@@ -101,11 +104,27 @@ class TaskNode final {
           unused_vars) {
     unused_vars_ = unused_vars;
   }
+  void SetWhileBlockVars(const std::vector<std::string>& vars) {
+    while_block_vars_ = vars;
+  }
 
   // upstream need buffs?
-  bool AddUpstreamTask(int64_t task_id, int64_t buff_size = 1);
-  bool AddDownstreamTask(int64_t task_id, int64_t buff_size = 1);
+  bool AddUpstreamTask(int64_t task_id,
+                       int64_t buff_size = 1,
+                       DependType type = DependType::NORMAL);
+  bool AddDownstreamTask(int64_t task_id,
+                         int64_t buff_size = 1,
+                         DependType type = DependType::NORMAL);
   std::string DebugString() const;
+  const std::map<std::string, std::string>& vars_to_dtype() const {
+    return vars_to_dtype_;
+  }
+  void SetVarsToDtype(const std::map<std::string, std::string>& vars_to_dtype);
+  const std::map<std::string, std::vector<int64_t>>& vars_to_shape() const {
+    return vars_to_shape_;
+  }
+  void SetVarsToShape(
+      const std::map<std::string, std::vector<int64_t>>& vars_to_shape);
 
  private:
   DISABLE_COPY_AND_ASSIGN(TaskNode);
@@ -115,16 +134,22 @@ class TaskNode final {
   // task_id-->buff_size
   std::unordered_map<int64_t, int64_t> upstream_;
   std::unordered_map<int64_t, int64_t> downstream_;
+  // task_id-->type
+  std::unordered_map<int64_t, DependType> id_to_dep_type_;
+
   framework::ProgramDesc* program_;
+  std::string cond_var_;
   std::vector<std::unique_ptr<OperatorBase>> ops_vec_;
   std::unordered_map<const OperatorBase*, std::vector<std::string>>
       unused_vars_;
+  std::vector<std::string> while_block_vars_;
+  std::map<std::string, std::string> vars_to_dtype_;
+  std::map<std::string, std::vector<int64_t>> vars_to_shape_;
 
   int32_t role_;
   int64_t rank_;
   int64_t task_id_;
   int64_t max_run_times_;
-  int64_t max_slot_nums_;
 
   int64_t run_per_steps_{1};
   int64_t run_at_offset_{0};
diff --git a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
index 0cd39b3aad6e60..5b7f95d6ac99a7 100644
--- a/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
+++ b/paddle/fluid/distributed/fleet_executor/test/CMakeLists.txt
@@ -59,9 +59,7 @@ cc_test(
        scope
        device_context)
 
-if(WITH_DISTRIBUTE
-   AND WITH_PSCORE
-   AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
+if(WITH_DISTRIBUTE AND NOT (WITH_ASCEND OR WITH_ASCEND_CL))
   set_source_files_properties(
     interceptor_ping_pong_with_brpc_test.cc
     PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
index 86d0609ce09ccb..ace89d63c5e437 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_run_op_test.cc
@@ -77,9 +77,8 @@ TEST(ComputeInterceptor, Compute) {
   // FIXME: don't delete, otherwise interceptor will use undefined node
   TaskNode* source =
       new TaskNode(0, SOURCE_ID, 2);  // rank, task_id, max_run_times
-  TaskNode* node_a =
-      new TaskNode(0, ops, 0, 0, 2, 0);  // role, ops, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 2, 0);
+  TaskNode* node_a = new TaskNode(0, ops, 0, 0, 2);  // role, ops, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 2);
   TaskNode* sink = new TaskNode(0, SINK_ID, 2);
 
   // source->a->b->sink
diff --git a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
index 4992a8b34c9da1..1a4f3f2ce9a139 100644
--- a/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/compute_interceptor_test.cc
@@ -21,61 +21,49 @@ limitations under the License. */
 #include "paddle/fluid/distributed/fleet_executor/interceptor.h"
 #include "paddle/fluid/distributed/fleet_executor/message_bus.h"
 #include "paddle/fluid/distributed/fleet_executor/task_node.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 namespace paddle {
 namespace distributed {
 
-class StartInterceptor : public Interceptor {
- public:
-  StartInterceptor(int64_t interceptor_id, TaskNode* node)
-      : Interceptor(interceptor_id, node) {
-    RegisterMsgHandle([this](const InterceptorMessage& msg) { NOP(msg); });
-  }
-
-  void NOP(const InterceptorMessage& msg) {
-    if (msg.message_type() == STOP) {
-      stop_ = true;
-      InterceptorMessage stop;
-      stop.set_message_type(STOP);
-      Send(1, stop);  // stop 1, compute
-      return;
-    }
-    std::cout << GetInterceptorId() << " recv msg from " << msg.src_id()
-              << std::endl;
-  }
-};
-
 TEST(ComputeInterceptor, Compute) {
   std::string carrier_id = "0";
   Carrier* carrier =
       GlobalMap<std::string, Carrier>::Create(carrier_id, carrier_id);
-  carrier->Init(0, {{0, 0}, {1, 0}, {2, 0}});
+  carrier->Init(0, {{SOURCE_ID, 0}, {0, 0}, {1, 0}, {SINK_ID, 0}});
 
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
-
-  // a->b->c
+  TaskNode* source =
+      new TaskNode(0, SOURCE_ID, 3);  // rank, task_id, max_run_times
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3);
+  TaskNode* node_b = new TaskNode(0, 0, 1, 3);
+  TaskNode* sink = new TaskNode(0, SINK_ID, 3);
+
+  // source->a->b->sink
+  source->AddDownstreamTask(0);
+  node_a->AddUpstreamTask(SOURCE_ID);
   node_a->AddDownstreamTask(1, 3);
   node_b->AddUpstreamTask(0, 3);
-  node_b->AddDownstreamTask(2);
-  node_c->AddUpstreamTask(1);
+  node_b->AddDownstreamTask(SINK_ID);
+  sink->AddUpstreamTask(1);
 
-  Interceptor* a =
-      carrier->SetInterceptor(0, std::make_unique<StartInterceptor>(0, node_a));
+  carrier->SetInterceptor(
+      SOURCE_ID, InterceptorFactory::Create("Source", SOURCE_ID, source));
+  carrier->SetInterceptor(0, InterceptorFactory::Create("Compute", 0, node_a));
   carrier->SetInterceptor(1, InterceptorFactory::Create("Compute", 1, node_b));
-  carrier->SetInterceptor(2, InterceptorFactory::Create("Compute", 2, node_c));
+  carrier->SetInterceptor(SINK_ID,
+                          InterceptorFactory::Create("Sink", SINK_ID, sink));
 
+  // start
   InterceptorMessage msg;
-  msg.set_message_type(DATA_IS_READY);
-  // test run three times
-  a->Send(1, msg);
-  a->Send(1, msg);
-  a->Send(1, msg);
+  msg.set_message_type(START);
+  msg.set_dst_id(SOURCE_ID);
+  carrier->EnqueueInterceptorMessage(msg);
 
   carrier->Wait();
   carrier->Release();
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
index 54adf06fb67ddf..f43f3860199fb7 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_test.cc
@@ -33,7 +33,6 @@ class PingPongInterceptor : public Interceptor {
 
   void PingPong(const InterceptorMessage& msg) {
     if (msg.message_type() == STOP) {
-      stop_ = true;
       return;
     }
     std::cout << GetInterceptorId() << " recv msg, count=" << count_
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
index 3828c4478cbe6e..62c23068d7d4a9 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_ping_pong_with_brpc_test.cc
@@ -36,7 +36,6 @@ class PingPongInterceptor : public Interceptor {
 
   void PingPong(const InterceptorMessage& msg) {
     if (msg.message_type() == STOP) {
-      stop_ = true;
       StopCarrier();
       return;
     }
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
index 3415e377478d48..12fc77a271711b 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_long_path_test.cc
@@ -66,17 +66,17 @@ TEST(AmplifierInterceptor, Amplifier) {
   MessageBus* msg_bus = GlobalVal<MessageBus>::Create();
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "127.0.0.0:0");
 
-  int64_t micro_steps = 3;
+  int64_t micro_steps = 1;
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
   TaskNode* source =
       new TaskNode(0, SOURCE_ID, micro_steps);  // rank, task_id, max_run_times
-  TaskNode* node_a = new TaskNode(0, 0, 0, 1, 0);  // role, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 1, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 1, 0);
-  TaskNode* node_d = new TaskNode(0, 0, 3, 1, 0);
-  TaskNode* node_e = new TaskNode(0, 0, 4, 1, 0);
-  TaskNode* node_f = new TaskNode(0, 0, 5, 1, 0);
+  TaskNode* node_a = new TaskNode(0, 0, 0, 1);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, 1);
+  TaskNode* node_c = new TaskNode(0, 0, 2, 1);
+  TaskNode* node_d = new TaskNode(0, 0, 3, 1);
+  TaskNode* node_e = new TaskNode(0, 0, 4, 1);
+  TaskNode* node_f = new TaskNode(0, 0, 5, 1);
   TaskNode* sink = new TaskNode(0, SINK_ID, micro_steps);
 
   // source->a->b->c->d->e->f->sink
diff --git a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
index fdee01fed1a05c..4a29f07db5b268 100644
--- a/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/interceptor_pipeline_short_path_test.cc
@@ -83,11 +83,10 @@ TEST(AmplifierInterceptor, Amplifier) {
   // NOTE: don't delete, otherwise interceptor will use undefined node
   TaskNode* source =
       new TaskNode(0, SOURCE_ID, micro_steps);  // rank, task_id, max_run_times
-  TaskNode* node_a =
-      new TaskNode(0, 0, 0, micro_steps, 0);  // role, rank, task_id
-  TaskNode* node_b = new TaskNode(0, 0, 1, 3, 0);
-  TaskNode* node_c = new TaskNode(0, 0, 2, 3, 0);
-  TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps, 0);
+  TaskNode* node_a = new TaskNode(0, 0, 0, micro_steps);  // role, rank, task_id
+  TaskNode* node_b = new TaskNode(0, 0, 1, micro_steps);
+  TaskNode* node_c = new TaskNode(0, 0, 2, micro_steps);
+  TaskNode* node_d = new TaskNode(0, 0, 3, micro_steps);
   TaskNode* sink = new TaskNode(0, SINK_ID, micro_steps);
 
   // source->a->b->c->d->sink
diff --git a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
index 879d7e9b029418..b2b1d06634bd82 100644
--- a/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/sink_interceptor_test.cc
@@ -62,10 +62,9 @@ TEST(SourceInterceptor, Source) {
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* source =
-      new TaskNode(0, SOURCE_ID, 0, 3, 0);             // role, rank, task_id
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);      // role, rank, task_id
-  TaskNode* sink = new TaskNode(0, SINK_ID, 0, 3, 0);  // role, rank, task_id
+  TaskNode* source = new TaskNode(0, SOURCE_ID, 0, 3);  // role, rank, task_id
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3);          // role, rank, task_id
+  TaskNode* sink = new TaskNode(0, SINK_ID, 0, 3);      // role, rank, task_id
 
   source->AddDownstreamTask(0, 1);
   node_a->AddUpstreamTask(SOURCE_ID, 1);
diff --git a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
index 21a1b4accc9f1e..a707650dfbc492 100644
--- a/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
+++ b/paddle/fluid/distributed/fleet_executor/test/source_interceptor_test.cc
@@ -61,9 +61,8 @@ TEST(SourceInterceptor, Source) {
   msg_bus->Init(0, {{0, "127.0.0.0:0"}}, "");
 
   // NOTE: don't delete, otherwise interceptor will use undefined node
-  TaskNode* source =
-      new TaskNode(0, SOURCE_ID, 0, 3, 0);         // role, rank, task_id
-  TaskNode* node_a = new TaskNode(0, 0, 0, 3, 0);  // role, rank, task_id
+  TaskNode* source = new TaskNode(0, SOURCE_ID, 0, 3);  // role, rank, task_id
+  TaskNode* node_a = new TaskNode(0, 0, 0, 3);          // role, rank, task_id
 
   source->AddDownstreamTask(0, 1);
   node_a->AddUpstreamTask(SOURCE_ID, 1);
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.cc b/paddle/fluid/eager/accumulation/accumulation_node.cc
index 0017dba79742e7..12bbfbbb25d2b3 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.cc
+++ b/paddle/fluid/eager/accumulation/accumulation_node.cc
@@ -16,6 +16,7 @@
 
 #include "glog/logging.h"
 #include "paddle/fluid/eager/eager_tensor.h"
+#include "paddle/fluid/eager/utils.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -89,7 +90,7 @@ GradNodeAccumulation::operator()(
                          kSlotSmallVectorSize>& grads,  // NOLINT
     bool create_graph,
     bool is_new_grad) {
-  VLOG(3) << "Running Eager Backward Node: GradNodeAccumulation";
+  VLOG(3) << "Running AD API Grad: GradNodeAccumulation";
   PADDLE_ENFORCE(grads.size() == 1,
                  paddle::platform::errors::Fatal(
                      "GradNodeAccumulation should take exactly 1 grad tensor"
@@ -122,7 +123,22 @@ GradNodeAccumulation::operator()(
   if (ReduceHooksRegistered()) {
     ApplyReduceHooks();
   }
+  VLOG(3) << "Finish AD API Grad: GradNodeAccumulation";
+  if (VLOG_IS_ON(4)) {
+    const char* INPUT_PRINT_TEMPLATE = "{ Input: [%s], Output: [%s] } ";
 
+    std::string input_str = "";
+    std::string output_str = "";
+    const char* TENSOR_OUT_GRAD_TEMPLATE = "(grads[0][0], [%s]), ";
+    std::string input_out_grad_str = paddle::string::Sprintf(
+        TENSOR_OUT_GRAD_TEMPLATE, egr::EagerUtils::TensorStr(grads[0][0]));
+    const char* TENSOR_X_GRAD_TEMPLATE = "(grad_out, [%s]), ";
+    std::string output_x_grad_str = paddle::string::Sprintf(
+        TENSOR_X_GRAD_TEMPLATE, egr::EagerUtils::TensorStr(grad_out));
+    output_str += output_x_grad_str;
+    VLOG(4) << paddle::string::Sprintf(
+        INPUT_PRINT_TEMPLATE, input_str, output_str);
+  }
   return {{grad_out}};
 }
 
diff --git a/paddle/fluid/eager/accumulation/accumulation_node.h b/paddle/fluid/eager/accumulation/accumulation_node.h
index 8dbc2872ca2bbb..f8e2c4327e1420 100644
--- a/paddle/fluid/eager/accumulation/accumulation_node.h
+++ b/paddle/fluid/eager/accumulation/accumulation_node.h
@@ -24,7 +24,7 @@ class GradNodeAccumulation : public GradNodeBase {
  public:
   // Constructor: configure fwd input tensors to grad node
   explicit GradNodeAccumulation(AutogradMeta* meta) : GradNodeBase(1, 1) {
-    VLOG(6) << "Construct GradNodeAccumulation";
+    VLOG(5) << "Construct GradNodeAccumulation";
     if (meta) {
       weak_grad_ = meta->WeakGrad();
     }
@@ -33,7 +33,7 @@ class GradNodeAccumulation : public GradNodeBase {
   }
 
   ~GradNodeAccumulation() override {
-    VLOG(6) << "Destruct GradNodeAccumulation";
+    VLOG(5) << "Destruct GradNodeAccumulation";
   }
 
   // Functor: perform backward computations
@@ -44,7 +44,7 @@ class GradNodeAccumulation : public GradNodeBase {
              bool create_graph = false,
              bool is_new_grad = false) override;
 
-  void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
+  void ClearTensorWrappers() override { VLOG(5) << "Do nothing here now"; }
 
   std::string name() { return "GradNodeAccumulation"; }
 
diff --git a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
index 49d401b92303ec..bc970f4e2d8594 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
+++ b/paddle/fluid/eager/api/manual/eager_manual/dygraph_forward_api.h
@@ -16,10 +16,10 @@
 
 #include "paddle/phi/api/include/tensor.h"
 
-paddle::experimental::Tensor add_n_dygraph_function(
+paddle::experimental::Tensor add_n_ad_func(
     const std::vector<paddle::experimental::Tensor>& x);
 
-paddle::experimental::Tensor conv2d_dygraph_function(
+paddle::experimental::Tensor conv2d_ad_func(
     const paddle::experimental::Tensor& input,
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
index 3081eaf3584f65..fc423402113103 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/add_n_fwd_func.cc
@@ -23,7 +23,7 @@
 #pragma GCC diagnostic ignored "-Wunused-variable"
 DECLARE_bool(check_nan_inf);
 
-paddle::experimental::Tensor add_n_dygraph_function(
+paddle::experimental::Tensor add_n_ad_func(
     const std::vector<paddle::experimental::Tensor>& x) {
   // Dygraph Record Event
   paddle::platform::RecordEvent dygraph_entrance_record_event(
@@ -46,7 +46,7 @@ paddle::experimental::Tensor add_n_dygraph_function(
       paddle::imperative::AutoCastGuard guard(
           egr::Controller::Instance().GetCurrentTracer(),
           paddle::imperative::AmpLevel::O0);
-      return add_n_dygraph_function(NEW_x);
+      return add_n_ad_func(NEW_x);
     }
   }
 
@@ -56,7 +56,7 @@ paddle::experimental::Tensor add_n_dygraph_function(
   std::vector<egr::AutogradMeta*>* x_autograd_meta = &x_autograd_meta_vec;
   // Forward API Call
   VLOG(3) << "Final State Running: "
-          << "add_n_dygraph_function";
+          << "add_n_ad_func";
   auto api_result = paddle::experimental::add_n(x);
   // Check NaN and Inf if needed
   if (FLAGS_check_nan_inf) {
diff --git a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
index 3e2e67297834d9..d5f15883e0e193 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/forwards/conv2d_fwd_function.cc
@@ -24,7 +24,7 @@
 #pragma GCC diagnostic ignored "-Wunused-variable"
 DECLARE_bool(check_nan_inf);
 
-paddle::experimental::Tensor conv2d_dygraph_function(
+paddle::experimental::Tensor conv2d_ad_func(
     const paddle::experimental::Tensor& input,
     const paddle::experimental::Tensor& filter,
     std::vector<int> strides,
@@ -51,32 +51,32 @@ paddle::experimental::Tensor conv2d_dygraph_function(
 
     auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);
 
-    auto NEW_input =
+    auto new_input =
         egr::EagerAmpAutoCast("input", input, amp_dst_dtype, op_name);
-    auto NEW_filter =
+    auto new_filter =
         egr::EagerAmpAutoCast("filter", filter, amp_dst_dtype, op_name);
 
     {
       paddle::imperative::AutoCastGuard guard(
           egr::Controller::Instance().GetCurrentTracer(),
           paddle::imperative::AmpLevel::O0);
-      return conv2d_dygraph_function(NEW_input,
-                                     NEW_filter,
-                                     strides,
-                                     paddings,
-                                     paddding_algorithm,
-                                     groups,
-                                     dilations,
-                                     data_format,
-                                     use_addto,
-                                     workspace_size_MB,
-                                     exhaustive_search);
+      return conv2d_ad_func(new_input,
+                            new_filter,
+                            strides,
+                            paddings,
+                            paddding_algorithm,
+                            groups,
+                            dilations,
+                            data_format,
+                            use_addto,
+                            workspace_size_MB,
+                            exhaustive_search);
     }
   }
 
   // Layout autotune
 
-  if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {
+  if (egr::Controller::Instance().UseLayoutAutoTune()) {
     VLOG(5) << "Check and Prepare For LAYOUT";
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          egr::kSlotSmallVectorSize>
@@ -85,24 +85,23 @@ paddle::experimental::Tensor conv2d_dygraph_function(
     auto op_name = phi::TransToFluidOpName("conv2d");
     auto transformer = egr::EagerLayoutAutotune<std::string>(
         op_name, tensors_vector, &data_format);
-    auto NEW_input = transformer->TransInTensor("input", input);
-    bool is_enable_tune =
-        paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune();
-    paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
-    auto out = conv2d_dygraph_function(NEW_input,
-                                       filter,
-                                       strides,
-                                       paddings,
-                                       paddding_algorithm,
-                                       groups,
-                                       dilations,
-                                       data_format,
-                                       use_addto,
-                                       workspace_size_MB,
-                                       exhaustive_search);
+    auto new_input = transformer->TransInTensor("input", input);
+    bool need_tune = egr::Controller::Instance().UseLayoutAutoTune();
+    egr::Controller::Instance().DisableLayoutAutoTune();
+    auto out = conv2d_ad_func(new_input,
+                              filter,
+                              strides,
+                              paddings,
+                              paddding_algorithm,
+                              groups,
+                              dilations,
+                              data_format,
+                              use_addto,
+                              workspace_size_MB,
+                              exhaustive_search);
     transformer->SetOutTensorLayout(&out);
-    if (is_enable_tune) {
-      paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
+    if (need_tune) {
+      egr::Controller::Instance().EnableLayoutAutoTune();
     }
     // Returns
     return out;
@@ -115,7 +114,7 @@ paddle::experimental::Tensor conv2d_dygraph_function(
       egr::EagerUtils::nullable_autograd_meta(filter);
   // Forward API Call
   VLOG(3) << "Final State Running: "
-          << "conv2d_dygraph_function";
+          << "conv2d_ad_func";
   auto api_result = paddle::experimental::conv2d(input,
                                                  filter,
                                                  strides,
diff --git a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
index b0dc4f59ffda5e..6f7a34094b19d6 100644
--- a/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
+++ b/paddle/fluid/eager/api/manual/eager_manual/nodes/add_n_node.cc
@@ -64,8 +64,7 @@ AddNGradNodeFinal::operator()(
 
   // dygraph function
   for (size_t i = 0; i < returns[0].size(); i++) {
-    returns[0][i] =
-        ::scale_dygraph_function(out_grad, phi::Scalar(1.0), 0.0, true);
+    returns[0][i] = ::scale_ad_func(out_grad, phi::Scalar(1.0), 0.0, true);
   }
 
   // Check NaN and Inf id needed
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
index ea1bc2271c1948..d733dbf8b7c288 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
+++ b/paddle/fluid/eager/api/manual/fluid_manual/forwards/fused_attention_fwd_func.cc
@@ -531,7 +531,6 @@ fused_attention_dygraph_function(
       egr::EagerUtils::SetHistory(p_autograd_Y, grad_node);
       grad_node->SetGradInMeta(Y, 19);
       egr::EagerUtils::CheckAndRetainGrad(Y);
-
       auto QKVOut_accumulation_node =
           std::make_shared<egr::GradNodeAccumulation>(p_autograd_QKVOut);
       egr::EagerUtils::SetOutRankWithSlot(p_autograd_QKVOut, 0);
diff --git a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
index 32389e553d03c4..7e0d679689c4a2 100644
--- a/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
+++ b/paddle/fluid/eager/api/manual/fluid_manual/nodes/nodes.h
@@ -17,6 +17,23 @@
 #include "paddle/fluid/eager/tensor_wrapper.h"
 #include "paddle/fluid/imperative/tracer.h"
 
+template <typename T>
+const T& GetAttrWithDefault(
+    const paddle::framework::AttributeMap& attrs,
+    const paddle::framework::AttributeMap& default_attrs,
+    const std::string& name) {
+  auto iter1 = attrs.find(name);
+  if (iter1 != attrs.end()) {
+    return PADDLE_GET_CONST(T, iter1->second);
+  }
+  auto iter2 = default_attrs.find(name);
+  if (iter2 != default_attrs.end()) {
+    return PADDLE_GET_CONST(T, iter2->second);
+  }
+  PADDLE_THROW(
+      phi::errors::InvalidArgument("Attribute(%s) cannot be found.", name));
+}
+
 class fused_gate_attentionGradNodeCompat : public egr::GradNodeBase {
  public:
   fused_gate_attentionGradNodeCompat() : egr::GradNodeBase() {
@@ -240,7 +257,9 @@ class fused_feedforwardGradNodeCompat : public egr::GradNodeBase {
   }
   void SetTensorWrapperDropout2Out(
       const paddle::experimental::Tensor& Dropout2Out) {
-    Dropout2Out_ = egr::TensorWrapper(Dropout2Out, false);
+    auto pre_layer_norm = GetAttrWithDefault<bool>(
+        attr_map_, default_attr_map_, "pre_layer_norm");
+    Dropout2Out_ = egr::TensorWrapper(Dropout2Out, pre_layer_norm);
   }
   void SetTensorWrapperLinear1Bias(
       const paddle::experimental::Tensor& Linear1Bias) {
@@ -427,27 +446,27 @@ class fused_attentionGradNodeCompat : public egr::GradNodeBase {
   }
   void SetTensorWrapperOutLinearOut(
       const paddle::experimental::Tensor& OutLinearOut) {
-    OutLinearOut_ = egr::TensorWrapper(OutLinearOut, false);
+    OutLinearOut_ = egr::TensorWrapper(OutLinearOut, true);
   }
   void SetTensorWrapperOutLinearW(
       const paddle::experimental::Tensor& OutLinearW) {
     OutLinearW_ = egr::TensorWrapper(OutLinearW, false);
   }
   void SetTensorWrapperQKOut(const paddle::experimental::Tensor& QKOut) {
-    QKOut_ = egr::TensorWrapper(QKOut, false);
+    QKOut_ = egr::TensorWrapper(QKOut, true);
   }
   void SetTensorWrapperQKTVOut(const paddle::experimental::Tensor& QKTVOut) {
-    QKTVOut_ = egr::TensorWrapper(QKTVOut, false);
+    QKTVOut_ = egr::TensorWrapper(QKTVOut, true);
   }
   void SetTensorWrapperQKVBias(const paddle::experimental::Tensor& QKVBias) {
     QKVBias_ = egr::TensorWrapper(QKVBias, false);
   }
   void SetTensorWrapperQKVBiasOut(
       const paddle::experimental::Tensor& QKVBiasOut) {
-    QKVBiasOut_ = egr::TensorWrapper(QKVBiasOut, false);
+    QKVBiasOut_ = egr::TensorWrapper(QKVBiasOut, true);
   }
   void SetTensorWrapperQKVOut(const paddle::experimental::Tensor& QKVOut) {
-    QKVOut_ = egr::TensorWrapper(QKVOut, false);
+    QKVOut_ = egr::TensorWrapper(QKVOut, true);
   }
   void SetTensorWrapperQKVW(const paddle::experimental::Tensor& QKVW) {
     QKVW_ = egr::TensorWrapper(QKVW, false);
diff --git a/paddle/fluid/eager/api/utils/global_utils.h b/paddle/fluid/eager/api/utils/global_utils.h
index 93149feeae3116..7defffa18e0f77 100644
--- a/paddle/fluid/eager/api/utils/global_utils.h
+++ b/paddle/fluid/eager/api/utils/global_utils.h
@@ -55,6 +55,23 @@ class Controller {
   paddle::imperative::AmpLevel GetAMPLevel() const {
     return tracer_->GetAmpLevel();
   }
+
+  bool UseLayoutAutoTune() {
+    bool use_autotune = false;
+#if defined(PADDLE_WITH_CUDA)
+    auto place = tracer_->ExpectedPlace();
+    bool is_gpu_place = paddle::platform::is_gpu_place(place);
+    if (is_gpu_place) {
+      use_autotune = tracer_->UseLayoutAutoTune();
+    }
+#endif
+    return use_autotune;
+  }
+
+  void DisableLayoutAutoTune() { tracer_->DisableLayoutAutoTune(); }
+
+  void EnableLayoutAutoTune() { tracer_->EnableLayoutAutoTune(); }
+
   bool HasGrad() const { return tracer_->HasGrad(); }
   void SetHasGrad(bool has_grad) { tracer_->SetHasGrad(has_grad); }
   std::string GenerateUniqueName(std::string key = "eager_in_tmp") {
diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
index 3f6af507aed2fb..efad9f61ee3f90 100644
--- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc
+++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc
@@ -55,7 +55,9 @@ static std::unordered_set<std::string> black_ops_list = {"run_program",
                                                          "fused_gate_attention",
                                                          "fused_feedforward",
                                                          "fused_attention",
-                                                         "fused_gemm_epilogue"};
+                                                         "fused_gemm_epilogue",
+                                                         "sparse_divide_scalar",
+                                                         "sparse_scale"};
 
 static std::string LegalizeVariableName(const std::string& var_name) {
   std::string ret = var_name;
@@ -1797,6 +1799,15 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     generated_function_body += amp_context;
     generated_function_body += "\n";
   }
+
+  if (!forward_inplace_map.empty()) {
+    generated_function_body +=
+        "  auto current_level = egr::Controller::Instance().GetAMPLevel();\n";
+    generated_function_body +=
+        "  "
+        "egr::Controller::Instance().SetAMPLevel(paddle::imperative::AmpLevel::"
+        "O0);\n";
+  }
   // forward ins insert
   const char* FWD_INS_MAP_TEMPLATE =
       "  std::map<std::string, "
@@ -1999,6 +2010,10 @@ static std::pair<std::string, std::string> GenerateForwardFunctionContents(
     }
     trace_op_body_str += out_tensor_str;
   }
+  if (!forward_inplace_map.empty()) {
+    trace_op_body_str +=
+        "  egr::Controller::Instance().SetAMPLevel(current_level);\n";
+  }
   trace_op_body_str += "\n";
   VLOG(6) << "Converted Output VarBase to EagerVariable(s)";
   /* ------ END Generate TraceOp ----- */
@@ -3148,6 +3163,12 @@ static void DygraphCodeGeneration(const std::string& output_dir,
       continue;
     }
 
+    // Skip the sparse op
+    if (op_type.compare(0, 7, "sparse_") == 0 && op_type != "sparse_momentum" &&
+        op_type != "sparse_attention") {
+      continue;
+    }
+
     GradNodeGenerationInfo bwd_info;
 
     bool is_available = CollectGradInformationFromOpInfo(op_info, &bwd_info);
diff --git a/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt b/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
index aeceb50573e9b8..f82e28de1f35a2 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
+++ b/paddle/fluid/eager/auto_code_generator/generator/CMakeLists.txt
@@ -1,8 +1,8 @@
 set(api_yaml_path
-    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/api.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_api.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_api.yaml"
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml"
 )
 set(backward_yaml_path
-    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_bw_api.yaml"
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_backward.yaml,${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_backward.yaml"
 )
 set(tmp_forwards_cc_path
     "${PADDLE_SOURCE_DIR}/paddle/fluid/eager/api/generated/eager_generated/forwards/tmp_dygraph_functions.cc"
@@ -30,7 +30,7 @@ set(nodes_h_path
 )
 # StringTensor only needs forward api
 set(fwd_api_yaml_path
-    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/strings_api.yaml")
+    "${PADDLE_SOURCE_DIR}/paddle/phi/api/yaml/strings_ops.yaml")
 
 message("Final State Eager CodeGen")
 add_custom_target(
diff --git a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
index 45895791128df1..2dc62ff349a73a 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/codegen_utils.py
@@ -83,10 +83,10 @@ def ReadBwdFile(filepath):
     ret = {}
     if contents is not None:
         for content in contents:
-            assert 'backward_api' in content.keys(), AssertMessage(
-                'backward_api', content.keys())
-            if 'backward_api' in content.keys():
-                api_name = content['backward_api']
+            assert 'backward_op' in content.keys(), AssertMessage(
+                'backward_op', content.keys())
+            if 'backward_op' in content.keys():
+                api_name = content['backward_op']
 
             ret[api_name] = content
     f.close()
@@ -161,11 +161,24 @@ def str2Hump(text):
     string = str2Hump(string)
     if string.rfind("Grad") == (len(string) - 4):
         string = string[:-4]
-    return f"{string}GradNodeFinal"
+    return f"{string}GradNode"
 
 
 def GetDygraphForwardFunctionName(string):
-    return f"{string}_dygraph_function"
+    return f"{string}_ad_func"
+
+
+def GetDygraphLogName(string):
+
+    def str2Hump(text):
+        arr = filter(None, text.split('_'))
+        res = ''
+        for i in arr:
+            res = res + i.lower()
+        return res
+
+    string = str2Hump(string)
+    return string
 
 
 def GetIntermediateAPIFunctionName(string):
@@ -198,7 +211,7 @@ def GetInplacedFunctionName(function_name):
 
 
 def GetForwardFunctionName(string):
-    return f"{string}_dygraph_function"
+    return f"{string}_ad_func"
 
 
 def GetIndent(num):
@@ -418,12 +431,12 @@ def ParseIntermediate(self):
     def CollectOriginalForwardInfo(self):
         forward_api_contents = self.forward_api_contents
 
-        self.forward_api_name = forward_api_contents['api']
+        self.forward_api_name = forward_api_contents['op']
         forward_args_str = forward_api_contents['args']
         forward_returns_str = forward_api_contents['output']
 
-        assert 'api' in forward_api_contents.keys(
-        ), "Unable to find \"api\" in forward_api_contents keys"
+        assert 'op' in forward_api_contents.keys(
+        ), "Unable to find \"op\" in forward_api_contents keys"
         assert 'args' in forward_api_contents.keys(
         ), "Unable to find \"args\" in forward_api_contents keys"
         assert 'output' in forward_api_contents.keys(
diff --git a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
index 41af2c3f1506b8..eb42189d13be53 100644
--- a/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
+++ b/paddle/fluid/eager/auto_code_generator/generator/eager_gen.py
@@ -23,7 +23,7 @@
 from codegen_utils import FindGradName, FindForwardName, GetSavedName, GetGradNodeName
 from codegen_utils import IsPlainTensorType, IsVectorTensorType
 from codegen_utils import GetConstReference, RemoveConstAndReference
-from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName
+from codegen_utils import GetDygraphForwardFunctionName, GetIntermediateAPIFunctionName, GetDygraphLogName
 from codegen_utils import GetAutoGradMetaName, GetAutoGradMetaVectorName
 from codegen_utils import RemoveSpecialSymbolsInName, RecoverBaseNameOfInplaceFunction
 from codegen_utils import GetInplacedFunctionName
@@ -150,6 +150,7 @@ class {} : public egr::GradNodeBase {{
 GRAD_FUNCTION_TEMPLATE = \
 """
 paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> {}::operator()(paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize>& grads, bool create_graph, bool is_new_grad) {{
+  VLOG(3) << \"Running AD API GRAD: \" << \"{}\";
   // Fill Zero For GradIn Tensors
 {}
   // Apply Gradient Hooks
@@ -165,8 +166,11 @@ class {} : public egr::GradNodeBase {{
 {}
   // Inplace Strategy
 {}
+
+  VLOG(5) << \"Running C++ API: \" << \"{}\";
+  // Before log info
+{}
   // Call grad_api function
-  VLOG(3) << \"Final State Running: {}\";
 {}
   // Check NaN and Inf id needed
 {}
@@ -174,6 +178,9 @@ class {} : public egr::GradNodeBase {{
 {}
   // Create Grad Node
 {}
+  VLOG(4) << \"Finish AD API GRAD: {}";
+  // LOG IF DEBUG
+  {}
   // Return
 {}
 }}
@@ -182,6 +189,7 @@ class {} : public egr::GradNodeBase {{
 FORWARD_FUNCTION_TEMPLATE = \
 """
 {} {}({}) {{
+  VLOG(3) << \"Running AD API: \" << \"{}\";
   // Dygraph Record Event
 {}
   // AMP Logic
@@ -190,8 +198,11 @@ class {} : public egr::GradNodeBase {{
 {}
   // Get Input AutoGradMeta
 {}
-  // Forward API Call
-  VLOG(3) << \"Final State Running: \" << \"{}\";
+
+  VLOG(5) << \"Running C++ API: \" << \"{}\";
+ // Before log info
+{}
+ // Forward API Call
 {}
   // Check NaN and Inf if needed
 {}
@@ -206,27 +217,53 @@ class {} : public egr::GradNodeBase {{
 {}{}
   // Node Creation
 {}
+
+  VLOG(4) << \"Finish AD API: {}";
+  // LOG IF DEBUG
+  {}
   // Returns
   return {};
 }}
 """
 
+AFTER_LOG_PRINT_TEMPLATE = \
+"""
+  if(VLOG_IS_ON(4)){{
+      const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s],  Output: [%s] }} \";
+      {}
+      VLOG(4) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str, output_str);
+  }}
+"""
+
+BEFORE_LOG_PRINT_TEMPLATE = \
+"""
+  if(VLOG_IS_ON(3)){{
+      const char* INPUT_PRINT_TEMPLATE = \"{{ Input: [%s]}} \";
+      {}
+      VLOG(3) << paddle::string::Sprintf(INPUT_PRINT_TEMPLATE, input_str);
+  }}
+"""
 
 FORWARD_ONLY_FUNCTION_TEMPLATE = \
 """
 {} {}({}) {{
+  VLOG(3) << \"Running AD API: \" << \"{}\";
   // Dygraph Record Event
 {}
   // AMP Logic
 {}
   // Layout autotune
+{}
+  VLOG(5) << \"Running C++ API: \" << \"{}\";
+  // Before log info
 {}
   // Forward API Call
-  VLOG(3) << \"Final State Running: \" << \"{}\";
 {}
   // Get Outputs
 {}
-
+  VLOG(4) << \"Finish AD API: {}";
+  // LOG IF DEBUG
+  {}
   // Returns
   return {};
 }}
@@ -400,15 +437,14 @@ class {} : public egr::GradNodeBase {{
 """
 LAYOUT_LOGIC_TEMPLATE=\
 """
-  if (paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune()) {{
-    VLOG(5) << "Check and Prepare For LAYOUT";
+  if (egr::Controller::Instance().UseLayoutAutoTune()) {{
     paddle::small_vector<std::vector<paddle::experimental::Tensor>, egr::kSlotSmallVectorSize> tensors_vector = {};
     {} 
     {}
-    paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune(); 
+    VLOG(5) << "Check and Prepare For LAYOUT "<< op_name;
+    paddle::imperative::LayoutAutotuneGuard guard(egr::Controller::Instance().GetCurrentTracer(), false);
     {}
     {}
-    paddle::imperative::LayoutAutoTune::Instance().EnableLayoutAutoTune();
     // Returns
     return {};
   }}
@@ -569,16 +605,16 @@ def DygraphYamlValidationCheck(self):
         forward_api_contents = self.forward_api_contents
         grad_api_contents = self.grad_api_contents
 
-        assert 'api' in forward_api_contents.keys(
-        ), "Unable to find \"api\" in api.yaml"
+        assert 'op' in forward_api_contents.keys(
+        ), "Unable to find \"op\" in ops.yaml"
         assert 'args' in forward_api_contents.keys(
-        ), "Unable to find \"args\" in api.yaml"
+        ), "Unable to find \"args\" in ops.yaml"
         assert 'output' in forward_api_contents.keys(
-        ), "Unable to find \"output\" in api.yaml"
+        ), "Unable to find \"output\" in ops.yaml"
 
         if grad_api_contents is not None:
             assert 'backward' in forward_api_contents.keys(
-            ), "Unable to find \"backward\" in api.yaml"
+            ), "Unable to find \"backward\" in ops.yaml"
             assert 'args' in grad_api_contents.keys(
             ), "Unable to find \"args\" in backward.yaml"
             assert 'output' in grad_api_contents.keys(
@@ -867,7 +903,7 @@ def GenerateNodeCreationCodes(self, for_backward=False):
             set_grad_out_meta_list.append(set_grad_out_meta)
         set_grad_out_meta_str = "\n".join(set_grad_out_meta_list)
 
-        # SetOutRank & SetHistory & SetGradInMeta & CheckAndRetainGrad
+        # SetOutRank & SetHistory & SetGradInMeta
         set_out_rank_list = []
         set_history_list = []
         set_grad_in_meta_list = []
@@ -978,6 +1014,98 @@ def __init__(self, forward_api_contents, grad_api_contents,
         self.forward_definition_str = ""
         self.forward_declaration_str = ""
 
+    def GenerateForwardLayoutAutotune(self, forward_api_name,
+                                      amp_tensors_vector_list,
+                                      layout_tensors_vector_optional_list,
+                                      layout_autotune_list_str,
+                                      returns_type_str, returns_str,
+                                      amp_inputs_call_args_str):
+        intermediate_outputs = self.intermediate_outputs
+        forward_attrs_list = self.forward_attrs_list
+        forward_outputs_position_map = self.forward_outputs_position_map
+        num_outputs = len(
+            forward_outputs_position_map.keys()) - len(intermediate_outputs)
+        # for layout autotune attr
+        lightly_sensitive_attr = [
+            'axis', 'axes', 'dim', 'dims', 'start', 'end', 'stop'
+        ]
+        heavily_sensitive_attr = ['data_format', 'data_layout']
+        layout_autotune_attr = []
+        layout_autotune_attr_code_list = []
+        layout_autotune_attr_type_list = []
+        layout_autotune_attr_code_list.append(
+            f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");\n"
+        )
+
+        lightly_flag = False
+        heavily_flag = False
+        for name, atype, default_val, pos in forward_attrs_list:
+            for attr_name in lightly_sensitive_attr:
+                if name.find(attr_name) != -1 and (name
+                                                   not in layout_autotune_attr):
+                    lightly_flag = True
+                    layout_autotune_attr.append(name)
+                    layout_autotune_attr_type_list.append(atype)
+            if lightly_flag is False:
+                for attr_name in heavily_sensitive_attr:
+                    if name.find(attr_name) != -1 and (
+                            name not in layout_autotune_attr):
+                        layout_autotune_attr.append(name)
+                        layout_autotune_attr_type_list.append(atype)
+                        heavily_flag = True
+        if len(layout_autotune_attr) == 0:
+            layout_autotune_attr_code_list.append(
+                f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector);\n"
+            )
+        elif len(layout_autotune_attr) == 1:
+            layout_autotune_attr_code_list.append(
+                f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector, &{layout_autotune_attr[0]});\n"
+            )
+        elif len(layout_autotune_attr) == 2:
+            layout_autotune_attr_code_list.append(
+                f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}, {layout_autotune_attr_type_list[1]}>(op_name, tensors_vector, &{layout_autotune_attr[0]}, &{layout_autotune_attr[1]});\n"
+            )
+        else:
+            layout_autotune_attr_code_list.append(
+                f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector,&{layout_autotune_attr[0]});\n"
+            )
+        # Out tensor
+        layout_inputs_call_args_str = amp_inputs_call_args_str
+        forward_function_name = GetDygraphForwardFunctionName(forward_api_name)
+        layout_tmp_result_list = []
+        layout_autotune_outs_list = []
+        result_name = "api_result"
+        if num_outputs == 1:
+            result_name = returns_str
+            layout_autotune_outs_list.append(
+                f"transformer -> SetOutTensorLayout(&{returns_str});\n")
+        else:
+            for name, (rtype, pos) in forward_outputs_position_map.items():
+                if name in intermediate_outputs:
+                    continue
+                layout_autotune_outs_list.append(
+                    f"    auto& {name} = std::get<{len(layout_tmp_result_list)}>(api_result);\n"
+                )
+                layout_autotune_outs_list.append(
+                    f"    transformer -> SetOutTensorLayout(&{name});\n")
+                layout_tmp_result_list.append(f"{name}")
+
+        tensors_vector_list_str = "{ " + ",".join(
+            amp_tensors_vector_list) + " }"
+
+        if len(amp_tensors_vector_list) == 0:
+            layout_logic_str = ""
+        else:
+            after_call_str = f"{returns_type_str} {result_name} = {forward_function_name}({layout_inputs_call_args_str});\n"
+            layout_logic_str = LAYOUT_LOGIC_TEMPLATE.format(
+                tensors_vector_list_str,
+                "    ".join(layout_tensors_vector_optional_list),
+                "    ".join(layout_autotune_attr_code_list) + "    " +
+                layout_autotune_list_str, after_call_str,
+                "    ".join(layout_autotune_outs_list), returns_str)
+
+        return layout_logic_str
+
     def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         namespace = self.namespace
         if self.forward_api_name[-1] == '_' and not is_inplaced:
@@ -1013,7 +1141,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
         layout_tensors_vector_optional_list = []
         for name, (ttype, pos) in forward_inputs_position_map.items():
             inputs_call_list[pos] = f"{name}"
-            amp_inputs_call_list[pos] = f"NEW_{name}"
+            amp_inputs_call_list[pos] = f"new_{name}"
             is_optional = (name in optional_inputs)
             if IsPlainTensorType(ttype):
                 if is_optional:
@@ -1026,13 +1154,13 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         f"if ({name}) amp_tensors_vector.push_back({{ *{name} }});\n"
                     )
                     amp_autocast_optional_list.append(
-                        f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
+                        f"auto new_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                     )
                     layout_tensors_vector_optional_list.append(
                         f"if ({name}) tensors_vector.push_back({{ *{name} }});\n"
                     )
                     layout_autotune_optional_list.append(
-                        f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
+                        f"auto new_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
                     )
                 else:
                     if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
@@ -1040,16 +1168,16 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         arg_str = f"paddle::experimental::Tensor& {name}"
                         amp_tensors_vector_list.append(f"{{{name}}}")
                         amp_autocast_list.append(
-                            f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
+                            f"auto new_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                         )
                     else:
                         arg_str = f"const paddle::experimental::Tensor& {name}"
                         amp_tensors_vector_list.append(f"{{{name}}}")
                         amp_autocast_list.append(
-                            f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
+                            f"auto new_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                         )
                     layout_autotune_list.append(
-                        f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
+                        f"auto new_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
                     )
             else:
                 assert IsVectorTensorType(ttype)
@@ -1063,10 +1191,10 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         f"if ({name}) amp_tensors_vector.push_back( *{name} );\n"
                     )
                     amp_autocast_optional_list.append(
-                        f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
+                        f"auto new_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                     )
                     layout_autotune_optional_list.append(
-                        f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
+                        f"auto new_{name} = transformer->TransInTensors(\"{name}\", {name});\n"
                     )
                 else:
                     if is_inplaced and forward_inplace_map and name in forward_inplace_map.keys(
@@ -1076,60 +1204,15 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                         arg_str = f"const std::vector<paddle::experimental::Tensor>& {name}"
                     amp_tensors_vector_list.append(f"{name}")
                     amp_autocast_list.append(
-                        f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
+                        f"auto new_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n"
                     )
                     layout_autotune_list.append(
-                        f"auto NEW_{name} = transformer->TransInTensor(\"{name}\", {name});\n"
+                        f"auto new_{name} = transformer->TransInTensors(\"{name}\", {name});\n"
                     )
 
             inputs_args_definition_list[pos] = arg_str
             inputs_args_declaration_list[pos] = arg_str
 
-        # for layout autotune attr
-        lightly_sensitive_attr = [
-            'axis', 'axes', 'dim', 'dims', 'start', 'end', 'stop'
-        ]
-        heavily_sensitive_attr = ['data_format', 'data_layout']
-        layout_autotune_attr = []
-        layout_autotune_attr_code_list = []
-        layout_autotune_attr_type_list = []
-        layout_autotune_attr_code_list.append(
-            f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");\n"
-        )
-
-        lightly_flag = False
-        heavily_flag = False
-        for name, atype, default_val, pos in forward_attrs_list:
-            for attr_name in lightly_sensitive_attr:
-                if name.find(
-                        attr_name) != -1 and name not in layout_autotune_attr:
-                    lightly_flag = True
-                    layout_autotune_attr.append(name)
-                    layout_autotune_attr_type_list.append(atype)
-            if lightly_flag is False:
-                for attr_name in heavily_sensitive_attr:
-                    if name.find(attr_name
-                                 ) != -1 and name not in layout_autotune_attr:
-                        layout_autotune_attr.append(name)
-                        layout_autotune_attr_type_list.append(atype)
-                        heavily_flag = True
-        if len(layout_autotune_attr) == 0:
-            layout_autotune_attr_code_list.append(
-                f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector);\n"
-            )
-        elif len(layout_autotune_attr) == 1:
-            layout_autotune_attr_code_list.append(
-                f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}>(op_name, tensors_vector, &{layout_autotune_attr[0]});\n"
-            )
-        elif len(layout_autotune_attr) == 2:
-            layout_autotune_attr_code_list.append(
-                f"auto transformer = egr::EagerLayoutAutotune<{layout_autotune_attr_type_list[0]}, {layout_autotune_attr_type_list[1]}>(op_name, tensors_vector, &{layout_autotune_attr[0]}, &{layout_autotune_attr[1]});\n"
-            )
-        else:
-            layout_autotune_attr_code_list.append(
-                f"auto transformer = egr::EagerLayoutAutotune(op_name, tensors_vector, {len(layout_autotune_attr)});\n"
-            )
-
         # forward attrs
         for name, atype, default_val, pos in forward_attrs_list:
             inputs_call_list[pos] = name
@@ -1219,6 +1302,7 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             returns_str = f"{returns_type_str}{{{returns_str}}}"
 
         # Node Creation Pre-Processing
+        inputs_names = []
         if not self.is_forward_only:
             # 1. Get Input AutoGradMeta
             inputs_autograd_meta_list = []
@@ -1294,7 +1378,8 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             node_creation_str = self.node_creation_str
 
         dygraph_event_str = f"{indent}paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);\n"
-        forward_function_name = GetDygraphForwardFunctionName(forward_api_name)
+        forward_ad_function_name = GetDygraphForwardFunctionName(
+            forward_api_name)
 
         # Forward amp logic
         kernel_trans2_op_name_str = f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");"
@@ -1307,9 +1392,10 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
             amp_autocast_list) + "    " + "    ".join(
                 amp_autocast_optional_list)
         amp_inputs_call_args_str = ", ".join(amp_inputs_call_list)
-        amp_call_str = f"return {forward_function_name}({amp_inputs_call_args_str});"
+        amp_call_str = f"return {forward_ad_function_name}({amp_inputs_call_args_str});"
         if is_inplaced or (forward_api_name == "cast"):
-            amp_logic_str = ""
+            amp_logic_str = "\n VLOG(5) << \" No AMP for {} because it is a inplace or cast api. \"; ".format(
+                forward_ad_function_name)
         else:
             amp_logic_str = AMP_LOGIC_TEMPLATE.format(
                 kernel_trans2_op_name_str, amp_tensors_vector_list_str,
@@ -1317,54 +1403,52 @@ def GenerateForwardDefinitionAndDeclaration(self, is_inplaced):
                 amp_autocast_list_str, amp_call_str)
 
         # Forward layout autotune
-        layout_inputs_call_args_str = amp_inputs_call_args_str
-        layout_tmp_result_list = []
-        layout_autotune_outs_list = ""
-        if num_outputs == 1:
-            layout_autotune_outs_list += f"{indent}auto {returns_str} = api_result;\n"
-            layout_autotune_outs_list += f"{indent}transformer -> SetOutTensorLayout(&{returns_str});\n"
-        else:
-            for name, (rtype, pos) in forward_outputs_position_map.items():
-                if name in intermediate_outputs:
-                    continue
-                layout_autotune_outs_list += f"{indent}auto& {name} = std::get<{len(layout_tmp_result_list)}>(api_result);\n"
-                layout_autotune_outs_list += f"{indent}transformer -> SetOutTensorLayout(&{name});\n"
-                layout_tmp_result_list.append(f"{name}")
+        layout_autotune_list_str = "    ".join(
+            layout_autotune_list) + "    ".join(layout_autotune_optional_list)
+        layout_logic_str = self.GenerateForwardLayoutAutotune(
+            forward_api_name, amp_tensors_vector_list,
+            layout_tensors_vector_optional_list, layout_autotune_list_str,
+            returns_type_str, returns_str, amp_inputs_call_args_str)
+
+        # For inputs outputs prepare for logging
+        var_str = f"\n{indent}  std::string input_str = \"\";"
+        var_str += f"\n{indent}  std::string output_str = \"\";"
+        for name, (ttype, pos) in forward_inputs_position_map.items():
+            var_str += f"\n{indent}  const char* TENSOR_{name.upper()}_TEMPLATE = \"({name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string input_{name}_str = paddle::string::Sprintf(TENSOR_{name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({name}));"
+            var_str += f"\n{indent}  input_str += input_{name}_str; "
 
-        if returns_type_str == "paddle::experimental::Tensor&" or forward_api_name == "slice" or forward_api_name == "strided_slice" or len(
-                layout_autotune_attr) == 0:
-            layout_logic_str = ""
-        else:
-            # after_call_str = f"return {forward_function_name}({layout_inputs_call_args_str});\n"
-            after_call_str = f"auto api_result = {forward_function_name}({layout_inputs_call_args_str});\n"
-            layout_logic_str = LAYOUT_LOGIC_TEMPLATE.format(
-                amp_tensors_vector_list_str,
-                "    ".join(layout_tensors_vector_optional_list),
-                "    ".join(layout_autotune_attr_code_list) + "    " +
-                "    ".join(layout_autotune_list) +
-                "   ".join(layout_autotune_optional_list), after_call_str,
-                layout_autotune_outs_list, returns_str)
+        before_log_str = BEFORE_LOG_PRINT_TEMPLATE.format(var_str)
+        for name, (ttype, pos) in forward_outputs_position_map.items():
+            var_str += f"\n{indent}  const char* TENSOR_{name.upper()}_TEMPLATE = \"({name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string output_{name}_str = paddle::string::Sprintf(TENSOR_{name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({name}));"
+            var_str += f"\n{indent}  output_str += output_{name}_str; "
+
+        log_str = AFTER_LOG_PRINT_TEMPLATE.format(var_str)
 
         # Generate forward_definition_str and forward_declaration_str
         if self.is_forward_only:
             if len(amp_tensors_vector_list) == 0:
-                amp_logic_str = ""
+                amp_logic_str = "\n VLOG(7) << \" No AMP for {} because it has no input. \"; ".format(
+                    forward_ad_function_name)
             self.forward_definition_str += FORWARD_ONLY_FUNCTION_TEMPLATE.format(
-                returns_type_str, forward_function_name,
-                inputs_args_definition_str, dygraph_event_str, amp_logic_str,
-                layout_logic_str, forward_function_name, forward_call_str,
-                get_outputs_str, returns_str)
+                returns_type_str, forward_ad_function_name,
+                inputs_args_definition_str, forward_api_name, dygraph_event_str,
+                amp_logic_str, layout_logic_str, forward_api_name,
+                before_log_str, forward_call_str, get_outputs_str,
+                forward_api_name, log_str, returns_str)
         else:
             self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format(
-                returns_type_str, forward_function_name,
-                inputs_args_definition_str, dygraph_event_str, amp_logic_str,
-                layout_logic_str, inputs_autograd_meta_str,
-                forward_function_name, forward_call_str, check_nan_inf_str,
-                get_outputs_str, outputs_autograd_meta_str,
+                returns_type_str, forward_ad_function_name,
+                inputs_args_definition_str, forward_api_name, dygraph_event_str,
+                amp_logic_str, layout_logic_str, inputs_autograd_meta_str,
+                forward_api_name, before_log_str, forward_call_str,
+                check_nan_inf_str, get_outputs_str, outputs_autograd_meta_str,
                 compute_require_grad_args_str, check_inplace_str,
-                bump_inplace_version_str, node_creation_str, returns_str)
+                bump_inplace_version_str, node_creation_str, forward_api_name,
+                log_str, returns_str)
 
-        self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n"
+        self.forward_declaration_str += f"{returns_type_str} {forward_ad_function_name}({inputs_args_declaration_str});\n"
 
     def GenerateInplacedForwardDygraphFunctions(self):
         # Inplaced Version Dygraph Function Generation
@@ -1485,7 +1569,7 @@ def GenerateHigherOrderNodeCreationCode(self):
         if next_grad_api_contents:
             # Fake forward_api_contents and backward_api_contents
             forward_api_contents = grad_api_contents
-            forward_api_contents['api'] = forward_api_contents['backward_api']
+            forward_api_contents['op'] = forward_api_contents['backward_op']
             backward_api_contents = next_grad_api_contents
 
             next_node_generator = DygraphFunctionGeneratorBase(
@@ -1770,7 +1854,8 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
             forward_api_name = self.grad_api_contents['invoke'].split(
                 '(')[0].strip()
             autograd_api = self.grad_api_contents['invoke'].replace(
-                forward_api_name, forward_api_name + '_dygraph_function', 1)
+                forward_api_name,
+                GetDygraphForwardFunctionName(forward_api_name), 1)
             grad_function_call_str = f"""
   if (trace_backward) {{            
   {indent}{autograd_api_out} api_output = {autograd_api};
@@ -1839,13 +1924,42 @@ def GenerateNodeDefinition(self, next_grad_node_creation_str,
         returns_str += f"{indent}return returns;\n"
 
         grad_node_name = GetGradNodeName(self.backward_api_name)
+        # For inputs outputs prepare for logging
+        var_str = f"\n{indent}  std::string input_str = \"\";"
+        var_str += f"\n{indent}  std::string output_str = \"\";"
+        for name, (ttype, fwd_position,
+                   grad_api_position) in backward_grad_inputs_map.items():
+            new_name = self.TransformToNextGradName(name)
+            var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \"({new_name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string input_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
+            var_str += f"\n{indent}  input_str += input_{new_name}_str; "
+
+        for name, (backward_input_type, is_fwd_input,
+                   grad_api_position), in backward_forward_inputs_map.items():
+            new_name = self.TransformToNextGradName(name)
+            var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \"({new_name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string input_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
+            var_str += f"\n{indent}  input_str += input_{new_name}_str; "
+
+        before_log_str = BEFORE_LOG_PRINT_TEMPLATE.format(var_str)
+
+        for name, (ttype, fwd_position,
+                   grad_api_position) in backward_grad_outputs_map.items():
+            new_name = self.TransformToNextGradName(name)
+            var_str += f"\n{indent}  const char* TENSOR_{new_name.upper()}_TEMPLATE = \"({new_name}, [%s]), \";"
+            var_str += f"\n{indent}  std::string output_{new_name}_str = paddle::string::Sprintf(TENSOR_{new_name.upper()}_TEMPLATE, egr::EagerUtils::TensorStr({new_name}));"
+            var_str += f"\n{indent}  output_str += output_{new_name}_str; "
+
+        log_str = AFTER_LOG_PRINT_TEMPLATE.format(var_str)
 
         self.node_definition_str = GRAD_FUNCTION_TEMPLATE.format(
-            grad_node_name, fill_zero_str, get_grad_in_args_str,
-            grad_function_prepare_str, compute_require_next_grad_str,
-            inplace_check_str, inplace_for_grad_outs_str, grad_node_name,
+            grad_node_name, self.backward_api_name, fill_zero_str,
+            get_grad_in_args_str, grad_function_prepare_str,
+            compute_require_next_grad_str, inplace_check_str,
+            inplace_for_grad_outs_str, self.backward_api_name, before_log_str,
             grad_function_call_str, check_nan_inf_str,
-            outputs_autograd_meta_str, next_grad_node_creation_str, returns_str)
+            outputs_autograd_meta_str, next_grad_node_creation_str,
+            self.backward_api_name, log_str, returns_str)
 
     def run(self):
         super().run()
@@ -1914,11 +2028,11 @@ def GenerateCode(self):
         grad_api_dict = self.grad_api_dict
         forward_apis_dict = {}
         for api_item in forward_api_list:
-            forward_apis_dict[api_item['api']] = api_item
+            forward_apis_dict[api_item['op']] = api_item
         namespace = self.namespace
 
         for forward_api_contents in forward_api_list:
-            if forward_api_contents['api'] in black_ops_list: continue
+            if forward_api_contents['op'] in black_ops_list: continue
 
             self.CollectIsForwardOnly(forward_api_contents)
 
@@ -1959,8 +2073,7 @@ def GenerateCode(self):
                 forward_api_contents = backward_api_contents
 
                 # Fake forward_api_content
-                forward_api_contents['api'] = forward_api_contents[
-                    'backward_api']
+                forward_api_contents['op'] = forward_api_contents['backward_op']
                 backward_api_contents = next_grad_api_contents
 
         if len(namespace) > 0:
@@ -2043,7 +2156,7 @@ def GenerateForwardHFile(filepath, forward_function_declaration_str):
         api_yaml_path = api_yaml_paths[i]
 
         # string api is forwrad only
-        if not api_yaml_path.endswith('strings_api.yaml'):
+        if not api_yaml_path.endswith('strings_ops.yaml'):
             backward_yaml_path = backward_yaml_paths[i]
         else:
             backward_yaml_path = None
diff --git a/paddle/fluid/eager/backward.cc b/paddle/fluid/eager/backward.cc
index b70ec78c7598cc..b80d0830660fcf 100644
--- a/paddle/fluid/eager/backward.cc
+++ b/paddle/fluid/eager/backward.cc
@@ -71,7 +71,6 @@ std::unordered_map<GradNodeBase*, int> getInDegreeMap(
 
 // Enforce GradNode has TensorWrappers as Input
 void EnforceGradNodeHasInput(GradNodeBase* node) {
-  VLOG(6) << "Running in EnforceGradNodeHasInput";
   PADDLE_ENFORCE_NE(
       node->IsTensorWrappersCleared(),
       true,
@@ -133,7 +132,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     AutogradMeta* auto_grad_meta = EagerUtils::nullable_autograd_meta(tensor);
     if (auto_grad_meta == nullptr) {
-      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
                  "stop_gradient=True: "
               << tensor.name();
       continue;
@@ -141,14 +140,14 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     // Get grad input info from target tensors
     auto input_info = auto_grad_meta->OutRankInfo();
 
-    VLOG(2) << "Out Rank of Tensor is slot: " << input_info.first
+    VLOG(5) << "Out Rank of Tensor is slot: " << input_info.first
             << ", rank: " << input_info.second;
     // Get target GradNodeBase from target tensors
     auto shared_grad_node = auto_grad_meta->GetMutableGradNode();
 
     if (shared_grad_node == nullptr || shared_grad_node.get() == nullptr ||
         auto_grad_meta->StopGradient()) {
-      VLOG(3) << "Skip auto grad since there is no grad op for var or loss is "
+      VLOG(5) << "Skip auto grad since there is no grad op for var or loss is "
                  "stop_gradient=True: "
               << tensor.name();
       continue;
@@ -169,7 +168,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     // Prepare GradTensorHolder
     if (!node_input_buffers_dict.count(grad_node)) {
-      VLOG(6) << "Create Value for grad input tensor " << i
+      VLOG(5) << "Create Value for grad input tensor " << i
               << " of grad node: " << grad_node->name();
       node_input_buffers_dict[grad_node] =
           std::make_unique<GradTensorHolder>(grad_node->InputMeta());
@@ -184,13 +183,13 @@ std::vector<paddle::experimental::Tensor> RunBackward(
               "grad_tensors should either have "
               "size = 0 or same size as tensors."));
       // Feed given tensor if it's provided
-      VLOG(6) << "Fill grad input tensor " << i << "with give grad tensor";
+      VLOG(3) << "Fill grad input tensor " << i << "with give grad tensor";
 
       // Deep copy
       node_input_buffers_dict[grad_node]->CopyValueFromTensor(
           input_info.first, input_info.second, grad_tensors[i]);
     } else {
-      VLOG(6) << "Fill grad input tensor " << i << " with 1.0";
+      VLOG(3) << "Fill grad input tensor " << i << " with 1.0";
       // Initialize tensor with 1.0
       // Forward Tensor "tensor" is passed to indicate tensortype, datatype and
       // dims
@@ -210,12 +209,12 @@ std::vector<paddle::experimental::Tensor> RunBackward(
         inputs, no_grad_vars, orig_queue, &queue, node_input_buffers_dict);
   }
 
-  VLOG(6) << "Update In degree Map for backward";
+  VLOG(5) << "Update In degree Map for backward";
   // 3. Compute in_degree for each node
   std::unordered_map<GradNodeBase*, int> node_in_degree_map =
       getInDegreeMap(queue);
 
-  VLOG(3) << "Startup_ops's size is " << queue.size();
+  VLOG(5) << "Startup_ops's size is " << queue.size();
 
   /* --- Topological Visit --- */
   // 1. Pop queue
@@ -224,11 +223,10 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   //    |- node(grads)
   //    |- Prepare for next node
   // 3. Update queue
-  VLOG(3) << "Run Backward";
   while (!queue.empty()) {
     GradNodeBase* node = queue.front();
-    VLOG(3) << "Running GradNode:" << node->name() << " addr:" << node;
-
+    VLOG(3) << "Preparing GradNode:" << node->name() << " addr:" << node;
+    VLOG(4) << EagerUtils::GradNodeStr(*node);
     paddle::platform::RecordEvent node_record_event(
         std::string((*node).name()),
         paddle::platform::TracerEventType::Operator,
@@ -255,7 +253,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     // Check input
     EnforceGradNodeHasInput(node);
 
-    VLOG(6) << "Run Backward Kernel with GradTensorHolder.";
+    VLOG(7) << "Run Backward Kernel with GradTensorHolder.";
     // Run Pre Backward Node and get outputs
     paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                          kSlotSmallVectorSize>
@@ -269,7 +267,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
     // retain_grad or not
     if (!retain_graph) {
-      VLOG(6)
+      VLOG(3)
           << "retain_graph is false, need to clear the TensorWrapper of nodes.";
       node->ClearTensorWrappers();
     }
@@ -322,11 +320,11 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
         if ((!grad_output_tensor.defined() ||
              !grad_output_tensor.initialized())) {
-          VLOG(6) << "We get grad_output_tensor with slot: " << i
+          VLOG(7) << "We get grad_output_tensor with slot: " << i
                   << ", rank: " << j << " as uninitialized or undefined tensor";
         }
 
-        VLOG(6) << "Get Edge and grad_output_tensor with slot: " << i
+        VLOG(7) << "Get Edge and grad_output_tensor with slot: " << i
                 << ", rank: " << j
                 << " 's name is: " << grad_output_tensor.name();
 
@@ -335,12 +333,12 @@ std::vector<paddle::experimental::Tensor> RunBackward(
           const auto& input_meta = next_node->InputMeta();
           auto grad_tensor_holder =
               std::make_unique<GradTensorHolder>(input_meta);
-          VLOG(6) << "Construct GradTensorHolder for grad node: "
+          VLOG(7) << "Construct GradTensorHolder for grad node: "
                   << next_node->name();
           node_input_buffers_dict[next_node] = std::move(grad_tensor_holder);
         }
 
-        VLOG(6) << "Sum grad inputs for edge slot: " << edge_rank.first
+        VLOG(3) << "Sum grad inputs for edge slot: " << edge_rank.first
                 << ", rank: " << edge_rank.second;
 
         node_input_buffers_dict[next_node]->add(edge_rank.first,
@@ -350,7 +348,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
 
         // Update queue
         node_in_degree_map[next_node]--;
-        VLOG(6) << next_node->name()
+        VLOG(7) << next_node->name()
                 << " ref_cnt is: " << node_in_degree_map[next_node];
 
         PADDLE_ENFORCE(
@@ -382,7 +380,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
     }
   }
 
-  VLOG(6) << "Run Backward Final hook size: "
+  VLOG(7) << "Run Backward Final hook size: "
           << egr::Controller::Instance().FinalBackwardHooks().size();
   for (auto& hook : egr::Controller::Instance().FinalBackwardHooks()) {
     (*hook)();
@@ -390,6 +388,7 @@ std::vector<paddle::experimental::Tensor> RunBackward(
   egr::Controller::Instance().ClearFinalBackwardHooks();
   if (!is_general_grad) return {};
   return GeneralGrad::Instance().GetResults(inputs, allow_unused, create_graph);
+  VLOG(3) << "Finish Backward";
 }
 
 void Backward(
diff --git a/paddle/fluid/eager/custom_operator/custom_operator_node.cc b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
index 1f0a055cbd3863..57932ec4c1e693 100644
--- a/paddle/fluid/eager/custom_operator/custom_operator_node.cc
+++ b/paddle/fluid/eager/custom_operator/custom_operator_node.cc
@@ -217,18 +217,20 @@ RunCustomOpNode::operator()(
   VLOG(6) << "Prepare Grad outputs for size: " << grad_outputs_names.size();
   for (size_t i = 0; i < OutputMeta().size(); i++) {
     if (map[0][0].find(i) != map[0][0].end()) {
+      int grad_output_idx = map[0][0][i];
       VLOG(7) << "Insert grad outputs: " << i
-              << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[0][0][i];
-      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
-        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
-                             std::make_shared<phi::DenseTensor>(
-                                 phi::DataType::UNDEFINED),
-                             egr::Controller::Instance().GenerateUniqueName(
-                                 "custom_tmp_grad"));
-        egr::EagerUtils::autograd_meta(&(outs[i][j]));
+              << " with size: " << OutputMeta()[grad_output_idx].size()
+              << " to tmp_outputs: " << grad_output_idx;
+      for (size_t j = 0; j < OutputMeta()[grad_output_idx].size(); j++) {
+        outs[grad_output_idx]
+            .emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                          std::make_shared<phi::DenseTensor>(
+                              phi::DataType::UNDEFINED),
+                          egr::Controller::Instance().GenerateUniqueName(
+                              "custom_tmp_grad"));
+        egr::EagerUtils::autograd_meta(&(outs[grad_output_idx][j]));
       }
-      tmp_outs[map[0][0][i]] = outs[i];
+      tmp_outs[grad_output_idx] = outs[grad_output_idx];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {
@@ -408,17 +410,19 @@ RunCustomOpDoubleGradNode::operator()(
 
   for (size_t i = 0; i < OutputMeta().size(); i++) {
     if (map[1][0].find(i) != map[1][0].end()) {
+      int grad_output_idx = map[1][0][i];
       VLOG(7) << "Insert grad outputs: " << i
-              << " with size: " << OutputMeta()[i].size()
-              << " to tmp_outputs: " << map[1][0][i];
-      for (size_t j = 0; j < OutputMeta()[i].size(); j++) {
-        outs[i].emplace_back(/* init it incase of copy nullptr of shared_ptr */
-                             std::make_shared<phi::DenseTensor>(
-                                 phi::DataType::UNDEFINED),
-                             egr::Controller::Instance().GenerateUniqueName(
-                                 "custom_tmp_grad"));
+              << " with size: " << OutputMeta()[grad_output_idx].size()
+              << " to tmp_outputs: " << grad_output_idx;
+      for (size_t j = 0; j < OutputMeta()[grad_output_idx].size(); j++) {
+        outs[grad_output_idx]
+            .emplace_back(/* init it incase of copy nullptr of shared_ptr */
+                          std::make_shared<phi::DenseTensor>(
+                              phi::DataType::UNDEFINED),
+                          egr::Controller::Instance().GenerateUniqueName(
+                              "custom_tmp_grad"));
       }
-      tmp_outs[map[1][0][i]] = outs[i];
+      tmp_outs[grad_output_idx] = outs[grad_output_idx];
     }
   }
   for (size_t i = 0; i < tmp_outs.size(); i++) {
diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h
index 4ebc2860c59d9b..42961b84bcdb02 100644
--- a/paddle/fluid/eager/eager_amp_auto_cast.h
+++ b/paddle/fluid/eager/eager_amp_auto_cast.h
@@ -45,7 +45,7 @@ inline paddle::experimental::Tensor Cast(
     const bool trace_backward = true) {
   if (input.is_sparse_coo_tensor() || input.is_sparse_csr_tensor()) {
     if (trace_backward) {
-      return sparse::cast_dygraph_function(
+      return sparse::cast_ad_func(
           input, paddle::experimental::DataType::UNDEFINED, dst_dtype);
     } else {
       return paddle::experimental::sparse::cast(
@@ -53,7 +53,7 @@ inline paddle::experimental::Tensor Cast(
     }
   } else {
     if (trace_backward) {
-      return cast_dygraph_function(input, dst_dtype);
+      return cast_ad_func(input, dst_dtype);
     } else {
       return paddle::experimental::cast(input, dst_dtype);
     }
@@ -87,7 +87,7 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
     const std::string& op_name,
     bool trace_backward = true) {
   VLOG(6) << "AMP AmpAutoCasts:"
-          << " input(" << input_name << ") dst_dtype("
+          << " input(" << egr::EagerUtils::TensorStr(input) << " to dst_dtype("
           << paddle::framework::DataType2String(dst_dtype) << ").";
   if (dst_dtype == paddle::experimental::DataType::FLOAT16) {
     if (op_name == "run_program") {
@@ -107,6 +107,7 @@ inline paddle::experimental::Tensor EagerAmpAutoCast(
     }
   }
   if (NeedCast(input, dst_dtype)) {
+    VLOG(6) << "Input : " << input.impl() << "NeedCast";
     return Cast(input, dst_dtype, trace_backward);
   }
   return input;
diff --git a/paddle/fluid/eager/eager_layout_auto_tune.h b/paddle/fluid/eager/eager_layout_auto_tune.h
index eebdd9caa6d5cf..7e0f916a7d3c02 100644
--- a/paddle/fluid/eager/eager_layout_auto_tune.h
+++ b/paddle/fluid/eager/eager_layout_auto_tune.h
@@ -19,43 +19,63 @@
 #include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 namespace egr {
+inline bool NeedTransLayout(
+    const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                               kSlotSmallVectorSize>& tensors_vector,
+    const paddle::experimental::DataLayout& layout) {
+  for (size_t i = 0; i < tensors_vector.size(); i++) {
+    for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) {
+      if (layout != tensors_vector[i][idx].layout()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
 
-// layout_agnostic_ops_
-// For agnostic op like add / relu
 inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
     const std::string& op_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>& tensors_vector) {
-  VLOG(3) << " Optimze Layout agnostic op: " << op_name;
-  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
-  transposer =
-      std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-  return transposer;
+  // For agnostic op like add, relu, exp
+  auto first_layout = tensors_vector[0][0].layout();
+  auto desired_layout = DesiredLayout();
+  bool is_started =
+      !(desired_layout == paddle::experimental::DataLayout::UNDEFINED);
+  if (is_started && NeedTransLayout(tensors_vector, first_layout)) {
+    bool need_trans_back = false;
+    for (size_t i = 0; i < tensors_vector.size(); i++) {
+      for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) {
+        if (4 != tensors_vector[i][idx].shape().size()) {
+          need_trans_back = true;
+        }
+      }
+    }
+    auto final_layout = need_trans_back ? DefaultLayout() : desired_layout;
+    VLOG(4) << op_name << "'s has different layout, need trans to "
+            << final_layout;
+    return std::make_shared<EagerLayoutTransformer>(
+        op_name, tensors_vector, final_layout);
+  }
+  return std::make_shared<EagerLayoutTransformer>(
+      op_name, tensors_vector, first_layout);
 }
 
-// For lightly op like reduce
 template <typename T>
 inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
     const std::string& op_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>& tensors_vector,
     T* attr) {
-  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
-  bool unstart =
-      (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
-       paddle::experimental::DataLayout::UNDEFINED);
-  if (unstart) {
-    VLOG(3) << "Optimze Layout was not started" << op_name;
-    transposer =
-        std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-    return transposer;
+  // For lightly op like reduce
+  if (!(DesiredLayout() == paddle::experimental::DataLayout::UNDEFINED)) {
+    VLOG(4) << "LayoutAutotune was unstarted. Current op :" << op_name;
+    return std::make_shared<EagerLayoutTransformer>(
+        op_name, tensors_vector, tensors_vector[0][0].layout());
   }
-  transposer =
-      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
-  return transposer;
+  return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
 }
 
-// For lightly op like argmax
 template <typename T1, typename T2>
 inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
     const std::string& op_name,
@@ -63,33 +83,25 @@ inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
                                kSlotSmallVectorSize>& tensors_vector,
     T1* axis,
     T2* keep_dim) {
+  // For lightly op like argmax
   return EagerLayoutAutotune<T1>(op_name, tensors_vector, axis);
 }
 
-// heavily string data_format data_layout
 template <>
 inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
     const std::string& op_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>& tensors_vector,
     std::string* attr) {
-  VLOG(3) << " Optimze Layout heavily op: " << op_name;
-  auto transposer =
-      std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
-      paddle::experimental::DataLayout::UNDEFINED) {
+  // Heavily op with (string) data_format, data_layout
+  auto transposer = std::make_shared<EagerLayoutTransformer>(
+      op_name, tensors_vector, tensors_vector[0][0].layout());
+  if (DesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) {
     // Layout autotune only supports model with convolutional layers
-    VLOG(3) << "Optimze Layout was not started" << op_name;
     if (op_name != "conv2d") {
+      VLOG(4) << "LayoutAutotune was unstarted. Current op :" << op_name;
       return transposer;
     } else {
-#if defined(PADDLE_WITH_CUDA)
-      if (paddle::platform::is_gpu_place(tensors_vector[0][0].place()) &&
-          !phi::backends::gpu::TensorCoreAvailable()) {
-        paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
-        return transposer;
-      }
-#endif
       auto data_type = tensors_vector[0][0].dtype();
       bool is_tune_fp32 =
           (data_type == paddle::experimental::DataType::FLOAT32) &&
@@ -97,6 +109,8 @@ inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
       bool is_tune_fp16 =
           (data_type == paddle::experimental::DataType::FLOAT16) &&
           (*attr == "NCHW");
+      VLOG(4) << "LayoutAutoTune assert with dtype and layout, Current op : "
+              << op_name;
       if (is_tune_fp32) {
         paddle::imperative::LayoutAutoTune::Instance().SetDesiredLayout(
             paddle::experimental::DataLayout::NCHW);
@@ -109,58 +123,45 @@ inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
         paddle::imperative::LayoutAutoTune::Instance().SetDefaultLayout(
             paddle::experimental::DataLayout::NCHW);
       } else {
-        paddle::imperative::LayoutAutoTune::Instance().DisableLayoutAutoTune();
+        VLOG(4) << "DisableLayoutAutoTune accoding to Conv op"
+                << " dtype : " << data_type << " format : " << (*attr);
+        egr::Controller::Instance().DisableLayoutAutoTune();
         return transposer;
       }
-      VLOG(3) << "Tune the layout from " << attr << " to "
-              << paddle::framework::DataLayoutToString(
-                     paddle::imperative::LayoutAutoTune::Instance()
-                         .GetDesiredLayout());
+      VLOG(4) << "LayoutAutoTune from " << *attr << " to " << DesiredLayout();
     }
   }
 
   if (paddle::imperative::LayoutAutoTune::Instance().IsHeavilyLayoutSensitive(
           op_name)) {
-    auto heavily_transposer =
-        std::make_shared<EagerHeavilyLayoutSensitiveOpTransformer>(op_name,
-                                                                   attr);
-    return heavily_transposer;
+    return std::make_shared<EagerHeavilyLayoutSensitiveOpTransformer>(op_name,
+                                                                      attr);
   }
-  VLOG(3) << op_name
-          << "'s LayoutTransformer is unimplemented. Use default "
-             "LayoutTransformer instead.";
-  return transposer;
+  return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
 }
 
-// lightly  transpose
 template <>
 inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune(
     const std::string& op_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>& tensors_vector,
     std::vector<int>* attr) {
-  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
-  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
-      paddle::experimental::DataLayout::UNDEFINED) {
-    VLOG(3) << " Optimze Layout Unstarted : " << op_name;
-    transposer =
-        std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-    return transposer;
+  // lightly  transpose
+  if (DesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) {
+    VLOG(4) << "LayoutAutotune was unstarted. Current op :" << op_name;
+    return std::make_shared<EagerLayoutTransformer>(
+        op_name, tensors_vector, tensors_vector[0][0].layout());
   }
-  VLOG(3) << " Optimze Layout lightly op: " << op_name;
-  if (op_name == "transpose2") {
+
+  if (op_name == "transpose2" &&
+      (tensors_vector[0][0].layout() == DesiredLayout())) {
     auto trans = std::make_shared<EagerTransposeOpTransformer>(op_name);
-    if (tensors_vector[0][0].layout() ==
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout()) {
-      trans->SetAttr(attr,
-                     tensors_vector[0][0].layout() ==
-                         paddle::experimental::DataLayout::NHWC);
-      return trans;
-    }
+    trans->SetAttr(attr,
+                   tensors_vector[0][0].layout() ==
+                       paddle::experimental::DataLayout::NHWC);
+    return trans;
   }
-  transposer =
-      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
-  return transposer;
+  return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
 }
 
 // lightly int argmax
@@ -172,33 +173,24 @@ EagerLayoutAutotune<paddle::experimental::Scalar, bool>(
                                kSlotSmallVectorSize>& tensors_vector,
     paddle::experimental::Scalar* axis,
     bool* keep_dim) {
-  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
-  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
-      paddle::experimental::DataLayout::UNDEFINED) {
-    VLOG(3) << " Optimze Layout Unstarted : " << op_name;
-    transposer =
-        std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-    return transposer;
+  if (DesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) {
+    VLOG(4) << "LayoutAutotune was unstarted. Current op :" << op_name;
+    return std::make_shared<EagerLayoutTransformer>(
+        op_name, tensors_vector, tensors_vector[0][0].layout());
   }
-  auto desired_layout =
-      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-  if (op_name == "argmax") {
+
+  if (op_name == "argmax" &&
+      (tensors_vector[0][0].layout() == DesiredLayout()) && (*keep_dim)) {
     std::shared_ptr<EagerArgmaxOpTransformer> argmax_transform = nullptr;
     argmax_transform = std::make_shared<EagerArgmaxOpTransformer>(op_name);
-    if ((tensors_vector[0][0].layout() == desired_layout) && (*keep_dim)) {
-      argmax_transform->SetAttr(axis,
-                                tensors_vector[0][0].layout() ==
-                                    paddle::experimental::DataLayout::NHWC);
-      return argmax_transform;
-    }
+    argmax_transform->SetAttr(axis,
+                              tensors_vector[0][0].layout() ==
+                                  paddle::experimental::DataLayout::NHWC);
+    return argmax_transform;
   }
-  VLOG(3) << " Optimze Layout lightly op: " << op_name;
-  transposer =
-      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
-  return transposer;
+  return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
 }
 
-// lightly int flatten
 template <>
 inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune<int, int>(
     const std::string& op_name,
@@ -206,71 +198,49 @@ inline std::shared_ptr<EagerLayoutTransformer> EagerLayoutAutotune<int, int>(
                                kSlotSmallVectorSize>& tensors_vector,
     int* start_axis,
     int* stop_axis) {
-  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
-  if (paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout() ==
-      paddle::experimental::DataLayout::UNDEFINED) {
-    VLOG(3) << " Optimze Layout Unstarted : " << op_name;
-    transposer =
-        std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-    return transposer;
+  if (DesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) {
+    VLOG(4) << "Optimze Layout was not started" << op_name;
+    return std::make_shared<EagerLayoutTransformer>(
+        op_name, tensors_vector, tensors_vector[0][0].layout());
   }
-  bool no_tranpose =
-      tensors_vector[0][0].layout() ==
-      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
+
+  bool no_tranpose = tensors_vector[0][0].layout() == DesiredLayout();
   bool is_valid = ((*start_axis) == 1 && (*stop_axis) == 3);
   if (op_name == "flatten" || op_name == "flatten_contiguous_range") {
     if (no_tranpose && is_valid) {
-      std::shared_ptr<EagerFlattenOpTransformer> flatten_transform = nullptr;
-      flatten_transform = std::make_shared<EagerFlattenOpTransformer>(op_name);
-      return flatten_transform;
+      return std::make_shared<EagerFlattenOpTransformer>(op_name);
     }
   }
-
-  VLOG(3) << " Optimze Layout lightly op: " << op_name;
-  transposer =
-      std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
-  return transposer;
+  return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
 }
 
-// lightly int Concat
-// lightly T can be int vector<int> vector<int64_t> IntArray
-template <>  // default int
+template <>
 inline std::shared_ptr<EagerLayoutTransformer>
 EagerLayoutAutotune<paddle::experimental::Scalar>(
     const std::string& op_name,
     const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
                                kSlotSmallVectorSize>& tensors_vector,
     paddle::experimental::Scalar* axis) {
-  auto desired_layout =
-      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-  std::shared_ptr<EagerLayoutTransformer> transposer = nullptr;
-  if (desired_layout == paddle::experimental::DataLayout::UNDEFINED) {
-    VLOG(3) << " Optimze Layout Unstarted : " << op_name;
-    transposer =
-        std::make_shared<EagerLayoutTransformer>(op_name, tensors_vector);
-    return transposer;
+  if (DesiredLayout() == paddle::experimental::DataLayout::UNDEFINED) {
+    VLOG(4) << "Optimze Layout was not started" << op_name;
+    return std::make_shared<EagerLayoutTransformer>(
+        op_name, tensors_vector, tensors_vector[0][0].layout());
   }
 
-  bool need_transpose = false;
-  for (size_t i = 0; i < tensors_vector.size(); i++) {
-    for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) {
-      if (desired_layout != tensors_vector[i][idx].layout()) {
-        need_transpose = true;
-      }
-    }
+  auto desired_layout = DesiredLayout();
+  if (NeedTransLayout(tensors_vector, desired_layout)) {
+    VLOG(4) << op_name << "'s has different layout";
+    return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
   }
-
-  if (need_transpose) {
-    VLOG(3) << "Concat need transpose to NCHW " << op_name;
-    transposer =
-        std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
-    return transposer;
-  } else {
-    VLOG(3) << " Optimze Layout lightly op: " << op_name;
-    auto trans = std::make_shared<EagerConcatOpTransformer>(op_name);
-    trans->SetAttr(axis, desired_layout);
-    return trans;
+  if (op_name == "Concat") {
+    if (desired_layout == tensors_vector[0][0].layout() &&
+        tensors_vector[0][0].shape().size() == 4) {
+      auto trans = std::make_shared<EagerConcatOpTransformer>(op_name);
+      trans->SetAttr(axis, desired_layout);
+      return trans;
+    }
   }
+  return std::make_shared<EagerLightlyLayoutSensitiveOpTransformer>(op_name);
 }
 
 }  // namespace egr
diff --git a/paddle/fluid/eager/eager_layout_transformer.h b/paddle/fluid/eager/eager_layout_transformer.h
index 3f2717be6bef54..4f161d3aa378b0 100644
--- a/paddle/fluid/eager/eager_layout_transformer.h
+++ b/paddle/fluid/eager/eager_layout_transformer.h
@@ -22,9 +22,9 @@ namespace egr {
 inline paddle::experimental::Tensor EagerTraceTransposeOp(
     const paddle::experimental::DataLayout layout,
     const paddle::experimental::Tensor& in) {
+  VLOG(4) << "AutoTune Transpose from " << in.layout() << " to " << layout
+          << ", tensor's dim size is " << in.shape().size();
   if (in.shape().size() != 4) {
-    VLOG(4) << "Shape is " << in.shape().size() << " can't transpose to"
-            << paddle::framework::DataLayoutToString(layout);
     return in;
   }
   std::vector<int> axis;
@@ -35,137 +35,181 @@ inline paddle::experimental::Tensor EagerTraceTransposeOp(
   } else {
     axis = {0, 1, 2, 3};
   }
-  auto out_tensor = transpose_dygraph_function(in, axis);
-  VLOG(4) << "AutoTune Transpose from "
-          << paddle::framework::DataLayoutToString(in.layout()) << " to "
-          << paddle::framework::DataLayoutToString(layout);
+  auto out_tensor = transpose_ad_func(in, axis);
+  VLOG(4) << "AutoTune Transpose from " << in.layout() << " to " << layout;
   return out_tensor;
 }
 
+inline paddle::experimental::DataLayout DesiredLayout() {
+  return paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
+}
+
+inline paddle::experimental::DataLayout DefaultLayout() {
+  return paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
+}
+
+inline void UpdateLayout(paddle::experimental::Tensor* out_tensor,
+                         const paddle::experimental::DataLayout layout) {
+  if (out_tensor->layout() != layout) {
+    VLOG(4) << "Update out_tensor's layout from " << out_tensor->layout()
+            << " to " << layout;
+    phi::DenseTensorUtils::GetMutableMeta(
+        static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
+        ->layout = layout;
+  }
+}
+
+inline void DealWithShapeOp(paddle::experimental::Tensor* out_tensor,
+                            const paddle::experimental::DataLayout layout,
+                            int dim_size) {
+  auto des_layout = DesiredLayout();
+  auto def_layout = DefaultLayout();
+  int32_t* value =
+      static_cast<phi::DenseTensor*>(out_tensor->impl().get())->data<int32_t>();
+  bool change_dim =
+      (des_layout != def_layout && layout == des_layout && dim_size == 4);
+  VLOG(6) << "'Shape OP', layout autotune: True"
+          << " desired_layout: " << des_layout
+          << " default_layout: " << def_layout
+          << " tensor layout: " << out_tensor->layout()
+          << " tensor's shape size is : " << dim_size;
+  // It's means input tensor has been autotune and tensor's layout is
+  // desired_layout
+  std::vector<int32_t> dims;
+  dims.resize(dim_size);
+  for (int i = 0; i < dim_size; i++) {
+    dims[i] = value[i];
+  }
+  auto des_str = paddle::framework::DataLayoutToString(des_layout);
+  if (change_dim && des_str == "NCHW") {
+    // NCHW -> NHWC
+    VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " "
+            << value[1] << " " << value[2] << " " << value[3] << " to "
+            << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
+    value[0] = dims[0];
+    value[1] = dims[2];
+    value[2] = dims[3];
+    value[3] = dims[1];
+  } else if (change_dim && des_str == "NHWC") {
+    // NHWC -> NCHW
+    VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " "
+            << value[1] << " " << value[2] << " " << value[3] << " to "
+            << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2];
+    value[0] = dims[0];
+    value[1] = dims[3];
+    value[2] = dims[1];
+    value[3] = dims[2];
+  }
+}
+
 // agnostic op
 class EagerLayoutTransformer {
+  using Layout = paddle::experimental::DataLayout;
+
  public:
-  EagerLayoutTransformer() : op_name_("") {}
-  explicit EagerLayoutTransformer(
-      const std::string& op_name,
-      const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
-                                 kSlotSmallVectorSize>& tensors_vector)
-      : op_name_(op_name) {
-    final_layout_ = "UNDEFINED";
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    for (size_t i = 0; i < tensors_vector.size(); i++) {
-      for (size_t idx = 0; idx < tensors_vector[0].size(); idx++) {
-        if (final_layout_ == "UNDEFINED") {
-          final_layout_ = paddle::framework::DataLayoutToString(
-              tensors_vector[0][0].layout());
-        } else if (tensors_vector[i][idx].layout() == desired_layout) {
-          final_layout_ = paddle::framework::DataLayoutToString(desired_layout);
-          break;
-        }
-      }
-    }
-    VLOG(4) << op_name_ << "final_layout_ is  " << final_layout_;
-  }
+  EagerLayoutTransformer() : op_name_(""), final_layout_(Layout::UNDEFINED) {}
 
   EagerLayoutTransformer(const EagerLayoutTransformer&) = delete;
 
   EagerLayoutTransformer& operator=(const EagerLayoutTransformer&) = delete;
 
+  explicit EagerLayoutTransformer(
+      const std::string& op_name,
+      const paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                                 kSlotSmallVectorSize>& tensors_vector,
+      const Layout final_layout = Layout::UNDEFINED)
+      : op_name_(op_name), final_layout_(final_layout), dim_size_(1) {
+    VLOG(4) << "Agnostic op : " << op_name_ << "'s layout is " << final_layout_;
+  }
+
   virtual ~EagerLayoutTransformer() {}
 
-  virtual paddle::optional<paddle::experimental::Tensor> TransInTensor(
-      const std::string& in_name,
-      const paddle::optional<paddle::experimental::Tensor>& in) {
-    VLOG(4) << op_name_ << "is is agnostic, final_layout_ is " << final_layout_;
+  virtual paddle::experimental::Tensor TransInTensor(
+      const std::string& in_name, const paddle::experimental::Tensor& in) {
+    // update in shape size
+    dim_size_ = in.shape().size();
+    bool need_trans =
+        !(final_layout_ == Layout::UNDEFINED || final_layout_ == in.layout());
+    // This is for Agnostic op when layout is differnet
+    if (need_trans) {
+      auto out_tensor = EagerTraceTransposeOp(final_layout_, in);
+      phi::DenseTensorUtils::GetMutableMeta(
+          static_cast<phi::DenseTensor*>(out_tensor.impl().get()))
+          ->layout = final_layout_;
+      return out_tensor;
+    }
     return in;
   }
 
-  virtual paddle::optional<std::vector<paddle::experimental::Tensor>>
-  TransInTensor(
+  virtual paddle::optional<paddle::experimental::Tensor> TransInTensor(
       const std::string& in_name,
-      const paddle::optional<std::vector<paddle::experimental::Tensor>>& in) {
-    return in;
+      const paddle::optional<paddle::experimental::Tensor>& in) {
+    return in ? TransInTensor(in_name, *in) : in;
   }
 
-  virtual std::vector<paddle::experimental::Tensor> TransInTensor(
+  virtual std::vector<paddle::experimental::Tensor> TransInTensors(
       const std::string& in_name,
       const std::vector<paddle::experimental::Tensor>& in) {
     return in;
   }
 
-  virtual paddle::experimental::Tensor TransInTensor(
-      const std::string& in_name, const paddle::experimental::Tensor& in) {
-    return in;
-  }
-
-  virtual void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    bool use_default = (final_layout_ == "Undefined(AnyLayout)" ||
-                        final_layout_ == ("UNDEFINED"));
-    auto layout = paddle::framework::StringToDataLayout(final_layout_);
-    if (!use_default) {
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = layout;
-    }
-    VLOG(4) << op_name_ << "is is agnostic, use_default " << use_default;
+  virtual paddle::optional<std::vector<paddle::experimental::Tensor>>
+  TransInTensors(
+      const std::string& in_name,
+      const paddle::optional<std::vector<paddle::experimental::Tensor>>& in) {
+    return (in ? TransInTensors(in_name, *in) : in);
   }
 
   virtual void SetOutTensorLayout(
       std::vector<paddle::experimental::Tensor>* out_tensor) {
-    bool use_default = (final_layout_ == "Undefined(AnyLayout)" ||
-                        final_layout_ == ("UNDEFINED"));
-    if (!use_default) {
+    bool update_layout = !(final_layout_ == Layout::UNDEFINED);
+    if (update_layout) {
       for (size_t i = 0; i < out_tensor->size(); i++) {
         phi::DenseTensorUtils::GetMutableMeta(
             static_cast<phi::DenseTensor*>((*out_tensor)[i].impl().get()))
-            ->layout =
-            paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
+            ->layout = DesiredLayout();
       }
     }
-    VLOG(4) << op_name_ << "is is agnostic, use_default " << use_default;
+  }
+
+  virtual void SetOutTensorLayout(
+      paddle::optional<paddle::experimental::Tensor>* out_tensor) {
+    VLOG(4) << "AutoTune out tensor is optional";
+  }
+
+  virtual void SetOutTensorLayout(
+      paddle::optional<std::vector<paddle::experimental::Tensor>>* out_tensor) {
+    VLOG(4) << "AutoTune out tensor is optional";
+  }
+
+  virtual void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
+    if (op_name_ == "shape") {
+      return DealWithShapeOp(out_tensor, final_layout_, dim_size_);
+    }
+    bool need_update = !(final_layout_ == Layout::UNDEFINED);
+    if (need_update) {
+      UpdateLayout(out_tensor, final_layout_);
+    }
   }
 
  protected:
   std::string op_name_;
-  std::string final_layout_;
+  const Layout final_layout_;
+  int dim_size_;
 };
 
 class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
  public:
   explicit EagerHeavilyLayoutSensitiveOpTransformer(const std::string& op_name,
                                                     std::string* layout)
-      : op_name_(op_name),
-        desired_layout_(
-            paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout()) {
-    VLOG(3) << "Optimze Layout heavily op: " << op_name;
-    final_layout_ = paddle::framework::DataLayoutToString(desired_layout_);
-    if ((*layout) != final_layout_) {
-      *layout = final_layout_;
-    }
-  }
-
-  virtual paddle::optional<std::vector<paddle::experimental::Tensor>>
-  TransInTensor(
-      const std::string& in_name,
-      const paddle::optional<std::vector<paddle::experimental::Tensor>>& in) {
-    VLOG(4) << op_name_ << "is is heavily";
-    return in;
-  }
-
-  virtual paddle::optional<paddle::experimental::Tensor> TransInTensor(
-      const std::string& in_name,
-      const paddle::optional<paddle::experimental::Tensor>& in) {
-    VLOG(4) << op_name_ << "is is heavily";
-    return in;
+      : op_name_(op_name), desired_layout_(DesiredLayout()) {
+    VLOG(4) << "Heavily op: " << op_name;
+    *layout = paddle::framework::DataLayoutToString(DesiredLayout());
   }
 
   paddle::experimental::Tensor TransInTensor(
       const std::string& in_name, const paddle::experimental::Tensor& in) {
     if (heavily_input_.count(in_name) != 0 && in.layout() != desired_layout_) {
-      VLOG(4) << op_name_ << "'s " << in_name << " need transpose from "
-              << paddle::framework::DataLayoutToString(in.layout()) << " to "
-              << final_layout_;
       auto out_tensor = EagerTraceTransposeOp(desired_layout_, in);
       return out_tensor;
     }
@@ -173,14 +217,7 @@ class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
   }
 
   void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    if (out_tensor->layout() != desired_layout_) {
-      VLOG(4) << " Set Out_tensor's layout from "
-              << paddle::framework::DataLayoutToString(out_tensor->layout())
-              << " to " << final_layout_;
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = desired_layout_;
-    }
+    UpdateLayout(out_tensor, desired_layout_);
   }
 
   void SetOutTensorLayout(
@@ -194,10 +231,8 @@ class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
       std::vector<paddle::experimental::Tensor>* out_tensor) {
     for (size_t i = 0; i < out_tensor->size(); i++) {
       if ((*out_tensor)[i].layout() != desired_layout_) {
-        VLOG(4) << " Set Out_tensor's layout from "
-                << paddle::framework::DataLayoutToString(
-                       (*out_tensor)[i].layout())
-                << " to " << final_layout_;
+        VLOG(4) << "Update out_tensor's layout from "
+                << (*out_tensor)[i].layout() << " to " << desired_layout_;
         phi::DenseTensorUtils::GetMutableMeta(
             static_cast<phi::DenseTensor*>((*out_tensor)[i].impl().get()))
             ->layout = desired_layout_;
@@ -207,7 +242,6 @@ class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
 
  protected:
   std::string op_name_;
-  std::string final_layout_;
   const paddle::experimental::DataLayout desired_layout_;
   std::unordered_set<std::string> heavily_input_{"x", "y", "input"};
 };
@@ -215,11 +249,10 @@ class EagerHeavilyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
 class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
  public:
   EagerLightlyLayoutSensitiveOpTransformer() {}
-  explicit EagerLightlyLayoutSensitiveOpTransformer(const std::string& op_name)
-      : op_name_(op_name) {
-    VLOG(3) << "Optimze Layout lightly " << op_name;
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
+  explicit EagerLightlyLayoutSensitiveOpTransformer(
+      const std::string& op_name) {
+    VLOG(4) << "Lightly op : " << op_name;
+    auto desired_layout = DesiredLayout();
     final_layout_ = paddle::framework::DataLayoutToString(desired_layout);
   }
 
@@ -228,12 +261,8 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
       const std::string& in_name, const paddle::experimental::Tensor& in) {
     std::string input_layout =
         paddle::framework::DataLayoutToString(in.layout());
-    auto default_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
-
+    auto default_layout = DefaultLayout();
     if (final_layout_ == input_layout && in.shape().size() == 4) {
-      VLOG(4) << op_name_ << "'s " << in_name << " need transpose from "
-              << input_layout << " to default_layout";
       auto out_tensor = EagerTraceTransposeOp(
           paddle::experimental::DataLayout::UNDEFINED, in);
       phi::DenseTensorUtils::GetMutableMeta(
@@ -241,23 +270,18 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
           ->layout = default_layout;
       return out_tensor;
     }
-    VLOG(4) << in_name << "'s layout is " << input_layout;
     return in;
   }
 
-  virtual std::vector<paddle::experimental::Tensor> TransInTensor(
+  virtual std::vector<paddle::experimental::Tensor> TransInTensors(
       const std::string& in_name,
       const std::vector<paddle::experimental::Tensor>& in) {
     std::vector<paddle::experimental::Tensor> result;
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    auto default_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
+    auto desired_layout = DesiredLayout();
+    auto default_layout = DefaultLayout();
     for (size_t i = 0; i < in.size(); i++) {
       auto in_tensor = in[i];
       if (in_tensor.layout() == desired_layout) {
-        VLOG(4) << op_name_ << "'s " << in_name << " need transpose from "
-                << final_layout_ << " to default_layout";
         auto out_tensor = EagerTraceTransposeOp(
             paddle::experimental::DataLayout::UNDEFINED, in_tensor);
         phi::DenseTensorUtils::GetMutableMeta(
@@ -272,33 +296,20 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
   }
 
   void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    auto out_layout = out_tensor->layout();
-    auto default_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
-    if (out_layout != default_layout) {
-      VLOG(4) << op_name_ << "'s out need transpose to default_layout";
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = default_layout;
-    }
+    UpdateLayout(out_tensor, DefaultLayout());
   }
 
   void SetOutTensorLayout(
       std::vector<paddle::experimental::Tensor*>* out_tensor) {
     for (size_t i = 0; i < out_tensor->size(); i++) {
-      VLOG(4) << "out layout is"
-              << paddle::framework::DataLayoutToString(
-                     (*out_tensor)[i]->layout());
       SetOutTensorLayout((*out_tensor)[i]);
     }
   }
 
   void SetOutTensorLayout(
       std::vector<paddle::experimental::Tensor>* out_tensor) {
-    auto default_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
+    auto default_layout = DefaultLayout();
     for (size_t i = 0; i < out_tensor->size(); i++) {
-      VLOG(4) << " out_tensor layout trans to default ";
       phi::DenseTensorUtils::GetMutableMeta(
           static_cast<phi::DenseTensor*>((*out_tensor)[i].impl().get()))
           ->layout = default_layout;
@@ -306,7 +317,6 @@ class EagerLightlyLayoutSensitiveOpTransformer : public EagerLayoutTransformer {
   }
 
  protected:
-  std::string op_name_;
   std::string final_layout_;
   std::unordered_set<std::string> heavily_input_{"x", "y", "input"};
 };
@@ -315,18 +325,11 @@ class EagerTransposeOpTransformer
     : public EagerLightlyLayoutSensitiveOpTransformer {
  public:
   EagerTransposeOpTransformer() {}
-  explicit EagerTransposeOpTransformer(const std::string& op_name)
-      : op_name_(op_name) {
-    VLOG(3) << "Optimze Layout TransposeOpTransformer " << op_name;
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    std::string desired_layout_str =
-        paddle::framework::DataLayoutToString(desired_layout);
-    final_layout_ = desired_layout_str;
+  explicit EagerTransposeOpTransformer(const std::string& op_name) {
+    VLOG(4) << "AutoTuneTransformer op: " << op_name;
   }
 
   void SetAttr(std::vector<int>* axis, bool is_nhwc) {
-    // input's layout is nhwc and input's layout === desired_layout
     std::vector<int> perm_nchw = {0, 2, 3, 1};
     std::vector<int> perm_nhwc = {0, 3, 1, 2};
     auto perm = is_nhwc ? perm_nhwc : perm_nchw;
@@ -334,44 +337,24 @@ class EagerTransposeOpTransformer
     (*axis)[1] = perm[(*axis)[1]];
     (*axis)[2] = perm[(*axis)[2]];
     (*axis)[3] = perm[(*axis)[3]];
-    VLOG(4) << " EagerTransposeOpTransformer " << op_name_
-            << "'s layout is equal to desire: " << is_nhwc;
   }
 
   paddle::experimental::Tensor TransInTensor(
       const std::string& in_name, const paddle::experimental::Tensor& in) {
-    VLOG(4) << "with no transpose: EagerTransposeOpTransformer " << in_name
-            << "'s layout is "
-            << paddle::framework::DataLayoutToString(in.layout());
     return in;
   }
 
   void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    if (out_tensor->layout() != desired_layout) {
-      VLOG(4) << " Set Out_tensor's layout from "
-              << paddle::framework::DataLayoutToString(out_tensor->layout())
-              << " to " << final_layout_;
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = desired_layout;
-    }
+    UpdateLayout(out_tensor, DefaultLayout());
   }
-
- protected:
-  std::string op_name_;
-  std::string final_layout_;
-  std::unordered_set<std::string> heavily_input_{"x", "y", "input"};
 };
 
 class EagerArgmaxOpTransformer
     : public EagerLightlyLayoutSensitiveOpTransformer {
  public:
   EagerArgmaxOpTransformer() {}
-  explicit EagerArgmaxOpTransformer(const std::string& op_name)
-      : op_name_(op_name) {
-    VLOG(3) << "Optimze Layout lightly " << op_name;
+  explicit EagerArgmaxOpTransformer(const std::string& op_name) {
+    VLOG(4) << "AutoTuneTransformer op: " << op_name;
   }
 
   void SetAttr(paddle::experimental::Scalar* axis, bool is_nhwc) {
@@ -383,38 +366,16 @@ class EagerArgmaxOpTransformer
   }
 
   void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    VLOG(4) << "EagerArgmaxOpTransformer's out layout is"
-            << paddle::framework::DataLayoutToString(out_tensor->layout());
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    if (desired_layout != out_tensor->layout()) {
-      VLOG(4) << "Change layout from "
-              << paddle::framework::DataLayoutToString(out_tensor->layout())
-              << " to " << final_layout_;
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = desired_layout;
-    }
+    UpdateLayout(out_tensor, DesiredLayout());
   }
-
- protected:
-  std::string op_name_;
-  std::string final_layout_;
-  std::unordered_set<std::string> heavily_input_{"x", "y", "input"};
 };
 
 class EagerFlattenOpTransformer
     : public EagerLightlyLayoutSensitiveOpTransformer {
  public:
   EagerFlattenOpTransformer() {}
-  explicit EagerFlattenOpTransformer(const std::string& op_name)
-      : op_name_(op_name) {
-    VLOG(3) << "Optimze Layout lightly " << op_name;
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    std::string desired_layout_str =
-        paddle::framework::DataLayoutToString(desired_layout);
-    final_layout_ = desired_layout_str;
+  explicit EagerFlattenOpTransformer(const std::string& op_name) {
+    VLOG(4) << "AutoTuneTransformer op: " << op_name;
   }
 
   // transpose from NHWC to NCHW
@@ -424,37 +385,16 @@ class EagerFlattenOpTransformer
   }
 
   void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    VLOG(4) << "EagerArgmaxOpTransformer's out layout is"
-            << paddle::framework::DataLayoutToString(out_tensor->layout());
-    auto layout = paddle::framework::StringToDataLayout(final_layout_);
-    if (layout != out_tensor->layout()) {
-      VLOG(4) << "Change layout from "
-              << paddle::framework::DataLayoutToString(out_tensor->layout())
-              << " to " << final_layout_;
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = layout;
-    }
+    UpdateLayout(out_tensor, DefaultLayout());
   }
-
- protected:
-  std::string op_name_;
-  std::string final_layout_;
-  std::unordered_set<std::string> heavily_input_{"x", "y", "input"};
 };
 
 class EagerConcatOpTransformer
     : public EagerLightlyLayoutSensitiveOpTransformer {
  public:
   EagerConcatOpTransformer() {}
-  explicit EagerConcatOpTransformer(const std::string& op_name)
-      : op_name_(op_name) {
-    VLOG(3) << "Optimze Layout lightly " << op_name;
-    auto desired_layout =
-        paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
-    std::string desired_layout_str =
-        paddle::framework::DataLayoutToString(desired_layout);
-    final_layout_ = desired_layout_str;
+  explicit EagerConcatOpTransformer(const std::string& op_name) {
+    VLOG(4) << "AutoTuneTransformer op : " << op_name;
   }
 
   void SetAttr(paddle::experimental::Scalar* axis,
@@ -462,32 +402,20 @@ class EagerConcatOpTransformer
     std::vector<int> perm_nhwc = {0, 3, 1, 2};
     std::vector<int> perm_nchw = {0, 2, 3, 1};
     int axes = axis->to<int>();
+    axes = axes < 0 ? axes + 4 : axes;
     auto perm =
         (paddle::framework::DataLayout::NHWC == layout) ? perm_nhwc : perm_nchw;
     (*axis) = static_cast<paddle::experimental::Scalar>(perm[axes]);
   }
 
-  virtual std::vector<paddle::experimental::Tensor> TransInTensor(
+  virtual std::vector<paddle::experimental::Tensor> TransInTensors(
       const std::string& in_name,
       const std::vector<paddle::experimental::Tensor>& in) {
     return in;
   }
 
   void SetOutTensorLayout(paddle::experimental::Tensor* out_tensor) {
-    auto layout = paddle::framework::StringToDataLayout(final_layout_);
-    if (layout != out_tensor->layout()) {
-      VLOG(4) << "Change layout from "
-              << paddle::framework::DataLayoutToString(out_tensor->layout())
-              << " to " << final_layout_;
-      phi::DenseTensorUtils::GetMutableMeta(
-          static_cast<phi::DenseTensor*>(out_tensor->impl().get()))
-          ->layout = layout;
-    }
+    UpdateLayout(out_tensor, DesiredLayout());
   }
-
- protected:
-  std::string op_name_;
-  std::string final_layout_;
-  std::unordered_set<std::string> heavily_input_{"x", "y", "input"};
 };
 }  // namespace egr
diff --git a/paddle/fluid/eager/grad_node_info.cc b/paddle/fluid/eager/grad_node_info.cc
index 0e102d0d1bc606..afa8a6f205259e 100644
--- a/paddle/fluid/eager/grad_node_info.cc
+++ b/paddle/fluid/eager/grad_node_info.cc
@@ -41,7 +41,7 @@ static void CheckTensor(const paddle::experimental::Tensor& pre,
         "The tensor in before and after hook are not consistent"));
   }
   if (pre.initialized() && post.initialized()) {
-    VLOG(4) << paddle::framework::DataType2String(pre.dtype()) << " "
+    VLOG(7) << paddle::framework::DataType2String(pre.dtype()) << " "
             << paddle::framework::DataType2String(post.dtype());
     PADDLE_ENFORCE_EQ(
         pre.dtype(),
@@ -62,7 +62,7 @@ static void CheckTensor(const paddle::experimental::Tensor& pre,
 }
 
 GradNodeBase::GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num) {
-  VLOG(6) << "Construct GradNodeBase";
+  VLOG(7) << "Construct GradNodeBase";
   bwd_in_meta_.resize(bwd_in_slot_num);
   bwd_out_meta_.resize(bwd_out_slot_num);
 }
@@ -84,7 +84,7 @@ GradNodeBase::MutableOutputMeta() {
 
 void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
                                  size_t slot_rank) {
-  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
+  VLOG(7) << "Set GradSlotMeta for Grad Inputs";
   auto* fwd_out_meta = egr::EagerUtils::nullable_autograd_meta(fwd_out);
   PADDLE_ENFORCE_LE(
       slot_rank,
@@ -104,7 +104,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
   }
 
   if (!fwd_out.initialized()) {
-    VLOG(6)
+    VLOG(7)
         << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
     return;
   }
@@ -123,7 +123,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
         static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
     dense_tensor = csr_tensor->mutable_non_zero_elements();
   } else {
-    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+    VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
                "non-DenseTensor argument.";
   }
   PADDLE_ENFORCE_NE(
@@ -145,7 +145,7 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
 void GradNodeBase::SetGradInMeta(
     const std::vector<paddle::experimental::Tensor>& fwd_out,
     size_t slot_rank) {
-  VLOG(6) << "Set GradSlotMeta for Grad Inputs";
+  VLOG(7) << "Set GradSlotMeta for Grad Inputs";
   size_t slot_size = fwd_out.size();
   PADDLE_ENFORCE_LE(
       slot_rank,
@@ -177,7 +177,7 @@ void GradNodeBase::SetGradInMeta(
     }
 
     if (!fwd_out_tensor.initialized()) {
-      VLOG(6)
+      VLOG(7)
           << "Skip Configuring GradSlotMeta for uninitialized GradInput Tensor";
       return;
     }
@@ -202,7 +202,7 @@ void GradNodeBase::SetGradInMeta(
         need_complex_to_real_ = true;
       }
     } else {
-      VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
+      VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta "
                  "with non-DenseTensor argument.";
     }
   }
@@ -260,7 +260,7 @@ void GradNodeBase::SetGradOutMeta(const paddle::experimental::Tensor& fwd_in,
       meta.SetPlace(fwd_in.place());
     }
   } else {
-    VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
+    VLOG(7) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
                "non-DenseTensor argument.";
   }
 }
@@ -319,7 +319,7 @@ void GradNodeBase::SetGradOutMeta(
         meta.SetPlace(fwd_in_tensor.place());
       }
     } else {
-      VLOG(6)
+      VLOG(7)
           << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
              "non-DenseTensor argument.";
     }
diff --git a/paddle/fluid/eager/grad_node_info.h b/paddle/fluid/eager/grad_node_info.h
index a65a044895a4ff..650446401468f5 100644
--- a/paddle/fluid/eager/grad_node_info.h
+++ b/paddle/fluid/eager/grad_node_info.h
@@ -74,7 +74,7 @@ class Edge {
   }
 
   void SetGradNode(const std::shared_ptr<GradNodeBase>& node) {
-    VLOG(6) << "Reseting Edge's Grad Node";
+    VLOG(7) << "Reseting Edge's Grad Node";
     grad_node_ = node;
   }
 
@@ -167,10 +167,10 @@ class GradSlotMeta {
 
 class GradNodeBase {
  public:
-  GradNodeBase() { VLOG(6) << "Construct GradNodeBase"; }
+  GradNodeBase() { VLOG(7) << "Construct GradNodeBase"; }
   GradNodeBase(size_t bwd_in_slot_num, size_t bwd_out_slot_num);
   // TODO(jiabin): Should we have other constructor here?
-  virtual ~GradNodeBase() { VLOG(6) << "Destruct GradNodeBase"; }
+  virtual ~GradNodeBase() { VLOG(7) << "Destruct GradNodeBase"; }
 
   /**
    * operator() designed to contian the real backward execution logic, it should
@@ -255,14 +255,14 @@ class GradNodeBase {
 
   std::map<int64_t, std::tuple<size_t, size_t, std::shared_ptr<TensorHook>>>
   GetGradientHookFuntions() {
-    VLOG(6) << "GetGradientHookFuntions ";
+    VLOG(7) << "GetGradientHookFuntions ";
     return gradient_hooks_;
   }
 
   void SetGradientHookFuntions(
       std::map<int64_t, std::tuple<size_t, size_t, std::shared_ptr<TensorHook>>>
           hooks) {
-    VLOG(6) << "SetGradientHookFuntions ";
+    VLOG(7) << "SetGradientHookFuntions ";
     gradient_hooks_ = hooks;
   }
 
@@ -302,7 +302,7 @@ class GradNodeBase {
   // Gradient Hooks
   // Customer may register a list of hooks which will be called in order during
   // backward
-  // Each entry consists one pair of
+  // Each entry consists of one pair of
   // <hook_id, <out_rank, std::shared_ptr<TensorHook>>>
   std::map<int64_t,
            std::tuple<
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index afd9e4ef865ff9..14a8c26f9dcb8d 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -143,7 +143,7 @@ void GradTensorHolder::add(size_t slot_id,
     if (t.is_dense_tensor()) {
       if (buffer_tensor.is_dense_tensor()) {
         if (create_graph || t.is_custom_device()) {
-          buffer_tensor = add_dygraph_function(t, buffer_tensor);
+          buffer_tensor = add_ad_func(t, buffer_tensor);
         } else {
           paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
               t, &buffer_tensor);
@@ -170,7 +170,7 @@ void GradTensorHolder::add(size_t slot_id,
             std::make_shared<phi::DenseTensor>(
                 buffer_sparse->non_zero_elements()));
         if (create_graph || t.is_custom_device()) {
-          buffer_values = add_dygraph_function(t_values, buffer_values);
+          buffer_values = add_ad_func(t_values, buffer_values);
         } else {
           paddle::imperative::TensorAdd<paddle::experimental::Tensor>(
               t_values, &buffer_values);
diff --git a/paddle/fluid/eager/pylayer/py_layer_node.h b/paddle/fluid/eager/pylayer/py_layer_node.h
index f1166c362e147b..18c48b62c4f524 100644
--- a/paddle/fluid/eager/pylayer/py_layer_node.h
+++ b/paddle/fluid/eager/pylayer/py_layer_node.h
@@ -45,7 +45,7 @@ class GradNodePyLayer : public GradNodeBase {
 
   void ClearTensorWrappers() override { VLOG(6) << "Do nothing here now"; }
 
-  std::string name() {
+  std::string name() override {
     return "GradNodePyLayer_" + std::string(Py_TYPE(ctx_)->tp_name);
   }
 
diff --git a/paddle/fluid/eager/tensor_wrapper.h b/paddle/fluid/eager/tensor_wrapper.h
index a6fd57ac6a4bc2..e7994e388298d5 100644
--- a/paddle/fluid/eager/tensor_wrapper.h
+++ b/paddle/fluid/eager/tensor_wrapper.h
@@ -100,10 +100,10 @@ class TensorWrapper {
 
     std::shared_ptr<GradNodeBase> new_grad_node = weak_grad_node_.lock();
     if (new_grad_node) {
-      VLOG(3) << "Recovered TensorWrapper with GradNode "
+      VLOG(7) << "Recovered TensorWrapper with GradNode "
               << new_grad_node->name() << " addr: " << new_grad_node.get();
     } else {
-      VLOG(3) << "Recovered TensorWrapper with Empty GradNode";
+      VLOG(7) << "Recovered TensorWrapper with Empty GradNode";
     }
     auto* intermediate_autograd_meta =
         EagerUtils::nullable_autograd_meta(intermidiate_tensor_);
@@ -129,7 +129,7 @@ class TensorWrapper {
  private:
   void check_inplace_version() {
     if (no_need_buffer_) {
-      VLOG(6) << "There's no need to check inplace_version because "
+      VLOG(7) << "There's no need to check inplace_version because "
                  "no_need_buffer_ is true.";
       return;
     }
@@ -154,10 +154,10 @@ class TensorWrapper {
               intermidiate_tensor_.name(),
               tensor_version,
               wrapper_version_snapshot));
-      VLOG(6) << " The wrapper_version_snapshot of Tensor '"
+      VLOG(7) << " The wrapper_version_snapshot of Tensor '"
               << intermidiate_tensor_.name() << "' is [ "
               << wrapper_version_snapshot << " ]";
-      VLOG(6) << " The tensor_version of Tensor '"
+      VLOG(7) << " The tensor_version of Tensor '"
               << intermidiate_tensor_.name() << "' is [ " << tensor_version
               << " ]";
     }
diff --git a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
index 144ceab1e49831..515def46b6413d 100644
--- a/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
+++ b/paddle/fluid/eager/tests/performance_tests/benchmark_utils.cc
@@ -77,7 +77,7 @@ void benchmark_eager_matmul(const paddle::experimental::Tensor& X,
 
   size_t max_num_runs = accuracy_check ? 2 : max_num_benchmark_runs;
   for (size_t i = 0; i < max_num_runs; i++) {
-    input_tensor0 = matmul_dygraph_function(input_tensor0, Y, false, false);
+    input_tensor0 = matmul_ad_func(input_tensor0, Y, false, false);
   }
 
   std::vector<paddle::experimental::Tensor> target_tensors = {input_tensor0};
diff --git a/paddle/fluid/eager/to_static/run_program_op_func.h b/paddle/fluid/eager/to_static/run_program_op_func.h
index 33da489fd47b16..23ba88c8898c1f 100644
--- a/paddle/fluid/eager/to_static/run_program_op_func.h
+++ b/paddle/fluid/eager/to_static/run_program_op_func.h
@@ -54,7 +54,7 @@ static void clear_no_grad_edges_with_partial_block(
   }
 }
 
-inline void run_program_dygraph_function(
+inline void run_program_ad_func(
     const std::vector<paddle::experimental::Tensor>& x,
     const std::vector<paddle::experimental::Tensor>& params,
     std::vector<paddle::experimental::Tensor*>& out,     // NOLINT
diff --git a/paddle/fluid/eager/to_static/run_program_op_node.h b/paddle/fluid/eager/to_static/run_program_op_node.h
index 93e957e20fa881..a22df2a1b1528b 100644
--- a/paddle/fluid/eager/to_static/run_program_op_node.h
+++ b/paddle/fluid/eager/to_static/run_program_op_node.h
@@ -246,6 +246,34 @@ static void BuildScopeByBlock(
   }
 }
 
+static void GcScope(paddle::framework::Scope *scope) {
+  std::deque<std::shared_ptr<paddle::memory::Allocation>> *garbages =
+      new std::deque<std::shared_ptr<paddle::memory::Allocation>>();
+
+  for (auto &var : scope->LocalVars()) {
+    if (var != nullptr) {
+      if (var->IsType<paddle::framework::LoDTensor>()) {
+        garbages->emplace_back(var->GetMutable<paddle::framework::LoDTensor>()
+                                   ->MoveMemoryHolder());
+      }
+      if (var->IsType<phi::SelectedRows>()) {
+        garbages->emplace_back(var->GetMutable<phi::SelectedRows>()
+                                   ->mutable_value()
+                                   ->MoveMemoryHolder());
+      }
+      if (var->IsType<paddle::framework::LoDTensorArray>()) {
+        auto *lod_tensor_arr =
+            var->GetMutable<paddle::framework::LoDTensorArray>();
+        for (auto &t : *lod_tensor_arr) {
+          garbages->emplace_back(t.MoveMemoryHolder());
+        }
+        lod_tensor_arr->clear();
+      }
+    }
+  }
+  delete garbages;  // free mem
+}
+
 }  // namespace details
 
 inline void RunProgramAPI(
@@ -274,22 +302,14 @@ inline void RunProgramAPI(
       1,
       paddle::platform::errors::InvalidArgument(
           "The OutScope of RunProgramGradOp should only hold one scope."));
-  // Step 2. prepare executor and init persistable variables
-  // NOTE(Aurelius84): While training some models, forward can be called many
-  // times and then apply backpropagation all at once, such as Reinforcement
-  // Learning. Tensor data in multi-step training should be saved into single
-  // scope separately. Otherwise, the gradients can be miscalculated because
-  // always using the Tensor data of the last step in forward.
-  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
-  VLOG(2) << "The number of sub scopes before forward: "
-          << out_scope_vec->front()->kids().size();
-  paddle::framework::Scope &scope = global_inner_scope->NewScope();
 
   bool use_interpretorcore =
       PADDLE_GET_CONST(bool, attrs.at("use_interpretorcore"));
 
   if (use_interpretorcore) {
-    VLOG(0) << "RunProgramOp use interpretercore to execute program.";
+    VLOG(2) << "RunProgramOp use interpretercore to execute program.";
+
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
 
     auto input_names = details::GetTensorsName(x);
     auto output_names = details::GetTensorsName(out);
@@ -308,12 +328,16 @@ inline void RunProgramAPI(
     if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/false)) {
       VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
       // Step 1. share input_vars & parameters into scope
-      details::ShareTensorsIntoScope(x, &scope);
-      details::ShareTensorsIntoScope(params, &scope);
+      details::ShareTensorsIntoScope(x, global_inner_scope);
+      details::ShareTensorsIntoScope(params, global_inner_scope);
       // Step 2. create new interpretercore
       auto interpreter_core =
           paddle::framework::CreateInterpreterCoreInfoToCache(
-              *forward_program, place, /*is_grad=*/false, program_id, &scope);
+              *forward_program,
+              place,
+              /*is_grad=*/false,
+              program_id,
+              global_inner_scope);
       // Step 3. get all eager gc vars
       std::set<std::string> skip_eager_delete_vars =
           paddle::framework::details::ParseSafeEagerDeletionSkipVarsSet(
@@ -331,10 +355,14 @@ inline void RunProgramAPI(
         interpreter_core->Run({});
       }
       // Step 5. Get Output
-      details::ShareTensorsFromScopeWithPartialBlock(
-          out, *forward_global_block, *backward_global_block, &scope);
-      details::ShareTensorsFromScopeWithPartialBlock(
-          dout, *forward_global_block, *backward_global_block, &scope);
+      details::ShareTensorsFromScopeWithPartialBlock(out,
+                                                     *forward_global_block,
+                                                     *backward_global_block,
+                                                     global_inner_scope);
+      details::ShareTensorsFromScopeWithPartialBlock(dout,
+                                                     *forward_global_block,
+                                                     *backward_global_block,
+                                                     global_inner_scope);
     } else {
       VLOG(2) << "Get interpretercore cahce by program:" << program_id;
       // Step 1. get cache interpretercore
@@ -342,34 +370,55 @@ inline void RunProgramAPI(
           interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/false);
       auto &interpreter_core = cached_value.core_;
       // Step 2. update scope for cache interpretercore
-      details::ShareTensorsIntoScope(x, &scope);
-      details::ShareTensorsIntoScope(params, &scope);
-      details::BuildScopeByBlock(
-          *interpreter_core.get(), *forward_global_block, &scope);
-      interpreter_core->reset_scope(&scope);
+      details::ShareTensorsIntoScope(x, global_inner_scope);
+      details::ShareTensorsIntoScope(params, global_inner_scope);
+      if (interpreter_core->GetVariableScope()->GetMutableScope() !=
+          global_inner_scope) {
+        details::BuildScopeByBlock(
+            *interpreter_core.get(), *forward_global_block, global_inner_scope);
+        interpreter_core->reset_scope(global_inner_scope);
+      }
       // Step 3. interpretercore run
       if (forward_global_block->OpSize() > 0) {
         interpreter_core->Run({});
       }
       // Step 4. Get Output
-      details::ShareTensorsFromScopeWithPartialBlock(
-          out, *forward_global_block, *backward_global_block, &scope);
-      details::ShareTensorsFromScopeWithPartialBlock(
-          dout, *forward_global_block, *backward_global_block, &scope);
+      details::ShareTensorsFromScopeWithPartialBlock(out,
+                                                     *forward_global_block,
+                                                     *backward_global_block,
+                                                     global_inner_scope);
+      details::ShareTensorsFromScopeWithPartialBlock(dout,
+                                                     *forward_global_block,
+                                                     *backward_global_block,
+                                                     global_inner_scope);
     }
     VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
 
-    if (is_test) {
-      VLOG(1) << "is test, after forward, drop kids";
-      out_scope_vec->front()->DropKids();
+    if (is_test || !egr::Controller::Instance().HasGrad()) {
+      VLOG(4) << "is test, set this scope can reused";
+      global_inner_scope->SetCanReuesd(true);
+      details::GcScope(global_inner_scope);
+    } else {
+      VLOG(4) << "not test, set this scope can not reused";
+      global_inner_scope->SetCanReuesd(false);
     }
-    VLOG(2) << "The number of sub scopes after forward: "
-            << out_scope_vec->front()->kids().size();
 #ifdef PADDLE_WITH_MKLDNN
     if (FLAGS_use_mkldnn) paddle::platform::DontClearMKLDNNCache(place);
 #endif
   } else {
     VLOG(2) << "RunProgramOp execute with parallel_executor.";
+
+    // Step 2. prepare executor and init persistable variables
+    // NOTE(Aurelius84): While training some models, forward can be called many
+    // times and then apply backpropagation all at once, such as Reinforcement
+    // Learning. Tensor data in multi-step training should be saved into single
+    // scope separately. Otherwise, the gradients can be miscalculated because
+    // always using the Tensor data of the last step in forward.
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+    VLOG(2) << "The number of sub scopes before forward: "
+            << out_scope_vec->front()->kids().size();
+    paddle::framework::Scope &scope = global_inner_scope->NewScope();
+
     // share input_vars & parameters into scope
     details::ShareTensorsIntoScope(x, &scope);
     details::ShareTensorsIntoScope(params, &scope);
@@ -421,7 +470,7 @@ inline void RunProgramAPI(
     // Debug info: scope info when run end
     VLOG(3) << paddle::framework::GenScopeTreeDebugInfo(out_scope_vec->front());
     // Step 5. Drop all children scopes while testing.
-    if (is_test) {
+    if (is_test || !egr::Controller::Instance().HasGrad()) {
       out_scope_vec->front()->DropKids();
     }
     VLOG(2) << "The number of sub scopes after forward: "
@@ -454,20 +503,13 @@ inline void RunProgramGradAPI(
       1,
       paddle::platform::errors::InvalidArgument(
           "The OutScope of RunProgramGradOp should only hold one scope."));
-  paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
-  auto sub_scope_num = global_inner_scope->kids().size();
-  VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
-  PADDLE_ENFORCE_GT(sub_scope_num,
-                    0,
-                    paddle::platform::errors::InvalidArgument(
-                        "The OutScope of RunProgramGradOp should hold at "
-                        "least one sub scope."));
 
-  auto &scope = *(global_inner_scope->kids().front());
   auto place = egr::Controller::Instance().GetExpectedPlace();
 
   if (use_interpretorcore) {
-    VLOG(0) << "RunProgramGradOp use interpretercore to execute program.";
+    VLOG(2) << "RunProgramGradOp use interpretercore to execute program.";
+
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
 
     auto *forward_global_block = PADDLE_GET_CONST(
         paddle::framework::BlockDesc *, attrs.at("forward_global_block"));
@@ -490,10 +532,14 @@ inline void RunProgramGradAPI(
         paddle::framework::InterpreterCoreInfoCache::Instance();
     if (!interpretercore_info_cache.Has(program_id, /*is_grad=*/true)) {
       VLOG(2) << "No interpretercore cahce, so create a new interpretercore";
-      details::ShareTensorsIntoScope(out_grad, &scope);
+      details::ShareTensorsIntoScope(out_grad, global_inner_scope);
       auto interpreter_core =
           paddle::framework::CreateInterpreterCoreInfoToCache(
-              *backward_program, place, /*is_grad=*/true, program_id, &scope);
+              *backward_program,
+              place,
+              /*is_grad=*/true,
+              program_id,
+              global_inner_scope);
 
       // get all eager gc vars
       std::set<std::string> skip_eager_delete_vars;
@@ -518,10 +564,14 @@ inline void RunProgramGradAPI(
           interpretercore_info_cache.GetMutable(program_id, /*is_grad=*/true);
       auto &interpreter_core = cached_value.core_;
       // update scope
-      details::ShareTensorsIntoScope(out_grad, &scope);
-      details::BuildScopeByBlock(
-          *interpreter_core.get(), *backward_global_block, &scope);
-      interpreter_core->reset_scope(&scope);
+      details::ShareTensorsIntoScope(out_grad, global_inner_scope);
+      if (interpreter_core->GetVariableScope()->GetMutableScope() !=
+          global_inner_scope) {
+        details::BuildScopeByBlock(*interpreter_core.get(),
+                                   *backward_global_block,
+                                   global_inner_scope);
+        interpreter_core->reset_scope(global_inner_scope);
+      }
 
       if (backward_global_block->OpSize() > 0) {
         // Debug info: scope info when run end
@@ -531,16 +581,31 @@ inline void RunProgramGradAPI(
       }
     }
     // Step 4. get outputs
-    details::ShareTensorsFromScopeWithPartialBlock(
-        x_grad, *forward_global_block, *backward_global_block, &scope);
-    details::ShareTensorsFromScopeWithPartialBlock(
-        params_grad, *forward_global_block, *backward_global_block, &scope);
-
-    // Step5. drop current scope
-    global_inner_scope->DeleteScope(&scope);
-    VLOG(2) << "The number of sub scopes after backward: "
-            << global_inner_scope->kids().size();
+    details::ShareTensorsFromScopeWithPartialBlock(x_grad,
+                                                   *forward_global_block,
+                                                   *backward_global_block,
+                                                   global_inner_scope);
+    details::ShareTensorsFromScopeWithPartialBlock(params_grad,
+                                                   *forward_global_block,
+                                                   *backward_global_block,
+                                                   global_inner_scope);
+    VLOG(4) << "after backward gc all vars";
+    global_inner_scope->SetCanReuesd(true);
+    details::GcScope(global_inner_scope);
   } else {
+    VLOG(2) << "RunProgramGradOp use pe to execute program.";
+
+    paddle::framework::Scope *global_inner_scope = out_scope_vec->front();
+    auto sub_scope_num = global_inner_scope->kids().size();
+    VLOG(2) << "The number of sub scopes before backward: " << sub_scope_num;
+    PADDLE_ENFORCE_GT(sub_scope_num,
+                      0,
+                      paddle::platform::errors::InvalidArgument(
+                          "The OutScope of RunProgramGradOp should hold at "
+                          "least one sub scope."));
+
+    auto &scope = *(global_inner_scope->kids().front());
+
     auto *global_block = PADDLE_GET_CONST(paddle::framework::BlockDesc *,
                                           attrs.at("global_block"));
     auto orig_end_op_index =
diff --git a/paddle/fluid/eager/utils.cc b/paddle/fluid/eager/utils.cc
index 33e2c84099e030..777929bbc75368 100644
--- a/paddle/fluid/eager/utils.cc
+++ b/paddle/fluid/eager/utils.cc
@@ -296,7 +296,7 @@ void EagerUtils::HandleViewBetweenInputAndOutput(
     view_output_dense_tensor->ShareInplaceVersionCounterWith(
         *input_dense_tensor);
 
-    VLOG(3) << "Perform View between Output Tensor("
+    VLOG(4) << "Perform View between Output Tensor("
             << view_output_tensor->name() << ") and Input Tensor("
             << input_tensor.name()
             << "), share allocation and inplace version.";
@@ -409,7 +409,7 @@ std::vector<paddle::experimental::Tensor> EagerUtils::RecoverTensorWrapper(
   }
   return ret;
 }
-
+// TODO(jiabin): remove all this when we fix all test using tmp grad
 void EagerUtils::CheckAndRetainGrad(
     const paddle::experimental::Tensor& tensor) {
   VLOG(6) << "Check RetainGradForTensor: " << tensor.name();
diff --git a/paddle/fluid/eager/utils.h b/paddle/fluid/eager/utils.h
index a42b118771830c..291d96ff0809f2 100644
--- a/paddle/fluid/eager/utils.h
+++ b/paddle/fluid/eager/utils.h
@@ -223,13 +223,14 @@ class EagerUtils {
       const std::vector<paddle::experimental::Tensor*>& out_var,
       std::vector<paddle::experimental::Tensor>* result);
 
-  // end Intermidate needed
+  // end Intermidate needed.
 
   static void CheckAndRetainGrad(const paddle::experimental::Tensor& tensor);
   static void CheckAndRetainGrad(
       const std::vector<paddle::experimental::Tensor>& tensors);
   static void CheckAndRetainGrad(
       const std::vector<paddle::experimental::Tensor*>& tensors);
+
   static std::shared_ptr<egr::GradNodeBase> GetGradAccumulationNode(
       const paddle::experimental::Tensor& tensor);
 
@@ -246,6 +247,183 @@ class EagerUtils {
   static void FillZeroForEmptyGradInput(
       std::vector<paddle::experimental::Tensor>* in_grads,
       const std::vector<GradSlotMeta>& grad_in_metas);
+  /**
+   * Print Input Output (level 0 means least info, level 2 means most info)
+   * **/
+  static const std::string TensorStr(const paddle::experimental::Tensor& t) {
+    std::string tensor_name_str = "";
+    if (t.name() == "") {
+      tensor_name_str = "None";
+    } else {
+      tensor_name_str = t.name();
+    }
+    const char* TENSOR_INFO_TEMPLATE = "Type: %s, Dtype: %s, Place: %s";
+    std::string tensor_info_str = "";
+    if (t.defined()) {
+      if (t.initialized()) {
+        tensor_info_str += paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
+                                                   t.impl()->type_info().name(),
+                                                   t.dtype(),
+                                                   t.place().DebugString());
+      } else {
+        tensor_info_str += paddle::string::Sprintf(TENSOR_INFO_TEMPLATE,
+                                                   t.impl()->type_info().name(),
+                                                   "Unknown",
+                                                   "Unknown");
+      }
+    } else {
+      tensor_info_str += "Unknown";
+    }
+    if (VLOG_IS_ON(6)) {
+      const char* TENSOR_PRINT_TEMPLATE =
+          "{Name: %s, Initialized: %d, Ptr: %d "
+          "TensorInfo: [ %s ], ADInfo:[ %s ]}";
+      auto* ad_meta = nullable_autograd_meta(t);
+      if (ad_meta && (ad_meta->WeakGrad().lock().get())) {
+        std::string ad_info_str = "";
+        const char* AD_INFO_TEMPLATE =
+            "Grad: [ %s ],  GradNode: [ %s ], StopGradient: [ %d ]";
+        ad_info_str += paddle::string::Sprintf(AD_INFO_TEMPLATE,
+                                               TensorStr(ad_meta->Grad()),
+                                               GradNodeStr(t),
+                                               ad_meta->StopGradient());
+        return paddle::string::Sprintf(TENSOR_PRINT_TEMPLATE,
+                                       tensor_name_str,
+                                       t.initialized(),
+                                       t.impl(),
+                                       tensor_info_str,
+                                       ad_info_str);
+      } else {
+        return paddle::string::Sprintf(TENSOR_PRINT_TEMPLATE,
+                                       tensor_name_str,
+                                       t.initialized(),
+                                       t.impl(),
+                                       tensor_info_str,
+                                       "None");
+      }
+    } else if (VLOG_IS_ON(5)) {
+      const char* TENSOR_PRINT_TEMPLATE =
+          "{Name: %s, Initialized: %d , Ptr: %d "
+          "TensorInfo: [ %s ]}";
+      return paddle::string::Sprintf(TENSOR_PRINT_TEMPLATE,
+                                     tensor_name_str,
+                                     t.initialized(),
+                                     t.impl(),
+                                     tensor_info_str);
+    } else if (VLOG_IS_ON(4)) {
+      const char* TENSOR_PRINT_TEMPLATE =
+          "{ Name: %s, Initialized: %d, Ptr: %d }";
+      return paddle::string::Sprintf(
+          TENSOR_PRINT_TEMPLATE, tensor_name_str, t.initialized(), t.impl());
+    } else {
+      return "[ Not specified tensor log level ]";
+    }
+  }
+
+  static const std::string GradNodeStr(const egr::GradNodeBase& node) {
+    if (VLOG_IS_ON(6)) {
+      const char* GRAD_NODE_TEMPLATE =
+          "BackwardOutMeta: [ %s ], BackwardInMeta: [ %s ]";
+      const char* GRAD_SLOT_META_TEMPLATE = " {SlotSize: [%d]: %s} ";
+      const char* SLOT_INFO_TEMPLATE =
+          "SlotID: %s, StopGradients: %s, Edges[ %s ]";
+      auto out_metas = node.OutputMeta();
+      auto in_metas = node.InputMeta();
+      std::string out_slot_str = "";
+      std::string in_slot_str = "";
+      const char* EDGE_INFO_TEMPLATE = " { [%d, %d]: [%s, %s] }, ";
+      std::string slot_str = "";
+      for (size_t i = 0; i < out_metas.size(); i++) {
+        std::string edges_str = "";
+        std::string sg_str = "";
+        for (const GradSlotMeta& meta : out_metas[i]) {
+          const egr::Edge& edge = meta.GetEdge();
+          if (edge.IsInitialized()) {
+            edges_str += paddle::string::Sprintf(EDGE_INFO_TEMPLATE,
+                                                 edge.GetEdgeRankInfo().first,
+                                                 edge.GetEdgeRankInfo().second,
+                                                 edge.GetGradNode(),
+                                                 edge.GetGradNode()->name());
+          } else {
+            edges_str += paddle::string::Sprintf("{ NULL Edge }");
+          }
+          sg_str += meta.IsStopGradient() ? "1, " : "0, ";
+        }
+        out_slot_str +=
+            paddle::string::Sprintf(SLOT_INFO_TEMPLATE, i, sg_str, edges_str);
+      }
+      std::string out_meta_str = paddle::string::Sprintf(
+          GRAD_SLOT_META_TEMPLATE, out_metas.size(), out_slot_str);
+
+      for (size_t i = 0; i < in_metas.size(); i++) {
+        std::string edges_str = "";
+        std::string sg_str = "";
+        for (const GradSlotMeta& meta : in_metas[i]) {
+          edges_str += paddle::string::Sprintf("{ NULL Edge }");
+          sg_str += meta.IsStopGradient() ? "1, " : "0, ";
+        }
+        in_slot_str +=
+            paddle::string::Sprintf(SLOT_INFO_TEMPLATE, i, sg_str, edges_str);
+      }
+      std::string in_meta_str =
+          paddle::string::Sprintf(GRAD_SLOT_META_TEMPLATE, in_slot_str);
+      return paddle::string::Sprintf(
+          GRAD_NODE_TEMPLATE, out_meta_str, in_meta_str);
+    } else if (VLOG_IS_ON(5)) {
+      const char* GRAD_NODE_TEMPLATE =
+          "BackwardOutMeta: [ %s ], BackwardInMeta: [ %s ]";
+      const char* GRAD_SLOT_META_TEMPLATE = "SlotSize: %d";
+      std::string out_meta_str = paddle::string::Sprintf(
+          GRAD_SLOT_META_TEMPLATE, node.OutputMeta().size());
+      std::string in_meta_str = paddle::string::Sprintf(
+          GRAD_SLOT_META_TEMPLATE, node.InputMeta().size());
+      return paddle::string::Sprintf(
+          GRAD_NODE_TEMPLATE, out_meta_str, in_meta_str);
+    } else {
+      return "[ Not specified grad node log level. ] ";
+    }
+  }
+
+  static const std::string GradNodeStr(const paddle::experimental::Tensor& t) {
+    auto* ad_meta = nullable_autograd_meta(t);
+    if (ad_meta && (ad_meta->GetMutableGradNode().get())) {
+      return GradNodeStr((*ad_meta->GetMutableGradNode().get()));
+    } else {
+      return "None";
+    }
+  }
+
+  static const std::string TensorStr(
+      const std::vector<paddle::experimental::Tensor>& tensors) {
+    std::string tensors_str = "";
+    for (const auto& tensor : tensors) {
+      tensors_str += TensorStr(tensor) + ", ";
+    }
+    return "[ " + tensors_str + " ]";
+  }
+
+  static const std::string TensorStr(
+      const paddle::optional<paddle::experimental::Tensor>& t) {
+    if (!t.is_initialized()) {
+      return "{ UnDefinedTensor }";
+    } else {
+      return TensorStr((*t.get_ptr()));
+    }
+  }
+
+  static const std::string TensorStr(
+      const paddle::optional<std::vector<paddle::experimental::Tensor>>&
+          tensors) {
+    std::string tensors_str = "";
+    if (!tensors.is_initialized()) {
+      return "[ UnDefinedTensor List ]";
+    } else {
+      for (const auto& tensor : (*tensors.get_ptr())) {
+        tensors_str += TensorStr(tensor) + ", ";
+      }
+      return "[ " + tensors_str + " ]";
+    }
+  }
 };
 
 }  // namespace egr
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 85806014312e57..98877d7a3e36bc 100755
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -190,7 +190,7 @@ cc_test(
 cc_library(
   var_type_traits
   SRCS var_type_traits.cc
-  DEPS framework_proto scope tensor_array)
+  DEPS framework_proto scope tensor_array sparse_coo_tensor sparse_csr_tensor)
 if(WITH_GPU)
   target_link_libraries(var_type_traits dynload_cuda)
 endif()
@@ -378,7 +378,7 @@ cc_library(
 cc_library(
   shape_inference
   SRCS shape_inference.cc
-  DEPS ddim attribute)
+  DEPS ddim attribute selected_rows_utils)
 
 # every source file that includes "dnnl.h" must depends on mkldnn
 # or, the first one should depends on mkldnn
@@ -584,16 +584,18 @@ if(WITH_PYTHON)
               ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
       COMMENT "Copy generated python proto into directory paddle/fluid/proto."
       WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
-    add_custom_target(
-      fleet_executor_proto_init ALL
-      DEPENDS fleet_proto_init fleet_executor_desc_py_proto
-      COMMAND
-        cp
-        ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py
-        ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
-      COMMENT
-        "Copy generated python proto into directory paddle/distributed/fleet/proto."
-    )
+    if(NOT WITH_ROCM)
+      add_custom_target(
+        fleet_executor_proto_init ALL
+        DEPENDS fleet_proto_init fleet_executor_desc_py_proto
+        COMMAND
+          cp
+          ${PADDLE_BINARY_DIR}/paddle/fluid/distributed/fleet_executor/fleet_executor_*.py
+          ${PADDLE_BINARY_DIR}/python/paddle/distributed/fleet/proto
+        COMMENT
+          "Copy generated python proto into directory paddle/distributed/fleet/proto."
+      )
+    endif()
   else()
     string(REPLACE "/" "\\" proto_dstpath
                    "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/")
@@ -1080,7 +1082,7 @@ cc_test(
 cc_library(
   selected_rows_utils
   SRCS selected_rows_utils.cc
-  DEPS selected_rows serialization)
+  DEPS selected_rows serialization device_context)
 cc_test(
   selected_rows_utils_test
   SRCS selected_rows_utils_test.cc
@@ -1138,7 +1140,8 @@ cc_library(
        phi
        phi_api_utils
        op_info
-       shape_inference)
+       shape_inference
+       sparse_coo_tensor)
 cc_test(
   infershape_utils_test
   SRCS infershape_utils_test.cc
diff --git a/paddle/fluid/framework/attribute_checker.h b/paddle/fluid/framework/attribute_checker.h
index fbafe9c73a9cc6..24f3f0be96b6cb 100644
--- a/paddle/fluid/framework/attribute_checker.h
+++ b/paddle/fluid/framework/attribute_checker.h
@@ -342,13 +342,12 @@ class OpAttrChecker {
   AttributeMap default_attrs_;
 
   // in order to improve the efficiency of dynamic graph mode,
-  // we divede the attribute into explicit type and implicit type.
+  // we divide the attribute into explicit type and implicit type.
   // for explicit attribute, we mean the attribute added in the customized
   // op makers, usually it's defined in the overloaded Make method.
   // for implicit attribute, we mean the attribute added outside of the Make
   // method like "op_role", "op_role_var", and they are useless in dynamic
-  // graph
-  // mode
+  // graph mode
   size_t explicit_checker_num_;
 };
 
diff --git a/paddle/fluid/framework/conv_search_cache.h b/paddle/fluid/framework/conv_search_cache.h
index 4da2aeb4d04722..1620c99ce8560d 100644
--- a/paddle/fluid/framework/conv_search_cache.h
+++ b/paddle/fluid/framework/conv_search_cache.h
@@ -55,7 +55,8 @@ class ConvSearchCache {
   AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t>* GetBackwardFilter() {
     return &backward_filter_cache_;
   }
-  AlgorithmsCache<cudnnConvolutionFwdAlgo_t>* GetConvFusion() {
+  AlgorithmsCache<SearchFuseResult<cudnnConvolutionFwdAlgo_t>>*
+  GetConvFusion() {
     return &fusion_forward_cache_;
   }
 #endif
@@ -75,7 +76,8 @@ class ConvSearchCache {
   AlgorithmsCache<cudnnConvolutionFwdAlgo_t> forward_cache_;
   AlgorithmsCache<cudnnConvolutionBwdDataAlgo_t> backward_data_cache_;
   AlgorithmsCache<cudnnConvolutionBwdFilterAlgo_t> backward_filter_cache_;
-  AlgorithmsCache<cudnnConvolutionFwdAlgo_t> fusion_forward_cache_;
+  AlgorithmsCache<SearchFuseResult<cudnnConvolutionFwdAlgo_t>>
+      fusion_forward_cache_;
 #endif
 };
 
diff --git a/paddle/fluid/framework/convert_utils.h b/paddle/fluid/framework/convert_utils.h
index da2af86c77c477..a11eafd8af2351 100644
--- a/paddle/fluid/framework/convert_utils.h
+++ b/paddle/fluid/framework/convert_utils.h
@@ -19,10 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/layout.h"
 #include "paddle/phi/core/tensor_meta.h"
 
-// See Note [ Why still include the fluid headers? ]
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/platform/place.h"
 
 // TODO(chenweihang): this file may need to be removed
 
diff --git a/paddle/fluid/framework/custom_operator.cc b/paddle/fluid/framework/custom_operator.cc
index 8c8d702e28f42a..d3e0ed42935cfb 100644
--- a/paddle/fluid/framework/custom_operator.cc
+++ b/paddle/fluid/framework/custom_operator.cc
@@ -801,7 +801,7 @@ void RegisterOperatorWithMetaInfo(const std::vector<OpMetaInfo>& op_meta_infos,
 
   // Infer Dtype
   if (infer_dtype_func == nullptr) {
-    // use defalut InferDtype
+    // use default InferDtype
     info.infer_var_type_ = [op_inputs, op_outputs](InferVarTypeContext* ctx) {
       PADDLE_ENFORCE_EQ(
           op_inputs.size(),
diff --git a/paddle/fluid/framework/data_device_transform.cc b/paddle/fluid/framework/data_device_transform.cc
index 36e558c1d504d8..e65ecff60edd76 100644
--- a/paddle/fluid/framework/data_device_transform.cc
+++ b/paddle/fluid/framework/data_device_transform.cc
@@ -51,8 +51,7 @@ void TransDataDevice(const Tensor &in,
   // the elements of learning rate are one and it's CPU side.
   // One solution is to use a CUDA kernel to complete the copy operation when
   // the transforming is from CPU to GPU and the number of elements is little.
-  // But the embarrassment is that this solution this solution makes training
-  // slower.
+  // But the embarrassment is that this solution makes training slower.
   TensorCopySync(in, dst_place, out);
 }
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 4bf81b46b34560..3c6a89f2939a76 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -171,7 +171,9 @@ void innerTransDataLayoutFromMKLDNN(DataLayout in_layout,
   out->set_mem_desc(out_mem_desc);
   out->Resize(in.dims());
 
-  if ((in.mem_desc() != out->mem_desc()) || always_copy) {
+  // Note(0x45f): Using initialized() to support slice Tensors
+  // with shapes like [0, 0, 0].
+  if (in.initialized() && ((in.mem_desc() != out->mem_desc()) || always_copy)) {
     void* in_data = GetDataFromTensor(in, in_type);
 
     platform::ReorderMKLDNNHandler handler(
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index ab63b489a2edad..fd1c06fc6458e1 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/details/nan_inf_utils_detail.cc b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
index ea292712610739..bce7b64e6d7354 100644
--- a/paddle/fluid/framework/details/nan_inf_utils_detail.cc
+++ b/paddle/fluid/framework/details/nan_inf_utils_detail.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 3fd7a994a62fb4..e792d2a38dc7e0 100755
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -123,6 +123,7 @@ message BuildStrategy {
   optional bool allow_cuda_graph_capture = 14 [ default = false ];
   optional int32 reduce_strategy = 15 [ default = 0 ];
   optional bool fuse_gemm_epilogue = 16 [ default = false ];
+  optional string debug_graphviz_path = 17;
 }
 
 message ExecutionStrategy {
@@ -177,6 +178,7 @@ message PipelineConfig {
   optional int32 accumulate_steps = 2 [ default = 1 ];
   optional string schedule_mode = 3 [ default = '1F1B' ];
   optional bool p2p_cache_shape = 4 [ default = true ];
+  optional bool enable_partial_send_recv = 5 [ default = true ];
 }
 
 message TensorParallelConfig {
diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h
index c383342ee3456e..22d3ac4333fb62 100644
--- a/paddle/fluid/framework/eigen.h
+++ b/paddle/fluid/framework/eigen.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdint.h>
 
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/executor_cache.cc b/paddle/fluid/framework/executor_cache.cc
index 1ce9db6294050b..ae02f4fbfb822c 100644
--- a/paddle/fluid/framework/executor_cache.cc
+++ b/paddle/fluid/framework/executor_cache.cc
@@ -54,6 +54,10 @@ static ExecutionStrategy GetExecutionStrategy(const platform::Place &place) {
       execution_strategy.num_threads_ = 1;
       break;
     }
+    case platform::DeviceType::CUSTOM_DEVICE: {
+      execution_strategy.num_threads_ = 1;
+      break;
+    }
     default:
       PADDLE_THROW(platform::errors::Unavailable("Unsupported Device type %d.",
                                                  device_type));
diff --git a/paddle/fluid/framework/feed_fetch_type.h b/paddle/fluid/framework/feed_fetch_type.h
index 3fe545ec9c5699..a31435028dafb4 100644
--- a/paddle/fluid/framework/feed_fetch_type.h
+++ b/paddle/fluid/framework/feed_fetch_type.h
@@ -22,10 +22,11 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-using FeedType = paddle::variant<LoDTensor, Strings>;
+using FeedType = paddle::variant<LoDTensor, Strings, phi::SparseCooTensor>;
 using FeedList = std::vector<FeedType>;
 
-using FetchType = paddle::variant<LoDTensor, LoDTensorArray, framework::Vocab>;
+using FetchType = paddle::
+    variant<LoDTensor, LoDTensorArray, framework::Vocab, phi::SparseCooTensor>;
 using FetchList = std::vector<FetchType>;
 
 using FetchUnmergedList = std::vector<std::vector<FetchType>>;
@@ -52,6 +53,13 @@ inline bool data_is_string_tensor(const FeedType &data) {
   return false;
 }
 
+inline bool data_is_sparse_coo_tensor(const FetchType &data) {
+  if (data.type() == typeid(phi::SparseCooTensor)) {
+    return true;
+  }
+  return false;
+}
+
 static const char kFeedOpType[] = "feed";
 static const char kFetchOpType[] = "fetch";
 
diff --git a/paddle/fluid/framework/framework.proto b/paddle/fluid/framework/framework.proto
index 2a56dc60335d90..e99316928aba60 100644
--- a/paddle/fluid/framework/framework.proto
+++ b/paddle/fluid/framework/framework.proto
@@ -154,6 +154,10 @@ message VarType {
     FEED_LIST = 28;
     // The data type of phi::StringTensor
     PSTRING = 29;
+    // the data type of phi::SparseCooTensor
+    SPARSE_COO = 30;
+    // the data type of phi::SparseCsrTensor
+    SPARSE_CSR = 31;
   }
 
   required Type type = 1;
@@ -186,6 +190,8 @@ message VarType {
   optional TensorDesc string = 8;
   optional TensorDesc strings = 9;
   optional TensorDesc vocab = 10;
+  optional TensorDesc sparse_coo = 11;
+  optional TensorDesc sparse_csr = 12;
 }
 
 message VarDesc {
diff --git a/paddle/fluid/framework/grad_op_desc_maker.h b/paddle/fluid/framework/grad_op_desc_maker.h
index 25960383904b6a..bb36742d475ad6 100644
--- a/paddle/fluid/framework/grad_op_desc_maker.h
+++ b/paddle/fluid/framework/grad_op_desc_maker.h
@@ -181,6 +181,7 @@ class GradOpDescMakerBase {
   }
 
   std::string ForwardOpType() const { return this->fwd_op_.Type(); }
+  const BlockDesc* GetForwardOpBlock() const { return fwd_op_.Block(); }
 
  protected:
   bool HasInput(const std::string& name) const {
@@ -212,8 +213,8 @@ class SingleGradOpMaker<OpDesc> : public GradOpDescMakerBase {
     std::vector<std::unique_ptr<OpDesc>> retv;
     retv.emplace_back(new OpDesc());
     try {
-      this->Apply(retv.front().get());
       retv.front()->SetRuntimeAttrMap(this->RuntimeAttrs());
+      this->Apply(retv.front().get());
     } catch (platform::EnforceNotMet& exception) {
       framework::AppendErrorOpHint(retv.front().get()->Type(), &exception);
       throw std::move(exception);
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index f59bb2503a5709..e773f82beb6c9a 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -87,6 +87,15 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
                        });
   }
 
+  bool IsSelectedRowsInputs(const std::string& name) const override {
+    auto var_types = ctx_.GetInputsVarType(name);
+    return std::all_of(var_types.begin(),
+                       var_types.end(),
+                       [](const proto::VarType::Type& type) {
+                         return type == proto::VarType::SELECTED_ROWS;
+                       });
+  }
+
   bool IsSelectedRowsInput(const std::string& name) const override {
     auto var_type = ctx_.GetInputVarType(name);
     return var_type == proto::VarType::SELECTED_ROWS;
@@ -101,6 +110,16 @@ class InferShapeArgumentMappingContext : public phi::ArgumentMappingContext {
                        });
   }
 
+  bool IsSparseCooTensorInput(const std::string& name) const override {
+    auto var_type = ctx_.GetInputVarType(name);
+    return var_type == proto::VarType::SPARSE_COO;
+  }
+
+  bool IsSparseCsrTensorInput(const std::string& name) const override {
+    auto var_type = ctx_.GetInputVarType(name);
+    return var_type == proto::VarType::SPARSE_CSR;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     auto var_types = ctx_.GetOutputsVarType(name);
     return std::all_of(var_types.begin(),
@@ -145,6 +164,36 @@ int64_t CompatMetaTensor::numel() const {
   }
 }
 
+bool CompatMetaTensor::is_selected_rows() const {
+  if (is_runtime_) {
+    auto* var = PADDLE_GET_CONST(Variable*, var_);
+    return var->IsType<phi::SelectedRows>();
+  } else {
+    auto* var = PADDLE_GET_CONST(VarDesc*, var_);
+    return var->GetType() == proto::VarType::SELECTED_ROWS;
+  }
+}
+
+bool CompatMetaTensor::is_dense() const {
+  if (is_runtime_) {
+    auto* var = PADDLE_GET_CONST(Variable*, var_);
+    return var->IsType<phi::DenseTensor>();
+  } else {
+    auto* var = PADDLE_GET_CONST(VarDesc*, var_);
+    return var->GetType() == proto::VarType::LOD_TENSOR;
+  }
+}
+
+bool CompatMetaTensor::is_tensor_array() const {
+  if (is_runtime_) {
+    auto* var = PADDLE_GET_CONST(Variable*, var_);
+    return var->IsType<framework::LoDTensorArray>();
+  } else {
+    auto* var = PADDLE_GET_CONST(VarDesc*, var_);
+    return var->GetType() == proto::VarType::LOD_TENSOR_ARRAY;
+  }
+}
+
 DDim CompatMetaTensor::dims() const {
   ValidCheck(*this);
   if (is_runtime_) {
@@ -152,7 +201,9 @@ DDim CompatMetaTensor::dims() const {
     if (var->IsType<phi::DenseTensor>()) {
       return var->Get<phi::DenseTensor>().dims();
     } else if (var->IsType<phi::SelectedRows>()) {
-      return var->Get<phi::SelectedRows>().dims();
+      return var->Get<phi::SelectedRows>().GetCompleteDims();
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      return var->Get<phi::SparseCooTensor>().dims();
     } else if (var->IsType<framework::LoDTensorArray>()) {
       // use tensor array size as dims
       auto& tensor_array = var->Get<framework::LoDTensorArray>();
@@ -178,6 +229,8 @@ phi::DataType CompatMetaTensor::dtype() const {
       return var->Get<phi::DenseTensor>().dtype();
     } else if (var->IsType<phi::SelectedRows>()) {
       return var->Get<phi::SelectedRows>().dtype();
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      return var->Get<phi::SparseCooTensor>().dtype();
     } else if (var->IsType<framework::LoDTensorArray>()) {
       // NOTE(chenweihang): do nothing
       // Unsupported get dtype from LoDTensorArray now
@@ -200,6 +253,8 @@ DataLayout CompatMetaTensor::layout() const {
       return var->Get<phi::DenseTensor>().layout();
     } else if (var->IsType<phi::SelectedRows>()) {
       return var->Get<phi::SelectedRows>().layout();
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      return var->Get<phi::SparseCooTensor>().layout();
     } else if (var->IsType<framework::LoDTensorArray>()) {
       // NOTE(chenweihang): do nothing
       // Unsupported get layout from LoDTensorArray now
@@ -224,7 +279,9 @@ void CompatMetaTensor::set_dims(const DDim& dims) {
       auto* tensor = var->GetMutable<phi::DenseTensor>();
       phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
     } else if (var->IsType<phi::SelectedRows>()) {
-      auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
+      var->GetMutable<phi::SelectedRows>()->set_height(dims[0]);
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      auto* tensor = var->GetMutable<phi::SparseCooTensor>();
       phi::DenseTensorUtils::GetMutableMeta(tensor)->dims = dims;
     } else if (var->IsType<framework::LoDTensorArray>()) {
       auto* tensor_array = var->GetMutable<framework::LoDTensorArray>();
@@ -257,6 +314,9 @@ void CompatMetaTensor::set_dtype(phi::DataType dtype) {
     } else if (var->IsType<phi::SelectedRows>()) {
       auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
       phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      auto* tensor = var->GetMutable<phi::SparseCooTensor>();
+      phi::DenseTensorUtils::GetMutableMeta(tensor)->dtype = dtype;
     } else if (var->IsType<framework::LoDTensorArray>()) {
       // NOTE(chenweihang): do nothing
       // Unsupported set dtype for LoDTensorArray now
@@ -280,6 +340,9 @@ void CompatMetaTensor::set_layout(DataLayout layout) {
     } else if (var->IsType<phi::SelectedRows>()) {
       auto* tensor = var->GetMutable<phi::SelectedRows>()->mutable_value();
       phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      auto* tensor = var->GetMutable<phi::SparseCooTensor>();
+      phi::DenseTensorUtils::GetMutableMeta(tensor)->layout = layout;
     } else if (var->IsType<framework::LoDTensorArray>()) {
       // NOTE(chenweihang): do nothing
       // Unsupported set dtype for LoDTensorArray now
@@ -299,7 +362,7 @@ void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) {
   ValidCheck(meta_tensor);
   if (is_runtime_) {
     auto* var = PADDLE_GET(Variable*, var_);
-    if (var->IsType<phi::DenseTensor>()) {
+    if (var->IsType<phi::DenseTensor>() && meta_tensor.is_dense()) {
       auto* tensor = var->GetMutable<phi::DenseTensor>();
       phi::DenseTensorUtils::GetMutableMeta(tensor)->lod =
           static_cast<const CompatMetaTensor&>(meta_tensor).GetRuntimeLoD();
@@ -309,6 +372,10 @@ void CompatMetaTensor::share_lod(const MetaTensor& meta_tensor) {
     }
   } else {
     auto* var = PADDLE_GET(VarDesc*, var_);
+    if (!meta_tensor.is_dense() && !meta_tensor.is_tensor_array()) {
+      VLOG(3) << "input metatensor is not LoDTensor or LoDTensorArray.";
+      return;
+    }
     var->SetLoDLevel(
         static_cast<const CompatMetaTensor&>(meta_tensor).GetCompileTimeLoD());
   }
diff --git a/paddle/fluid/framework/infershape_utils.h b/paddle/fluid/framework/infershape_utils.h
index 1f745e5bf9be06..13188f924d83af 100644
--- a/paddle/fluid/framework/infershape_utils.h
+++ b/paddle/fluid/framework/infershape_utils.h
@@ -59,6 +59,11 @@ class CompatMetaTensor : public phi::MetaTensor {
 
   bool initialized() const override { return initialized_; };
 
+  bool is_selected_rows() const;
+
+  bool is_tensor_array() const;
+  bool is_dense() const;
+
   operator unspecified_bool_type() const override {
     return initialized_ ? unspecified_bool_true : 0;
   }
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 3a2ae0ff21788d..99e136e8b64948 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -148,10 +148,14 @@ pass_library(delete_c_identity_op_pass inference)
 pass_library(preln_residual_bias_fuse_pass inference)
 pass_library(delete_fill_constant_op_pass inference)
 pass_library(constant_folding_pass inference)
+pass_library(auto_mixed_precision_pass inference)
 pass_library(simplify_with_basic_ops_pass base)
 pass_library(fc_elementwise_layernorm_fuse_pass base)
 pass_library(skip_layernorm_fuse_pass base)
 pass_library(multihead_matmul_fuse_pass inference)
+pass_library(fused_multi_transformer_encoder_pass inference)
+pass_library(fused_multi_transformer_decoder_pass inference)
+pass_library(fuse_multi_transformer_layer_pass inference)
 pass_library(adaptive_pool2d_convert_global_pass inference)
 pass_library(unsqueeze2_eltwise_fuse_pass inference)
 pass_library(yolo_box_fuse_pass inference)
@@ -216,6 +220,10 @@ if(WITH_MKLDNN)
   pass_library(elt_act_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
   pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
+  pass_library(operator_scale_onednn_fuse_pass inference DIR mkldnn)
+  pass_library(squeeze2_transpose2_onednn_fuse_pass inference DIR mkldnn)
+  pass_library(operator_unsqueeze2_onednn_fuse_pass inference DIR mkldnn)
+  pass_library(operator_reshape2_onednn_fuse_pass inference DIR mkldnn)
   pass_library(cpu_quantize_placement_pass base DIR mkldnn)
   pass_library(cpu_quantize_pass inference DIR mkldnn)
   pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
@@ -354,6 +362,18 @@ cc_test(
   test_multihead_matmul_fuse_pass
   SRCS multihead_matmul_fuse_pass_tester.cc
   DEPS multihead_matmul_fuse_pass)
+cc_test(
+  test_fused_multi_transformer_encoder_pass
+  SRCS fused_multi_transformer_encoder_pass_tester.cc
+  DEPS fused_multi_transformer_encoder_pass)
+cc_test(
+  test_fused_multi_transformer_decoder_pass
+  SRCS fused_multi_transformer_decoder_pass_tester.cc
+  DEPS fused_multi_transformer_decoder_pass)
+cc_test(
+  test_fuse_multi_transformer_layer_pass
+  SRCS fuse_multi_transformer_layer_pass_tester.cc
+  DEPS fuse_multi_transformer_layer_pass)
 cc_test(
   test_conv_bn_fuse_pass_cc
   SRCS conv_bn_fuse_pass_tester.cc
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
new file mode 100644
index 00000000000000..3d66ed788c6a94
--- /dev/null
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc
@@ -0,0 +1,824 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
+
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+namespace {
+
+using VarType = AutoMixedPrecisionPass::VarType;
+
+bool PhiKernelSupportPrecision(
+    const std::string& op_type,
+    phi::Backend backend,
+    phi::DataType data_type,
+    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
+  const auto& kernels = phi::KernelFactory::Instance().kernels();
+  if (kernels.count(op_type) == 0) {
+    return false;
+  }
+  phi::KernelKey kernel_key(backend, layout, data_type);
+  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
+}
+
+bool GpuKernelSupportPrecision(
+    const std::string& op_type,
+    phi::DataType precision,
+    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
+  auto phi_op_type = phi::TransToPhiKernelName(op_type);
+  bool support = PhiKernelSupportPrecision(
+      phi_op_type, phi::Backend::GPU, precision, layout);
+  support |= PhiKernelSupportPrecision(
+      phi_op_type, phi::Backend::GPUDNN, precision, layout);
+
+  if (!support) {
+    const auto& all_kernels = framework::OperatorWithKernel::AllOpKernels();
+    auto it = all_kernels.find(op_type);
+    if (it != all_kernels.end()) {
+      for (const auto& kern_pair : it->second) {
+        if (platform::is_gpu_place(kern_pair.first.place_) &&
+            kern_pair.first.data_type_ ==
+                framework::TransToProtoVarType(precision)) {
+          support = true;
+          break;
+        }
+      }
+    }
+  }
+  return support;
+}
+
+inline bool VarNodeHasDtype(Node* var_node) {
+  auto type = var_node->Var()->GetType();
+  return (type == VarType::SELECTED_ROWS) || (type == VarType::LOD_TENSOR) ||
+         (type == VarType::LOD_TENSOR_ARRAY) || (type == VarType::STRINGS) ||
+         (type == VarType::VOCAB);
+}
+
+inline bool IsFP32AndFP64(VarType::Type type) {
+  return (type == VarType::FP64) || (type == VarType::FP32);
+}
+
+inline bool IsFP16AndBFP16(VarType::Type type) {
+  return (type == VarType::FP16) || (type == VarType::BF16);
+}
+
+};  // namespace
+
+void DoInsertCastOp(Graph* graph,
+                    Node* var_node,
+                    Node* op_node,
+                    VarType::Type from_type,
+                    VarType::Type to_type,
+                    framework::BlockDesc* block_desc,
+                    int* suffix,
+                    std::unordered_map<Node*, Node*>* cache) {
+  if (from_type == to_type) return;
+
+  auto update_cast_desc = [&](framework::OpDesc& desc,
+                              const std::string& x_name,
+                              const std::string& out_name,
+                              const int in_dtype,
+                              const int out_dtype) {
+    desc.SetType("cast");
+    desc.SetInput("X", {x_name});
+    desc.SetOutput("Out", {out_name});
+    desc.SetAttr("in_dtype", in_dtype);
+    desc.SetAttr("out_dtype", out_dtype);
+    desc.SetAttr("use_mkldnn", false);
+    desc.SetAttr("with_quant_attr", false);
+    desc.Flush();
+  };
+
+  if (cache->count(var_node) == 0) {
+    // insert cast op between var_node and op_node
+    std::string cast_input_name = var_node->Var()->Name();
+    std::string cast_output_name =
+        var_node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
+    framework::OpDesc cast_op_desc(block_desc);
+    update_cast_desc(cast_op_desc,
+                     cast_input_name,
+                     cast_output_name,
+                     static_cast<int>(from_type),
+                     static_cast<int>(to_type));
+    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
+    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
+    cast_output_vardesc->SetPersistable(false);
+    cast_output_vardesc->SetDataType(to_type);
+    cast_output_vardesc->SetShape(var_node->Var()->GetShape());
+    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
+    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
+    (*cache)[var_node] = cast_output_node;
+  }
+  op_node->Op()->Rename(var_node->Name(), cache->at(var_node)->Name());
+  IR_NODE_LINK_TO(var_node, cache->at(var_node)->inputs[0]);
+  IR_NODE_LINK_TO(cache->at(var_node), op_node);
+
+  IR_NODE_UNLINK(var_node, op_node);
+}
+
+bool OpSupportPrecision(const std::string& op_type,
+                        phi::Backend backend,
+                        phi::DataType precision,
+                        const std::unordered_set<std::string>& black_list) {
+  bool support = false;
+  if (black_list.count(op_type) == 0) {
+    if (backend == phi::Backend::GPU) {
+      support = GpuKernelSupportPrecision(op_type, precision);
+    } else {
+      PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+          "Now, only support backend of GPU."));
+    }
+  }
+  return support;
+}
+
+// The set of ops that support fp16 calculation and are considered
+// numerically-dangerous, slower and whose effects may also be observed in
+// downstream ops.
+// ref to python/paddle/fluid/contrib/mixed_precision/fp16_lists.py
+void AutoMixedPrecisionPass::SetDefaultBlacklist() const {
+  black_list_.insert({
+      // numerically-dangerous
+      "exp",
+      "square",
+      "log",
+      "mean",
+      "sum",
+      "cos_sim",
+      "softmax_with_cross_entropy",
+      "sigmoid_cross_entropy_with_logits",
+      "c_softmax_with_cross_entropy",
+      "cross_entropy",
+      "cross_entropy2",
+      // slower than fp32
+      "conv2d_transpose",
+      // default fp32 can avoid return inf when the sum value large than 65504
+      "reduce_sum",
+  });
+}
+
+void AutoMixedPrecisionPass::Init(Graph* graph) const {
+  bool enable_gpu_mixed = Get<bool>("enable_gpu_mixed");
+  if (enable_gpu_mixed) {
+    backend_ = phi::Backend::GPU;
+  }
+
+  skip_pass_ = !enable_gpu_mixed;
+
+  low_precision_ = static_cast<phi::DataType>(Get<int>("mixed_precision_mode"));
+
+  black_list_ = Get<std::unordered_set<std::string>>("mixed_black_list");
+  SetDefaultBlacklist();
+  VLOG(4) << "black_list has ";
+  for (const auto& name : black_list_) {
+    VLOG(4) << " - " << name;
+  }
+
+  keep_io_types_ = true;
+  if (Has("keep_io_types")) {
+    keep_io_types_ = Get<bool>("keep_io_types");
+  }
+
+  auto graph_size = graph->SubGraphsSize();
+  VLOG(4) << "graph size: " << graph_size;
+  subgraphes_.resize(graph_size);
+  all_op_nodes_.resize(graph_size);
+
+  for (size_t i = 0; i < graph_size; i++) {
+    subgraphes_[i] = graph->GetSubGraph(i);
+    all_op_nodes_[i] = TopologySortOperations(*subgraphes_[i]);
+    VLOG(4) << "subgraph " << i << " has " << all_op_nodes_[i].size()
+            << "op nodes";
+    for (auto* var_node : subgraphes_[i]->Nodes()) {
+      if (!var_node->IsVar()) continue;
+
+      auto var_name = var_node->Var()->Name();
+      if (real_vars_.count(var_name) == 0) {
+        real_vars_[var_name] = var_node;
+        VLOG(4) << var_name << " is in graph " << i;
+      }
+    }
+  }
+}
+
+void AutoMixedPrecisionPass::ApplyImpl(Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::PreconditionNotMet(
+                              "During the auto_mixed_precision_pass, the graph "
+                              "should not be nullptr."));
+  PADDLE_ENFORCE_EQ(graph->IsMainGraph(),
+                    true,
+                    platform::errors::PreconditionNotMet(
+                        "During the auto_mixed_precision_pass, the graph "
+                        "should be main graph."));
+
+  FusePassBase::Init("auto_mixed_precision", graph);
+
+  Init(graph);
+  VLOG(4) << "Init done";
+
+  if (skip_pass_) {
+    VLOG(3) << "Skip auto_mixed_precision_pass.";
+    return;
+  }
+
+  SetOpUniqueType();
+  VLOG(4) << "SetOpUniqueType done";
+  GetOpPrecision();
+  VLOG(4) << "GetOpPrecision done";
+  UpdateOpPrecision();
+  VLOG(4) << "UpdateOpPrecision done";
+  SetVarPrecision();
+  VLOG(4) << "SetVarPrecision done";
+  ConvertWeightsData();
+  VLOG(4) << "ConvertWeightsData done";
+  ProcessOpWithDtypeAttr();
+  VLOG(4) << "ProcessOpWithDtypeAttr done";
+  InsertCastOp();
+  VLOG(4) << "InsertCastOp done";
+  RestoreOpOriginType();
+  VLOG(4) << "RestoreOpOriginType done";
+  LOG(INFO) << "The number of ops run at low precision ["
+            << op_run_low_precision_.size() << "/" << op_original_type_.size()
+            << "]";
+}
+
+void AutoMixedPrecisionPass::SetOpUniqueType() const {
+  int suffix = 0;
+  for (const auto& nodes : all_op_nodes_) {
+    for (auto* op_node : nodes) {
+      auto op_type = op_node->Op()->Type();
+
+      if (op_type == "feed" || op_type == "fetch") continue;
+
+      std::string unique_type = op_type + "_" + std::to_string(suffix++);
+      op_original_type_[unique_type] = op_type;
+      op_node->Op()->SetType(unique_type);
+      op_node->Op()->Flush();
+      VLOG(4) << "change op type: " << op_type << " ---> " << unique_type;
+    }
+  }
+}
+
+void AutoMixedPrecisionPass::RestoreOpOriginType() const {
+  for (const auto& nodes : all_op_nodes_) {
+    for (auto* op_node : nodes) {
+      auto op_type = op_node->Op()->Type();
+      op_node->Op()->SetType(GetOpOriginalType(op_type));
+      op_node->Op()->Flush();
+      VLOG(4) << "restore op type: " << op_type << " ---> "
+              << op_node->Op()->Type();
+    }
+  }
+}
+
+inline std::string AutoMixedPrecisionPass::GetOpOriginalType(
+    const std::string& op_type) const {
+  if (op_original_type_.count(op_type)) {
+    return op_original_type_.at(op_type);
+  }
+  return op_type;
+}
+
+void AutoMixedPrecisionPass::ProcessOpWithDtypeAttr() const {
+  for (const auto& nodes : all_op_nodes_) {
+    for (auto* op_node : nodes) {
+      auto op_type = op_node->Op()->Type();
+
+      if (op_node->Op()->HasAttr("in_dtype")) {
+        auto* var_node = op_node->inputs[0];
+        auto* real_var_node = real_vars_[var_node->Var()->Name()];
+        if (IsFP16AndBFP16(real_var_node->Var()->GetDataType())) {
+          op_node->Op()->SetAttr(
+              "in_dtype",
+              static_cast<int>(framework::TransToProtoVarType(low_precision_)));
+          op_node->Op()->Flush();
+          VLOG(4) << "process op with in_dtype attr: " << op_type << " ( "
+                  << static_cast<int>(real_var_node->Var()->GetDataType())
+                  << " --->" << static_cast<int>(low_precision_) << " )";
+        }
+      }
+
+      if (op_run_low_precision_.count(op_type) == 0) continue;
+
+      if (op_node->Op()->HasAttr("dtype")) {
+        auto dtype = op_node->Op()->GetAttrIfExists<int>("dtype");
+        if (IsFP32AndFP64(static_cast<VarType::Type>(dtype))) {
+          op_node->Op()->SetAttr(
+              "dtype",
+              static_cast<int>(framework::TransToProtoVarType(low_precision_)));
+          op_node->Op()->Flush();
+          VLOG(4) << "process op with dtype attr: " << op_type << " ( " << dtype
+                  << " --->" << static_cast<int>(low_precision_) << " )";
+        }
+      } else if (op_node->Op()->HasAttr("out_dtype")) {
+        auto out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
+        if (IsFP32AndFP64(static_cast<VarType::Type>(out_dtype))) {
+          op_node->Op()->SetAttr(
+              "out_dtype",
+              static_cast<int>(framework::TransToProtoVarType(low_precision_)));
+          op_node->Op()->Flush();
+          VLOG(4) << "process op with out_dtype attr: " << op_type << " ( "
+                  << out_dtype << " --->" << static_cast<int>(low_precision_)
+                  << " )";
+        }
+      }
+    }
+  }
+}
+
+void AutoMixedPrecisionPass::GetOpPrecision() const {
+  for (const auto& nodes : all_op_nodes_) {
+    for (auto* op_node : nodes) {
+      auto op_type = op_node->Op()->Type();
+      bool support_low_precision = true;
+      if (GetOpOriginalType(op_type) == "feed" ||
+          GetOpOriginalType(op_type) == "fetch") {
+        support_low_precision = !keep_io_types_;
+      } else {
+        support_low_precision = OpSupportPrecision(
+            GetOpOriginalType(op_type), backend_, low_precision_, black_list_);
+      }
+
+      if (op_node->Op()->HasAttr("dtype")) {
+        auto dtype = op_node->Op()->GetAttrIfExists<int>("dtype");
+        support_low_precision =
+            support_low_precision &&
+            IsFP32AndFP64(static_cast<VarType::Type>(dtype));
+      } else if (op_node->Op()->HasAttr("out_dtype")) {
+        auto out_dtype = op_node->Op()->GetAttrIfExists<int>("out_dtype");
+        support_low_precision =
+            support_low_precision &&
+            IsFP32AndFP64(static_cast<VarType::Type>(out_dtype));
+      }
+
+      // If scale op's "scale" and "bias" attr value exceed the range of fp16
+      // and bf16, it cannot run at low precision.
+      if (GetOpOriginalType(op_node->Op()->Type()) == "scale") {
+        auto scale = op_node->Op()->GetAttrIfExists<float>("scale");
+        auto bias = op_node->Op()->GetAttrIfExists<float>("bias");
+        if (low_precision_ == phi::DataType::FLOAT16) {
+          support_low_precision =
+              support_low_precision &&
+              phi::dtype::isfinite(static_cast<phi::dtype::float16>(scale)) &&
+              phi::dtype::isfinite(static_cast<phi::dtype::float16>(bias));
+        } else if (low_precision_ == phi::DataType::BFLOAT16) {
+          support_low_precision =
+              support_low_precision &&
+              phi::dtype::isfinite(static_cast<phi::dtype::bfloat16>(scale)) &&
+              phi::dtype::isfinite(static_cast<phi::dtype::bfloat16>(bias));
+        }
+      }
+
+      // if op's input var and output var is not dense tensor, the op should
+      // not run at low precision.
+      for (auto* in_var_node : op_node->inputs) {
+        CHECK_EQ(in_var_node->IsVar(), true);
+        auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()];
+        if (real_in_var_node->Var()->Persistable()) continue;
+
+        support_low_precision =
+            support_low_precision &&
+            (real_in_var_node->Var()->GetType() == VarType::LOD_TENSOR);
+      }
+      for (auto* out_var_node : op_node->outputs) {
+        CHECK_EQ(out_var_node->IsVar(), true);
+        auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()];
+        if (real_out_var_node->Var()->Persistable()) continue;
+
+        support_low_precision =
+            support_low_precision &&
+            (real_out_var_node->Var()->GetType() == VarType::LOD_TENSOR);
+      }
+
+      if (support_low_precision) {
+        op_run_low_precision_.insert(op_type);
+        VLOG(4) << "support precision: " << op_type << " run at low precision";
+      } else {
+        VLOG(4) << "support precision: " << op_type
+                << " not run at low precision";
+      }
+    }
+  }
+}
+
+void AutoMixedPrecisionPass::UpdateOpPrecision() const {
+  std::unordered_set<std::string> vars_should_not_low_precision;
+
+  // var -> the var's all input op
+  std::unordered_map<std::string, std::vector<Node*>> var_input_ops;
+
+  auto GetVarInputOps = [&] {
+    for (const auto& nodes : all_op_nodes_) {
+      for (auto* op_node : nodes) {
+        auto op_type = op_node->Op()->Type();
+
+        if (GetOpOriginalType(op_type) == "fetch") continue;
+        if (op_node->Op()->HasAttr("sub_block")) continue;
+
+        for (auto* var_node : op_node->outputs) {
+          CHECK_EQ(var_node->IsVar(), true);
+          if (var_node->Var()->Persistable()) continue;
+          if (!VarNodeHasDtype(var_node)) continue;
+
+          var_input_ops[var_node->Var()->Name()].push_back(op_node);
+          VLOG(4) << "var input ops: " << var_node->Var()->Name()
+                  << " is output of " << op_type;
+        }
+
+        // the select_input op's input var should not convert to low precision.
+        // when op's output var is select_input op's input var, the op should
+        // not run at low precision.
+        if (GetOpOriginalType(op_node->Op()->Type()) == "select_input") {
+          for (auto* in_var_node : op_node->inputs) {
+            CHECK_EQ(in_var_node->IsVar(), true);
+            if (in_var_node->Var()->Persistable()) continue;
+            if (!VarNodeHasDtype(in_var_node)) continue;
+
+            vars_should_not_low_precision.insert(in_var_node->Var()->Name());
+          }
+        }
+
+        // when op_1 only support cpu kernel. if op_2's intput var is op_1's
+        // output var, then op_2 should not run at low precision.
+        if (GetOpOriginalType(op_type) != "feed" &&
+            !GpuKernelSupportPrecision(GetOpOriginalType(op_type),
+                                       phi::DataType::FLOAT32)) {
+          for (auto* out_var_node : op_node->outputs) {
+            CHECK_EQ(out_var_node->IsVar(), true);
+            if (out_var_node->Var()->Persistable()) continue;
+            if (!VarNodeHasDtype(out_var_node)) continue;
+
+            vars_should_not_low_precision.insert(out_var_node->Var()->Name());
+          }
+        }
+      }
+    }
+  };
+  GetVarInputOps();
+
+  bool precision_updated = false;
+  do {
+    precision_updated = false;
+    for (const auto& nodes : all_op_nodes_) {
+      for (auto* op_node : nodes) {
+        if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) continue;
+
+        for (auto* in_var_node : op_node->inputs) {
+          CHECK_EQ(in_var_node->IsVar(), true);
+          if (!VarNodeHasDtype(in_var_node)) continue;
+
+          auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()];
+          if (real_in_var_node->Var()->Persistable()) continue;
+
+          if (vars_should_not_low_precision.count(
+                  real_in_var_node->Var()->Name())) {
+            op_run_low_precision_.erase(op_node->Op()->Type());
+            precision_updated = true;
+            VLOG(4) << op_node->Op()->Type()
+                    << " should not run at low precision.";
+            break;
+          }
+        }
+
+        if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) continue;
+
+        for (auto* out_var_node : op_node->outputs) {
+          CHECK_EQ(out_var_node->IsVar(), true);
+          if (!VarNodeHasDtype(out_var_node)) continue;
+
+          auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()];
+          if (real_out_var_node->Var()->Persistable()) continue;
+
+          bool not_run_low_precision = false;
+          const auto& input_op_nodes =
+              var_input_ops[real_out_var_node->Var()->Name()];
+          if (vars_should_not_low_precision.count(
+                  real_out_var_node->Var()->Name())) {
+            not_run_low_precision = true;
+          } else {
+            for (auto* node : input_op_nodes) {
+              if (op_run_low_precision_.count(node->Op()->Type()) == 0) {
+                not_run_low_precision = true;
+                break;
+              }
+            }
+          }
+          if (not_run_low_precision) {
+            op_run_low_precision_.erase(op_node->Op()->Type());
+            precision_updated = true;
+            VLOG(4) << op_node->Op()->Type()
+                    << " should not run at low precision.";
+            break;
+          }
+        }
+      }
+    }
+  } while (precision_updated);
+}
+
+// special ops, its weights should not be low precision.
+bool AutoMixedPrecisionPass::InputVarsNotConvert(
+    Node* op_node, const std::string& var_name) const {
+  auto* op_desc = op_node->Op();
+  if (GetOpOriginalType(op_desc->Type()) == "batch_norm") {
+    auto vecs = op_desc->Input("Bias");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Input("Mean");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Input("Scale");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Input("Variance");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+  } else if (GetOpOriginalType(op_desc->Type()) == "fused_multi_transformer") {
+    auto vecs = op_desc->Input("LnScale");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Input("LnBias");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Input("FFNLnScale");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Input("FFNLnBias");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool AutoMixedPrecisionPass::OutputVarsNotConvert(
+    Node* op_node, const std::string& var_name) const {
+  auto* op_desc = op_node->Op();
+  // batch_norm's input and output (variance and mean) are the same.
+  if (GetOpOriginalType(op_desc->Type()) == "batch_norm") {
+    auto vecs = op_desc->Output("MeanOut");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Output("VarianceOut");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Output("SavedMean");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+    vecs = op_desc->Output("SavedVariance");
+    if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void AutoMixedPrecisionPass::SetVarPrecision() const {
+  for (const auto& nodes : all_op_nodes_) {
+    for (auto* op_node : nodes) {
+      if (op_run_low_precision_.count(op_node->Op()->Type()) == 0) {
+        continue;
+      }
+
+      if (GetOpOriginalType(op_node->Op()->Type()) != "feed") {
+        for (auto* in_var_node : op_node->inputs) {
+          CHECK_EQ(in_var_node->IsVar(), true);
+
+          auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()];
+          auto in_var_name = real_in_var_node->Var()->Name();
+
+          if (!IsFP32AndFP64(real_in_var_node->Var()->GetDataType())) continue;
+          if (!VarNodeHasDtype(real_in_var_node)) continue;
+          if (InputVarsNotConvert(op_node, in_var_name)) continue;
+
+          if (real_in_var_node->Var()->Persistable()) {
+            real_in_var_node->Var()->SetDataType(
+                framework::TransToProtoVarType(low_precision_));
+            vars_convert_to_low_precision_.insert(in_var_name);
+          }
+        }
+      }
+
+      if (GetOpOriginalType(op_node->Op()->Type()) != "fetch") {
+        for (auto* out_var_node : op_node->outputs) {
+          CHECK_EQ(out_var_node->IsVar(), true);
+
+          auto* real_out_var_node = real_vars_[out_var_node->Var()->Name()];
+          auto out_var_name = real_out_var_node->Var()->Name();
+
+          if (!IsFP32AndFP64(real_out_var_node->Var()->GetDataType())) continue;
+          if (!VarNodeHasDtype(real_out_var_node)) continue;
+          if (OutputVarsNotConvert(op_node, out_var_name)) continue;
+
+          real_out_var_node->Var()->SetDataType(
+              framework::TransToProtoVarType(low_precision_));
+          if (real_out_var_node->Var()->Persistable()) {
+            vars_convert_to_low_precision_.insert(out_var_name);
+          }
+        }
+      }
+    }
+  }
+
+  // This code used to precess vars with the same name. Vars with the same
+  // name should have the same data type.
+  for (auto* subgraph : subgraphes_) {
+    for (auto* var_node : subgraph->Nodes()) {
+      if (!var_node->IsVar() || !var_node->Var()->Persistable()) continue;
+      if (!VarNodeHasDtype(var_node)) continue;
+
+      auto var_name = var_node->Var()->Name();
+      if (vars_convert_to_low_precision_.count(var_name)) {
+        var_node->Var()->SetDataType(
+            framework::TransToProtoVarType(low_precision_));
+      }
+    }
+  }
+}
+
+void AutoMixedPrecisionPass::ConvertWeightsData() const {
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(scope,
+                          platform::errors::PreconditionNotMet(
+                              "During the auto_mixed_precision_pass, the scope "
+                              "should not be null."));
+
+  auto var_names = scope->LocalVarNames();
+  for (const auto& var_name : var_names) {
+    if (vars_convert_to_low_precision_.count(var_name)) {
+      VLOG(4) << var_name << "'s data type was convert to low precision";
+
+      auto* var = scope->FindLocalVar(var_name);
+      CHECK_EQ(var->IsType<phi::DenseTensor>(), true);
+
+      auto* origin_tensor = var->GetMutable<phi::DenseTensor>();
+
+      phi::DenseTensor low_precision_tensor;
+      low_precision_tensor.Resize(origin_tensor->dims());
+      low_precision_tensor.set_type(low_precision_);
+
+      if (low_precision_ == phi::DataType::FLOAT16) {
+        auto* low_precision_data =
+            low_precision_tensor.mutable_data<phi::dtype::float16>(
+                phi::CPUPlace{});
+        for (int64_t i = 0; i < origin_tensor->numel(); i++) {
+          if (origin_tensor->dtype() == phi::DataType::FLOAT64) {
+            auto* origin_data = origin_tensor->data<double>();
+            low_precision_data[i] =
+                static_cast<phi::dtype::float16>(origin_data[i]);
+          } else if (origin_tensor->dtype() == phi::DataType::FLOAT32) {
+            auto* origin_data = origin_tensor->data<float>();
+            low_precision_data[i] =
+                static_cast<phi::dtype::float16>(origin_data[i]);
+          }
+        }
+      } else if (low_precision_ == phi::DataType::BFLOAT16) {
+        auto* low_precision_data =
+            low_precision_tensor.mutable_data<phi::dtype::bfloat16>(
+                phi::CPUPlace{});
+        for (int64_t i = 0; i < origin_tensor->numel(); i++) {
+          if (origin_tensor->dtype() == phi::DataType::FLOAT64) {
+            auto* origin_data = origin_tensor->data<double>();
+            low_precision_data[i] =
+                static_cast<phi::dtype::bfloat16>(origin_data[i]);
+          } else if (origin_tensor->dtype() == phi::DataType::FLOAT32) {
+            auto* origin_data = origin_tensor->data<float>();
+            low_precision_data[i] =
+                static_cast<phi::dtype::bfloat16>(origin_data[i]);
+          }
+        }
+      }
+      origin_tensor->clear();
+      paddle::framework::TensorCopySync(
+          low_precision_tensor, phi::CPUPlace{}, origin_tensor);
+    }
+  }
+}
+
+void AutoMixedPrecisionPass::InsertCastOp() const {
+  int suffix = 0;
+  std::unordered_map<Node*, Node*> cache;
+
+  for (size_t i = 0; i < all_op_nodes_.size(); i++) {
+    auto* block_desc = all_op_nodes_[i][0]->Op()->Block();
+    CHECK_NOTNULL(block_desc);
+    for (auto* op_node : all_op_nodes_[i]) {
+      auto op_type = op_node->Op()->Type();
+
+      if (GetOpOriginalType(op_type) == "feed") continue;
+      if (op_node->Op()->HasAttr("sub_block")) continue;
+
+      VLOG(4) << "process op: " << op_type
+              << " run low precision: " << op_run_low_precision_.count(op_type);
+
+      auto inputs = op_node->inputs;
+      for (auto* in_var_node : inputs) {
+        if (!in_var_node->IsVar()) continue;
+        if (!VarNodeHasDtype(in_var_node)) continue;
+        if (in_var_node->Var()->Persistable()) continue;
+
+        auto* real_in_var_node = real_vars_[in_var_node->Var()->Name()];
+
+        auto in_var_type = real_in_var_node->Var()->GetDataType();
+
+        VLOG(4) << "process var: " << real_in_var_node->Var()->Name()
+                << " with type " << in_var_type;
+
+        if (IsFP32AndFP64(in_var_type) &&
+            op_run_low_precision_.count(op_type)) {
+          auto to_type = framework::TransToProtoVarType(low_precision_);
+          auto* prev_op =
+              in_var_node->inputs.empty() ? nullptr : in_var_node->inputs[0];
+          if (prev_op && GetOpOriginalType(prev_op->Op()->Type()) == "cast") {
+            in_var_node->Var()->SetDataType(to_type);
+            prev_op->Op()->SetAttr("out_dtype", static_cast<int>(to_type));
+            prev_op->Op()->Flush();
+          } else {
+            DoInsertCastOp(subgraphes_[i],
+                           in_var_node,
+                           op_node,
+                           in_var_type,
+                           to_type,
+                           block_desc,
+                           &suffix,
+                           &cache);
+          }
+        } else if (IsFP16AndBFP16(in_var_type) &&
+                   op_run_low_precision_.count(op_type) == 0) {
+          auto to_type = VarType::FP32;
+          auto* prev_op =
+              in_var_node->inputs.empty() ? nullptr : in_var_node->inputs[0];
+          if (prev_op && GetOpOriginalType(prev_op->Op()->Type()) == "cast") {
+            in_var_node->Var()->SetDataType(to_type);
+            prev_op->Op()->SetAttr("out_dtype", static_cast<int>(to_type));
+            prev_op->Op()->Flush();
+          } else {
+            DoInsertCastOp(subgraphes_[i],
+                           in_var_node,
+                           op_node,
+                           in_var_type,
+                           to_type,
+                           block_desc,
+                           &suffix,
+                           &cache);
+          }
+        }
+      }
+
+      // Special op.
+      // fused_multi_transformer's input(CacheKV) and output(CacheKVOut) vars
+      // have same name.
+      if (GetOpOriginalType(op_type) == "fused_multi_transformer") {
+        auto cache_kv_inputs = op_node->Op()->Input("CacheKV");
+        auto cache_kv_outputs = op_node->Op()->Output("CacheKVOut");
+        CHECK_EQ(cache_kv_inputs.size(), cache_kv_outputs.size());
+        for (size_t i = 0; i < cache_kv_inputs.size(); ++i) {
+          op_node->Op()->RenameOutput(cache_kv_outputs[i], cache_kv_inputs[i]);
+        }
+      }
+    }
+  }
+  VLOG(4) << "insert number of cast op: " << cache.size();
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(auto_mixed_precision_pass,
+              paddle::framework::ir::AutoMixedPrecisionPass);
diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.h b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h
new file mode 100644
index 00000000000000..578d47282b76d4
--- /dev/null
+++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/phi/common/backend.h"
+#include "paddle/phi/common/data_type.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AutoMixedPrecisionPass : public FusePassBase {
+ public:
+  using VarType = framework::proto::VarType;
+
+ public:
+  AutoMixedPrecisionPass() = default;
+  ~AutoMixedPrecisionPass() = default;
+
+ protected:
+  void ApplyImpl(Graph* graph) const override;
+
+ private:
+  void Init(Graph* graph) const;
+
+  void SetDefaultBlacklist() const;
+
+  void SetOpUniqueType() const;
+
+  void RestoreOpOriginType() const;
+
+  inline std::string GetOpOriginalType(const std::string& op_type) const;
+
+  void GetOpPrecision() const;
+
+  void UpdateOpPrecision() const;
+
+  void InsertCastOp() const;
+
+  void ProcessOpWithDtypeAttr() const;
+
+  bool InputVarsNotConvert(Node* op_node, const std::string& var_name) const;
+
+  bool OutputVarsNotConvert(Node* op_node, const std::string& var_name) const;
+
+  void SetVarPrecision() const;
+
+  void ConvertWeightsData() const;
+
+ private:
+  mutable bool skip_pass_{false};
+
+  mutable bool keep_io_types_{false};
+  // float16 or bfloat16 now
+  mutable phi::DataType low_precision_{phi::DataType::FLOAT16};
+
+  mutable phi::Backend backend_{phi::Backend::GPU};
+
+  mutable std::unordered_set<std::string> black_list_;
+
+  // subgraph id -> pointer to subgraph
+  mutable std::vector<Graph*> subgraphes_;
+  // var name -> real var node
+  mutable std::unordered_map<std::string, Node*> real_vars_;
+  // subgraph id -> all op nodes in subgraph
+  mutable std::vector<std::vector<Node*>> all_op_nodes_;
+  // op's unique type -> the op's origin type
+  mutable std::unordered_map<std::string, std::string> op_original_type_;
+  // op's unique type -> whether the op run at low precision
+  mutable std::unordered_set<std::string> op_run_low_precision_;
+
+  mutable std::unordered_set<std::string> vars_convert_to_low_precision_;
+};
+
+bool OpSupportPrecision(const std::string& op_type,
+                        phi::Backend backend,
+                        phi::DataType precision,
+                        const std::unordered_set<std::string>& black_list);
+
+void DoInsertCastOp(Graph* graph,
+                    Node* var_node,
+                    Node* op_node,
+                    proto::VarType::Type from_type,
+                    proto::VarType::Type to_type,
+                    framework::BlockDesc* block_desc,
+                    int* suffix,
+                    std::unordered_map<Node*, Node*>* cache);
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
index 94d5c4bac58fda..3c7f77708cd0bd 100644
--- a/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/conv_bn_fuse_pass.cc
@@ -17,6 +17,7 @@
 #include <string>
 
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/tensor.h"
diff --git a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
index cd5cbf150b3a33..582b9389e0ffcd 100644
--- a/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_fill_constant_op_pass.cc
@@ -29,6 +29,11 @@ void FillConstData(LoDTensor* out_t, T value) {
 }
 
 void DeleteFillConstantOpPass::ApplyImpl(ir::Graph* graph) const {
+  bool with_dynamic_shape = Get<bool>("with_dynamic_shape");
+  // Not support
+  if (with_dynamic_shape) {
+    return;
+  }
   FusePassBase::Init("delete_fill_constant_op_pass", graph);
   GraphPatternDetector detector;
   auto fill_constant_op =
diff --git a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
index ee7a2a722331e0..e049d1e950a98d 100644
--- a/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
+++ b/paddle/fluid/framework/ir/delete_quant_dequant_linear_op_pass.cc
@@ -111,9 +111,6 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
     }
     */
     std::unordered_set<const Node*> nodes2rm = {};
-    int bit_length =
-        PADDLE_GET_CONST(int, quantize_linear_op->Op()->GetAttr("bit_length"));
-    int range = ((1 << (bit_length - 1)) - 1);
 
     // Get input scale from tensor
     const LoDTensor& input_scale_tensor =
@@ -124,7 +121,7 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
         platform::errors::InvalidArgument(
             "Input scale tensor's place should be CPU."));
     const float* input_scale_data = input_scale_tensor.data<float>();
-    float input_scale = input_scale_data[0] / range;
+    float input_scale = input_scale_data[0];
 
     int nums_any_ops = dequantize_linear_op_out->outputs.size();
     for (int i = 0; i < nums_any_ops; ++i) {
@@ -138,8 +135,9 @@ void DeleteQuantDequantLinearOpPass::ApplyImpl(ir::Graph* graph) const {
       IR_NODE_LINK_TO(quantize_linear_op_x,
                       dequantize_linear_op_out->outputs[i]);
     }
-
-    nodes2rm.insert(quantize_linear_op_scale);
+    // Forbid removing weight tensor when weight is shared between ops
+    if (quantize_linear_op_scale->outputs.size() <= 1UL)
+      nodes2rm.insert(quantize_linear_op_scale);
     nodes2rm.insert(quantize_linear_op);
     nodes2rm.insert(quantize_linear_op_out);
     nodes2rm.insert(dequantize_linear_op);
diff --git a/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc
new file mode 100644
index 00000000000000..4e2bca2ae2a970
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.cc
@@ -0,0 +1,325 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+std::unordered_map<std::string, std::string>
+MultiTransformerLayerPattern::operator()(bool enable_int8,
+                                         int num_fused_op,
+                                         bool is_decoder) {
+  std::string fused_multi_transformer_name =
+      enable_int8 ? "fused_multi_transformer_int8" : "fused_multi_transformer";
+
+  std::unordered_map<std::string, std::string> node_reprs;
+
+  // x0 and src_mask is unqiue input of subgraph
+  auto* x0 = pattern->NewNode(x0_repr());
+  x0->assert_is_op_input(fused_multi_transformer_name, "X")->AsInput();
+  auto* src_mask = pattern->NewNode(src_mask_repr());
+  src_mask->assert_is_op_input(fused_multi_transformer_name, "SrcMask")
+      ->AsInput();
+
+  for (int i = 0; i < num_fused_op; ++i) {
+    auto fuse_op_repr =
+        PDNodeName(name_scope_, repr_, id_, "fuse_op_" + std::to_string(i));
+    node_reprs["fuse_op_" + std::to_string(i)] = fuse_op_repr;
+    auto* fused_multi_transformer =
+        pattern->NewNode(fuse_op_repr)
+            ->assert_is_op(fused_multi_transformer_name);
+
+    auto out_repr =
+        PDNodeName(name_scope_, repr_, id_, "out_" + std::to_string(i));
+    node_reprs["out_" + std::to_string(i)] = out_repr;
+    auto* out = pattern->NewNode(out_repr)->assert_is_op_output(
+        fused_multi_transformer_name, "Out");
+
+    if (is_decoder) {
+      auto shape_repr =
+          PDNodeName(name_scope_, repr_, id_, "shape_" + std::to_string(i));
+      node_reprs["shape_" + std::to_string(i)] = shape_repr;
+      auto* shape = pattern->NewNode(shape_repr)->assert_is_op("shape");
+
+      auto shape_out_repr =
+          PDNodeName(name_scope_, repr_, id_, "shape_out_" + std::to_string(i));
+      node_reprs["shape_out_" + std::to_string(i)] = shape_out_repr;
+      auto* shape_out =
+          pattern->NewNode(shape_out_repr)->assert_is_op_output("shape", "Out");
+
+      shape->LinksFrom({src_mask}).LinksTo({shape_out});
+
+      auto slice_repr =
+          PDNodeName(name_scope_, repr_, id_, "slice_" + std::to_string(i));
+      node_reprs["slice_" + std::to_string(i)] = slice_repr;
+      auto* slice = pattern->NewNode(slice_repr)->assert_is_op("slice");
+
+      auto slice_out_repr =
+          PDNodeName(name_scope_, repr_, id_, "slice_out_" + std::to_string(i));
+      node_reprs["slice_out_" + std::to_string(i)] = slice_out_repr;
+      auto* slice_out =
+          pattern->NewNode(slice_out_repr)->assert_is_op_output("slice", "Out");
+
+      slice->LinksFrom({shape_out}).LinksTo({slice_out});
+
+      fused_multi_transformer->LinksFrom({x0, src_mask, slice_out})
+          .LinksTo({out});
+    } else {
+      auto cache_kv_repr =
+          PDNodeName(name_scope_, repr_, id_, "cache_kv_" + std::to_string(i));
+      node_reprs["cache_kv_" + std::to_string(i)] = cache_kv_repr;
+      auto* cache_kv = pattern->NewNode(cache_kv_repr);
+      cache_kv->assert_is_op_input(fused_multi_transformer_name, "CacheKV");
+      cache_kv->AsInput();
+
+      auto fill_const_op_repr =
+          PDNodeName(name_scope_, repr_, id_, "fill_op_" + std::to_string(i));
+      node_reprs["fill_op_" + std::to_string(i)] = fill_const_op_repr;
+      auto fill_const_op = pattern->NewNode(fill_const_op_repr)
+                               ->assert_is_op("fill_constant_batch_size_like");
+
+      fused_multi_transformer->LinksFrom({x0, src_mask, cache_kv})
+          .LinksTo({out});
+      fill_const_op->LinksFrom({x0}).LinksTo({cache_kv});
+    }
+    x0 = out;
+  }
+  x0->AsOutput();
+  return node_reprs;
+}
+}  // namespace patterns
+
+inline void MergeInput(OpDesc* op,
+                       const std::vector<VariableNameMap>& input_name_maps,
+                       const std::string& input_name) {
+  std::vector<std::string> tmp = input_name_maps[0].at(input_name);
+  for (size_t i = 1; i < input_name_maps.size(); ++i) {
+    tmp.insert(tmp.end(),
+               input_name_maps[i].at(input_name).begin(),
+               input_name_maps[i].at(input_name).end());
+  }
+  op->SetInput(input_name, tmp);
+}
+
+template <typename T>
+inline void MergeAttrs(const std::vector<OpDesc*>& ops,
+                       const std::string& attr_name) {
+  std::vector<T> res;
+  for (size_t i = 0; i < ops.size(); ++i) {
+    auto scale_vec =
+        PADDLE_GET_CONST(std::vector<T>, ops[i]->GetAttr(attr_name));
+    res.insert(res.end(), scale_vec.begin(), scale_vec.end());
+  }
+  ops[0]->SetAttr(attr_name, res);
+}
+
+int FuseMultiTransformerLayerPass::BuildFusion(Graph* graph,
+                                               const std::string& name_scope,
+                                               Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // TODO(wufeisheng): Get enable_int8 attr from graph after
+  // fused_multi_transformer pass with int8 merged
+  bool enable_int8 = false;
+
+  int num_fuse_op = 0;
+  bool is_decoder = false;
+
+  if (graph->Has(kFusedMultiTransformerEncoderFusionCount)) {
+    num_fuse_op = graph->Get<int>(kFusedMultiTransformerEncoderFusionCount);
+    is_decoder = false;
+  } else if (graph->Has(kFusedMultiTransformerDecoderFusionCount)) {
+    num_fuse_op = graph->Get<int>(kFusedMultiTransformerDecoderFusionCount);
+    is_decoder = true;
+  }
+  if (num_fuse_op == 0) {
+    VLOG(4) << "fuse_multi_transformer_layer_pass will be skipped "
+               "cause num_fuse_op is not been set or set to 0";
+    return 0;
+  }
+  if (!is_decoder) {
+    VLOG(4) << "fuse_multi_transformer_layer_pass will match encoder pattern";
+  } else {
+    VLOG(4) << "fuse_multi_transformer_layer_pass will match decoder pattern";
+  }
+
+  patterns::MultiTransformerLayerPattern multi_layer_pattern(pattern,
+                                                             name_scope);
+  auto node_reprs = multi_layer_pattern(enable_int8, num_fuse_op, is_decoder);
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    ///////////////////
+    //// Get nodes ////
+    ///////////////////
+
+    GET_IR_NODE_FROM_SUBGRAPH(src_mask, src_mask, multi_layer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(x0, x0, multi_layer_pattern);
+
+    std::vector<Node*> fuse_op_nodes;
+    std::vector<Node*> out_nodes;
+
+    std::vector<std::string> unused_node_prefixes = {
+        "shape_", "shape_out_", "slice_", "slice_out_"};
+    std::vector<Node*> unused_nodes;
+
+    std::vector<OpDesc*> fuse_op_descs;
+    std::vector<VariableNameMap> fuse_op_input_var_name_maps;
+    std::vector<VariableNameMap> fuse_op_output_var_name_maps;
+
+    for (int i = 0; i < num_fuse_op; ++i) {
+      PDNode* fuse_op_pdnode =
+          multi_layer_pattern.PatternBase::pattern->RetrieveNode(
+              node_reprs["fuse_op_" + std::to_string(i)]);
+      Node* fuse_op_node = subgraph.at(fuse_op_pdnode);
+      fuse_op_nodes.push_back(fuse_op_node);
+      fuse_op_descs.push_back(fuse_op_node->Op());
+      fuse_op_input_var_name_maps.emplace_back(fuse_op_node->Op()->Inputs());
+      fuse_op_output_var_name_maps.emplace_back(fuse_op_node->Op()->Outputs());
+
+      PDNode* out_pdnode =
+          multi_layer_pattern.PatternBase::pattern->RetrieveNode(
+              node_reprs["out_" + std::to_string(i)]);
+      out_nodes.push_back(subgraph.at(out_pdnode));
+
+      // fill_const op use x0 as input
+      if (!is_decoder && i != 0) {
+        PDNode* fill_op_pdnode =
+            multi_layer_pattern.PatternBase::pattern->RetrieveNode(
+                node_reprs["fill_op_" + std::to_string(i)]);
+        Node* fill_op_node = subgraph.at(fill_op_pdnode);
+        fill_op_node->Op()->SetInput("Input", {x0->Name()});
+        IR_NODE_UNLINK(out_nodes[i - 1], fill_op_node);
+        IR_NODE_LINK_TO(x0, fill_op_node);
+      } else if (is_decoder && i != 0) {
+        for (const auto& unused_node_prefix : unused_node_prefixes) {
+          PDNode* unused_pdnode =
+              multi_layer_pattern.PatternBase::pattern->RetrieveNode(
+                  node_reprs[unused_node_prefix + std::to_string(i)]);
+          Node* unused_node = subgraph.at(unused_pdnode);
+          unused_nodes.push_back(unused_node);
+        }
+      }
+    }
+
+    ///////////////
+    //// Merge ////
+    ///////////////
+
+    // Merge inputs
+    std::vector<std::string> inputs_names = {"CacheKV",
+                                             "FFN1Bias",
+                                             "FFN1Weight",
+                                             "FFN2Bias",
+                                             "FFN2Weight",
+                                             "FFNLnBias",
+                                             "FFNLnScale",
+                                             "LnBias",
+                                             "LnScale",
+                                             "OutLinearBias",
+                                             "OutLinearW",
+                                             "QKVBias",
+                                             "QKVW"};
+
+    for (const auto& input_name : inputs_names) {
+      MergeInput(fuse_op_descs[0], fuse_op_input_var_name_maps, input_name);
+    }
+
+    // Merge outputs
+    fuse_op_descs[0]->SetOutput(
+        "Out", fuse_op_output_var_name_maps[num_fuse_op - 1]["Out"]);
+    auto& merged_cache_kv_out_names =
+        fuse_op_output_var_name_maps[0]["CacheKVOut"];
+    for (int i = 1; i < num_fuse_op; ++i) {
+      const auto& out_var_names = fuse_op_output_var_name_maps[i]["CacheKVOut"];
+      merged_cache_kv_out_names.insert(merged_cache_kv_out_names.end(),
+                                       out_var_names.begin(),
+                                       out_var_names.end());
+    }
+    fuse_op_descs[0]->SetOutput("CacheKVOut", merged_cache_kv_out_names);
+
+    ////////////////
+    //// ReLink ////
+    ////////////////
+    // Before relink, out nodes (0 -> num_layer-1) should be removed
+    std::unordered_set<const Node*> marked_out_nodes(out_nodes.begin(),
+                                                     out_nodes.end() - 1);
+    GraphSafeRemoveNodes(graph, marked_out_nodes);
+
+    // Relink all input nodes of fused_multi_transformer ops to the first op
+    auto& merged_inputs = fuse_op_nodes[0]->inputs;
+    for (int i = 1; i < num_fuse_op; ++i) {
+      merged_inputs.insert(merged_inputs.end(),
+                           fuse_op_nodes[i]->inputs.begin(),
+                           fuse_op_nodes[i]->inputs.end());
+    }
+
+    // Relink fuse op -> out
+    IR_NODE_UNLINK(fuse_op_nodes[num_fuse_op - 1], out_nodes[num_fuse_op - 1]);
+    IR_NODE_LINK_TO(fuse_op_nodes[0], out_nodes[num_fuse_op - 1]);
+
+    /////////////////////////////
+    //// Delete unused nodes ////
+    /////////////////////////////
+    // Delete fused_multi_transformer op expect for the first one
+    std::unordered_set<const Node*> marked_fuse_op_nodes(
+        fuse_op_nodes.begin() + 1, fuse_op_nodes.end());
+
+    if (is_decoder) {
+      marked_fuse_op_nodes.insert(unused_nodes.begin(), unused_nodes.end());
+    }
+
+    GraphSafeRemoveNodes(graph, marked_fuse_op_nodes);
+    ++fusion_count;
+  };
+
+  gpd(graph, handler);
+  return fusion_count;
+}
+
+void FuseMultiTransformerLayerPass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal("During the fuse_multi_transformer_layer pass, "
+                              "The scope should not be null."));
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+
+  AddStatis(fusion_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_multi_transformer_layer_pass,
+              paddle::framework::ir::FuseMultiTransformerLayerPass);
diff --git a/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.h b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.h
new file mode 100644
index 00000000000000..339cc6815e2230
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.h
@@ -0,0 +1,60 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct MultiTransformerLayerPattern : public PatternBase {
+  MultiTransformerLayerPattern(PDPattern* pattern,
+                               const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fuse_multi_transformer_layer") {}
+
+  std::unordered_map<std::string, std::string> operator()(
+      bool enable_int8, int num_fused_op = 1, bool is_decoder = false);
+
+  PATTERN_DECL_NODE(src_mask);
+  PATTERN_DECL_NODE(x0);
+};
+
+}  // namespace patterns
+
+class FuseMultiTransformerLayerPass : public FusePassBase {
+ public:
+  FuseMultiTransformerLayerPass() {}
+  virtual ~FuseMultiTransformerLayerPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"fuse_multi_transformer_layer"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass_tester.cc b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass_tester.cc
new file mode 100644
index 00000000000000..72635d1c958555
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass_tester.cc
@@ -0,0 +1,175 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fuse_multi_transformer_layer_pass.h"
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+#define DEF_INPUT_DATA                                                  \
+  Layers layers;                                                        \
+  int num_layers = 3;                                                   \
+  auto* x = layers.data("x", {1, 128, 1024});                           \
+  auto* src_mask = layers.data("src_mask", {1, 16, 128, 128});          \
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);               \
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);                 \
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);       \
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);         \
+  auto* qkv_w = layers.data("qkv_w", {3, 16, 64, 1024}, true);          \
+  auto* out_linear_w = layers.data("out_linear_w", {1024, 1024}, true); \
+  auto* ffn1_w = layers.data("ffn1_w", {1024, 4096}, true);             \
+  auto* ffn2_w = layers.data("ffn2_w", {4096, 1024}, true);             \
+  auto* qkv_bias = layers.data("qkv_bias", {3072}, true);               \
+  auto* out_linear_bias = layers.data("out_linear_bias", {1024}, true); \
+  auto* ffn1_bias = layers.data("ffn1_bias", {4096}, true);             \
+  auto* ffn2_bias = layers.data("ffn2_bias", {1024}, true);
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope,
+                   const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<phi::DenseTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "ln_scale", {1024});
+  AddVarToScope(param_scope, "ln_bias", {1024});
+  AddVarToScope(param_scope, "ffn_ln_scale", {1024});
+  AddVarToScope(param_scope, "ffn_ln_bias", {1024});
+
+  AddVarToScope(param_scope, "qkv_w", {3, 16, 64, 1024});
+  AddVarToScope(param_scope, "out_linear_w", {1024, 1024});
+  AddVarToScope(param_scope, "ffn1_w", {1024, 4096});
+  AddVarToScope(param_scope, "ffn2_w", {4096, 1024});
+  AddVarToScope(param_scope, "qkv_bias", {3072});
+  AddVarToScope(param_scope, "out_linear_bias", {1024});
+  AddVarToScope(param_scope, "ffn1_bias", {4096});
+  AddVarToScope(param_scope, "ffn2_bias", {1024});
+
+  return param_scope;
+}
+TEST(FuseMultiTransformerLayerPass, encoder_fp) {
+  DEF_INPUT_DATA
+
+  // Layers
+  for (int i = 0; i < num_layers; ++i) {
+    auto* cache_kv = layers.fill_constant_batch_size_like(
+        x,
+        static_cast<int>(proto::VarType::FP32),
+        0,
+        1,
+        {2, -1, 16, 1024, 64},
+        0);
+    auto* out = layers.fused_multi_transformer(x,
+                                               cache_kv,
+                                               src_mask,
+                                               qkv_w,
+                                               qkv_bias,
+                                               out_linear_w,
+                                               out_linear_bias,
+                                               ffn1_w,
+                                               ffn1_bias,
+                                               ffn2_w,
+                                               ffn2_bias,
+                                               ln_scale,
+                                               ln_bias,
+                                               ffn_ln_scale,
+                                               ffn_ln_bias,
+                                               0.1,
+                                               1e-12);
+
+    x = out;
+  }
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+  graph->Set(kFusedMultiTransformerEncoderFusionCount, new int(num_layers));
+
+  auto pass = PassRegistry::Instance().Get("fuse_multi_transformer_layer_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO) << "get fuse_multi_transformer_layer_pass failed";
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(
+      num_nodes_after,
+      1,
+      platform::errors::InvalidArgument(
+          "After the fuse_multi_transformer_layer_pass, "
+          "The node num in graph should be 1, but the result is %d",
+          num_nodes_after));
+}
+TEST(FuseMultiTransformerLayerPass, decoder_fp) {
+  DEF_INPUT_DATA
+
+  x = layers.data("x", {1, 1, 1024});
+  auto* cache_kv = layers.data("cache_kv", {2, 1, 16, 1024, 64}, true);
+  src_mask = layers.data("src_mask", {1, 16, 1, 129});
+
+  // Layers
+  for (int i = 0; i < num_layers; ++i) {
+    auto* shape_out = layers.shape(src_mask);
+    auto* time_stamp = layers.slice(shape_out, {0}, {3}, {4});
+    auto* out = layers.fused_multi_transformer(x,
+                                               cache_kv,
+                                               src_mask,
+                                               qkv_w,
+                                               qkv_bias,
+                                               out_linear_w,
+                                               out_linear_bias,
+                                               ffn1_w,
+                                               ffn1_bias,
+                                               ffn2_w,
+                                               ffn2_bias,
+                                               ln_scale,
+                                               ln_bias,
+                                               ffn_ln_scale,
+                                               ffn_ln_bias,
+                                               0.1,
+                                               1e-12,
+                                               time_stamp);
+
+    x = out;
+  }
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  auto param_scope = CreateParamScope();
+  AddVarToScope(param_scope, "cache_kv", {2, 1, 16, 1024, 64});
+  graph->Set("__param_scope__", param_scope);
+
+  graph->Set(kFusedMultiTransformerDecoderFusionCount, new int(num_layers));
+
+  auto pass = PassRegistry::Instance().Get("fuse_multi_transformer_layer_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO) << "get fuse_multi_transformer_layer_pass failed";
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(
+      num_nodes_after,
+      1,
+      platform::errors::InvalidArgument(
+          "After the fuse_multi_transformer_layer_pass, "
+          "The node num in graph should be 1, but the result is %d",
+          num_nodes_after));
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fuse_multi_transformer_layer_pass);
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
new file mode 100644
index 00000000000000..42c699195beb91
--- /dev/null
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.cc
@@ -0,0 +1,3041 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h"
+
+#include <string>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+PDNode* FusedMultiTransformerDecoderPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("layer_norm", "X");
+
+  // pre-LayerNorm
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsIntermediate()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("layer_norm", "Y")
+                                 ->assert_is_op_input("matmul_v2", "X")
+                                 ->assert_more([](Node* x) {
+                                   if (x->outputs.size() == 3) {
+                                     return true;
+                                   } else {
+                                     return false;
+                                   }
+                                 });
+
+  layer_norm->LinksFrom({input0, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+
+  // Q path Nodes
+  auto* matmul0 = pattern->NewNode(matmul0_repr())->assert_is_op("matmul_v2");
+  auto* matmul0_w_var = pattern->NewNode(matmul0_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul0_out_var = pattern->NewNode(matmul0_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate()
+                                   ->assert_is_op_input("matmul", "X");
+
+  // Q path Links
+  matmul0->LinksFrom({layer_norm_out_var, matmul0_w_var})
+      .LinksTo({matmul0_out_var});
+  eltadd0->LinksFrom({matmul0_out_var, eltadd0_b_var})
+      .LinksTo({eltadd0_out_var});
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+
+  // K path Nodes
+  auto* matmul1 = pattern->NewNode(matmul1_repr())->assert_is_op("matmul_v2");
+  auto* matmul1_w_var = pattern->NewNode(matmul1_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul1_out_var = pattern->NewNode(matmul1_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd1 =
+      pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+
+  auto* eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+  auto* reshape2_1_out_var = pattern->NewNode(reshape2_1_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate();
+  auto* concat_0_in_var = pattern->NewNode(concat_0_in_repr())->AsInput();
+  auto* concat_0 = pattern->NewNode(concat_0_repr())->assert_is_op("concat");
+  auto* concat_0_out_var = pattern->NewNode(concat_0_out_repr())
+                               ->assert_is_op_output("concat")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul")
+                               ->assert_is_op_input("assign");
+  auto assign_0 = pattern->NewNode(assign_0_repr())->assert_is_op("assign");
+
+  // K path Links
+  matmul1->LinksFrom({layer_norm_out_var, matmul1_w_var})
+      .LinksTo({matmul1_out_var});
+  eltadd1->LinksFrom({matmul1_out_var, eltadd1_b_var})
+      .LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+  concat_0->LinksFrom({transpose2_1_out_var, concat_0_in_var})
+      .LinksTo({concat_0_out_var});
+  assign_0->LinksFrom({concat_0_out_var});
+
+  // V path Nodes
+  auto* matmul2 = pattern->NewNode(matmul2_repr())->assert_is_op("matmul_v2");
+  auto* matmul2_w_var = pattern->NewNode(matmul2_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul2_out_var = pattern->NewNode(matmul2_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd2 =
+      pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  auto* eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+  auto* reshape2_2_out_var = pattern->NewNode(reshape2_2_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2");
+  auto* concat_1_in_var = pattern->NewNode(concat_1_in_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("concat");
+  auto* concat_1 = pattern->NewNode(concat_1_repr())->assert_is_op("concat");
+  auto* concat_1_out_var = pattern->NewNode(concat_1_out_repr())
+                               ->assert_is_op_output("concat")
+                               ->assert_is_op_input("matmul_v2")
+                               ->assert_is_op_input("assign");
+  auto assign_1 = pattern->NewNode(assign_1_repr())->assert_is_op("assign");
+
+  // V path Links
+  matmul2->LinksFrom({layer_norm_out_var, matmul2_w_var})
+      .LinksTo({matmul2_out_var});
+  eltadd2->LinksFrom({matmul2_out_var, eltadd2_b_var})
+      .LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+  concat_1->LinksFrom({transpose2_2_out_var, concat_1_in_var})
+      .LinksTo({concat_1_out_var});
+  assign_1->LinksFrom({concat_1_out_var});
+
+  // QK path Nodes
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add")
+                                ->AsIntermediate()
+                                ->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  // QK path Linsk
+  matmul_qk->LinksFrom({transpose2_0_out_var, concat_0_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+
+  // QKV path Nodes
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul_v2");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul_v2");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var =
+      pattern->NewNode(reshape2_qkv_out_repr())
+          ->assert_is_op_output("reshape2")
+          ->AsIntermediate()
+          ->assert_is_op_input("matmul_v2");  // -> out_linear
+
+  auto* matmul_linear =
+      pattern->NewNode(matmul_linear_repr())->assert_is_op("matmul_v2");
+  auto* matmul_linear_w_var = pattern->NewNode(matmul_linear_w_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul_linear_out_var = pattern->NewNode(matmul_linear_out_repr())
+                                    ->assert_is_op_output("matmul_v2")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_linear =
+      pattern->NewNode(eltadd_linear_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_linear_b_var = pattern->NewNode(eltadd_linear_b_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr())
+                                    ->assert_is_op_output("elementwise_add")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_out =
+      pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* attention_output = pattern->NewNode(attention_output_repr())
+                               ->assert_is_op_output("elementwise_add")
+                               ->AsIntermediate();
+
+  // QKV path Links
+  matmul_qkv->LinksFrom({softmax_qk_out_var, concat_1_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+  matmul_linear->LinksFrom({reshape2_qkv_out_var, matmul_linear_w_var})
+      .LinksTo({matmul_linear_out_var});
+  eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var})
+      .LinksTo({eltadd_linear_out_var});
+  eltadd_out->LinksFrom({input0, eltadd_linear_out_var})
+      .LinksTo({attention_output});
+
+  // Feed Forward LayerNorm Nodes
+  auto* ffn_layer_norm =
+      pattern->NewNode(ffn_layer_norm_repr())->assert_is_op("layer_norm");
+  auto* ffn_layer_norm_scale_var =
+      pattern->NewNode(ffn_layer_norm_scale_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Scale");
+  auto* ffn_layer_norm_bias_var =
+      pattern->NewNode(ffn_layer_norm_bias_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Bias");
+  auto* ffn_layer_norm_mean_var =
+      pattern->NewNode(ffn_layer_norm_mean_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Mean");
+  auto* ffn_layer_norm_variance_var =
+      pattern->NewNode(ffn_layer_norm_variance_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* ffn_layer_norm_out_var = pattern->NewNode(ffn_layer_norm_out_repr())
+                                     ->AsIntermediate()
+                                     ->assert_is_op_output("layer_norm", "Y")
+                                     ->assert_is_op_input("matmul_v2", "X");
+
+  ffn_layer_norm
+      ->LinksFrom(
+          {attention_output, ffn_layer_norm_bias_var, ffn_layer_norm_scale_var})
+      .LinksTo({ffn_layer_norm_out_var,
+                ffn_layer_norm_mean_var,
+                ffn_layer_norm_variance_var});
+
+  // Feed Forward fc1 -> gelu -> fc2
+  auto* ffn_matmul0 =
+      pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul0_out_var = pattern->NewNode(ffn_matmul0_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd0 =
+      pattern->NewNode(ffn_eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd0_b_var = pattern->NewNode(ffn_eltadd0_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd0_out_var = pattern->NewNode(ffn_eltadd0_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("gelu");
+
+  auto* ffn_gelu = pattern->NewNode(ffn_gelu_repr())->assert_is_op("gelu");
+  auto* ffn_gelu_out_var = pattern->NewNode(ffn_gelu_out_repr())
+                               ->assert_is_op_output("gelu")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2");
+
+  auto* ffn_matmul1 =
+      pattern->NewNode(ffn_matmul1_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul1_w_var = pattern->NewNode(ffn_matmul1_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul1_out_var = pattern->NewNode(ffn_matmul1_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd1 =
+      pattern->NewNode(ffn_eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd1_b_var = pattern->NewNode(ffn_eltadd1_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd_out =
+      pattern->NewNode(ffn_eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* ffn_output = pattern->NewNode(ffn_output_repr())
+                         ->assert_is_op_output("elementwise_add")
+                         ->AsOutput();
+
+  ffn_matmul0->LinksFrom({ffn_layer_norm_out_var, ffn_matmul0_w_var})
+      .LinksTo({ffn_matmul0_out_var});
+  ffn_eltadd0->LinksFrom({ffn_matmul0_out_var, ffn_eltadd0_b_var})
+      .LinksTo({ffn_eltadd0_out_var});
+  ffn_gelu->LinksFrom({ffn_eltadd0_out_var}).LinksTo({ffn_gelu_out_var});
+  ffn_matmul1->LinksFrom({ffn_gelu_out_var, ffn_matmul1_w_var})
+      .LinksTo({ffn_matmul1_out_var});
+  ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var})
+      .LinksTo({ffn_eltadd1_out_var});
+
+  ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var})
+      .LinksTo({ffn_output});
+
+  return ffn_output;
+}
+
+PDNode* FusedMultiTransformerDecoderFuseQKVPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("layer_norm", "X");
+
+  // pre-LayerNorm
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("layer_norm", "Y")
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  layer_norm->LinksFrom({input0, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+
+  // QKV fused path Nodes
+  auto* matmul0 = pattern->NewNode(matmul0_repr())->assert_is_op("matmul_v2");
+  auto* matmul0_w_var = pattern->NewNode(matmul0_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul0_out_var = pattern->NewNode(matmul0_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate()
+                                   ->assert_is_op_input("split", "X");
+
+  auto* split0 = pattern->NewNode(split0_repr())->assert_is_op("split");
+  auto* split0_q_out_var = pattern->NewNode(split0_q_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul", "X");
+  auto* split0_k_out_var = pattern->NewNode(split0_k_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("concat");
+  auto* split0_v_out_var = pattern->NewNode(split0_v_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("concat");
+
+  auto* concat_k_in_var = pattern
+                              ->NewNode(concat_k_in_repr())
+                              // ->AsInput()
+                              ->assert_is_op_input("concat");
+  auto* concat_k = pattern->NewNode(concat_k_repr())->assert_is_op("concat");
+  auto* concat_k_out_var = pattern->NewNode(concat_k_out_repr())
+                               ->assert_is_op_output("concat")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul")
+                               ->assert_is_op_input("assign");
+  auto* concat_v_in_var = pattern
+                              ->NewNode(concat_v_in_repr())
+                              // ->AsInput()
+                              ->assert_is_op_input("concat");
+  auto* concat_v = pattern->NewNode(concat_v_repr())->assert_is_op("concat");
+  auto* concat_v_out_var = pattern->NewNode(concat_v_out_repr())
+                               ->assert_is_op_output("concat")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2")
+                               ->assert_is_op_input("assign");
+
+  auto* assign_k = pattern->NewNode(assign_k_repr())->assert_is_op("assign");
+  auto* assign_v = pattern->NewNode(assign_v_repr())->assert_is_op("assign");
+
+  // QKV fused path Links
+  matmul0->LinksFrom({layer_norm_out_var, matmul0_w_var})
+      .LinksTo({matmul0_out_var});
+  eltadd0->LinksFrom({matmul0_out_var, eltadd0_b_var})
+      .LinksTo({eltadd0_out_var});
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  split0->LinksFrom({transpose2_0_out_var})
+      .LinksTo({split0_q_out_var, split0_k_out_var, split0_v_out_var});
+  concat_k->LinksFrom({concat_k_in_var, split0_k_out_var})
+      .LinksTo({concat_k_out_var});
+  concat_v->LinksFrom({concat_v_in_var, split0_v_out_var})
+      .LinksTo({concat_v_out_var});
+  assign_k->LinksFrom({concat_k_out_var});
+  assign_v->LinksFrom({concat_v_out_var});
+
+  // QK path Nodes
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add")
+                                ->AsIntermediate()
+                                ->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  // QK path Linsk
+  matmul_qk->LinksFrom({split0_q_out_var, concat_k_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+
+  // QKV path Nodes
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul_v2");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul_v2");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var =
+      pattern->NewNode(reshape2_qkv_out_repr())
+          ->assert_is_op_output("reshape2")
+          ->AsIntermediate()
+          ->assert_is_op_input("matmul_v2");  // -> out_linear
+
+  auto* matmul_linear =
+      pattern->NewNode(matmul_linear_repr())->assert_is_op("matmul_v2");
+  auto* matmul_linear_w_var = pattern->NewNode(matmul_linear_w_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul_linear_out_var = pattern->NewNode(matmul_linear_out_repr())
+                                    ->assert_is_op_output("matmul_v2")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_linear =
+      pattern->NewNode(eltadd_linear_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_linear_b_var = pattern->NewNode(eltadd_linear_b_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr())
+                                    ->assert_is_op_output("elementwise_add")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_out =
+      pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* attention_output = pattern->NewNode(attention_output_repr())
+                               ->assert_is_op_output("elementwise_add")
+                               ->AsIntermediate();
+
+  // QKV path Links
+  matmul_qkv->LinksFrom({softmax_qk_out_var, concat_v_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+  matmul_linear->LinksFrom({reshape2_qkv_out_var, matmul_linear_w_var})
+      .LinksTo({matmul_linear_out_var});
+  eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var})
+      .LinksTo({eltadd_linear_out_var});
+  eltadd_out->LinksFrom({input0, eltadd_linear_out_var})
+      .LinksTo({attention_output});
+
+  // Feed Forward LayerNorm Nodes
+  auto* ffn_layer_norm =
+      pattern->NewNode(ffn_layer_norm_repr())->assert_is_op("layer_norm");
+  auto* ffn_layer_norm_scale_var =
+      pattern->NewNode(ffn_layer_norm_scale_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Scale");
+  auto* ffn_layer_norm_bias_var =
+      pattern->NewNode(ffn_layer_norm_bias_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Bias");
+  auto* ffn_layer_norm_mean_var =
+      pattern->NewNode(ffn_layer_norm_mean_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Mean");
+  auto* ffn_layer_norm_variance_var =
+      pattern->NewNode(ffn_layer_norm_variance_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* ffn_layer_norm_out_var = pattern->NewNode(ffn_layer_norm_out_repr())
+                                     ->AsIntermediate()
+                                     ->assert_is_op_output("layer_norm", "Y")
+                                     ->assert_is_op_input("matmul_v2", "X");
+
+  ffn_layer_norm
+      ->LinksFrom(
+          {attention_output, ffn_layer_norm_bias_var, ffn_layer_norm_scale_var})
+      .LinksTo({ffn_layer_norm_out_var,
+                ffn_layer_norm_mean_var,
+                ffn_layer_norm_variance_var});
+
+  // Feed Forward fc1 -> gelu -> fc2
+  auto* ffn_matmul0 =
+      pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul0_out_var = pattern->NewNode(ffn_matmul0_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd0 =
+      pattern->NewNode(ffn_eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd0_b_var = pattern->NewNode(ffn_eltadd0_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd0_out_var = pattern->NewNode(ffn_eltadd0_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("gelu");
+
+  auto* ffn_gelu = pattern->NewNode(ffn_gelu_repr())->assert_is_op("gelu");
+  auto* ffn_gelu_out_var = pattern->NewNode(ffn_gelu_out_repr())
+                               ->assert_is_op_output("gelu")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2");
+
+  auto* ffn_matmul1 =
+      pattern->NewNode(ffn_matmul1_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul1_w_var = pattern->NewNode(ffn_matmul1_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul1_out_var = pattern->NewNode(ffn_matmul1_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd1 =
+      pattern->NewNode(ffn_eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd1_b_var = pattern->NewNode(ffn_eltadd1_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd_out =
+      pattern->NewNode(ffn_eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* ffn_output = pattern->NewNode(ffn_output_repr())
+                         ->assert_is_op_output("elementwise_add")
+                         ->AsOutput();
+
+  ffn_matmul0->LinksFrom({ffn_layer_norm_out_var, ffn_matmul0_w_var})
+      .LinksTo({ffn_matmul0_out_var});
+  ffn_eltadd0->LinksFrom({ffn_matmul0_out_var, ffn_eltadd0_b_var})
+      .LinksTo({ffn_eltadd0_out_var});
+  ffn_gelu->LinksFrom({ffn_eltadd0_out_var}).LinksTo({ffn_gelu_out_var});
+  ffn_matmul1->LinksFrom({ffn_gelu_out_var, ffn_matmul1_w_var})
+      .LinksTo({ffn_matmul1_out_var});
+  ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var})
+      .LinksTo({ffn_eltadd1_out_var});
+
+  ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var})
+      .LinksTo({ffn_output});
+
+  return ffn_output;
+}
+
+PDNode* MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("layer_norm", "X");
+
+  // pre-LayerNorm
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("layer_norm", "Y")
+                                 ->assert_is_op_input("c_identity", "X");
+
+  layer_norm->LinksFrom({input0, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+
+  // communication c_identity
+  auto* c_identity =
+      pattern->NewNode(c_identity_repr())->assert_is_op("c_identity");
+  auto* c_identity_out_var = pattern->NewNode(c_identity_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("c_identity", "Out")
+                                 ->assert_is_op_input("matmul_v2", "X");
+  c_identity->LinksFrom({layer_norm_out_var}).LinksTo({c_identity_out_var});
+
+  // QKV fused path Nodes
+  auto* matmul0 = pattern->NewNode(matmul0_repr())->assert_is_op("matmul_v2");
+  auto* matmul0_w_var = pattern->NewNode(matmul0_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul0_out_var = pattern->NewNode(matmul0_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate()
+                                   ->assert_is_op_input("split", "X");
+
+  auto* split0 = pattern->NewNode(split0_repr())->assert_is_op("split");
+  auto* split0_q_out_var = pattern->NewNode(split0_q_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul", "X");
+  auto* split0_k_out_var = pattern->NewNode(split0_k_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("concat");
+  auto* split0_v_out_var = pattern->NewNode(split0_v_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("concat");
+
+  auto* concat_k_in_var = pattern
+                              ->NewNode(concat_k_in_repr())
+                              // ->AsInput()
+                              ->assert_is_op_input("concat");
+  auto* concat_k = pattern->NewNode(concat_k_repr())->assert_is_op("concat");
+  auto* concat_k_out_var = pattern->NewNode(concat_k_out_repr())
+                               ->assert_is_op_output("concat")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul")
+                               ->assert_is_op_input("assign");
+  auto* concat_v_in_var = pattern
+                              ->NewNode(concat_v_in_repr())
+                              // ->AsInput()
+                              ->assert_is_op_input("concat");
+  auto* concat_v = pattern->NewNode(concat_v_repr())->assert_is_op("concat");
+  auto* concat_v_out_var = pattern->NewNode(concat_v_out_repr())
+                               ->assert_is_op_output("concat")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2")
+                               ->assert_is_op_input("assign");
+
+  auto* assign_k = pattern->NewNode(assign_k_repr())->assert_is_op("assign");
+  auto* assign_v = pattern->NewNode(assign_v_repr())->assert_is_op("assign");
+
+  // QKV fused path Links
+  matmul0->LinksFrom({c_identity_out_var, matmul0_w_var})
+      .LinksTo({matmul0_out_var});
+  eltadd0->LinksFrom({matmul0_out_var, eltadd0_b_var})
+      .LinksTo({eltadd0_out_var});
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  split0->LinksFrom({transpose2_0_out_var})
+      .LinksTo({split0_q_out_var, split0_k_out_var, split0_v_out_var});
+  concat_k->LinksFrom({concat_k_in_var, split0_k_out_var})
+      .LinksTo({concat_k_out_var});
+  concat_v->LinksFrom({concat_v_in_var, split0_v_out_var})
+      .LinksTo({concat_v_out_var});
+  assign_k->LinksFrom({concat_k_out_var});
+  assign_v->LinksFrom({concat_v_out_var});
+
+  // QK path Nodes
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add")
+                                ->AsIntermediate()
+                                ->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  // QK path Linsk
+  matmul_qk->LinksFrom({split0_q_out_var, concat_k_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+
+  // QKV path Nodes
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul_v2");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul_v2");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var =
+      pattern->NewNode(reshape2_qkv_out_repr())
+          ->assert_is_op_output("reshape2")
+          ->AsIntermediate()
+          ->assert_is_op_input("matmul_v2");  // -> out_linear
+
+  auto* matmul_linear =
+      pattern->NewNode(matmul_linear_repr())->assert_is_op("matmul_v2");
+  auto* matmul_linear_w_var = pattern->NewNode(matmul_linear_w_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul_linear_out_var = pattern->NewNode(matmul_linear_out_repr())
+                                    ->assert_is_op_output("matmul_v2")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("c_allreduce_sum");
+
+  // communication c_allreduce_sum
+  auto* c_allreduce_sum =
+      pattern->NewNode(c_allreduce_sum_repr())->assert_is_op("c_allreduce_sum");
+  auto* c_allreduce_sum_out_var = pattern->NewNode(c_allreduce_sum_out_repr())
+                                      ->assert_is_op_output("c_allreduce_sum")
+                                      ->AsIntermediate()
+                                      ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_linear =
+      pattern->NewNode(eltadd_linear_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_linear_b_var = pattern->NewNode(eltadd_linear_b_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr())
+                                    ->assert_is_op_output("elementwise_add")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_out =
+      pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* attention_output = pattern->NewNode(attention_output_repr())
+                               ->assert_is_op_output("elementwise_add")
+                               ->AsIntermediate();
+
+  // QKV path Links
+  matmul_qkv->LinksFrom({softmax_qk_out_var, concat_v_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+  matmul_linear->LinksFrom({reshape2_qkv_out_var, matmul_linear_w_var})
+      .LinksTo({matmul_linear_out_var});
+  c_allreduce_sum->LinksFrom({matmul_linear_out_var})
+      .LinksTo({c_allreduce_sum_out_var});
+  eltadd_linear->LinksFrom({c_allreduce_sum_out_var, eltadd_linear_b_var})
+      .LinksTo({eltadd_linear_out_var});
+  eltadd_out->LinksFrom({input0, eltadd_linear_out_var})
+      .LinksTo({attention_output});
+
+  // Feed Forward LayerNorm Nodes
+  auto* ffn_layer_norm =
+      pattern->NewNode(ffn_layer_norm_repr())->assert_is_op("layer_norm");
+  auto* ffn_layer_norm_scale_var =
+      pattern->NewNode(ffn_layer_norm_scale_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Scale");
+  auto* ffn_layer_norm_bias_var =
+      pattern->NewNode(ffn_layer_norm_bias_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Bias");
+  auto* ffn_layer_norm_mean_var =
+      pattern->NewNode(ffn_layer_norm_mean_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Mean");
+  auto* ffn_layer_norm_variance_var =
+      pattern->NewNode(ffn_layer_norm_variance_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* ffn_layer_norm_out_var = pattern->NewNode(ffn_layer_norm_out_repr())
+                                     ->AsIntermediate()
+                                     ->assert_is_op_output("layer_norm", "Y")
+                                     ->assert_is_op_input("c_identity", "X");
+
+  ffn_layer_norm
+      ->LinksFrom(
+          {attention_output, ffn_layer_norm_bias_var, ffn_layer_norm_scale_var})
+      .LinksTo({ffn_layer_norm_out_var,
+                ffn_layer_norm_mean_var,
+                ffn_layer_norm_variance_var});
+
+  // communication c_identity
+  auto* ffn_c_identity =
+      pattern->NewNode(ffn_c_identity_repr())->assert_is_op("c_identity");
+  auto* ffn_c_identity_out_var = pattern->NewNode(ffn_c_identity_out_repr())
+                                     ->assert_is_op_output("c_identity", "Out")
+                                     ->AsIntermediate()
+                                     ->assert_is_op_input("matmul_v2", "X");
+  ffn_c_identity->LinksFrom({ffn_layer_norm_out_var})
+      .LinksTo({ffn_c_identity_out_var});
+
+  // Feed Forward fc1 -> gelu -> fc2
+  auto* ffn_matmul0 =
+      pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul0_out_var = pattern->NewNode(ffn_matmul0_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd0 =
+      pattern->NewNode(ffn_eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd0_b_var = pattern->NewNode(ffn_eltadd0_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd0_out_var = pattern->NewNode(ffn_eltadd0_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("gelu");
+
+  auto* ffn_gelu = pattern->NewNode(ffn_gelu_repr())->assert_is_op("gelu");
+  auto* ffn_gelu_out_var = pattern->NewNode(ffn_gelu_out_repr())
+                               ->assert_is_op_output("gelu")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2");
+
+  auto* ffn_matmul1 =
+      pattern->NewNode(ffn_matmul1_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul1_w_var = pattern->NewNode(ffn_matmul1_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul1_out_var = pattern->NewNode(ffn_matmul1_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("c_allreduce_sum");
+
+  // communication c_allreduce_sum
+  auto* ffn_c_allreduce_sum = pattern->NewNode(ffn_c_allreduce_sum_repr())
+                                  ->assert_is_op("c_allreduce_sum");
+  auto* ffn_c_allreduce_sum_out_var =
+      pattern->NewNode(ffn_c_allreduce_sum_out_repr())
+          ->assert_is_op_output("c_allreduce_sum")
+          ->AsIntermediate()
+          ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd1 =
+      pattern->NewNode(ffn_eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd1_b_var = pattern->NewNode(ffn_eltadd1_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd_out =
+      pattern->NewNode(ffn_eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* ffn_output = pattern->NewNode(ffn_output_repr())
+                         ->assert_is_op_output("elementwise_add")
+                         ->AsOutput();
+
+  ffn_matmul0->LinksFrom({ffn_c_identity_out_var, ffn_matmul0_w_var})
+      .LinksTo({ffn_matmul0_out_var});
+  ffn_eltadd0->LinksFrom({ffn_matmul0_out_var, ffn_eltadd0_b_var})
+      .LinksTo({ffn_eltadd0_out_var});
+  ffn_gelu->LinksFrom({ffn_eltadd0_out_var}).LinksTo({ffn_gelu_out_var});
+  ffn_matmul1->LinksFrom({ffn_gelu_out_var, ffn_matmul1_w_var})
+      .LinksTo({ffn_matmul1_out_var});
+  ffn_c_allreduce_sum->LinksFrom({ffn_matmul1_out_var})
+      .LinksTo({ffn_c_allreduce_sum_out_var});
+  ffn_eltadd1->LinksFrom({ffn_c_allreduce_sum_out_var, ffn_eltadd1_b_var})
+      .LinksTo({ffn_eltadd1_out_var});
+
+  ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var})
+      .LinksTo({ffn_output});
+
+  return ffn_output;
+}
+
+}  // namespace patterns
+
+int FusedMultiTransformerDecoderPass::BuildFusion(Graph* graph,
+                                                  const std::string& name_scope,
+                                                  Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::FusedMultiTransformerDecoderPattern fused_multi_transformer_pattern(
+      pattern, name_scope);
+  fused_multi_transformer_pattern();
+
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0,
+                          Node* layer_norm,
+                          Node* layer_norm_scale,
+                          Node* layer_norm_bias,
+                          Node* layer_norm_mean,
+                          Node* layer_norm_variance,
+                          Node* matmul0_w,
+                          Node* matmul1_w,
+                          Node* matmul2_w,
+                          Node* eltadd0_b,
+                          Node* eltadd1_b,
+                          Node* eltadd2_b,
+                          Node* transpose2_1_out,
+                          Node* transpose2_2_out,
+                          Node* eltadd_qk_b,
+                          Node* reshape2_0,
+                          Node* matmul_linear_w,
+                          Node* eltadd_linear_b,
+                          Node* ffn_layer_norm,
+                          Node* ffn_layer_norm_scale,
+                          Node* ffn_layer_norm_bias,
+                          Node* ffn_layer_norm_mean,
+                          Node* ffn_layer_norm_variance,
+                          Node* ffn_matmul0_w,
+                          Node* ffn_matmul1_w,
+                          Node* ffn_eltadd0_b,
+                          Node* ffn_eltadd1_b,
+                          Node* ffn_output) {
+    // Calc index of transformer layer by LayerNorm Scale name
+    // This calculation assumes:
+    //    1. no LayerNorm before all transformer layer
+    //    2. each transformer layer contains 2 LayerNorm layer
+    auto ln_scale_name = layer_norm_scale->Name();
+    auto ln_name = ln_scale_name.substr(0, ln_scale_name.find('.'));
+    auto ln_idx_str = ln_name.substr(ln_name.rfind('_') + 1);
+    int layer_idx = atoi(ln_idx_str.c_str()) / 2;
+
+    // create fused_multi_transformer
+    OpDesc fused_multi_transformer_op_desc(layer_norm->Op()->Block());
+    fused_multi_transformer_op_desc.SetType("fused_multi_transformer");
+
+    // 1. Input setting
+    fused_multi_transformer_op_desc.SetInput("X", {input0->Name()});
+
+    // pre-LayerNorm input
+    fused_multi_transformer_op_desc.SetInput("LnScale",
+                                             {layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("LnBias",
+                                             {layer_norm_bias->Name()});
+
+    // QKV computation input
+    fused_multi_transformer_op_desc.SetInput("QKVW", {matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("QKVBias", {eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("SrcMask", {eltadd_qk_b->Name()});
+
+    // Cache KV use cache_kv in encoder
+    auto cache_kv_name = "cache_kv" + std::to_string(layer_idx);
+    fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv_name});
+
+    VarDesc shape_out_desc("shape_out." + std::to_string(layer_idx));
+    shape_out_desc.SetDataType(proto::VarType::INT32);
+    shape_out_desc.SetPersistable(false);
+    auto* shape_out = graph->CreateVarNode(&shape_out_desc);
+
+    OpDesc shape_op_desc(layer_norm->Op()->Block());
+    shape_op_desc.SetType("shape");
+    shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()});
+    shape_op_desc.SetOutput("Out", {shape_out->Name()});
+    auto* shape_op = graph->CreateOpNode(&shape_op_desc);
+
+    VarDesc slice_out_desc("slice_out." + std::to_string(layer_idx));
+    slice_out_desc.SetDataType(proto::VarType::INT32);
+    slice_out_desc.SetPersistable(false);
+    auto* slice_out = graph->CreateVarNode(&slice_out_desc);
+
+    OpDesc slice_op_desc(layer_norm->Op()->Block());
+    slice_op_desc.SetType("slice");
+    slice_op_desc.SetInput("Input", {shape_out->Name()});
+    slice_op_desc.SetOutput("Out", {slice_out->Name()});
+    std::vector<int> axes = {0};
+    std::vector<int> starts = {3};
+    std::vector<int> ends = {4};
+    slice_op_desc.SetAttr("axes", axes);
+    slice_op_desc.SetAttr("starts", starts);
+    slice_op_desc.SetAttr("ends", ends);
+    auto* slice_op = graph->CreateOpNode(&slice_op_desc);
+
+    fused_multi_transformer_op_desc.SetInput("TimeStep", {slice_out->Name()});
+
+    // Out Linear input
+    fused_multi_transformer_op_desc.SetInput("OutLinearW",
+                                             {matmul_linear_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("OutLinearBias",
+                                             {eltadd_linear_b->Name()});
+
+    // Feed Forward input
+    fused_multi_transformer_op_desc.SetInput("FFNLnScale",
+                                             {ffn_layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFNLnBias",
+                                             {ffn_layer_norm_bias->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Weight",
+                                             {ffn_matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Bias",
+                                             {ffn_eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Weight",
+                                             {ffn_matmul1_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Bias",
+                                             {ffn_eltadd1_b->Name()});
+
+    // 2. Output setting
+    fused_multi_transformer_op_desc.SetOutput("Out", {ffn_output->Name()});
+    fused_multi_transformer_op_desc.SetOutput("CacheKVOut", {cache_kv_name});
+
+    // Attribute setting
+    fused_multi_transformer_op_desc.SetAttr("pre_layer_norm", true);
+    fused_multi_transformer_op_desc.SetAttr(
+        "epsilon", layer_norm->Op()->GetAttr("epsilon"));
+
+    // output dropout attribute
+    fused_multi_transformer_op_desc.SetAttr("is_test", true);
+    fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f);
+
+    auto* fused_multi_transformer =
+        graph->CreateOpNode(&fused_multi_transformer_op_desc);
+    IR_NODE_LINK_TO(input0, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer);
+
+    // TimeStep link
+    IR_NODE_LINK_TO(eltadd_qk_b, shape_op);
+    IR_NODE_LINK_TO(shape_op, shape_out);
+    IR_NODE_LINK_TO(shape_out, slice_op);
+    IR_NODE_LINK_TO(slice_op, slice_out);
+    IR_NODE_LINK_TO(slice_out, fused_multi_transformer)
+
+    IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(fused_multi_transformer, ffn_output);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "fused_multi_transformer_decoder "
+                      "pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle MultiTransformer decoder fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm, layer_norm, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_scale, layer_norm_scale, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_bias, layer_norm_bias, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_mean, layer_norm_mean, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance,
+                              layer_norm_variance,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_out, layer_norm_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0, matmul0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_out, matmul0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_w, matmul0_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0, reshape2_0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0_out, reshape2_0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0, transpose2_0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0_out, transpose2_0_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul1, matmul1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul1_out, matmul1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul1_w, matmul1_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_1, reshape2_1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_1_out, reshape2_1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_1, transpose2_1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_1_out, transpose2_1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_0, concat_0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_0_out, concat_0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        assign_0, assign_0, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul2, matmul2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul2_out, matmul2_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul2_w, matmul2_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_2, reshape2_2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_2_out, reshape2_2_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_2, transpose2_2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_2_out, transpose2_2_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_1, concat_1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_1_out, concat_1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        assign_1, assign_1, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attention_output, attention_output, fused_multi_transformer_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_layer_norm, ffn_layer_norm, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_scale,
+                              ffn_layer_norm_scale,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_bias,
+                              ffn_layer_norm_bias,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_mean,
+                              ffn_layer_norm_mean,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_variance,
+                              ffn_layer_norm_variance,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_out,
+                              ffn_layer_norm_out,
+                              fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0, ffn_matmul0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_out, ffn_matmul0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_w, ffn_matmul0_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0, ffn_eltadd0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_b, ffn_eltadd0_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_out, ffn_eltadd0_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu, ffn_gelu, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu_out, ffn_gelu_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1, ffn_matmul1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_out, ffn_matmul1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_w, ffn_matmul1_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1, ffn_eltadd1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_b, ffn_eltadd1_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_out, ffn_eltadd1_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_output, ffn_output, fused_multi_transformer_pattern)
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0, eltadd0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_b, eltadd0_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_out, eltadd0_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd1, eltadd1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd1_b, eltadd1_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd1_out, eltadd1_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd2, eltadd2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd2_b, eltadd2_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd2_out, eltadd2_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk, matmul_qk, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk_out, matmul_qk_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk, eltadd_qk, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_b, eltadd_qk_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_out, eltadd_qk_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk, softmax_qk, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk_out, softmax_qk_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv, matmul_qkv, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv_out, matmul_qkv_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv, reshape2_qkv, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv_out, reshape2_qkv_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_qkv, transpose2_qkv, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out,
+                              transpose2_qkv_out,
+                              fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear, matmul_linear, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear_w, matmul_linear_w, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear_out, matmul_linear_out, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear, eltadd_linear, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear_b, eltadd_linear_b, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_out, eltadd_out, fused_multi_transformer_pattern)
+
+    fuse_creater(input0,
+                 layer_norm,
+                 layer_norm_scale,
+                 layer_norm_bias,
+                 layer_norm_mean,
+                 layer_norm_variance,
+                 matmul0_w,
+                 matmul1_w,
+                 matmul2_w,
+                 eltadd0_b,
+                 eltadd1_b,
+                 eltadd2_b,
+                 transpose2_1_out,
+                 transpose2_2_out,
+                 eltadd_qk_b,
+                 reshape2_0,
+                 matmul_linear_w,
+                 eltadd_linear_b,
+                 ffn_layer_norm,
+                 ffn_layer_norm_scale,
+                 ffn_layer_norm_bias,
+                 ffn_layer_norm_mean,
+                 ffn_layer_norm_variance,
+                 ffn_matmul0_w,
+                 ffn_matmul1_w,
+                 ffn_eltadd0_b,
+                 ffn_eltadd1_b,
+                 ffn_output);
+
+    std::unordered_set<const Node*> marked_nodes({layer_norm,
+                                                  layer_norm_mean,
+                                                  layer_norm_variance,
+                                                  layer_norm_out,
+                                                  matmul0,
+                                                  matmul1,
+                                                  matmul2,
+                                                  matmul0_out,
+                                                  matmul1_out,
+                                                  matmul2_out,
+                                                  eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  concat_0,
+                                                  concat_1,
+                                                  concat_0_out,
+                                                  concat_1_out,
+                                                  assign_0,
+                                                  assign_1,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  reshape2_qkv,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_linear,
+                                                  matmul_linear_out,
+                                                  eltadd_linear,
+                                                  eltadd_linear_out,
+                                                  eltadd_out,
+                                                  ffn_layer_norm,
+                                                  ffn_layer_norm_mean,
+                                                  ffn_layer_norm_variance,
+                                                  ffn_layer_norm_out,
+                                                  ffn_matmul0,
+                                                  ffn_matmul1,
+                                                  ffn_matmul0_out,
+                                                  ffn_matmul1_out,
+                                                  ffn_eltadd0,
+                                                  ffn_eltadd1,
+                                                  ffn_eltadd0_out,
+                                                  ffn_eltadd1_out,
+                                                  ffn_gelu,
+                                                  ffn_gelu_out,
+                                                  ffn_eltadd_out});
+
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void FusedMultiTransformerDecoderPass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal("During the multi_transformer pass, "
+                              "The scope should not be null."));
+
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kFusedMultiTransformerDecoderPass, new bool(true));
+    graph->Set(kFusedMultiTransformerDecoderFusionCount, new int(fusion_count));
+  }
+  AddStatis(fusion_count);
+}
+
+FusedMultiTransformerDecoderPass::FusedMultiTransformerDecoderPass() {
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("concat"))
+      .AddInput("X")  // Input("X"): vector<tensors>
+      .End()
+      .AddInput("AxisTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsNumEQ(2)
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumGE(0.0f)
+      .IsNumLE(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+}
+
+int FusedMultiTransformerDecoderFuseQKVPass::BuildFusion(
+    Graph* graph, const std::string& name_scope, Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::FusedMultiTransformerDecoderFuseQKVPattern
+      fused_multi_transformer_fuse_qkv_pattern(pattern, name_scope);
+  fused_multi_transformer_fuse_qkv_pattern();
+
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0,
+                          Node* layer_norm,
+                          Node* layer_norm_scale,
+                          Node* layer_norm_bias,
+                          Node* layer_norm_mean,
+                          Node* layer_norm_variance,
+                          Node* matmul0_w,
+                          Node* eltadd0_b,
+                          Node* eltadd_qk_b,
+                          Node* reshape2_0,
+                          Node* matmul_linear_w,
+                          Node* eltadd_linear_b,
+                          Node* ffn_layer_norm,
+                          Node* ffn_layer_norm_scale,
+                          Node* ffn_layer_norm_bias,
+                          Node* ffn_layer_norm_mean,
+                          Node* ffn_layer_norm_variance,
+                          Node* ffn_matmul0_w,
+                          Node* ffn_matmul1_w,
+                          Node* ffn_eltadd0_b,
+                          Node* ffn_eltadd1_b,
+                          Node* ffn_output) {
+    // Calc index of transformer layer by LayerNorm Scale name
+    // This calculation assumes:
+    //    1. no LayerNorm before all transformer layer
+    //    2. each transformer layer contains 2 LayerNorm layer
+    auto ln_scale_name = layer_norm_scale->Name();
+    auto ln_name = ln_scale_name.substr(0, ln_scale_name.find('.'));
+    auto ln_idx_str = ln_name.substr(ln_name.rfind('_') + 1);
+    int layer_idx = atoi(ln_idx_str.c_str()) / 2;
+
+    // create fused_multi_transformer
+    OpDesc fused_multi_transformer_op_desc(layer_norm->Op()->Block());
+    fused_multi_transformer_op_desc.SetType("fused_multi_transformer");
+
+    // 1. Input setting
+    fused_multi_transformer_op_desc.SetInput("X", {input0->Name()});
+
+    // pre-LayerNorm input
+    fused_multi_transformer_op_desc.SetInput("LnScale",
+                                             {layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("LnBias",
+                                             {layer_norm_bias->Name()});
+
+    // QKV computation input
+    fused_multi_transformer_op_desc.SetInput("QKVW", {matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("QKVBias", {eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("SrcMask", {eltadd_qk_b->Name()});
+
+    // Cache KV use cache_kv in encoder
+    auto cache_kv_name = "cache_kv" + std::to_string(layer_idx);
+    fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv_name});
+
+    VarDesc shape_out_desc("shape_out." + std::to_string(layer_idx));
+    shape_out_desc.SetDataType(proto::VarType::INT32);
+    shape_out_desc.SetPersistable(false);
+    auto* shape_out = graph->CreateVarNode(&shape_out_desc);
+
+    OpDesc shape_op_desc(layer_norm->Op()->Block());
+    shape_op_desc.SetType("shape");
+    shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()});
+    shape_op_desc.SetOutput("Out", {shape_out->Name()});
+    auto* shape_op = graph->CreateOpNode(&shape_op_desc);
+
+    VarDesc slice_out_desc("slice_out." + std::to_string(layer_idx));
+    slice_out_desc.SetDataType(proto::VarType::INT32);
+    slice_out_desc.SetPersistable(false);
+    auto* slice_out = graph->CreateVarNode(&slice_out_desc);
+
+    OpDesc slice_op_desc(layer_norm->Op()->Block());
+    slice_op_desc.SetType("slice");
+    slice_op_desc.SetInput("Input", {shape_out->Name()});
+    slice_op_desc.SetOutput("Out", {slice_out->Name()});
+    std::vector<int> axes = {0};
+    std::vector<int> starts = {3};
+    std::vector<int> ends = {4};
+    slice_op_desc.SetAttr("axes", axes);
+    slice_op_desc.SetAttr("starts", starts);
+    slice_op_desc.SetAttr("ends", ends);
+    auto* slice_op = graph->CreateOpNode(&slice_op_desc);
+
+    fused_multi_transformer_op_desc.SetInput("TimeStep", {slice_out->Name()});
+
+    // Out Linear input
+    fused_multi_transformer_op_desc.SetInput("OutLinearW",
+                                             {matmul_linear_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("OutLinearBias",
+                                             {eltadd_linear_b->Name()});
+
+    // Feed Forward input
+    fused_multi_transformer_op_desc.SetInput("FFNLnScale",
+                                             {ffn_layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFNLnBias",
+                                             {ffn_layer_norm_bias->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Weight",
+                                             {ffn_matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Bias",
+                                             {ffn_eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Weight",
+                                             {ffn_matmul1_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Bias",
+                                             {ffn_eltadd1_b->Name()});
+
+    // 2. Output setting
+    fused_multi_transformer_op_desc.SetOutput("Out", {ffn_output->Name()});
+    fused_multi_transformer_op_desc.SetOutput("CacheKVOut", {cache_kv_name});
+
+    // Attribute setting
+    fused_multi_transformer_op_desc.SetAttr("pre_layer_norm", true);
+    fused_multi_transformer_op_desc.SetAttr(
+        "epsilon", layer_norm->Op()->GetAttr("epsilon"));
+
+    // output dropout attribute
+    fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f);
+    fused_multi_transformer_op_desc.SetAttr("is_test", true);
+
+    auto* fused_multi_transformer =
+        graph->CreateOpNode(&fused_multi_transformer_op_desc);
+    IR_NODE_LINK_TO(input0, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer);
+
+    // TimeStep link
+    IR_NODE_LINK_TO(eltadd_qk_b, shape_op);
+    IR_NODE_LINK_TO(shape_op, shape_out);
+    IR_NODE_LINK_TO(shape_out, slice_op);
+    IR_NODE_LINK_TO(slice_op, slice_out);
+    IR_NODE_LINK_TO(slice_out, fused_multi_transformer)
+
+    IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(fused_multi_transformer, ffn_output);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "fused_multi_transformer_decoder_fuse_qkv "
+                      "pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle MultiTransformer decoder(Fuse-QKV) fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(
+        input0, input0, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm, layer_norm, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale,
+                              layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias,
+                              layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean,
+                              layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance,
+                              layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out,
+                              layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0, matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_out, matmul0_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_w, matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0, reshape2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out,
+                              reshape2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0, transpose2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out,
+                              transpose2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0, split0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_q_out, split0_q_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_k_out, split0_k_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_v_out, split0_v_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_k_in, concat_k_in, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_k, concat_k, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_k_out, concat_k_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_v_in, concat_v_in, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_v, concat_v, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_v_out, concat_v_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        assign_k, assign_k, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        assign_v, assign_v, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm,
+                              ffn_layer_norm,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_scale,
+                              ffn_layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_bias,
+                              ffn_layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_mean,
+                              ffn_layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_variance,
+                              ffn_layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_out,
+                              ffn_layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0, ffn_matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul0_out,
+                              ffn_matmul0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_w, ffn_matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0, ffn_eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_b, ffn_eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd0_out,
+                              ffn_eltadd0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu, ffn_gelu, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu_out, ffn_gelu_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1, ffn_matmul1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul1_out,
+                              ffn_matmul1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_w, ffn_matmul1_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1, ffn_eltadd1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_b, ffn_eltadd1_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd1_out,
+                              ffn_eltadd1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out,
+                              ffn_eltadd_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_output, ffn_output, fused_multi_transformer_fuse_qkv_pattern)
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0, eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_b, eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_out, eltadd0_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk, matmul_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk_out, matmul_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk, eltadd_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_b, eltadd_qk_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_out, eltadd_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk, softmax_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out,
+                              softmax_qk_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out,
+                              matmul_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv, reshape2_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out,
+                              reshape2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv,
+                              transpose2_qkv,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out,
+                              transpose2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear, matmul_linear, fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_w,
+                              matmul_linear_w,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_out,
+                              matmul_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear, eltadd_linear, fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_b,
+                              eltadd_linear_b,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out,
+                              eltadd_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern)
+
+    fuse_creater(input0,
+                 layer_norm,
+                 layer_norm_scale,
+                 layer_norm_bias,
+                 layer_norm_mean,
+                 layer_norm_variance,
+                 matmul0_w,
+                 eltadd0_b,
+                 eltadd_qk_b,
+                 reshape2_0,
+                 matmul_linear_w,
+                 eltadd_linear_b,
+                 ffn_layer_norm,
+                 ffn_layer_norm_scale,
+                 ffn_layer_norm_bias,
+                 ffn_layer_norm_mean,
+                 ffn_layer_norm_variance,
+                 ffn_matmul0_w,
+                 ffn_matmul1_w,
+                 ffn_eltadd0_b,
+                 ffn_eltadd1_b,
+                 ffn_output);
+
+    std::unordered_set<const Node*> marked_nodes({layer_norm,
+                                                  layer_norm_mean,
+                                                  layer_norm_variance,
+                                                  layer_norm_out,
+                                                  matmul0,
+                                                  matmul0_out,
+                                                  eltadd0,
+                                                  eltadd0_out,
+                                                  reshape2_0,
+                                                  reshape2_0_out,
+                                                  transpose2_0,
+                                                  transpose2_0_out,
+                                                  split0,
+                                                  split0_q_out,
+                                                  split0_k_out,
+                                                  split0_v_out,
+                                                  concat_k_in,
+                                                  concat_k,
+                                                  concat_k_out,
+                                                  concat_v_in,
+                                                  concat_v,
+                                                  concat_v_out,
+                                                  assign_k,
+                                                  assign_v,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  reshape2_qkv,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_linear,
+                                                  matmul_linear_out,
+                                                  eltadd_linear,
+                                                  eltadd_linear_out,
+                                                  eltadd_out,
+                                                  ffn_layer_norm,
+                                                  ffn_layer_norm_mean,
+                                                  ffn_layer_norm_variance,
+                                                  ffn_layer_norm_out,
+                                                  ffn_matmul0,
+                                                  ffn_matmul1,
+                                                  ffn_matmul0_out,
+                                                  ffn_matmul1_out,
+                                                  ffn_eltadd0,
+                                                  ffn_eltadd1,
+                                                  ffn_eltadd0_out,
+                                                  ffn_eltadd1_out,
+                                                  ffn_gelu,
+                                                  ffn_gelu_out,
+                                                  ffn_eltadd_out});
+
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void FusedMultiTransformerDecoderFuseQKVPass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal("During the fused_multi_transformer_decoder "
+                              "pass, The scope should not be null."));
+
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kFusedMultiTransformerDecoderFuseQKVPass, new bool(true));
+    graph->Set(kFusedMultiTransformerDecoderFusionCount, new int(fusion_count));
+  }
+  AddStatis(fusion_count);
+}
+
+FusedMultiTransformerDecoderFuseQKVPass::
+    FusedMultiTransformerDecoderFuseQKVPass() {
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("concat"))
+      .AddInput("X")  // Input("X"): vector<tensors>
+      .End()
+      .AddInput("AxisTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsNumEQ(2)
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumGE(0.0f)
+      .IsNumLE(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+}
+
+int MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::BuildFusion(
+    Graph* graph, const std::string& name_scope, Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern
+      fused_multi_transformer_fuse_qkv_pattern(pattern, name_scope);
+  fused_multi_transformer_fuse_qkv_pattern();
+
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0,
+                          Node* layer_norm,
+                          Node* layer_norm_scale,
+                          Node* layer_norm_bias,
+                          Node* layer_norm_mean,
+                          Node* layer_norm_variance,
+                          Node* c_identity,
+                          Node* matmul0_w,
+                          Node* eltadd0_b,
+                          Node* eltadd_qk_b,
+                          Node* reshape2_0,
+                          Node* matmul_linear_w,
+                          Node* eltadd_linear_b,
+                          Node* ffn_layer_norm,
+                          Node* ffn_layer_norm_scale,
+                          Node* ffn_layer_norm_bias,
+                          Node* ffn_layer_norm_mean,
+                          Node* ffn_layer_norm_variance,
+                          Node* ffn_matmul0_w,
+                          Node* ffn_matmul1_w,
+                          Node* ffn_eltadd0_b,
+                          Node* ffn_eltadd1_b,
+                          Node* ffn_output) {
+    // Calc index of transformer layer by LayerNorm Scale name
+    // This calculation assumes:
+    //    1. no LayerNorm before all transformer layer
+    //    2. each transformer layer contains 2 LayerNorm layer
+    auto ln_scale_name = layer_norm_scale->Name();
+    auto ln_name = ln_scale_name.substr(0, ln_scale_name.find('.'));
+    auto ln_idx_str = ln_name.substr(ln_name.rfind('_') + 1);
+    int layer_idx = atoi(ln_idx_str.c_str()) / 2;
+
+    // create fused_multi_transformer
+    OpDesc fused_multi_transformer_op_desc(layer_norm->Op()->Block());
+    fused_multi_transformer_op_desc.SetType("fused_multi_transformer");
+
+    // 1. Input setting
+    fused_multi_transformer_op_desc.SetInput("X", {input0->Name()});
+
+    // pre-LayerNorm input
+    fused_multi_transformer_op_desc.SetInput("LnScale",
+                                             {layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("LnBias",
+                                             {layer_norm_bias->Name()});
+
+    // QKV computation input
+    fused_multi_transformer_op_desc.SetInput("QKVW", {matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("QKVBias", {eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("SrcMask", {eltadd_qk_b->Name()});
+
+    // Cache KV use cache_kv in encoder
+    auto cache_kv_name = "cache_kv" + std::to_string(layer_idx);
+    fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv_name});
+
+    VarDesc shape_out_desc("shape_out." + std::to_string(layer_idx));
+    shape_out_desc.SetDataType(proto::VarType::INT32);
+    shape_out_desc.SetPersistable(false);
+    auto* shape_out = graph->CreateVarNode(&shape_out_desc);
+
+    OpDesc shape_op_desc(layer_norm->Op()->Block());
+    shape_op_desc.SetType("shape");
+    shape_op_desc.SetInput("Input", {eltadd_qk_b->Name()});
+    shape_op_desc.SetOutput("Out", {shape_out->Name()});
+    auto* shape_op = graph->CreateOpNode(&shape_op_desc);
+
+    VarDesc slice_out_desc("slice_out." + std::to_string(layer_idx));
+    slice_out_desc.SetDataType(proto::VarType::INT32);
+    slice_out_desc.SetPersistable(false);
+    auto* slice_out = graph->CreateVarNode(&slice_out_desc);
+
+    OpDesc slice_op_desc(layer_norm->Op()->Block());
+    slice_op_desc.SetType("slice");
+    slice_op_desc.SetInput("Input", {shape_out->Name()});
+    slice_op_desc.SetOutput("Out", {slice_out->Name()});
+    std::vector<int> axes = {0};
+    std::vector<int> starts = {3};
+    std::vector<int> ends = {4};
+    slice_op_desc.SetAttr("axes", axes);
+    slice_op_desc.SetAttr("starts", starts);
+    slice_op_desc.SetAttr("ends", ends);
+    auto* slice_op = graph->CreateOpNode(&slice_op_desc);
+
+    fused_multi_transformer_op_desc.SetInput("TimeStep", {slice_out->Name()});
+
+    // Out Linear input
+    fused_multi_transformer_op_desc.SetInput("OutLinearW",
+                                             {matmul_linear_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("OutLinearBias",
+                                             {eltadd_linear_b->Name()});
+
+    // Feed Forward input
+    fused_multi_transformer_op_desc.SetInput("FFNLnScale",
+                                             {ffn_layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFNLnBias",
+                                             {ffn_layer_norm_bias->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Weight",
+                                             {ffn_matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Bias",
+                                             {ffn_eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Weight",
+                                             {ffn_matmul1_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Bias",
+                                             {ffn_eltadd1_b->Name()});
+
+    // 2. Output setting
+    fused_multi_transformer_op_desc.SetOutput("Out", {ffn_output->Name()});
+    fused_multi_transformer_op_desc.SetOutput("CacheKVOut", {cache_kv_name});
+
+    // Attribute setting
+    fused_multi_transformer_op_desc.SetAttr("pre_layer_norm", true);
+    fused_multi_transformer_op_desc.SetAttr(
+        "epsilon", layer_norm->Op()->GetAttr("epsilon"));
+
+    // output dropout attribute
+    fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f);
+    fused_multi_transformer_op_desc.SetAttr("is_test", true);
+
+    // parallel ring id
+    auto* c_identity_op = c_identity->Op();
+    fused_multi_transformer_op_desc.SetAttr("ring_id",
+                                            c_identity_op->GetAttr("ring_id"));
+
+    auto* fused_multi_transformer =
+        graph->CreateOpNode(&fused_multi_transformer_op_desc);
+    IR_NODE_LINK_TO(input0, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer);
+
+    // TimeStep link
+    IR_NODE_LINK_TO(eltadd_qk_b, shape_op);
+    IR_NODE_LINK_TO(shape_op, shape_out);
+    IR_NODE_LINK_TO(shape_out, slice_op);
+    IR_NODE_LINK_TO(slice_op, slice_out);
+    IR_NODE_LINK_TO(slice_out, fused_multi_transformer)
+
+    IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(fused_multi_transformer, ffn_output);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "fused_multi_transformer_decoder_fuse_qkv "
+                      "pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle MultiTransformer decoder(Fuse-QKV) fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(
+        input0, input0, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm, layer_norm, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale,
+                              layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias,
+                              layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean,
+                              layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance,
+                              layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out,
+                              layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        c_identity, c_identity, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(c_identity_out,
+                              c_identity_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0, matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_out, matmul0_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_w, matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0, reshape2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out,
+                              reshape2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0, transpose2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out,
+                              transpose2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0, split0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_q_out, split0_q_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_k_out, split0_k_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_v_out, split0_v_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_k_in, concat_k_in, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_k, concat_k, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_k_out, concat_k_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_v_in, concat_v_in, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_v, concat_v, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        concat_v_out, concat_v_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        assign_k, assign_k, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        assign_v, assign_v, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm,
+                              ffn_layer_norm,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_scale,
+                              ffn_layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_bias,
+                              ffn_layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_mean,
+                              ffn_layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_variance,
+                              ffn_layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_out,
+                              ffn_layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_identity,
+                              ffn_c_identity,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_identity_out,
+                              ffn_c_identity_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0, ffn_matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul0_out,
+                              ffn_matmul0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_w, ffn_matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0, ffn_eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_b, ffn_eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd0_out,
+                              ffn_eltadd0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu, ffn_gelu, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu_out, ffn_gelu_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1, ffn_matmul1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul1_out,
+                              ffn_matmul1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_w, ffn_matmul1_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_allreduce_sum,
+                              ffn_c_allreduce_sum,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_allreduce_sum_out,
+                              ffn_c_allreduce_sum_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1, ffn_eltadd1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_b, ffn_eltadd1_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd1_out,
+                              ffn_eltadd1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out,
+                              ffn_eltadd_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_output, ffn_output, fused_multi_transformer_fuse_qkv_pattern)
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0, eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_b, eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_out, eltadd0_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk, matmul_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk_out, matmul_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk, eltadd_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_b, eltadd_qk_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_out, eltadd_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk, softmax_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out,
+                              softmax_qk_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out,
+                              matmul_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv, reshape2_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out,
+                              reshape2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv,
+                              transpose2_qkv,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out,
+                              transpose2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear, matmul_linear, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_w,
+                              matmul_linear_w,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_out,
+                              matmul_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(c_allreduce_sum,
+                              c_allreduce_sum,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(c_allreduce_sum_out,
+                              c_allreduce_sum_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear, eltadd_linear, fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_b,
+                              eltadd_linear_b,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out,
+                              eltadd_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern)
+
+    fuse_creater(input0,
+                 layer_norm,
+                 layer_norm_scale,
+                 layer_norm_bias,
+                 layer_norm_mean,
+                 layer_norm_variance,
+                 c_identity,
+                 matmul0_w,
+                 eltadd0_b,
+                 eltadd_qk_b,
+                 reshape2_0,
+                 matmul_linear_w,
+                 eltadd_linear_b,
+                 ffn_layer_norm,
+                 ffn_layer_norm_scale,
+                 ffn_layer_norm_bias,
+                 ffn_layer_norm_mean,
+                 ffn_layer_norm_variance,
+                 ffn_matmul0_w,
+                 ffn_matmul1_w,
+                 ffn_eltadd0_b,
+                 ffn_eltadd1_b,
+                 ffn_output);
+
+    std::unordered_set<const Node*> marked_nodes({layer_norm,
+                                                  layer_norm_mean,
+                                                  layer_norm_variance,
+                                                  layer_norm_out,
+                                                  c_identity,
+                                                  c_identity_out,
+                                                  matmul0,
+                                                  matmul0_out,
+                                                  eltadd0,
+                                                  eltadd0_out,
+                                                  reshape2_0,
+                                                  reshape2_0_out,
+                                                  transpose2_0,
+                                                  transpose2_0_out,
+                                                  split0,
+                                                  split0_q_out,
+                                                  split0_k_out,
+                                                  split0_v_out,
+                                                  concat_k_in,
+                                                  concat_k,
+                                                  concat_k_out,
+                                                  concat_v_in,
+                                                  concat_v,
+                                                  concat_v_out,
+                                                  assign_k,
+                                                  assign_v,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  reshape2_qkv,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_linear,
+                                                  matmul_linear_out,
+                                                  c_allreduce_sum,
+                                                  c_allreduce_sum_out,
+                                                  eltadd_linear,
+                                                  eltadd_linear_out,
+                                                  eltadd_out,
+                                                  ffn_layer_norm,
+                                                  ffn_layer_norm_mean,
+                                                  ffn_layer_norm_variance,
+                                                  ffn_layer_norm_out,
+                                                  ffn_c_identity,
+                                                  ffn_c_identity_out,
+                                                  ffn_matmul0,
+                                                  ffn_matmul1,
+                                                  ffn_matmul0_out,
+                                                  ffn_matmul1_out,
+                                                  ffn_c_allreduce_sum,
+                                                  ffn_c_allreduce_sum_out,
+                                                  ffn_eltadd0,
+                                                  ffn_eltadd1,
+                                                  ffn_eltadd0_out,
+                                                  ffn_eltadd1_out,
+                                                  ffn_gelu,
+                                                  ffn_gelu_out,
+                                                  ffn_eltadd_out});
+
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::ApplyImpl(
+    Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal("During the fused_multi_transformer_decoder "
+                              "pass, The scope should not be null."));
+
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kFusedMultiTransformerDecoderFuseQKVPass, new bool(true));
+    graph->Set(kFusedMultiTransformerDecoderFusionCount, new int(fusion_count));
+  }
+  AddStatis(fusion_count);
+}
+
+MultiDevicesFusedMultiTransformerDecoderFuseQKVPass::
+    MultiDevicesFusedMultiTransformerDecoderFuseQKVPass() {
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("concat"))
+      .AddInput("X")  // Input("X"): vector<tensors>
+      .End()
+      .AddInput("AxisTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsNumEQ(2)
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumGE(0.0f)
+      .IsNumLE(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fused_multi_transformer_decoder_pass,
+              paddle::framework::ir::FusedMultiTransformerDecoderPass);
+REGISTER_PASS(fused_multi_transformer_decoder_fuse_qkv_pass,
+              paddle::framework::ir::FusedMultiTransformerDecoderFuseQKVPass);
+REGISTER_PASS(
+    multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass,
+    paddle::framework::ir::MultiDevicesFusedMultiTransformerDecoderFuseQKVPass);
+
+REGISTER_PASS_CAPABILITY(fused_multi_transformer_decoder_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(fused_multi_transformer_decoder_fuse_qkv_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(
+    multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h
new file mode 100644
index 00000000000000..fd2cfc8c6677e7
--- /dev/null
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass.h
@@ -0,0 +1,398 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct FusedMultiTransformerDecoderPattern : public PatternBase {
+  FusedMultiTransformerDecoderPattern(PDPattern* pattern,
+                                      const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fused_multi_transformer_decoder") {}
+
+  PDNode* operator()();
+
+  // Q, K, V path
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(matmul0);
+  PATTERN_DECL_NODE(matmul1);
+  PATTERN_DECL_NODE(matmul2);
+  PATTERN_DECL_NODE(matmul0_w);
+  PATTERN_DECL_NODE(matmul1_w);
+  PATTERN_DECL_NODE(matmul2_w);
+  PATTERN_DECL_NODE(matmul0_out);
+  PATTERN_DECL_NODE(matmul1_out);
+  PATTERN_DECL_NODE(matmul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+
+  PATTERN_DECL_NODE(concat_0_in);
+  PATTERN_DECL_NODE(concat_0);
+  PATTERN_DECL_NODE(concat_0_out);
+  PATTERN_DECL_NODE(assign_0);
+  PATTERN_DECL_NODE(concat_1_in);
+  PATTERN_DECL_NODE(concat_1);
+  PATTERN_DECL_NODE(concat_1_out);
+  PATTERN_DECL_NODE(assign_1);
+
+  // Q, K matmul
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  // QK, V matmul
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+
+  // out linear
+  PATTERN_DECL_NODE(matmul_linear);
+  PATTERN_DECL_NODE(matmul_linear_w);
+  PATTERN_DECL_NODE(matmul_linear_out);
+  PATTERN_DECL_NODE(eltadd_linear);
+  PATTERN_DECL_NODE(eltadd_linear_b);
+  PATTERN_DECL_NODE(eltadd_linear_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(eltadd_out)
+  PATTERN_DECL_NODE(attention_output);
+
+  // while loop
+  PATTERN_DECL_NODE(while0);
+
+  // Feed Forward nodes
+  PATTERN_DECL_NODE(ffn_layer_norm);
+  PATTERN_DECL_NODE(ffn_layer_norm_scale);
+  PATTERN_DECL_NODE(ffn_layer_norm_bias);
+  PATTERN_DECL_NODE(ffn_layer_norm_mean);
+  PATTERN_DECL_NODE(ffn_layer_norm_variance);
+  PATTERN_DECL_NODE(ffn_layer_norm_out);
+  PATTERN_DECL_NODE(ffn_matmul0);
+  PATTERN_DECL_NODE(ffn_matmul0_w);
+  PATTERN_DECL_NODE(ffn_matmul0_out);
+  PATTERN_DECL_NODE(ffn_eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_out);
+  PATTERN_DECL_NODE(ffn_gelu);
+  PATTERN_DECL_NODE(ffn_gelu_out);
+  PATTERN_DECL_NODE(ffn_matmul1);
+  PATTERN_DECL_NODE(ffn_matmul1_w);
+  PATTERN_DECL_NODE(ffn_matmul1_out);
+  PATTERN_DECL_NODE(ffn_eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(ffn_eltadd_out)
+  PATTERN_DECL_NODE(ffn_output);
+};
+
+struct FusedMultiTransformerDecoderFuseQKVPattern : public PatternBase {
+  FusedMultiTransformerDecoderFuseQKVPattern(PDPattern* pattern,
+                                             const std::string& name_scope)
+      : PatternBase(
+            pattern, name_scope, "fused_multi_transformer_decoder_fuse_qkv") {}
+
+  PDNode* operator()();
+
+  // Q, K, V path
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(matmul0);
+  PATTERN_DECL_NODE(matmul0_w);
+  PATTERN_DECL_NODE(matmul0_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_0_out);
+
+  PATTERN_DECL_NODE(split0)
+  PATTERN_DECL_NODE(split0_q_out)
+  PATTERN_DECL_NODE(split0_k_out)
+  PATTERN_DECL_NODE(split0_v_out)
+  PATTERN_DECL_NODE(concat_k_in)
+  PATTERN_DECL_NODE(concat_v_in)
+  PATTERN_DECL_NODE(concat_k)
+  PATTERN_DECL_NODE(concat_v)
+  PATTERN_DECL_NODE(concat_k_out)
+  PATTERN_DECL_NODE(concat_v_out)
+  PATTERN_DECL_NODE(assign_k)
+  PATTERN_DECL_NODE(assign_v)
+
+  // Q, K matmul
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  // QK, V matmul
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+
+  // out linear
+  PATTERN_DECL_NODE(matmul_linear);
+  PATTERN_DECL_NODE(matmul_linear_w);
+  PATTERN_DECL_NODE(matmul_linear_out);
+  PATTERN_DECL_NODE(eltadd_linear);
+  PATTERN_DECL_NODE(eltadd_linear_b);
+  PATTERN_DECL_NODE(eltadd_linear_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(eltadd_out)
+  PATTERN_DECL_NODE(attention_output);
+
+  // Feed Forward nodes
+  PATTERN_DECL_NODE(ffn_layer_norm);
+  PATTERN_DECL_NODE(ffn_layer_norm_scale);
+  PATTERN_DECL_NODE(ffn_layer_norm_bias);
+  PATTERN_DECL_NODE(ffn_layer_norm_mean);
+  PATTERN_DECL_NODE(ffn_layer_norm_variance);
+  PATTERN_DECL_NODE(ffn_layer_norm_out);
+  PATTERN_DECL_NODE(ffn_matmul0);
+  PATTERN_DECL_NODE(ffn_matmul0_w);
+  PATTERN_DECL_NODE(ffn_matmul0_out);
+  PATTERN_DECL_NODE(ffn_eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_out);
+  PATTERN_DECL_NODE(ffn_gelu);
+  PATTERN_DECL_NODE(ffn_gelu_out);
+  PATTERN_DECL_NODE(ffn_matmul1);
+  PATTERN_DECL_NODE(ffn_matmul1_w);
+  PATTERN_DECL_NODE(ffn_matmul1_out);
+  PATTERN_DECL_NODE(ffn_eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(ffn_eltadd_out)
+  PATTERN_DECL_NODE(ffn_output);
+};
+
+struct MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern
+    : public PatternBase {
+  MultiDevicesFusedMultiTransformerDecoderFuseQKVPattern(
+      PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern,
+                    name_scope,
+                    "multi_devices_fused_multi_transformer_decoder_fuse_qkv") {}
+
+  PDNode* operator()();
+
+  // Q, K, V path
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(c_identity);
+  PATTERN_DECL_NODE(c_identity_out);
+  PATTERN_DECL_NODE(matmul0);
+  PATTERN_DECL_NODE(matmul0_w);
+  PATTERN_DECL_NODE(matmul0_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_0_out);
+
+  PATTERN_DECL_NODE(split0)
+  PATTERN_DECL_NODE(split0_q_out)
+  PATTERN_DECL_NODE(split0_k_out)
+  PATTERN_DECL_NODE(split0_v_out)
+  PATTERN_DECL_NODE(concat_k_in)
+  PATTERN_DECL_NODE(concat_v_in)
+  PATTERN_DECL_NODE(concat_k)
+  PATTERN_DECL_NODE(concat_v)
+  PATTERN_DECL_NODE(concat_k_out)
+  PATTERN_DECL_NODE(concat_v_out)
+  PATTERN_DECL_NODE(assign_k)
+  PATTERN_DECL_NODE(assign_v)
+
+  // Q, K matmul
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  // QK, V matmul
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+
+  // out linear
+  PATTERN_DECL_NODE(matmul_linear);
+  PATTERN_DECL_NODE(matmul_linear_w);
+  PATTERN_DECL_NODE(matmul_linear_out);
+  PATTERN_DECL_NODE(c_allreduce_sum);
+  PATTERN_DECL_NODE(c_allreduce_sum_out);
+  PATTERN_DECL_NODE(eltadd_linear);
+  PATTERN_DECL_NODE(eltadd_linear_b);
+  PATTERN_DECL_NODE(eltadd_linear_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(eltadd_out)
+  PATTERN_DECL_NODE(attention_output);
+
+  // Feed Forward nodes
+  PATTERN_DECL_NODE(ffn_layer_norm);
+  PATTERN_DECL_NODE(ffn_layer_norm_scale);
+  PATTERN_DECL_NODE(ffn_layer_norm_bias);
+  PATTERN_DECL_NODE(ffn_layer_norm_mean);
+  PATTERN_DECL_NODE(ffn_layer_norm_variance);
+  PATTERN_DECL_NODE(ffn_layer_norm_out);
+  PATTERN_DECL_NODE(ffn_c_identity);
+  PATTERN_DECL_NODE(ffn_c_identity_out);
+  PATTERN_DECL_NODE(ffn_matmul0);
+  PATTERN_DECL_NODE(ffn_matmul0_w);
+  PATTERN_DECL_NODE(ffn_matmul0_out);
+  PATTERN_DECL_NODE(ffn_eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_out);
+  PATTERN_DECL_NODE(ffn_gelu);
+  PATTERN_DECL_NODE(ffn_gelu_out);
+  PATTERN_DECL_NODE(ffn_matmul1);
+  PATTERN_DECL_NODE(ffn_matmul1_w);
+  PATTERN_DECL_NODE(ffn_matmul1_out);
+  PATTERN_DECL_NODE(ffn_c_allreduce_sum);
+  PATTERN_DECL_NODE(ffn_c_allreduce_sum_out);
+  PATTERN_DECL_NODE(ffn_eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(ffn_eltadd_out)
+  PATTERN_DECL_NODE(ffn_output);
+};
+
+}  // namespace patterns
+
+class FusedMultiTransformerDecoderPass : public FusePassBase {
+ public:
+  FusedMultiTransformerDecoderPass();
+  virtual ~FusedMultiTransformerDecoderPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"fused_multi_transformer_decoder"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+
+class FusedMultiTransformerDecoderFuseQKVPass : public FusePassBase {
+ public:
+  FusedMultiTransformerDecoderFuseQKVPass();
+  virtual ~FusedMultiTransformerDecoderFuseQKVPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"fused_multi_transformer_decoder_fuse_qkv"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+
+class MultiDevicesFusedMultiTransformerDecoderFuseQKVPass
+    : public FusePassBase {
+ public:
+  MultiDevicesFusedMultiTransformerDecoderFuseQKVPass();
+  virtual ~MultiDevicesFusedMultiTransformerDecoderFuseQKVPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{
+      "multi_devices_fused_multi_transformer_decoder_fuse_qkv"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc
new file mode 100644
index 00000000000000..28357f4b20e426
--- /dev/null
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_decoder_pass_tester.cc
@@ -0,0 +1,552 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h"  // NOLINT
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope,
+                   const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+
+  // MHA: pre Layer Norm
+  AddVarToScope(param_scope, "ln_scale", {1024});
+  AddVarToScope(param_scope, "ln_bias", {1024});
+
+  // MHA: QKV fc
+  AddVarToScope(param_scope, "weights0", {1024, 1024});
+  AddVarToScope(param_scope, "weights1", {1024, 1024});
+  AddVarToScope(param_scope, "weights2", {1024, 1024});
+  AddVarToScope(param_scope, "bias_0", {1024});
+  AddVarToScope(param_scope, "bias_1", {1024});
+  AddVarToScope(param_scope, "bias_2", {1024});
+
+  // MHA: QK bias
+  AddVarToScope(param_scope, "biasqk", {1024});
+
+  // MHA: out Linear
+  AddVarToScope(param_scope, "weights_l", {1024, 1024});
+  AddVarToScope(param_scope, "bias_l", {1024});
+
+  // MHA: pre Layer Norm
+  AddVarToScope(param_scope, "ffn_ln_scale", {1024});
+  AddVarToScope(param_scope, "ffn_ln_bias", {1024});
+
+  // FFN: fc1 -> (gelu) -> fc2
+  AddVarToScope(param_scope, "ffn_weights0", {1024, 4096});
+  AddVarToScope(param_scope, "ffn_weights1", {4096, 1024});
+  AddVarToScope(param_scope, "ffn_bias_0", {4096});
+  AddVarToScope(param_scope, "ffn_bias_1", {1024});
+
+  return param_scope;
+}
+
+TEST(FusedMultiTransformerDecoderPass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, ln_scale, ln_bias)           layer_norm       -> layer_norm_out
+  // (layer_norm_out, weights_0)      matmul_v2        -> matmul_out0
+  // (layer_norm_out, weights_1)      matmul_v2        -> matmul_out1
+  // (layer_norm_out, weights_2)      matmul_v2        -> matmul_out2
+  // (matmul_out0, bias_0)            elementwise_add  -> eltadd_0
+  // (matmul_out1, bias_1)            elementwise_add  -> eltadd_1
+  // (matmul_out2, bias_2)            elementwise_add  -> eltadd_2
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (eltadd_1)                       reshape2         -> reshape_1
+  // (eltadd_2)                       reshape2         -> reshape_2
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (reshape_1)                      transpose2       -> transpose_1
+  // (reshape_2)                      transpose2       -> transpose_2
+  // (transpose_1)                    concat           -> concat_0
+  // (transpose_2)                    concat           -> concat_2
+  // (concat_0)                       assign           -> assign_0
+  // (concat_1)                       assign           -> assign_2
+  // (transpose_0, transpose_1)       matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul_v2        -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    matmul_v2        -> matmul_linear
+  // (matmul_linear)                  elementwise_add  -> eltadd_linear
+  // (eltadd_out)                     elementwise_add  -> attention_out
+  //
+  // (attention_out, scale, bias)     layer_norm       -> ffn_layer_norm_out
+  // (layer_norm_out, ffn_matmul0_w)  matmul_v2        -> ffn_matmul0
+  // (ffn_matmul0, ffn_bias0)         elementwise_add  -> ffn_eltadd0
+  // (ffn_eltadd0)                    gelu             -> ffn_gelu
+  // (ffn_gelu)                       matmul_v2        -> ffn_matmul1
+  // (ffn_matmul1, ffn_bias1)         elementwise_add  -> ffn_eltadd1
+  // (attention_out, ffn_eltadd1)     elementwise_add  -> ffn_output
+
+  Layers layers;
+  // MHA: pre LayerNorm
+  auto* x = layers.data("x", {1, 128, 1024});
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);
+  auto* ln_out = layers.layer_norm(x, ln_scale, ln_bias)[0];
+
+  // MHA: QKV fc
+  auto* weights_0 = layers.data("weights0", {1024, 1024}, true);
+  auto* weights_1 = layers.data("weights1", {1024, 1024}, true);
+  auto* weights_2 = layers.data("weights2", {1024, 1024}, true);
+  auto* matmul_out_0 =
+      layers.matmul_v2(ln_out, weights_0, nullptr, false, true);
+  auto* matmul_out_1 =
+      layers.matmul_v2(ln_out, weights_1, nullptr, false, true);
+  auto* matmul_out_2 =
+      layers.matmul_v2(ln_out, weights_2, nullptr, false, true);
+
+  auto* b0 = layers.data("bias_0", {1024}, true);
+  auto* b1 = layers.data("bias_1", {1024}, true);
+  auto* b2 = layers.data("bias_2", {1024}, true);
+  auto* elementwise_out_0 =
+      layers.elementwise_add(matmul_out_0, b0, nullptr, 2);
+  auto* elementwise_out_1 =
+      layers.elementwise_add(matmul_out_1, b1, nullptr, 2);
+  auto* elementwise_out_2 =
+      layers.elementwise_add(matmul_out_2, b2, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 16, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+  auto* reshape_1 = layers.reshape2(elementwise_out_1, shape, true);
+  auto* reshape_2 = layers.reshape2(elementwise_out_2, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+  auto* transpose_1 = layers.transpose2(reshape_1, axis, true);
+  auto* transpose_2 = layers.transpose2(reshape_2, axis, true);
+
+  auto* cache_k = layers.data("cache_k", {1, 16, 128, 64});
+  auto* cache_v = layers.data("cache_v", {1, 16, 128, 64});
+  auto* concat_k = layers.concat({cache_k, transpose_1}, 2);
+  auto* concat_v = layers.concat({cache_v, transpose_2}, 2);
+  layers.assign(concat_k);
+  layers.assign(concat_v);
+
+  // MHA: QK matmul
+  auto* matmul_qk = layers.matmul(transpose_0, concat_k, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  // MHA: QKV matmul
+  auto* matmul_qkv = layers.matmul_v2(softmax_qk, concat_v);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true);
+
+  // MHA: out Linear
+  auto* weights_l = layers.data("weights_l", {1024, 1024}, true);
+  auto* bias_l = layers.data("weightsl", {1024, 1024}, true);
+  auto* linear_matmut_out =
+      layers.matmul_v2(reshape_qkv_out, weights_l, nullptr, false, true);
+  auto* linear_eltadd_out =
+      layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2);
+
+  auto* attention_out = layers.elementwise_add(x, linear_eltadd_out);
+
+  // FFN: pre LayerNorm
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);
+  auto* ffn_ln_out =
+      layers.layer_norm(attention_out, ffn_ln_scale, ffn_ln_bias)[0];
+
+  // FFN: fc1 -> gelu -> fc2
+  auto* ffn_weights0 = layers.data("ffn_weights0", {1024, 4096}, true);
+  auto* ffn_weights1 = layers.data("ffn_weights1", {4096, 1024}, true);
+  auto* ffn_bias0 = layers.data("ffn_bias0", {4096}, true);
+  auto* ffn_bias1 = layers.data("ffn_bias1", {1024}, true);
+  auto* ffn_matmul0_out =
+      layers.matmul_v2(ffn_ln_out, ffn_weights0, nullptr, false, true);
+  auto* ffn_eltadd0_out =
+      layers.elementwise_add(ffn_matmul0_out, ffn_bias0, nullptr, 2);
+  auto* ffn_gelu_out = layers.gelu(ffn_eltadd0_out);
+  auto* ffn_matmul1_out =
+      layers.matmul_v2(ffn_gelu_out, ffn_weights1, nullptr, false, true);
+  auto* ffn_eltadd1_out =
+      layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2);
+
+  layers.elementwise_add(attention_out, ffn_eltadd1_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass =
+      PassRegistry::Instance().Get("fused_multi_transformer_decoder_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO) << "get fused_multi_transformer_decoder_pass failed";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+  int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(num_nodes_before,
+                    num_nodes_after + 60,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_decoder_pass, The "
+                        "node num in graph "
+                        "should be %d, but the result is %d",
+                        num_nodes_before - 60,
+                        num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_decoder pass, "
+                        "there should be one fused_multi_transformer op, "
+                        "but the result is %d",
+                        num_fused_nodes_after));
+}
+
+TEST(FusedMultiTransformerDecoderPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("fused_multi_transformer_decoder_pass"));
+}
+
+TEST(FusedMultiTransformerDecoderFuseQKVPass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, ln_scale, ln_bias)           layer_norm       -> layer_norm_out
+  // (layer_norm_out, weights_0)      matmul_v2        -> matmul_out0
+  // (matmul_out0, bias_0)            elementwise_add  -> eltadd_0
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (transpose_0)                    split            -> split_q, split_k,
+  // split_v (split_k)                        concat           -> concat_k
+  // (split_v)                        concat           -> concat_v
+  // (concat_k)                       assign           -> assign_k
+  // (concat_v)                       assign           -> assign_v
+  // (split_q, split_k)               matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul_v2        -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    matmul_v2        -> matmul_linear
+  // (matmul_linear)                  elementwise_add  -> eltadd_linear
+  // (eltadd_out)                     elementwise_add  -> attention_out
+  //
+  // (attention_out, scale, bias)     layer_norm       -> ffn_layer_norm_out
+  // (layer_norm_out, ffn_matmul0_w)  matmul_v2        -> ffn_matmul0
+  // (ffn_matmul0, ffn_bias0)         elementwise_add  -> ffn_eltadd0
+  // (ffn_eltadd0)                    gelu             -> ffn_gelu
+  // (ffn_gelu)                       matmul_v2        -> ffn_matmul1
+  // (ffn_matmul1, ffn_bias1)         elementwise_add  -> ffn_eltadd1
+  // (attention_out, ffn_eltadd1)     elementwise_add  -> ffn_output
+  //
+  // (transpose_1, transpose_2)       while            -> decoder block
+
+  Layers layers;
+  // MHA: pre LayerNorm
+  auto* x = layers.data("x", {1, 128, 1024});
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);
+  auto* ln_out = layers.layer_norm(x, ln_scale, ln_bias)[0];
+
+  // MHA: QKV fc
+  auto* weights_0 = layers.data("weights0", {1024, 3072}, true);
+  auto* matmul_out_0 =
+      layers.matmul_v2(ln_out, weights_0, nullptr, false, true);
+
+  auto* b0 = layers.data("bias_0", {3072}, true);
+  auto* elementwise_out_0 =
+      layers.elementwise_add(matmul_out_0, b0, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 16, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+
+  auto split_outs = layers.split(transpose_0, 3, 3);
+  auto* split_q = split_outs[0];
+  auto* split_k = split_outs[1];
+  auto* split_v = split_outs[2];
+
+  auto* cache_k = layers.data("cache_k", {1, 16, 128, 64});
+  auto* cache_v = layers.data("cache_v", {1, 16, 128, 64});
+  auto* concat_k = layers.concat({cache_k, split_k}, 2);
+  auto* concat_v = layers.concat({cache_v, split_v}, 2);
+  layers.assign(concat_k);
+  layers.assign(concat_v);
+
+  // MHA: QK matmul
+  auto* matmul_qk = layers.matmul(split_q, concat_k, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  // MHA: QKV matmul
+  auto* matmul_qkv = layers.matmul_v2(softmax_qk, concat_v);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true);
+
+  // MHA: out Linear
+  auto* weights_l = layers.data("weights_l", {1024, 1024}, true);
+  auto* bias_l = layers.data("weightsl", {1024, 1024}, true);
+  auto* linear_matmut_out =
+      layers.matmul_v2(reshape_qkv_out, weights_l, nullptr, false, true);
+  auto* linear_eltadd_out =
+      layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2);
+
+  auto* attention_out = layers.elementwise_add(x, linear_eltadd_out);
+
+  // FFN: pre LayerNorm
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);
+  auto* ffn_ln_out =
+      layers.layer_norm(attention_out, ffn_ln_scale, ffn_ln_bias)[0];
+
+  // FFN: fc1 -> gelu -> fc2
+  auto* ffn_weights0 = layers.data("ffn_weights0", {1024, 4096}, true);
+  auto* ffn_weights1 = layers.data("ffn_weights1", {4096, 1024}, true);
+  auto* ffn_bias0 = layers.data("ffn_bias0", {4096}, true);
+  auto* ffn_bias1 = layers.data("ffn_bias1", {1024}, true);
+  auto* ffn_matmul0_out =
+      layers.matmul_v2(ffn_ln_out, ffn_weights0, nullptr, false, true);
+  auto* ffn_eltadd0_out =
+      layers.elementwise_add(ffn_matmul0_out, ffn_bias0, nullptr, 2);
+  auto* ffn_gelu_out = layers.gelu(ffn_eltadd0_out);
+  auto* ffn_matmul1_out =
+      layers.matmul_v2(ffn_gelu_out, ffn_weights1, nullptr, false, true);
+  auto* ffn_eltadd1_out =
+      layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2);
+
+  layers.elementwise_add(attention_out, ffn_eltadd1_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass = PassRegistry::Instance().Get(
+      "fused_multi_transformer_decoder_fuse_qkv_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO) << "get fused_multi_transformer_decoder_fuse_qkv_pass failed";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+  int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before,
+      num_nodes_after + 50,
+      platform::errors::InvalidArgument(
+          "After the fused_multi_transformer_decoder_fuse_qkv_pass, "
+          "The node num in graph should be %d, but the result is %d",
+          num_nodes_before - 50,
+          num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_decoder_fuse_qkv "
+                        "pass, there should be one fused_multi_transformer "
+                        "op, but the result is %d",
+                        num_fused_nodes_after));
+}
+
+TEST(FusedMultiTransformerDecoderFuseQKVPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("fused_multi_transformer_decoder_fuse_qkv_pass"));
+}
+
+TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, ln_scale, ln_bias)           layer_norm       -> layer_norm_out
+  // (layer_norm_out)                 c_identity       -> c_identity_out
+  // (c_identity_out, weights_0)      matmul_v2        -> matmul_out0
+  // (matmul_out0, bias_0)            elementwise_add  -> eltadd_0
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (transpose_0)                    split            -> split_q, split_k,
+  // split_v (split_k)                        concat           -> concat_k
+  // (split_v)                        concat           -> concat_v
+  // (concat_k)                       assign           -> assign_k
+  // (concat_v)                       assign           -> assign_v
+  // (split_q, split_k)               matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul_v2        -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    matmul_v2        -> matmul_linear
+  // (matmul_linear)                  c_allreduce_sum  -> c_all_reduce_out
+  // (matmul_linear)                  elementwise_add  -> eltadd_linear
+  // (eltadd_out)                     elementwise_add  -> attention_out
+  //
+  // (attention_out, scale, bias)     layer_norm       -> ffn_layer_norm_out
+  // (ffn_layer_norm_out)             c_identity       -> ffn_c_identity_out
+  // (layer_norm_out, ffn_matmul0_w)  matmul_v2        -> ffn_matmul0
+  // (ffn_matmul0, ffn_bias0)         elementwise_add  -> ffn_eltadd0
+  // (ffn_eltadd0)                    gelu             -> ffn_gelu
+  // (ffn_gelu)                       matmul_v2        -> ffn_matmul1
+  // (ffn_matmul1)                    c_allreduce_sum  -> c_allreduce_out
+  // (ffn_matmul1, ffn_bias1)         elementwise_add  -> ffn_eltadd1
+  // (attention_out, ffn_eltadd1)     elementwise_add  -> ffn_output
+  //
+  // (transpose_1, transpose_2)       while            -> decoder block
+
+  Layers layers;
+  // MHA: pre LayerNorm
+  auto* x = layers.data("x", {1, 128, 1024});
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);
+  auto* ln_out = layers.layer_norm(x, ln_scale, ln_bias)[0];
+  auto* c_identity_out = layers.c_identity(ln_out);
+
+  // MHA: QKV fc
+  auto* weights_0 = layers.data("weights0", {1024, 3072}, true);
+  auto* matmul_out_0 =
+      layers.matmul_v2(c_identity_out, weights_0, nullptr, false, true);
+
+  auto* b0 = layers.data("bias_0", {3072}, true);
+  auto* elementwise_out_0 =
+      layers.elementwise_add(matmul_out_0, b0, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 16, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+
+  auto split_outs = layers.split(transpose_0, 3, 3);
+  auto* split_q = split_outs[0];
+  auto* split_k = split_outs[1];
+  auto* split_v = split_outs[2];
+
+  auto* cache_k = layers.data("cache_k", {1, 16, 128, 64});
+  auto* cache_v = layers.data("cache_v", {1, 16, 128, 64});
+  auto* concat_k = layers.concat({cache_k, split_k}, 2);
+  auto* concat_v = layers.concat({cache_v, split_v}, 2);
+  layers.assign(concat_k);
+  layers.assign(concat_v);
+
+  // MHA: QK matmul
+  auto* matmul_qk = layers.matmul(split_q, concat_k, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  // MHA: QKV matmul
+  auto* matmul_qkv = layers.matmul_v2(softmax_qk, concat_v);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true);
+
+  // MHA: out Linear
+  auto* weights_l = layers.data("weights_l", {1024, 1024}, true);
+  auto* bias_l = layers.data("weightsl", {1024, 1024}, true);
+  auto* linear_matmut_out =
+      layers.matmul_v2(reshape_qkv_out, weights_l, nullptr, false, true);
+  auto* c_allreduce_out = layers.c_allreduce_sum(linear_matmut_out);
+  auto* linear_eltadd_out =
+      layers.elementwise_add(c_allreduce_out, bias_l, nullptr, 2);
+
+  auto* attention_out = layers.elementwise_add(x, linear_eltadd_out);
+
+  // FFN: pre LayerNorm
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);
+  auto* ffn_ln_out =
+      layers.layer_norm(attention_out, ffn_ln_scale, ffn_ln_bias)[0];
+  auto* ffn_c_identity_out = layers.c_identity(ffn_ln_out);
+
+  // FFN: fc1 -> gelu -> fc2
+  auto* ffn_weights0 = layers.data("ffn_weights0", {1024, 4096}, true);
+  auto* ffn_weights1 = layers.data("ffn_weights1", {4096, 1024}, true);
+  auto* ffn_bias0 = layers.data("ffn_bias0", {4096}, true);
+  auto* ffn_bias1 = layers.data("ffn_bias1", {1024}, true);
+  auto* ffn_matmul0_out =
+      layers.matmul_v2(ffn_c_identity_out, ffn_weights0, nullptr, false, true);
+  auto* ffn_eltadd0_out =
+      layers.elementwise_add(ffn_matmul0_out, ffn_bias0, nullptr, 2);
+  auto* ffn_gelu_out = layers.gelu(ffn_eltadd0_out);
+  auto* ffn_matmul1_out =
+      layers.matmul_v2(ffn_gelu_out, ffn_weights1, nullptr, false, true);
+  auto* ffn_c_allreduce_out = layers.c_allreduce_sum(ffn_matmul1_out);
+  auto* ffn_eltadd1_out =
+      layers.elementwise_add(ffn_c_allreduce_out, ffn_bias1, nullptr, 2);
+
+  layers.elementwise_add(attention_out, ffn_eltadd1_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass = PassRegistry::Instance().Get(
+      "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO)
+        << "get multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass "
+           "failed";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+  int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before,
+      num_nodes_after + 58,
+      platform::errors::InvalidArgument(
+          "After the fused_multi_transformer_decoder_fuse_qkv_pass, "
+          "The node num in graph should be %d, but the result is %d",
+          num_nodes_before - 58,
+          num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_decoder_fuse_qkv "
+                        "multi-devices pass, there should be one "
+                        "fused_multi_transformer op, but the result is %d",
+                        num_fused_nodes_after));
+}
+
+TEST(MultiDevicesFusedMultiTransformerDecoderFuseQKVPass,
+     pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible(
+              "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass"));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fused_multi_transformer_decoder_pass);
+USE_PASS(fused_multi_transformer_decoder_fuse_qkv_pass);
+USE_PASS(multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass);
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
new file mode 100644
index 00000000000000..f9f0cc9f937d27
--- /dev/null
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.cc
@@ -0,0 +1,3284 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+PDNode* FusedMultiTransformerEncoderPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("layer_norm", "X");
+
+  // pre-LayerNorm
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("layer_norm", "Y")
+                                 ->assert_is_op_input("matmul_v2", "X")
+                                 ->assert_more([](Node* x) {
+                                   if (x->outputs.size() == 3) {
+                                     return true;
+                                   } else {
+                                     return false;
+                                   }
+                                 });
+
+  layer_norm->LinksFrom({input0, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+
+  // Q path Nodes
+  auto* matmul0 = pattern->NewNode(matmul0_repr())->assert_is_op("matmul_v2");
+  auto* matmul0_w_var = pattern->NewNode(matmul0_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul0_out_var = pattern->NewNode(matmul0_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate()
+                                   ->assert_is_op_input("matmul", "X");
+
+  // Q path Links
+  matmul0->LinksFrom({layer_norm_out_var, matmul0_w_var})
+      .LinksTo({matmul0_out_var});
+  eltadd0->LinksFrom({matmul0_out_var, eltadd0_b_var})
+      .LinksTo({eltadd0_out_var});
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+
+  // K path Nodes
+  auto* matmul1 = pattern->NewNode(matmul1_repr())->assert_is_op("matmul_v2");
+  auto* matmul1_w_var = pattern->NewNode(matmul1_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul1_out_var = pattern->NewNode(matmul1_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd1 =
+      pattern->NewNode(eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* eltadd1_b_var = pattern->NewNode(eltadd1_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+
+  auto* eltadd1_out_var = pattern->NewNode(eltadd1_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_1 =
+      pattern->NewNode(reshape2_1_repr())->assert_is_op("reshape2");
+  auto* reshape2_1_out_var = pattern->NewNode(reshape2_1_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_1 =
+      pattern->NewNode(transpose2_1_repr())->assert_is_op("transpose2");
+  auto* transpose2_1_out_var = pattern->NewNode(transpose2_1_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsOutput()
+                                   ->assert_is_op_input("matmul", "Y")
+                                   ->assert_is_op_input("while")
+                                   ->assert_more([](Node* x) {
+                                     if (x->outputs.size() == 2) {
+                                       return true;
+                                     } else {
+                                       return false;
+                                     }
+                                   });
+
+  // K path Links
+  matmul1->LinksFrom({layer_norm_out_var, matmul1_w_var})
+      .LinksTo({matmul1_out_var});
+  eltadd1->LinksFrom({matmul1_out_var, eltadd1_b_var})
+      .LinksTo({eltadd1_out_var});
+  reshape2_1->LinksFrom({eltadd1_out_var}).LinksTo({reshape2_1_out_var});
+  transpose2_1->LinksFrom({reshape2_1_out_var}).LinksTo({transpose2_1_out_var});
+
+  // V path Nodes
+  auto* matmul2 = pattern->NewNode(matmul2_repr())->assert_is_op("matmul_v2");
+  auto* matmul2_w_var = pattern->NewNode(matmul2_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul2_out_var = pattern->NewNode(matmul2_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd2 =
+      pattern->NewNode(eltadd2_repr())->assert_is_op("elementwise_add");
+  auto* eltadd2_b_var = pattern->NewNode(eltadd2_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd2_out_var = pattern->NewNode(eltadd2_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_2 =
+      pattern->NewNode(reshape2_2_repr())->assert_is_op("reshape2");
+  auto* reshape2_2_out_var = pattern->NewNode(reshape2_2_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_2 =
+      pattern->NewNode(transpose2_2_repr())->assert_is_op("transpose2");
+  auto* transpose2_2_out_var = pattern->NewNode(transpose2_2_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsOutput()
+                                   ->assert_is_op_input("matmul_v2", "Y")
+                                   ->assert_is_op_input("while")
+                                   ->assert_more([](Node* x) {
+                                     if (x->outputs.size() == 2) {
+                                       return true;
+                                     } else {
+                                       return false;
+                                     }
+                                   });
+
+  // V path Links
+  matmul2->LinksFrom({layer_norm_out_var, matmul2_w_var})
+      .LinksTo({matmul2_out_var});
+  eltadd2->LinksFrom({matmul2_out_var, eltadd2_b_var})
+      .LinksTo({eltadd2_out_var});
+  reshape2_2->LinksFrom({eltadd2_out_var}).LinksTo({reshape2_2_out_var});
+  transpose2_2->LinksFrom({reshape2_2_out_var}).LinksTo({transpose2_2_out_var});
+
+  // QK path Nodes
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add")
+                                ->AsIntermediate()
+                                ->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  // QK path Linsk
+  matmul_qk->LinksFrom({transpose2_0_out_var, transpose2_1_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+
+  // QKV path Nodes
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul_v2");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul_v2");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var =
+      pattern->NewNode(reshape2_qkv_out_repr())
+          ->assert_is_op_output("reshape2")
+          ->AsIntermediate()
+          ->assert_is_op_input("matmul_v2");  // -> out_linear
+
+  auto* matmul_linear =
+      pattern->NewNode(matmul_linear_repr())->assert_is_op("matmul_v2");
+  auto* matmul_linear_w_var = pattern->NewNode(matmul_linear_w_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul_linear_out_var = pattern->NewNode(matmul_linear_out_repr())
+                                    ->assert_is_op_output("matmul_v2")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_linear =
+      pattern->NewNode(eltadd_linear_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_linear_b_var = pattern->NewNode(eltadd_linear_b_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr())
+                                    ->assert_is_op_output("elementwise_add")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_out =
+      pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* attention_output = pattern->NewNode(attention_output_repr())
+                               ->assert_is_op_output("elementwise_add")
+                               ->AsIntermediate();
+
+  // QKV path Links
+  matmul_qkv->LinksFrom({softmax_qk_out_var, transpose2_2_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+  matmul_linear->LinksFrom({reshape2_qkv_out_var, matmul_linear_w_var})
+      .LinksTo({matmul_linear_out_var});
+  eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var})
+      .LinksTo({eltadd_linear_out_var});
+  eltadd_out->LinksFrom({input0, eltadd_linear_out_var})
+      .LinksTo({attention_output});
+
+  // while loop
+  auto* while0 = pattern->NewNode(while0_repr())->assert_is_op("while");
+  while0->LinksFrom({transpose2_1_out_var, transpose2_2_out_var});
+
+  // Feed Forward LayerNorm Nodes
+  auto* ffn_layer_norm =
+      pattern->NewNode(ffn_layer_norm_repr())->assert_is_op("layer_norm");
+  auto* ffn_layer_norm_scale_var =
+      pattern->NewNode(ffn_layer_norm_scale_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Scale");
+  auto* ffn_layer_norm_bias_var =
+      pattern->NewNode(ffn_layer_norm_bias_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Bias");
+  auto* ffn_layer_norm_mean_var =
+      pattern->NewNode(ffn_layer_norm_mean_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Mean");
+  auto* ffn_layer_norm_variance_var =
+      pattern->NewNode(ffn_layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* ffn_layer_norm_out_var = pattern->NewNode(ffn_layer_norm_out_repr())
+                                     ->AsIntermediate()
+                                     ->assert_is_op_output("layer_norm", "Y")
+                                     ->assert_is_op_input("matmul_v2", "X");
+
+  ffn_layer_norm
+      ->LinksFrom(
+          {attention_output, ffn_layer_norm_bias_var, ffn_layer_norm_scale_var})
+      .LinksTo({ffn_layer_norm_out_var,
+                ffn_layer_norm_mean_var,
+                ffn_layer_norm_variance_var});
+
+  // Feed Forward fc1 -> gelu -> fc2
+  auto* ffn_matmul0 =
+      pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul0_out_var = pattern->NewNode(ffn_matmul0_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd0 =
+      pattern->NewNode(ffn_eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd0_b_var = pattern->NewNode(ffn_eltadd0_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd0_out_var = pattern->NewNode(ffn_eltadd0_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("gelu");
+
+  auto* ffn_gelu = pattern->NewNode(ffn_gelu_repr())->assert_is_op("gelu");
+  auto* ffn_gelu_out_var = pattern->NewNode(ffn_gelu_out_repr())
+                               ->assert_is_op_output("gelu")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2");
+
+  auto* ffn_matmul1 =
+      pattern->NewNode(ffn_matmul1_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul1_w_var = pattern->NewNode(ffn_matmul1_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul1_out_var = pattern->NewNode(ffn_matmul1_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd1 =
+      pattern->NewNode(ffn_eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd1_b_var = pattern->NewNode(ffn_eltadd1_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd_out =
+      pattern->NewNode(ffn_eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* ffn_output = pattern->NewNode(ffn_output_repr())
+                         ->assert_is_op_output("elementwise_add")
+                         ->AsOutput();
+
+  ffn_matmul0->LinksFrom({ffn_layer_norm_out_var, ffn_matmul0_w_var})
+      .LinksTo({ffn_matmul0_out_var});
+  ffn_eltadd0->LinksFrom({ffn_matmul0_out_var, ffn_eltadd0_b_var})
+      .LinksTo({ffn_eltadd0_out_var});
+  ffn_gelu->LinksFrom({ffn_eltadd0_out_var}).LinksTo({ffn_gelu_out_var});
+  ffn_matmul1->LinksFrom({ffn_gelu_out_var, ffn_matmul1_w_var})
+      .LinksTo({ffn_matmul1_out_var});
+  ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var})
+      .LinksTo({ffn_eltadd1_out_var});
+
+  ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var})
+      .LinksTo({ffn_output});
+
+  return ffn_output;
+}
+
+PDNode* FusedMultiTransformerEncoderFuseQKVPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("layer_norm", "X");
+
+  // pre-LayerNorm
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("layer_norm", "Y")
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  layer_norm->LinksFrom({input0, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+
+  // QKV fused path Nodes
+  auto* matmul0 = pattern->NewNode(matmul0_repr())->assert_is_op("matmul_v2");
+  auto* matmul0_w_var = pattern->NewNode(matmul0_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul0_out_var = pattern->NewNode(matmul0_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate()
+                                   ->assert_is_op_input("split", "X");
+
+  auto* split0 = pattern->NewNode(split0_repr())->assert_is_op("split");
+  auto* split0_q_out_var = pattern->NewNode(split0_q_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul", "X");
+  auto* split0_k_out_var = pattern->NewNode(split0_k_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsOutput()
+                               ->assert_is_op_input("matmul", "Y")
+                               ->assert_is_op_input("while");
+  auto* split0_v_out_var = pattern->NewNode(split0_v_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsOutput()
+                               ->assert_is_op_input("matmul_v2", "Y")
+                               ->assert_is_op_input("while");
+
+  // QKV fused path Links
+  matmul0->LinksFrom({layer_norm_out_var, matmul0_w_var})
+      .LinksTo({matmul0_out_var});
+  eltadd0->LinksFrom({matmul0_out_var, eltadd0_b_var})
+      .LinksTo({eltadd0_out_var});
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  split0->LinksFrom({transpose2_0_out_var})
+      .LinksTo({split0_q_out_var, split0_k_out_var, split0_v_out_var});
+
+  // while loop
+  auto* while0 = pattern->NewNode(while0_repr())->assert_is_op("while");
+  while0->LinksFrom({split0_k_out_var, split0_v_out_var});
+
+  // QK path Nodes
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add")
+                                ->AsIntermediate()
+                                ->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  // QK path Linsk
+  matmul_qk->LinksFrom({split0_q_out_var, split0_k_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+
+  // QKV path Nodes
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul_v2");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul_v2");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var =
+      pattern->NewNode(reshape2_qkv_out_repr())
+          ->assert_is_op_output("reshape2")
+          ->AsIntermediate()
+          ->assert_is_op_input("matmul_v2");  // -> out_linear
+
+  auto* matmul_linear =
+      pattern->NewNode(matmul_linear_repr())->assert_is_op("matmul_v2");
+  auto* matmul_linear_w_var = pattern->NewNode(matmul_linear_w_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul_linear_out_var = pattern->NewNode(matmul_linear_out_repr())
+                                    ->assert_is_op_output("matmul_v2")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_linear =
+      pattern->NewNode(eltadd_linear_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_linear_b_var = pattern->NewNode(eltadd_linear_b_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr())
+                                    ->assert_is_op_output("elementwise_add")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_out =
+      pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* attention_output = pattern->NewNode(attention_output_repr())
+                               ->assert_is_op_output("elementwise_add")
+                               ->AsIntermediate();
+
+  // QKV path Links
+  matmul_qkv->LinksFrom({softmax_qk_out_var, split0_v_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+  matmul_linear->LinksFrom({reshape2_qkv_out_var, matmul_linear_w_var})
+      .LinksTo({matmul_linear_out_var});
+  eltadd_linear->LinksFrom({matmul_linear_out_var, eltadd_linear_b_var})
+      .LinksTo({eltadd_linear_out_var});
+  eltadd_out->LinksFrom({input0, eltadd_linear_out_var})
+      .LinksTo({attention_output});
+
+  // Feed Forward LayerNorm Nodes
+  auto* ffn_layer_norm =
+      pattern->NewNode(ffn_layer_norm_repr())->assert_is_op("layer_norm");
+  auto* ffn_layer_norm_scale_var =
+      pattern->NewNode(ffn_layer_norm_scale_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Scale");
+  auto* ffn_layer_norm_bias_var =
+      pattern->NewNode(ffn_layer_norm_bias_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Bias");
+  auto* ffn_layer_norm_mean_var =
+      pattern->NewNode(ffn_layer_norm_mean_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Mean");
+  auto* ffn_layer_norm_variance_var =
+      pattern->NewNode(ffn_layer_norm_variance_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* ffn_layer_norm_out_var = pattern->NewNode(ffn_layer_norm_out_repr())
+                                     ->AsIntermediate()
+                                     ->assert_is_op_output("layer_norm", "Y")
+                                     ->assert_is_op_input("matmul_v2", "X");
+
+  ffn_layer_norm
+      ->LinksFrom(
+          {attention_output, ffn_layer_norm_bias_var, ffn_layer_norm_scale_var})
+      .LinksTo({ffn_layer_norm_out_var,
+                ffn_layer_norm_mean_var,
+                ffn_layer_norm_variance_var});
+
+  // Feed Forward fc1 -> gelu -> fc2
+  auto* ffn_matmul0 =
+      pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul0_out_var = pattern->NewNode(ffn_matmul0_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd0 =
+      pattern->NewNode(ffn_eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd0_b_var = pattern->NewNode(ffn_eltadd0_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd0_out_var = pattern->NewNode(ffn_eltadd0_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("gelu");
+
+  auto* ffn_gelu = pattern->NewNode(ffn_gelu_repr())->assert_is_op("gelu");
+  auto* ffn_gelu_out_var = pattern->NewNode(ffn_gelu_out_repr())
+                               ->assert_is_op_output("gelu")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2");
+
+  auto* ffn_matmul1 =
+      pattern->NewNode(ffn_matmul1_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul1_w_var = pattern->NewNode(ffn_matmul1_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul1_out_var = pattern->NewNode(ffn_matmul1_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd1 =
+      pattern->NewNode(ffn_eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd1_b_var = pattern->NewNode(ffn_eltadd1_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd_out =
+      pattern->NewNode(ffn_eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* ffn_output = pattern->NewNode(ffn_output_repr())
+                         ->assert_is_op_output("elementwise_add")
+                         ->AsOutput();
+
+  ffn_matmul0->LinksFrom({ffn_layer_norm_out_var, ffn_matmul0_w_var})
+      .LinksTo({ffn_matmul0_out_var});
+  ffn_eltadd0->LinksFrom({ffn_matmul0_out_var, ffn_eltadd0_b_var})
+      .LinksTo({ffn_eltadd0_out_var});
+  ffn_gelu->LinksFrom({ffn_eltadd0_out_var}).LinksTo({ffn_gelu_out_var});
+  ffn_matmul1->LinksFrom({ffn_gelu_out_var, ffn_matmul1_w_var})
+      .LinksTo({ffn_matmul1_out_var});
+  ffn_eltadd1->LinksFrom({ffn_matmul1_out_var, ffn_eltadd1_b_var})
+      .LinksTo({ffn_eltadd1_out_var});
+
+  ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var})
+      .LinksTo({ffn_output});
+
+  return ffn_output;
+}
+
+PDNode* MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern::operator()() {
+  auto* input0 = pattern->NewNode(input0_repr());
+  input0->assert_is_op_input("layer_norm", "X");
+
+  // pre-LayerNorm
+  auto* layer_norm =
+      pattern->NewNode(layer_norm_repr())->assert_is_op("layer_norm");
+  auto* layer_norm_scale_var = pattern->NewNode(layer_norm_scale_repr())
+                                   ->AsInput()
+                                   ->assert_is_persistable_var()
+                                   ->assert_is_op_input("layer_norm", "Scale");
+  auto* layer_norm_bias_var = pattern->NewNode(layer_norm_bias_repr())
+                                  ->AsInput()
+                                  ->assert_is_persistable_var()
+                                  ->assert_is_op_input("layer_norm", "Bias");
+  auto* layer_norm_mean_var = pattern->NewNode(layer_norm_mean_repr())
+                                  ->AsOutput()
+                                  ->assert_is_op_output("layer_norm", "Mean");
+  auto* layer_norm_variance_var =
+      pattern->NewNode(layer_norm_variance_repr())
+          ->AsOutput()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* layer_norm_out_var = pattern->NewNode(layer_norm_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("layer_norm", "Y")
+                                 ->assert_is_op_input("c_identity", "X");
+
+  layer_norm->LinksFrom({input0, layer_norm_bias_var, layer_norm_scale_var})
+      .LinksTo(
+          {layer_norm_out_var, layer_norm_mean_var, layer_norm_variance_var});
+
+  // communication c_identity
+  auto* c_identity =
+      pattern->NewNode(c_identity_repr())->assert_is_op("c_identity");
+  auto* c_identity_out_var = pattern->NewNode(c_identity_out_repr())
+                                 ->AsIntermediate()
+                                 ->assert_is_op_output("c_identity", "Out")
+                                 ->assert_is_op_input("matmul_v2", "X");
+  c_identity->LinksFrom({layer_norm_out_var}).LinksTo({c_identity_out_var});
+
+  // QKV fused path Nodes
+  auto* matmul0 = pattern->NewNode(matmul0_repr())->assert_is_op("matmul_v2");
+  auto* matmul0_w_var = pattern->NewNode(matmul0_w_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul0_out_var = pattern->NewNode(matmul0_out_repr())
+                              ->assert_is_op_output("matmul_v2")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd0 =
+      pattern->NewNode(eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* eltadd0_b_var = pattern->NewNode(eltadd0_b_repr())
+                            ->AsInput()
+                            ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd0_out_var = pattern->NewNode(eltadd0_out_repr())
+                              ->assert_is_op_output("elementwise_add")
+                              ->AsIntermediate()
+                              ->assert_is_op_input("reshape2");
+
+  auto* reshape2_0 =
+      pattern->NewNode(reshape2_0_repr())->assert_is_op("reshape2");
+  auto* reshape2_0_out_var = pattern->NewNode(reshape2_0_out_repr())
+                                 ->assert_is_op_output("reshape2")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("transpose2");
+
+  auto* transpose2_0 =
+      pattern->NewNode(transpose2_0_repr())->assert_is_op("transpose2");
+  auto* transpose2_0_out_var = pattern->NewNode(transpose2_0_out_repr())
+                                   ->assert_is_op_output("transpose2")
+                                   ->AsIntermediate()
+                                   ->assert_is_op_input("split", "X");
+
+  auto* split0 = pattern->NewNode(split0_repr())->assert_is_op("split");
+  auto* split0_q_out_var = pattern->NewNode(split0_q_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul", "X");
+  auto* split0_k_out_var = pattern->NewNode(split0_k_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsOutput()
+                               ->assert_is_op_input("matmul", "Y")
+                               ->assert_is_op_input("while");
+  auto* split0_v_out_var = pattern->NewNode(split0_v_out_repr())
+                               ->assert_is_op_output("split")
+                               ->AsOutput()
+                               ->assert_is_op_input("matmul_v2", "Y")
+                               ->assert_is_op_input("while");
+
+  // QKV fused path Links
+  matmul0->LinksFrom({c_identity_out_var, matmul0_w_var})
+      .LinksTo({matmul0_out_var});
+  eltadd0->LinksFrom({matmul0_out_var, eltadd0_b_var})
+      .LinksTo({eltadd0_out_var});
+  reshape2_0->LinksFrom({eltadd0_out_var}).LinksTo({reshape2_0_out_var});
+  transpose2_0->LinksFrom({reshape2_0_out_var}).LinksTo({transpose2_0_out_var});
+  split0->LinksFrom({transpose2_0_out_var})
+      .LinksTo({split0_q_out_var, split0_k_out_var, split0_v_out_var});
+
+  // while loop
+  auto* while0 = pattern->NewNode(while0_repr())->assert_is_op("while");
+  while0->LinksFrom({split0_k_out_var, split0_v_out_var});
+
+  // QK path Nodes
+  auto* matmul_qk = pattern->NewNode(matmul_qk_repr())->assert_is_op("matmul");
+  auto* matmul_qk_out_var =
+      pattern->NewNode(matmul_qk_out_repr())->assert_is_op_output("matmul");
+  matmul_qk_out_var->AsIntermediate()->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_qk =
+      pattern->NewNode(eltadd_qk_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_qk_b_var = pattern->NewNode(eltadd_qk_b_repr())
+                              ->AsInput()
+                              ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_qk_out_var = pattern->NewNode(eltadd_qk_out_repr())
+                                ->assert_is_op_output("elementwise_add")
+                                ->AsIntermediate()
+                                ->assert_is_op_input("softmax");
+
+  auto* softmax_qk =
+      pattern->NewNode(softmax_qk_repr())->assert_is_op("softmax");
+  auto* softmax_qk_out_var = pattern->NewNode(softmax_qk_out_repr())
+                                 ->assert_is_op_output("softmax")
+                                 ->AsIntermediate()
+                                 ->assert_is_op_input("matmul_v2", "X");
+
+  // QK path Linsk
+  matmul_qk->LinksFrom({split0_q_out_var, split0_k_out_var})
+      .LinksTo({matmul_qk_out_var});
+  eltadd_qk->LinksFrom({matmul_qk_out_var, eltadd_qk_b_var})
+      .LinksTo({eltadd_qk_out_var});
+  softmax_qk->LinksFrom({eltadd_qk_out_var}).LinksTo({softmax_qk_out_var});
+
+  // QKV path Nodes
+  auto* matmul_qkv =
+      pattern->NewNode(matmul_qkv_repr())->assert_is_op("matmul_v2");
+  auto* matmul_qkv_out_var =
+      pattern->NewNode(matmul_qkv_out_repr())->assert_is_op_output("matmul_v2");
+  matmul_qkv_out_var->AsIntermediate()->assert_is_op_input("transpose2");
+
+  auto* transpose2_qkv =
+      pattern->NewNode(transpose2_qkv_repr())->assert_is_op("transpose2");
+  auto* transpose2_qkv_out_var = pattern->NewNode(transpose2_qkv_out_repr())
+                                     ->assert_is_op_output("transpose2");
+  transpose2_qkv_out_var->AsIntermediate()->assert_is_op_input("reshape2");
+
+  auto* reshape2_qkv =
+      pattern->NewNode(reshape2_qkv_repr())->assert_is_op("reshape2");
+  auto* reshape2_qkv_out_var =
+      pattern->NewNode(reshape2_qkv_out_repr())
+          ->assert_is_op_output("reshape2")
+          ->AsIntermediate()
+          ->assert_is_op_input("matmul_v2");  // -> out_linear
+
+  auto* matmul_linear =
+      pattern->NewNode(matmul_linear_repr())->assert_is_op("matmul_v2");
+  auto* matmul_linear_w_var = pattern->NewNode(matmul_linear_w_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("matmul_v2", "Y");
+  auto* matmul_linear_out_var = pattern->NewNode(matmul_linear_out_repr())
+                                    ->assert_is_op_output("matmul_v2")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("c_allreduce_sum");
+
+  // communication c_allreduce_sum
+  auto* c_allreduce_sum =
+      pattern->NewNode(c_allreduce_sum_repr())->assert_is_op("c_allreduce_sum");
+  auto* c_allreduce_sum_out_var = pattern->NewNode(c_allreduce_sum_out_repr())
+                                      ->assert_is_op_output("c_allreduce_sum")
+                                      ->AsIntermediate()
+                                      ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_linear =
+      pattern->NewNode(eltadd_linear_repr())->assert_is_op("elementwise_add");
+  auto* eltadd_linear_b_var = pattern->NewNode(eltadd_linear_b_repr())
+                                  ->AsInput()
+                                  ->assert_is_op_input("elementwise_add", "Y");
+  auto* eltadd_linear_out_var = pattern->NewNode(eltadd_linear_out_repr())
+                                    ->assert_is_op_output("elementwise_add")
+                                    ->AsIntermediate()
+                                    ->assert_is_op_input("elementwise_add");
+
+  auto* eltadd_out =
+      pattern->NewNode(eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* attention_output = pattern->NewNode(attention_output_repr())
+                               ->assert_is_op_output("elementwise_add")
+                               ->AsIntermediate();
+
+  // QKV path Links
+  matmul_qkv->LinksFrom({softmax_qk_out_var, split0_v_out_var})
+      .LinksTo({matmul_qkv_out_var});
+  transpose2_qkv->LinksFrom({matmul_qkv_out_var})
+      .LinksTo({transpose2_qkv_out_var});
+  reshape2_qkv->LinksFrom({transpose2_qkv_out_var})
+      .LinksTo({reshape2_qkv_out_var});
+  matmul_linear->LinksFrom({reshape2_qkv_out_var, matmul_linear_w_var})
+      .LinksTo({matmul_linear_out_var});
+  c_allreduce_sum->LinksFrom({matmul_linear_out_var})
+      .LinksTo({c_allreduce_sum_out_var});
+  eltadd_linear->LinksFrom({c_allreduce_sum_out_var, eltadd_linear_b_var})
+      .LinksTo({eltadd_linear_out_var});
+  eltadd_out->LinksFrom({input0, eltadd_linear_out_var})
+      .LinksTo({attention_output});
+
+  // Feed Forward LayerNorm Nodes
+  auto* ffn_layer_norm =
+      pattern->NewNode(ffn_layer_norm_repr())->assert_is_op("layer_norm");
+  auto* ffn_layer_norm_scale_var =
+      pattern->NewNode(ffn_layer_norm_scale_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Scale");
+  auto* ffn_layer_norm_bias_var =
+      pattern->NewNode(ffn_layer_norm_bias_repr())
+          ->AsInput()
+          ->assert_is_persistable_var()
+          ->assert_is_op_input("layer_norm", "Bias");
+  auto* ffn_layer_norm_mean_var =
+      pattern->NewNode(ffn_layer_norm_mean_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Mean");
+  auto* ffn_layer_norm_variance_var =
+      pattern->NewNode(ffn_layer_norm_variance_repr())
+          ->AsIntermediate()
+          ->assert_is_op_output("layer_norm", "Variance");
+  auto* ffn_layer_norm_out_var = pattern->NewNode(ffn_layer_norm_out_repr())
+                                     ->AsIntermediate()
+                                     ->assert_is_op_output("layer_norm", "Y")
+                                     ->assert_is_op_input("c_identity", "X");
+
+  ffn_layer_norm
+      ->LinksFrom(
+          {attention_output, ffn_layer_norm_bias_var, ffn_layer_norm_scale_var})
+      .LinksTo({ffn_layer_norm_out_var,
+                ffn_layer_norm_mean_var,
+                ffn_layer_norm_variance_var});
+
+  // communication c_identity
+  auto* ffn_c_identity =
+      pattern->NewNode(ffn_c_identity_repr())->assert_is_op("c_identity");
+  auto* ffn_c_identity_out_var = pattern->NewNode(ffn_c_identity_out_repr())
+                                     ->assert_is_op_output("c_identity", "Out")
+                                     ->AsIntermediate()
+                                     ->assert_is_op_input("matmul_v2", "X");
+  ffn_c_identity->LinksFrom({ffn_layer_norm_out_var})
+      .LinksTo({ffn_c_identity_out_var});
+
+  // Feed Forward fc1 -> gelu -> fc2
+  auto* ffn_matmul0 =
+      pattern->NewNode(ffn_matmul0_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul0_w_var = pattern->NewNode(ffn_matmul0_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul0_out_var = pattern->NewNode(ffn_matmul0_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd0 =
+      pattern->NewNode(ffn_eltadd0_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd0_b_var = pattern->NewNode(ffn_eltadd0_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd0_out_var = pattern->NewNode(ffn_eltadd0_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("gelu");
+
+  auto* ffn_gelu = pattern->NewNode(ffn_gelu_repr())->assert_is_op("gelu");
+  auto* ffn_gelu_out_var = pattern->NewNode(ffn_gelu_out_repr())
+                               ->assert_is_op_output("gelu")
+                               ->AsIntermediate()
+                               ->assert_is_op_input("matmul_v2");
+
+  auto* ffn_matmul1 =
+      pattern->NewNode(ffn_matmul1_repr())->assert_is_op("matmul_v2");
+  auto* ffn_matmul1_w_var = pattern->NewNode(ffn_matmul1_w_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("matmul_v2", "Y");
+  auto* ffn_matmul1_out_var = pattern->NewNode(ffn_matmul1_out_repr())
+                                  ->assert_is_op_output("matmul_v2")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("c_allreduce_sum");
+
+  // communication c_allreduce_sum
+  auto* ffn_c_allreduce_sum = pattern->NewNode(ffn_c_allreduce_sum_repr())
+                                  ->assert_is_op("c_allreduce_sum");
+  auto* ffn_c_allreduce_sum_out_var =
+      pattern->NewNode(ffn_c_allreduce_sum_out_repr())
+          ->assert_is_op_output("c_allreduce_sum")
+          ->AsIntermediate()
+          ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd1 =
+      pattern->NewNode(ffn_eltadd1_repr())->assert_is_op("elementwise_add");
+  auto* ffn_eltadd1_b_var = pattern->NewNode(ffn_eltadd1_b_repr())
+                                ->AsInput()
+                                ->assert_is_op_input("elementwise_add", "Y");
+  auto* ffn_eltadd1_out_var = pattern->NewNode(ffn_eltadd1_out_repr())
+                                  ->assert_is_op_output("elementwise_add")
+                                  ->AsIntermediate()
+                                  ->assert_is_op_input("elementwise_add");
+
+  auto* ffn_eltadd_out =
+      pattern->NewNode(ffn_eltadd_out_repr())->assert_is_op("elementwise_add");
+  auto* ffn_output = pattern->NewNode(ffn_output_repr())
+                         ->assert_is_op_output("elementwise_add")
+                         ->AsOutput();
+
+  ffn_matmul0->LinksFrom({ffn_c_identity_out_var, ffn_matmul0_w_var})
+      .LinksTo({ffn_matmul0_out_var});
+  ffn_eltadd0->LinksFrom({ffn_matmul0_out_var, ffn_eltadd0_b_var})
+      .LinksTo({ffn_eltadd0_out_var});
+  ffn_gelu->LinksFrom({ffn_eltadd0_out_var}).LinksTo({ffn_gelu_out_var});
+  ffn_matmul1->LinksFrom({ffn_gelu_out_var, ffn_matmul1_w_var})
+      .LinksTo({ffn_matmul1_out_var});
+  ffn_c_allreduce_sum->LinksFrom({ffn_matmul1_out_var})
+      .LinksTo({ffn_c_allreduce_sum_out_var});
+  ffn_eltadd1->LinksFrom({ffn_c_allreduce_sum_out_var, ffn_eltadd1_b_var})
+      .LinksTo({ffn_eltadd1_out_var});
+
+  ffn_eltadd_out->LinksFrom({attention_output, ffn_eltadd1_out_var})
+      .LinksTo({ffn_output});
+
+  return ffn_output;
+}
+
+}  // namespace patterns
+
+template <typename T>
+inline void QKVWeightsProcess(framework::LoDTensor* wq_tensor,
+                              framework::LoDTensor* wk_tensor,
+                              framework::LoDTensor* wv_tensor,
+                              framework::LoDTensor* bq_tensor,
+                              framework::LoDTensor* bk_tensor,
+                              framework::LoDTensor* bv_tensor,
+                              const int num_head,
+                              const int dim_head,
+                              const int dim_embed) {
+  auto* wq_data = wq_tensor->mutable_data<T>(platform::CPUPlace());
+  auto* wk_data = wk_tensor->mutable_data<T>(platform::CPUPlace());
+  auto* wv_data = wv_tensor->mutable_data<T>(platform::CPUPlace());
+  auto* bq_data = bq_tensor->mutable_data<T>(platform::CPUPlace());
+  auto* bk_data = bk_tensor->mutable_data<T>(platform::CPUPlace());
+  auto* bv_data = bv_tensor->mutable_data<T>(platform::CPUPlace());
+
+  auto combined_w_dims = phi::make_ddim({3, num_head, dim_head, dim_embed});
+  auto combined_bias_dims = phi::make_ddim({3, num_head, dim_head});
+
+  framework::LoDTensor tmp_combined_w_tensor;
+  tmp_combined_w_tensor.Resize(combined_w_dims);
+  auto* tmp_combined_w_data =
+      tmp_combined_w_tensor.mutable_data<T>(platform::CPUPlace());
+
+  std::vector<T*> w_vec = {wq_data, wk_data, wv_data};
+  // Combine the three fc weights together.
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < num_head; j++) {
+      for (int k = 0; k < dim_head; k++) {
+        for (int l = 0; l < dim_embed; l++) {
+          int out_idx = i * num_head * dim_head * dim_embed +
+                        j * dim_head * dim_embed + k * dim_embed + l;
+          int in_idx = l * num_head * dim_head + j * dim_head + k;
+          tmp_combined_w_data[out_idx] = w_vec[i][in_idx];
+        }
+      }
+    }
+  }
+
+  wq_tensor->Resize(combined_w_dims);
+  auto* new_combined_w_data = wq_tensor->mutable_data<T>(platform::CPUPlace());
+  memcpy(
+      new_combined_w_data, tmp_combined_w_data, sizeof(T) * wq_tensor->numel());
+
+  framework::LoDTensor tmp_combined_bias_tensor;
+  tmp_combined_bias_tensor.Resize(combined_bias_dims);
+  auto* tmp_combined_bias_data =
+      tmp_combined_bias_tensor.mutable_data<T>(platform::CPUPlace());
+
+  size_t bias_size = bq_tensor->numel();
+  memcpy(tmp_combined_bias_data, bq_data, sizeof(T) * bias_size);
+  memcpy(tmp_combined_bias_data + bias_size, bk_data, sizeof(T) * bias_size);
+  memcpy(
+      tmp_combined_bias_data + 2 * bias_size, bv_data, sizeof(T) * bias_size);
+
+  bq_tensor->Resize(combined_bias_dims);
+  auto* new_combined_bias_data =
+      bq_tensor->mutable_data<T>(platform::CPUPlace());
+  memcpy(new_combined_bias_data,
+         tmp_combined_bias_data,
+         sizeof(T) * bq_tensor->numel());
+}
+
+template <typename T>
+inline void QKVWeightsProcessFuseQKV(framework::LoDTensor* qkv_w_tensor,
+                                     framework::LoDTensor* qkv_b_tensor,
+                                     const int num_head,
+                                     const int dim_head,
+                                     const int dim_embed) {
+  auto* qkv_w_data = qkv_w_tensor->mutable_data<T>(platform::CPUPlace());
+  auto transpose_w_dims = phi::make_ddim({3, num_head, dim_head, dim_embed});
+
+  framework::LoDTensor tmp_transpose_w_tensor;
+  tmp_transpose_w_tensor.Resize(transpose_w_dims);
+  auto* tmp_transpose_w_data =
+      tmp_transpose_w_tensor.mutable_data<T>(platform::CPUPlace());
+
+  // transpose qkv matmul Y to QKVWeights
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < num_head; j++) {
+      for (int k = 0; k < dim_head; k++) {
+        for (int l = 0; l < dim_embed; l++) {
+          int out_idx = i * num_head * dim_head * dim_embed +
+                        j * dim_head * dim_embed + k * dim_embed + l;
+          int in_idx =
+              l * num_head * 3 * dim_head + j * 3 * dim_head + i * dim_head + k;
+          tmp_transpose_w_data[out_idx] = qkv_w_data[in_idx];
+        }
+      }
+    }
+  }
+
+  qkv_w_tensor->Resize(transpose_w_dims);
+  auto* new_transpose_w_data =
+      qkv_w_tensor->mutable_data<T>(platform::CPUPlace());
+  memcpy(new_transpose_w_data,
+         tmp_transpose_w_data,
+         sizeof(T) * qkv_w_tensor->numel());
+
+  auto* qkv_b_data = qkv_b_tensor->mutable_data<T>(platform::CPUPlace());
+  auto transpose_b_dims = phi::make_ddim({3, num_head, dim_head});
+
+  framework::LoDTensor tmp_transpose_b_tensor;
+  tmp_transpose_b_tensor.Resize(transpose_b_dims);
+  auto* tmp_transpose_b_data =
+      tmp_transpose_b_tensor.mutable_data<T>(platform::CPUPlace());
+
+  // transpose qkv elemenwise_add Y to QKVBias
+  for (int i = 0; i < 3; i++) {
+    for (int j = 0; j < num_head; j++) {
+      for (int k = 0; k < dim_head; k++) {
+        int out_idx = i * num_head * dim_head + j * dim_head + k;
+        int in_idx = j * 3 * dim_head + i * dim_head + k;
+        tmp_transpose_b_data[out_idx] = qkv_b_data[in_idx];
+      }
+    }
+  }
+
+  qkv_b_tensor->Resize({3, num_head, dim_head});
+  auto* new_transpose_b_data =
+      qkv_b_tensor->mutable_data<T>(platform::CPUPlace());
+  memcpy(new_transpose_b_data,
+         tmp_transpose_b_data,
+         sizeof(T) * qkv_b_tensor->numel());
+}
+
+int FusedMultiTransformerEncoderPass::BuildFusion(Graph* graph,
+                                                  const std::string& name_scope,
+                                                  Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::FusedMultiTransformerEncoderPattern fused_multi_transformer_pattern(
+      pattern, name_scope);
+  fused_multi_transformer_pattern();
+
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0,
+                          Node* layer_norm,
+                          Node* layer_norm_scale,
+                          Node* layer_norm_bias,
+                          Node* layer_norm_mean,
+                          Node* layer_norm_variance,
+                          Node* matmul0_w,
+                          Node* matmul1_w,
+                          Node* matmul2_w,
+                          Node* eltadd0_b,
+                          Node* eltadd1_b,
+                          Node* eltadd2_b,
+                          Node* transpose2_1_out,
+                          Node* transpose2_2_out,
+                          Node* eltadd_qk_b,
+                          Node* reshape2_0,
+                          Node* matmul_linear_w,
+                          Node* eltadd_linear_b,
+                          Node* while0,
+                          Node* ffn_layer_norm,
+                          Node* ffn_layer_norm_scale,
+                          Node* ffn_layer_norm_bias,
+                          Node* ffn_layer_norm_mean,
+                          Node* ffn_layer_norm_variance,
+                          Node* ffn_matmul0_w,
+                          Node* ffn_matmul1_w,
+                          Node* ffn_eltadd0_b,
+                          Node* ffn_eltadd1_b,
+                          Node* ffn_output) {
+    auto reshape_desc = reshape2_0->Op();
+    int num_head =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(2);
+    int dim_head =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(3);
+    int dim_embed = num_head * dim_head;
+
+    // Calc index of transformer layer by LayerNorm Scale name
+    // This calculation assumes:
+    //    1. no LayerNorm before all transformer layer
+    //    2. each transformer layer contains 2 LayerNorm layer
+    auto ln_scale_name = layer_norm_scale->Name();
+    auto ln_name = ln_scale_name.substr(0, ln_scale_name.find('.'));
+    auto ln_idx_str = ln_name.substr(ln_name.rfind('_') + 1);
+    int layer_idx = atoi(ln_idx_str.c_str()) / 2;
+
+    auto* wq_tensor =
+        scope->FindVar(matmul0_w->Name())->GetMutable<LoDTensor>();
+    auto* wk_tensor =
+        scope->FindVar(matmul1_w->Name())->GetMutable<LoDTensor>();
+    auto* wv_tensor =
+        scope->FindVar(matmul2_w->Name())->GetMutable<LoDTensor>();
+
+    auto* bq_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+    auto* bk_tensor =
+        scope->FindVar(eltadd1_b->Name())->GetMutable<LoDTensor>();
+    auto* bv_tensor =
+        scope->FindVar(eltadd2_b->Name())->GetMutable<LoDTensor>();
+
+    if (wq_tensor->dtype() == phi::DataType::FLOAT32) {
+      QKVWeightsProcess<float>(wq_tensor,
+                               wk_tensor,
+                               wv_tensor,
+                               bq_tensor,
+                               bk_tensor,
+                               bv_tensor,
+                               num_head,
+                               dim_head,
+                               dim_embed);
+    } else if (wq_tensor->dtype() == phi::DataType::FLOAT16) {
+      QKVWeightsProcess<platform::float16>(wq_tensor,
+                                           wk_tensor,
+                                           wv_tensor,
+                                           bq_tensor,
+                                           bk_tensor,
+                                           bv_tensor,
+                                           num_head,
+                                           dim_head,
+                                           dim_embed);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "fused_multi_transformer not supported weight dtype. "
+          "we now only support fp32 and fp16."));
+    }
+
+    // reuse the mul0_w and eltadd_0_b nodes for the combined nodes.
+    auto* combined_w_desc = matmul0_w->Var();
+    combined_w_desc->SetShape({3, num_head, dim_head, dim_embed});
+    combined_w_desc->SetPersistable(true);
+
+    auto* combined_bias_desc = eltadd0_b->Var();
+    combined_bias_desc->SetShape({3, num_head, dim_head});
+    combined_bias_desc->SetPersistable(true);
+
+    scope->EraseVars({matmul1_w->Name(), matmul2_w->Name()});
+    scope->EraseVars({eltadd1_b->Name(), eltadd2_b->Name()});
+
+    // create fused_multi_transformer
+    OpDesc fused_multi_transformer_op_desc(layer_norm->Op()->Block());
+    fused_multi_transformer_op_desc.SetType("fused_multi_transformer");
+
+    // 1. Input setting
+    fused_multi_transformer_op_desc.SetInput("X", {input0->Name()});
+
+    // pre-LayerNorm input
+    fused_multi_transformer_op_desc.SetInput("LnScale",
+                                             {layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("LnBias",
+                                             {layer_norm_bias->Name()});
+
+    // QKV computation input
+    fused_multi_transformer_op_desc.SetInput("QKVW", {matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("QKVBias", {eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("SrcMask", {eltadd_qk_b->Name()});
+
+    // CacheKV input
+    VarDesc cache_kv_desc("cache_kv" + std::to_string(layer_idx));
+    // FIXME: only support max_seq_len <= 1024
+    cache_kv_desc.SetDataType(
+        framework::TransToProtoVarType(wq_tensor->dtype()));
+    cache_kv_desc.SetPersistable(false);
+    auto* cache_kv = graph->CreateVarNode(&cache_kv_desc);
+
+    OpDesc fill_const_op_desc(layer_norm->Op()->Block());
+    fill_const_op_desc.SetType("fill_constant_batch_size_like");
+    fill_const_op_desc.SetInput("Input", {input0->Name()});
+    fill_const_op_desc.SetOutput("Out", {cache_kv->Name()});
+    std::vector<int> shape = {2, -1, num_head, 1024, dim_head};
+    fill_const_op_desc.SetAttr("shape", shape);
+    fill_const_op_desc.SetAttr("input_dim_idx", 0);
+    fill_const_op_desc.SetAttr("output_dim_idx", 1);
+    fill_const_op_desc.SetAttr("value", 0);
+    fill_const_op_desc.SetAttr(
+        "dtype",
+        static_cast<int>(framework::TransToProtoVarType(wq_tensor->dtype())));
+    auto* fill_const_op = graph->CreateOpNode(&fill_const_op_desc);
+
+    fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv->Name()});
+
+    // Out Linear input
+    fused_multi_transformer_op_desc.SetInput("OutLinearW",
+                                             {matmul_linear_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("OutLinearBias",
+                                             {eltadd_linear_b->Name()});
+
+    // Feed Forward input
+    fused_multi_transformer_op_desc.SetInput("FFNLnScale",
+                                             {ffn_layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFNLnBias",
+                                             {ffn_layer_norm_bias->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Weight",
+                                             {ffn_matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Bias",
+                                             {ffn_eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Weight",
+                                             {ffn_matmul1_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Bias",
+                                             {ffn_eltadd1_b->Name()});
+
+    // 2. Output setting
+    fused_multi_transformer_op_desc.SetOutput("Out", {ffn_output->Name()});
+    fused_multi_transformer_op_desc.SetOutput("CacheKVOut", {cache_kv->Name()});
+
+    // Attribute setting
+    fused_multi_transformer_op_desc.SetAttr("pre_layer_norm", true);
+    fused_multi_transformer_op_desc.SetAttr(
+        "epsilon", layer_norm->Op()->GetAttr("epsilon"));
+
+    fused_multi_transformer_op_desc.SetAttr("is_test", true);
+    fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f);
+
+    auto* fused_multi_transformer =
+        graph->CreateOpNode(&fused_multi_transformer_op_desc);
+    IR_NODE_LINK_TO(input0, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(input0, fill_const_op);
+    IR_NODE_LINK_TO(fill_const_op, cache_kv);
+    IR_NODE_LINK_TO(cache_kv, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(fused_multi_transformer, ffn_output);
+
+    // rewrite while OP input
+    //  1. delete k, v
+    //  2. delete matmul1/2_w eltadd1/2_w
+    //  3. add cache_kv
+    auto while_Xs = while0->Op()->Input("X");
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), transpose2_1_out->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), transpose2_2_out->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), matmul1_w->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), matmul2_w->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), eltadd1_b->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), eltadd2_b->Name()),
+        std::end(while_Xs));
+    while_Xs.emplace_back(cache_kv->Name());
+    while0->Op()->SetInput("X", while_Xs);
+
+    // rewrite while OP output
+    //  1. delete k, v
+    //  2. add cache_kv
+    auto while_Outs = while0->Op()->Output("Out");
+    while_Outs.erase(std::remove(std::begin(while_Outs),
+                                 std::end(while_Outs),
+                                 transpose2_1_out->Name()),
+                     std::end(while_Outs));
+    while_Outs.erase(std::remove(std::begin(while_Outs),
+                                 std::end(while_Outs),
+                                 transpose2_2_out->Name()),
+                     std::end(while_Outs));
+    while_Outs.emplace_back(cache_kv->Name());
+    while0->Op()->SetOutput("Out", while_Outs);
+
+    // link CacheKV to while
+    IR_NODE_LINK_TO(cache_kv, while0)
+    // unlink origin KV output to while
+    IR_NODE_UNLINK(transpose2_1_out, while0);
+    IR_NODE_UNLINK(transpose2_2_out, while0);
+    IR_NODE_UNLINK(while0, transpose2_1_out);
+    IR_NODE_UNLINK(while0, transpose2_2_out);
+    // unlink KV weight/bias to while after merged into Q weight/bias
+    IR_NODE_UNLINK(matmul1_w, while0);
+    IR_NODE_UNLINK(matmul2_w, while0);
+    IR_NODE_UNLINK(eltadd1_b, while0);
+    IR_NODE_UNLINK(eltadd2_b, while0);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "fused_multi_transformer_encoder pass in "
+                      "op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle MultiTransformer encoder fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(input0, input0, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm, layer_norm, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_scale, layer_norm_scale, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_bias, layer_norm_bias, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_mean, layer_norm_mean, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance,
+                              layer_norm_variance,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm_out, layer_norm_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0, matmul0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_out, matmul0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_w, matmul0_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0, reshape2_0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0_out, reshape2_0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0, transpose2_0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0_out, transpose2_0_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul1, matmul1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul1_out, matmul1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul1_w, matmul1_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_1, reshape2_1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_1_out, reshape2_1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_1, transpose2_1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_1_out, transpose2_1_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul2, matmul2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul2_out, matmul2_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul2_w, matmul2_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_2, reshape2_2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_2_out, reshape2_2_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_2, transpose2_2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_2_out, transpose2_2_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        attention_output, attention_output, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(while0, while0, fused_multi_transformer_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_layer_norm, ffn_layer_norm, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_scale,
+                              ffn_layer_norm_scale,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_bias,
+                              ffn_layer_norm_bias,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_mean,
+                              ffn_layer_norm_mean,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_variance,
+                              ffn_layer_norm_variance,
+                              fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_out,
+                              ffn_layer_norm_out,
+                              fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0, ffn_matmul0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_out, ffn_matmul0_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_w, ffn_matmul0_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0, ffn_eltadd0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_b, ffn_eltadd0_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_out, ffn_eltadd0_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu, ffn_gelu, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu_out, ffn_gelu_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1, ffn_matmul1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_out, ffn_matmul1_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_w, ffn_matmul1_w, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1, ffn_eltadd1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_b, ffn_eltadd1_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_out, ffn_eltadd1_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd_out, ffn_eltadd_out, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_output, ffn_output, fused_multi_transformer_pattern)
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0, eltadd0, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_b, eltadd0_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_out, eltadd0_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd1, eltadd1, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd1_b, eltadd1_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd1_out, eltadd1_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd2, eltadd2, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd2_b, eltadd2_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd2_out, eltadd2_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk, matmul_qk, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk_out, matmul_qk_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk, eltadd_qk, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_b, eltadd_qk_b, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_out, eltadd_qk_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk, softmax_qk, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk_out, softmax_qk_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv, matmul_qkv, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv_out, matmul_qkv_out, fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv, reshape2_qkv, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv_out, reshape2_qkv_out, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_qkv, transpose2_qkv, fused_multi_transformer_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out,
+                              transpose2_qkv_out,
+                              fused_multi_transformer_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear, matmul_linear, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear_w, matmul_linear_w, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear_out, matmul_linear_out, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear, eltadd_linear, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear_b, eltadd_linear_b, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear_out, eltadd_linear_out, fused_multi_transformer_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_out, eltadd_out, fused_multi_transformer_pattern)
+
+    fuse_creater(input0,
+                 layer_norm,
+                 layer_norm_scale,
+                 layer_norm_bias,
+                 layer_norm_mean,
+                 layer_norm_variance,
+                 matmul0_w,
+                 matmul1_w,
+                 matmul2_w,
+                 eltadd0_b,
+                 eltadd1_b,
+                 eltadd2_b,
+                 transpose2_1_out,
+                 transpose2_2_out,
+                 eltadd_qk_b,
+                 reshape2_0,
+                 matmul_linear_w,
+                 eltadd_linear_b,
+                 while0,
+                 ffn_layer_norm,
+                 ffn_layer_norm_scale,
+                 ffn_layer_norm_bias,
+                 ffn_layer_norm_mean,
+                 ffn_layer_norm_variance,
+                 ffn_matmul0_w,
+                 ffn_matmul1_w,
+                 ffn_eltadd0_b,
+                 ffn_eltadd1_b,
+                 ffn_output);
+
+    std::unordered_set<const Node*> marked_nodes({layer_norm,
+                                                  layer_norm_mean,
+                                                  layer_norm_variance,
+                                                  layer_norm_out,
+                                                  matmul0,
+                                                  matmul1,
+                                                  matmul2,
+                                                  matmul0_out,
+                                                  matmul1_out,
+                                                  matmul2_out,
+                                                  eltadd0,
+                                                  eltadd1,
+                                                  eltadd2,
+                                                  eltadd0_out,
+                                                  eltadd1_out,
+                                                  eltadd2_out,
+                                                  reshape2_0,
+                                                  reshape2_1,
+                                                  reshape2_2,
+                                                  reshape2_0_out,
+                                                  reshape2_1_out,
+                                                  reshape2_2_out,
+                                                  transpose2_0,
+                                                  transpose2_1,
+                                                  transpose2_2,
+                                                  transpose2_0_out,
+                                                  transpose2_1_out,
+                                                  transpose2_2_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  reshape2_qkv,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_linear,
+                                                  matmul_linear_out,
+                                                  eltadd_linear,
+                                                  eltadd_linear_out,
+                                                  eltadd_out,
+                                                  ffn_layer_norm,
+                                                  ffn_layer_norm_mean,
+                                                  ffn_layer_norm_variance,
+                                                  ffn_layer_norm_out,
+                                                  ffn_matmul0,
+                                                  ffn_matmul1,
+                                                  ffn_matmul0_out,
+                                                  ffn_matmul1_out,
+                                                  ffn_eltadd0,
+                                                  ffn_eltadd1,
+                                                  ffn_eltadd0_out,
+                                                  ffn_eltadd1_out,
+                                                  ffn_gelu,
+                                                  ffn_gelu_out,
+                                                  ffn_eltadd_out});
+
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void FusedMultiTransformerEncoderPass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the multi_transformer pass, The scope should not be null."));
+
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kFusedMultiTransformerEncoderPass, new bool(true));
+    graph->Set(kFusedMultiTransformerEncoderFusionCount, new int(fusion_count));
+  }
+  AddStatis(fusion_count);
+}
+
+FusedMultiTransformerEncoderPass::FusedMultiTransformerEncoderPass() {
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumGE(0.0f)
+      .IsNumLE(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("while"))
+      .AddInput("X")  // A set of variables, unconstrained
+      .End()
+      .AddInput("Condition")  // An scalar
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // A set of variables, unconstrained
+      .End()
+      .AddOutput("StepScopes")  // A vector of local scope, unconstrained
+      .End()
+      .AddAttr("sub_block")
+      .IsType<framework::BlockDesc*>()
+      .End();
+}
+
+int FusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
+    Graph* graph, const std::string& name_scope, Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::FusedMultiTransformerEncoderFuseQKVPattern
+      fused_multi_transformer_fuse_qkv_pattern(pattern, name_scope);
+  fused_multi_transformer_fuse_qkv_pattern();
+
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0,
+                          Node* layer_norm,
+                          Node* layer_norm_scale,
+                          Node* layer_norm_bias,
+                          Node* layer_norm_mean,
+                          Node* layer_norm_variance,
+                          Node* matmul0_w,
+                          Node* eltadd0_b,
+                          Node* split0_k_out,
+                          Node* split0_v_out,
+                          Node* eltadd_qk_b,
+                          Node* reshape2_0,
+                          Node* matmul_linear_w,
+                          Node* eltadd_linear_b,
+                          Node* while0,
+                          Node* ffn_layer_norm,
+                          Node* ffn_layer_norm_scale,
+                          Node* ffn_layer_norm_bias,
+                          Node* ffn_layer_norm_mean,
+                          Node* ffn_layer_norm_variance,
+                          Node* ffn_matmul0_w,
+                          Node* ffn_matmul1_w,
+                          Node* ffn_eltadd0_b,
+                          Node* ffn_eltadd1_b,
+                          Node* ffn_output) {
+    auto reshape_desc = reshape2_0->Op();
+    int num_head =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(2);
+    int dim_head =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(3) /
+        3;  // 3 for qkv
+    int dim_embed = num_head * dim_head;
+
+    // Calc index of transformer layer by LayerNorm Scale name
+    // This calculation assumes:
+    //    1. no LayerNorm before all transformer layer
+    //    2. each transformer layer contains 2 LayerNorm layer
+    auto ln_scale_name = layer_norm_scale->Name();
+    auto ln_name = ln_scale_name.substr(0, ln_scale_name.find('.'));
+    auto ln_idx_str = ln_name.substr(ln_name.rfind('_') + 1);
+    int layer_idx = atoi(ln_idx_str.c_str()) / 2;
+
+    auto* qkv_w_tensor =
+        scope->FindVar(matmul0_w->Name())->GetMutable<LoDTensor>();
+    auto* qkv_b_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+
+    if (qkv_w_tensor->dtype() == phi::DataType::FLOAT32) {
+      QKVWeightsProcessFuseQKV<float>(
+          qkv_w_tensor, qkv_b_tensor, num_head, dim_head, dim_embed);
+    } else if (qkv_w_tensor->dtype() == phi::DataType::FLOAT16) {
+      QKVWeightsProcessFuseQKV<platform::float16>(
+          qkv_w_tensor, qkv_b_tensor, num_head, dim_head, dim_embed);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "fused_multi_transformer not supported weight dtype. "
+          "we now only support fp32 and fp16."));
+    }
+
+    // create fused_multi_transformer
+    OpDesc fused_multi_transformer_op_desc(layer_norm->Op()->Block());
+    fused_multi_transformer_op_desc.SetType("fused_multi_transformer");
+
+    // 1. Input setting
+    fused_multi_transformer_op_desc.SetInput("X", {input0->Name()});
+
+    // pre-LayerNorm input
+    fused_multi_transformer_op_desc.SetInput("LnScale",
+                                             {layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("LnBias",
+                                             {layer_norm_bias->Name()});
+
+    // QKV computation input
+    fused_multi_transformer_op_desc.SetInput("QKVW", {matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("QKVBias", {eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("SrcMask", {eltadd_qk_b->Name()});
+
+    // CacheKV input
+    VarDesc cache_kv_desc("cache_kv" + std::to_string(layer_idx));
+    // FIXME: only support max_seq_len <= 1024
+    cache_kv_desc.SetDataType(
+        framework::TransToProtoVarType(qkv_w_tensor->dtype()));
+    cache_kv_desc.SetPersistable(false);
+    auto* cache_kv = graph->CreateVarNode(&cache_kv_desc);
+
+    OpDesc fill_const_op_desc(layer_norm->Op()->Block());
+    fill_const_op_desc.SetType("fill_constant_batch_size_like");
+    fill_const_op_desc.SetInput("Input", {input0->Name()});
+    fill_const_op_desc.SetOutput("Out", {cache_kv->Name()});
+    std::vector<int> shape = {2, -1, num_head, 1024, dim_head};
+    fill_const_op_desc.SetAttr("shape", shape);
+    fill_const_op_desc.SetAttr("input_dim_idx", 0);
+    fill_const_op_desc.SetAttr("output_dim_idx", 1);
+    fill_const_op_desc.SetAttr("value", 0);
+    fill_const_op_desc.SetAttr("dtype",
+                               static_cast<int>(framework::TransToProtoVarType(
+                                   qkv_w_tensor->dtype())));
+    auto* fill_const_op = graph->CreateOpNode(&fill_const_op_desc);
+
+    fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv->Name()});
+
+    // Out Linear input
+    fused_multi_transformer_op_desc.SetInput("OutLinearW",
+                                             {matmul_linear_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("OutLinearBias",
+                                             {eltadd_linear_b->Name()});
+
+    // Feed Forward input
+    fused_multi_transformer_op_desc.SetInput("FFNLnScale",
+                                             {ffn_layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFNLnBias",
+                                             {ffn_layer_norm_bias->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Weight",
+                                             {ffn_matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Bias",
+                                             {ffn_eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Weight",
+                                             {ffn_matmul1_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Bias",
+                                             {ffn_eltadd1_b->Name()});
+
+    // 2. Output setting
+    fused_multi_transformer_op_desc.SetOutput("Out", {ffn_output->Name()});
+    fused_multi_transformer_op_desc.SetOutput("CacheKVOut", {cache_kv->Name()});
+
+    // Attribute setting
+    fused_multi_transformer_op_desc.SetAttr("pre_layer_norm", true);
+    fused_multi_transformer_op_desc.SetAttr(
+        "epsilon", layer_norm->Op()->GetAttr("epsilon"));
+
+    // output dropout attribute
+    fused_multi_transformer_op_desc.SetAttr("is_test", true);
+    fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f);
+
+    auto* fused_multi_transformer =
+        graph->CreateOpNode(&fused_multi_transformer_op_desc);
+    IR_NODE_LINK_TO(input0, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(input0, fill_const_op);
+    IR_NODE_LINK_TO(fill_const_op, cache_kv);
+    IR_NODE_LINK_TO(cache_kv, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(fused_multi_transformer, ffn_output);
+
+    // rewrite while OP input
+    //  1. delete k, v
+    //  2. delete matmul1/2_w eltadd1/2_w
+    //  3. add cache_kv
+    auto while_Xs = while0->Op()->Input("X");
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), split0_k_out->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), split0_v_out->Name()),
+        std::end(while_Xs));
+    while_Xs.emplace_back(cache_kv->Name());
+    while0->Op()->SetInput("X", while_Xs);
+
+    // rewrite while OP output
+    //  1. delete k, v
+    //  2. add cache_kv
+    auto while_Outs = while0->Op()->Output("Out");
+    while_Outs.erase(
+        std::remove(
+            std::begin(while_Outs), std::end(while_Outs), split0_k_out->Name()),
+        std::end(while_Outs));
+    while_Outs.erase(
+        std::remove(
+            std::begin(while_Outs), std::end(while_Outs), split0_v_out->Name()),
+        std::end(while_Outs));
+    while_Outs.emplace_back(cache_kv->Name());
+    while0->Op()->SetOutput("Out", while_Outs);
+
+    // link CacheKV to while
+    IR_NODE_LINK_TO(cache_kv, while0)
+    // unlink origin KV output to while
+    IR_NODE_UNLINK(split0_k_out, while0);
+    IR_NODE_UNLINK(split0_v_out, while0);
+    IR_NODE_UNLINK(while0, split0_k_out);
+    IR_NODE_UNLINK(while0, split0_v_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "fused_multi_transformer_encoder_fuse_qkv "
+                      "pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle MultiTransformer encoder(Fuse-QKV) fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(
+        input0, input0, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm, layer_norm, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale,
+                              layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias,
+                              layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean,
+                              layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance,
+                              layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out,
+                              layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0, matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_out, matmul0_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_w, matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0, reshape2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out,
+                              reshape2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0, transpose2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out,
+                              transpose2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0, split0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_q_out, split0_q_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_k_out, split0_k_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_v_out, split0_v_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm,
+                              ffn_layer_norm,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_scale,
+                              ffn_layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_bias,
+                              ffn_layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_mean,
+                              ffn_layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_variance,
+                              ffn_layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_out,
+                              ffn_layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0, ffn_matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul0_out,
+                              ffn_matmul0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_w, ffn_matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0, ffn_eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_b, ffn_eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd0_out,
+                              ffn_eltadd0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu, ffn_gelu, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu_out, ffn_gelu_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1, ffn_matmul1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul1_out,
+                              ffn_matmul1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_w, ffn_matmul1_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1, ffn_eltadd1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_b, ffn_eltadd1_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd1_out,
+                              ffn_eltadd1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out,
+                              ffn_eltadd_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_output, ffn_output, fused_multi_transformer_fuse_qkv_pattern)
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0, eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_b, eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_out, eltadd0_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk, matmul_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk_out, matmul_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk, eltadd_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_b, eltadd_qk_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_out, eltadd_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk, softmax_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out,
+                              softmax_qk_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out,
+                              matmul_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv, reshape2_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out,
+                              reshape2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv,
+                              transpose2_qkv,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out,
+                              transpose2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear, matmul_linear, fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_w,
+                              matmul_linear_w,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_out,
+                              matmul_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear, eltadd_linear, fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_b,
+                              eltadd_linear_b,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out,
+                              eltadd_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern)
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        while0, while0, fused_multi_transformer_fuse_qkv_pattern)
+
+    fuse_creater(input0,
+                 layer_norm,
+                 layer_norm_scale,
+                 layer_norm_bias,
+                 layer_norm_mean,
+                 layer_norm_variance,
+                 matmul0_w,
+                 eltadd0_b,
+                 split0_k_out,
+                 split0_v_out,
+                 eltadd_qk_b,
+                 reshape2_0,
+                 matmul_linear_w,
+                 eltadd_linear_b,
+                 while0,
+                 ffn_layer_norm,
+                 ffn_layer_norm_scale,
+                 ffn_layer_norm_bias,
+                 ffn_layer_norm_mean,
+                 ffn_layer_norm_variance,
+                 ffn_matmul0_w,
+                 ffn_matmul1_w,
+                 ffn_eltadd0_b,
+                 ffn_eltadd1_b,
+                 ffn_output);
+
+    std::unordered_set<const Node*> marked_nodes({layer_norm,
+                                                  layer_norm_mean,
+                                                  layer_norm_variance,
+                                                  layer_norm_out,
+                                                  matmul0,
+                                                  matmul0_out,
+                                                  eltadd0,
+                                                  eltadd0_out,
+                                                  reshape2_0,
+                                                  reshape2_0_out,
+                                                  transpose2_0,
+                                                  transpose2_0_out,
+                                                  split0,
+                                                  split0_q_out,
+                                                  split0_k_out,
+                                                  split0_v_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  reshape2_qkv,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_linear,
+                                                  matmul_linear_out,
+                                                  eltadd_linear,
+                                                  eltadd_linear_out,
+                                                  eltadd_out,
+                                                  ffn_layer_norm,
+                                                  ffn_layer_norm_mean,
+                                                  ffn_layer_norm_variance,
+                                                  ffn_layer_norm_out,
+                                                  ffn_matmul0,
+                                                  ffn_matmul1,
+                                                  ffn_matmul0_out,
+                                                  ffn_matmul1_out,
+                                                  ffn_eltadd0,
+                                                  ffn_eltadd1,
+                                                  ffn_eltadd0_out,
+                                                  ffn_eltadd1_out,
+                                                  ffn_gelu,
+                                                  ffn_gelu_out,
+                                                  ffn_eltadd_out});
+
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void FusedMultiTransformerEncoderFuseQKVPass::ApplyImpl(Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the fused_multi_transformer_encoder pass, "
+          "The scope should not be null."));
+
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kFusedMultiTransformerEncoderFuseQKVPass, new bool(true));
+    graph->Set(kFusedMultiTransformerEncoderFusionCount, new int(fusion_count));
+  }
+  AddStatis(fusion_count);
+}
+
+FusedMultiTransformerEncoderFuseQKVPass::
+    FusedMultiTransformerEncoderFuseQKVPass() {
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumGE(0.0f)
+      .IsNumLE(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("while"))
+      .AddInput("X")  // A set of variables, unconstrained
+      .End()
+      .AddInput("Condition")  // An scalar
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // A set of variables, unconstrained
+      .End()
+      .AddOutput("StepScopes")  // A vector of local scope, unconstrained
+      .End()
+      .AddAttr("sub_block")
+      .IsType<framework::BlockDesc*>()
+      .End();
+}
+
+int MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::BuildFusion(
+    Graph* graph, const std::string& name_scope, Scope* scope) const {
+  GraphPatternDetector gpd;
+  auto* pattern = gpd.mutable_pattern();
+
+  // Create pattern.
+  patterns::MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern
+      fused_multi_transformer_fuse_qkv_pattern(pattern, name_scope);
+  fused_multi_transformer_fuse_qkv_pattern();
+
+  // Create New OpDesc
+  auto fuse_creater = [&](Node* input0,
+                          Node* layer_norm,
+                          Node* layer_norm_scale,
+                          Node* layer_norm_bias,
+                          Node* layer_norm_mean,
+                          Node* layer_norm_variance,
+                          Node* c_identity,
+                          Node* matmul0_w,
+                          Node* eltadd0_b,
+                          Node* split0_k_out,
+                          Node* split0_v_out,
+                          Node* eltadd_qk_b,
+                          Node* reshape2_0,
+                          Node* matmul_linear_w,
+                          Node* eltadd_linear_b,
+                          Node* while0,
+                          Node* ffn_layer_norm,
+                          Node* ffn_layer_norm_scale,
+                          Node* ffn_layer_norm_bias,
+                          Node* ffn_layer_norm_mean,
+                          Node* ffn_layer_norm_variance,
+                          Node* ffn_matmul0_w,
+                          Node* ffn_matmul1_w,
+                          Node* ffn_eltadd0_b,
+                          Node* ffn_eltadd1_b,
+                          Node* ffn_output) {
+    auto reshape_desc = reshape2_0->Op();
+    int num_head =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(2);
+    int dim_head =
+        PADDLE_GET_CONST(std::vector<int>, reshape_desc->GetAttr("shape"))
+            .at(3) /
+        3;  // 3 for qkv
+
+    // Calc index of transformer layer by LayerNorm Scale name
+    // This calculation assumes:
+    //    1. no LayerNorm before all transformer layer
+    //    2. each transformer layer contains 2 LayerNorm layer
+    auto ln_scale_name = layer_norm_scale->Name();
+    auto ln_name = ln_scale_name.substr(0, ln_scale_name.find('.'));
+    auto ln_idx_str = ln_name.substr(ln_name.rfind('_') + 1);
+    int layer_idx = atoi(ln_idx_str.c_str()) / 2;
+
+    auto* qkv_w_tensor =
+        scope->FindVar(matmul0_w->Name())->GetMutable<LoDTensor>();
+    auto* qkv_b_tensor =
+        scope->FindVar(eltadd0_b->Name())->GetMutable<LoDTensor>();
+
+    int dim_embed = qkv_w_tensor->dims()[0];
+
+    if (qkv_w_tensor->dtype() == phi::DataType::FLOAT32) {
+      QKVWeightsProcessFuseQKV<float>(
+          qkv_w_tensor, qkv_b_tensor, num_head, dim_head, dim_embed);
+    } else if (qkv_w_tensor->dtype() == phi::DataType::FLOAT16) {
+      QKVWeightsProcessFuseQKV<platform::float16>(
+          qkv_w_tensor, qkv_b_tensor, num_head, dim_head, dim_embed);
+    } else {
+      PADDLE_THROW(platform::errors::Unavailable(
+          "fused_multi_transformer not supported weight dtype. "
+          "we now only support fp32 and fp16."));
+    }
+
+    // create fused_multi_transformer
+    OpDesc fused_multi_transformer_op_desc(layer_norm->Op()->Block());
+    fused_multi_transformer_op_desc.SetType("fused_multi_transformer");
+
+    // 1. Input setting
+    fused_multi_transformer_op_desc.SetInput("X", {input0->Name()});
+
+    // pre-LayerNorm input
+    fused_multi_transformer_op_desc.SetInput("LnScale",
+                                             {layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("LnBias",
+                                             {layer_norm_bias->Name()});
+
+    // QKV computation input
+    fused_multi_transformer_op_desc.SetInput("QKVW", {matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("QKVBias", {eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("SrcMask", {eltadd_qk_b->Name()});
+
+    // CacheKV input
+    VarDesc cache_kv_desc("cache_kv" + std::to_string(layer_idx));
+    // FIXME: only support max_seq_len <= 1024
+    cache_kv_desc.SetDataType(
+        framework::TransToProtoVarType(qkv_w_tensor->dtype()));
+    cache_kv_desc.SetPersistable(false);
+    auto* cache_kv = graph->CreateVarNode(&cache_kv_desc);
+
+    OpDesc fill_const_op_desc(layer_norm->Op()->Block());
+    fill_const_op_desc.SetType("fill_constant_batch_size_like");
+    fill_const_op_desc.SetInput("Input", {input0->Name()});
+    fill_const_op_desc.SetOutput("Out", {cache_kv->Name()});
+    std::vector<int> shape = {2, -1, num_head, 1024, dim_head};
+    fill_const_op_desc.SetAttr("shape", shape);
+    fill_const_op_desc.SetAttr("input_dim_idx", 0);
+    fill_const_op_desc.SetAttr("output_dim_idx", 1);
+    fill_const_op_desc.SetAttr("value", 0);
+    fill_const_op_desc.SetAttr("dtype",
+                               static_cast<int>(framework::TransToProtoVarType(
+                                   qkv_w_tensor->dtype())));
+    auto* fill_const_op = graph->CreateOpNode(&fill_const_op_desc);
+
+    fused_multi_transformer_op_desc.SetInput("CacheKV", {cache_kv->Name()});
+
+    // Out Linear input
+    fused_multi_transformer_op_desc.SetInput("OutLinearW",
+                                             {matmul_linear_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("OutLinearBias",
+                                             {eltadd_linear_b->Name()});
+
+    // Feed Forward input
+    fused_multi_transformer_op_desc.SetInput("FFNLnScale",
+                                             {ffn_layer_norm_scale->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFNLnBias",
+                                             {ffn_layer_norm_bias->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Weight",
+                                             {ffn_matmul0_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN1Bias",
+                                             {ffn_eltadd0_b->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Weight",
+                                             {ffn_matmul1_w->Name()});
+    fused_multi_transformer_op_desc.SetInput("FFN2Bias",
+                                             {ffn_eltadd1_b->Name()});
+
+    // 2. Output setting
+    fused_multi_transformer_op_desc.SetOutput("Out", {ffn_output->Name()});
+    fused_multi_transformer_op_desc.SetOutput("CacheKVOut", {cache_kv->Name()});
+
+    // Attribute setting
+    fused_multi_transformer_op_desc.SetAttr("pre_layer_norm", true);
+    fused_multi_transformer_op_desc.SetAttr(
+        "epsilon", layer_norm->Op()->GetAttr("epsilon"));
+
+    // output dropout attribute
+    fused_multi_transformer_op_desc.SetAttr("dropout_rate", 0.0f);
+    fused_multi_transformer_op_desc.SetAttr("is_test", true);
+
+    // parallel ring id
+    auto* c_identity_op = c_identity->Op();
+    fused_multi_transformer_op_desc.SetAttr("ring_id",
+                                            c_identity_op->GetAttr("ring_id"));
+
+    auto* fused_multi_transformer =
+        graph->CreateOpNode(&fused_multi_transformer_op_desc);
+    IR_NODE_LINK_TO(input0, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(layer_norm_bias, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_qk_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(input0, fill_const_op);
+    IR_NODE_LINK_TO(fill_const_op, cache_kv);
+    IR_NODE_LINK_TO(cache_kv, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(matmul_linear_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(eltadd_linear_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_scale, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_layer_norm_bias, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul0_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd0_b, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_matmul1_w, fused_multi_transformer);
+    IR_NODE_LINK_TO(ffn_eltadd1_b, fused_multi_transformer);
+
+    IR_NODE_LINK_TO(fused_multi_transformer, ffn_output);
+
+    // rewrite while OP input
+    //  1. delete k, v
+    //  2. delete matmul1/2_w eltadd1/2_w
+    //  3. add cache_kv
+    auto while_Xs = while0->Op()->Input("X");
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), split0_k_out->Name()),
+        std::end(while_Xs));
+    while_Xs.erase(
+        std::remove(
+            std::begin(while_Xs), std::end(while_Xs), split0_v_out->Name()),
+        std::end(while_Xs));
+    while_Xs.emplace_back(cache_kv->Name());
+    while0->Op()->SetInput("X", while_Xs);
+
+    // rewrite while OP output
+    //  1. delete k, v
+    //  2. add cache_kv
+    auto while_Outs = while0->Op()->Output("Out");
+    while_Outs.erase(
+        std::remove(
+            std::begin(while_Outs), std::end(while_Outs), split0_k_out->Name()),
+        std::end(while_Outs));
+    while_Outs.erase(
+        std::remove(
+            std::begin(while_Outs), std::end(while_Outs), split0_v_out->Name()),
+        std::end(while_Outs));
+    while_Outs.emplace_back(cache_kv->Name());
+    while0->Op()->SetOutput("Out", while_Outs);
+
+    // link CacheKV to while
+    IR_NODE_LINK_TO(cache_kv, while0)
+    // unlink origin KV output to while
+    IR_NODE_UNLINK(split0_k_out, while0);
+    IR_NODE_UNLINK(split0_v_out, while0);
+    IR_NODE_UNLINK(while0, split0_k_out);
+    IR_NODE_UNLINK(while0, split0_v_out);
+  };
+
+  int fusion_count{0};
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "fused_multi_transformer_encoder_fuse_qkv "
+                      "pass in op compat failed.";
+      return;
+    }
+
+    VLOG(4) << "handle MultiTransformer encoder(Fuse-QKV) fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(
+        input0, input0, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        layer_norm, layer_norm, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_scale,
+                              layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_bias,
+                              layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean,
+                              layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance,
+                              layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(layer_norm_out,
+                              layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        c_identity, c_identity, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(c_identity_out,
+                              c_identity_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0, matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_out, matmul0_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul0_w, matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_0, reshape2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_0_out,
+                              reshape2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_0, transpose2_0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_0_out,
+                              transpose2_0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0, split0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_q_out, split0_q_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_k_out, split0_k_out, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        split0_v_out, split0_v_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm,
+                              ffn_layer_norm,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_scale,
+                              ffn_layer_norm_scale,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_bias,
+                              ffn_layer_norm_bias,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_mean,
+                              ffn_layer_norm_mean,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_variance,
+                              ffn_layer_norm_variance,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_layer_norm_out,
+                              ffn_layer_norm_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_identity,
+                              ffn_c_identity,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_identity_out,
+                              ffn_c_identity_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0, ffn_matmul0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul0_out,
+                              ffn_matmul0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul0_w, ffn_matmul0_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0, ffn_eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd0_b, ffn_eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd0_out,
+                              ffn_eltadd0_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu, ffn_gelu, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_gelu_out, ffn_gelu_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1, ffn_matmul1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_matmul1_out,
+                              ffn_matmul1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_matmul1_w, ffn_matmul1_w, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_allreduce_sum,
+                              ffn_c_allreduce_sum,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_c_allreduce_sum_out,
+                              ffn_c_allreduce_sum_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1, ffn_eltadd1, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_eltadd1_b, ffn_eltadd1_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd1_out,
+                              ffn_eltadd1_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(ffn_eltadd_out,
+                              ffn_eltadd_out,
+                              fused_multi_transformer_fuse_qkv_pattern)
+    GET_IR_NODE_FROM_SUBGRAPH(
+        ffn_output, ffn_output, fused_multi_transformer_fuse_qkv_pattern)
+
+    // nodes need be removed
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0, eltadd0, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_b, eltadd0_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd0_out, eltadd0_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk, matmul_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qk_out, matmul_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk, eltadd_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_b, eltadd_qk_b, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_qk_out, eltadd_qk_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        softmax_qk, softmax_qk, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(softmax_qk_out,
+                              softmax_qk_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_qkv, matmul_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_qkv_out,
+                              matmul_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        reshape2_qkv, reshape2_qkv, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_qkv_out,
+                              reshape2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv,
+                              transpose2_qkv,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(transpose2_qkv_out,
+                              transpose2_qkv_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        matmul_linear, matmul_linear, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_w,
+                              matmul_linear_w,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(matmul_linear_out,
+                              matmul_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(c_allreduce_sum,
+                              c_allreduce_sum,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(c_allreduce_sum_out,
+                              c_allreduce_sum_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_linear, eltadd_linear, fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_b,
+                              eltadd_linear_b,
+                              fused_multi_transformer_fuse_qkv_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(eltadd_linear_out,
+                              eltadd_linear_out,
+                              fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        eltadd_out, eltadd_out, fused_multi_transformer_fuse_qkv_pattern);
+
+    GET_IR_NODE_FROM_SUBGRAPH(
+        while0, while0, fused_multi_transformer_fuse_qkv_pattern);
+
+    fuse_creater(input0,
+                 layer_norm,
+                 layer_norm_scale,
+                 layer_norm_bias,
+                 layer_norm_mean,
+                 layer_norm_variance,
+                 c_identity,
+                 matmul0_w,
+                 eltadd0_b,
+                 split0_k_out,
+                 split0_v_out,
+                 eltadd_qk_b,
+                 reshape2_0,
+                 matmul_linear_w,
+                 eltadd_linear_b,
+                 while0,
+                 ffn_layer_norm,
+                 ffn_layer_norm_scale,
+                 ffn_layer_norm_bias,
+                 ffn_layer_norm_mean,
+                 ffn_layer_norm_variance,
+                 ffn_matmul0_w,
+                 ffn_matmul1_w,
+                 ffn_eltadd0_b,
+                 ffn_eltadd1_b,
+                 ffn_output);
+
+    std::unordered_set<const Node*> marked_nodes({layer_norm,
+                                                  layer_norm_mean,
+                                                  layer_norm_variance,
+                                                  layer_norm_out,
+                                                  c_identity,
+                                                  c_identity_out,
+                                                  matmul0,
+                                                  matmul0_out,
+                                                  eltadd0,
+                                                  eltadd0_out,
+                                                  reshape2_0,
+                                                  reshape2_0_out,
+                                                  transpose2_0,
+                                                  transpose2_0_out,
+                                                  split0,
+                                                  split0_q_out,
+                                                  split0_k_out,
+                                                  split0_v_out,
+                                                  matmul_qk,
+                                                  matmul_qk_out,
+                                                  eltadd_qk,
+                                                  eltadd_qk_out,
+                                                  softmax_qk,
+                                                  softmax_qk_out,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_qkv,
+                                                  matmul_qkv_out,
+                                                  reshape2_qkv,
+                                                  transpose2_qkv,
+                                                  transpose2_qkv_out,
+                                                  matmul_linear,
+                                                  matmul_linear_out,
+                                                  c_allreduce_sum,
+                                                  c_allreduce_sum_out,
+                                                  eltadd_linear,
+                                                  eltadd_linear_out,
+                                                  eltadd_out,
+                                                  ffn_layer_norm,
+                                                  ffn_layer_norm_mean,
+                                                  ffn_layer_norm_variance,
+                                                  ffn_layer_norm_out,
+                                                  ffn_c_identity,
+                                                  ffn_c_identity_out,
+                                                  ffn_matmul0,
+                                                  ffn_matmul1,
+                                                  ffn_matmul0_out,
+                                                  ffn_matmul1_out,
+                                                  ffn_c_allreduce_sum,
+                                                  ffn_c_allreduce_sum_out,
+                                                  ffn_eltadd0,
+                                                  ffn_eltadd1,
+                                                  ffn_eltadd0_out,
+                                                  ffn_eltadd1_out,
+                                                  ffn_gelu,
+                                                  ffn_gelu_out,
+                                                  ffn_eltadd_out});
+
+    // Remove unneeded nodes.
+    GraphSafeRemoveNodes(graph, marked_nodes);
+    ++fusion_count;
+  };
+  gpd(graph, handler);
+
+  return fusion_count;
+}
+
+void MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::ApplyImpl(
+    Graph* graph) const {
+  FusePassBase::Init(name_scope_, graph);
+  auto* scope = param_scope();
+  PADDLE_ENFORCE_NOT_NULL(
+      scope,
+      platform::errors::Fatal(
+          "During the fused_multi_transformer_encoder pass, "
+          "The scope should not be null."));
+
+  int fusion_count = BuildFusion(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiDevicesFusedMultiTransformerEncoderFuseQKVPass,
+               new bool(true));
+    graph->Set(kFusedMultiTransformerEncoderFusionCount, new int(fusion_count));
+  }
+  AddStatis(fusion_count);
+}
+
+MultiDevicesFusedMultiTransformerEncoderFuseQKVPass::
+    MultiDevicesFusedMultiTransformerEncoderFuseQKVPass() {
+  AddOpCompat(OpCompat("layer_norm"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Scale")
+      .IsTensor()
+      .End()
+      .AddInput("Bias")
+      .IsTensor()
+      .End()
+      .AddOutput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Mean")
+      .IsTensor()
+      .End()
+      .AddOutput("Variance")
+      .IsTensor()
+      .End()
+      .AddAttr("epsilon")
+      .IsNumGE(0.0f)
+      .IsNumLE(0.001f)
+      .End()
+      .AddAttr("begin_norm_axis")
+      .IsNumGT(0)
+      .End();
+
+  AddOpCompat(OpCompat("matmul_v2"))
+      .AddInput("X")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddInput("Y")  // the shape shoule be (N*H, N*H)
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // the shape shoule be (B, S, N*H)
+      .IsTensor()
+      .End()
+      .AddAttr("trans_x")
+      .IsType<bool>()
+      .End()
+      .AddAttr("trans_y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("elementwise_add"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({2, -1, 0})
+      .End();
+
+  AddOpCompat(OpCompat("reshape2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Shape")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddInput("ShapeTensor")
+      .IsTensor()
+      .IsOptional()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("shape")  // -->(B, S, H, N)  <--(B, S, N*H)
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("transpose2"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddOutput("XShape")
+      .IsOptional()
+      .IsTensor()
+      .End()
+      .AddAttr("axis")  // {0, 2, 1, 3}
+      .IsType<std::vector<int>>()
+      .End();
+
+  AddOpCompat(OpCompat("matmul"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddInput("Y")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("alpha")
+      .IsNumGE(0.0f)
+      .IsNumLE(1.0f)
+      .End()
+      .AddAttr("transpose_X")
+      .IsBoolEQ(false)
+      .End()
+      .AddAttr("transpose_Y")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("softmax"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsIntIn({-1, 3})  // shape is (B, H, S, S), so axis is -1 or 3
+      .End();
+
+  AddOpCompat(OpCompat("gelu"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("approximate")
+      .IsType<bool>()
+      .End();
+
+  AddOpCompat(OpCompat("while"))
+      .AddInput("X")  // A set of variables, unconstrained
+      .End()
+      .AddInput("Condition")  // An scalar
+      .IsTensor()
+      .End()
+      .AddOutput("Out")  // A set of variables, unconstrained
+      .End()
+      .AddOutput("StepScopes")  // A vector of local scope, unconstrained
+      .End()
+      .AddAttr("sub_block")
+      .IsType<framework::BlockDesc*>()
+      .End();
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fused_multi_transformer_encoder_pass,
+              paddle::framework::ir::FusedMultiTransformerEncoderPass);
+REGISTER_PASS(fused_multi_transformer_encoder_fuse_qkv_pass,
+              paddle::framework::ir::FusedMultiTransformerEncoderFuseQKVPass);
+REGISTER_PASS(
+    multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass,
+    paddle::framework::ir::MultiDevicesFusedMultiTransformerEncoderFuseQKVPass);
+
+REGISTER_PASS_CAPABILITY(fused_multi_transformer_encoder_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(fused_multi_transformer_encoder_fuse_qkv_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
+REGISTER_PASS_CAPABILITY(
+    multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .LE("elementwise_add", 1)
+            .EQ("reshape2", 0)
+            .EQ("transpose2", 0)
+            .EQ("scale", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("softmax", 0));
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h
new file mode 100644
index 00000000000000..55792456b8c834
--- /dev/null
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h
@@ -0,0 +1,380 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+struct FusedMultiTransformerEncoderPattern : public PatternBase {
+  FusedMultiTransformerEncoderPattern(PDPattern* pattern,
+                                      const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "fused_multi_transformer_encoder") {}
+
+  PDNode* operator()();
+
+  // Q, K, V path
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(matmul0);
+  PATTERN_DECL_NODE(matmul1);
+  PATTERN_DECL_NODE(matmul2);
+  PATTERN_DECL_NODE(matmul0_w);
+  PATTERN_DECL_NODE(matmul1_w);
+  PATTERN_DECL_NODE(matmul2_w);
+  PATTERN_DECL_NODE(matmul0_out);
+  PATTERN_DECL_NODE(matmul1_out);
+  PATTERN_DECL_NODE(matmul2_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd2_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(eltadd1_out);
+  PATTERN_DECL_NODE(eltadd2_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_1);
+  PATTERN_DECL_NODE(reshape2_2);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(reshape2_1_out);
+  PATTERN_DECL_NODE(reshape2_2_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_1);
+  PATTERN_DECL_NODE(transpose2_2);
+  PATTERN_DECL_NODE(transpose2_0_out);
+  PATTERN_DECL_NODE(transpose2_1_out);
+  PATTERN_DECL_NODE(transpose2_2_out);
+
+  // Q, K matmul
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  // QK, V matmul
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+
+  // out linear
+  PATTERN_DECL_NODE(matmul_linear);
+  PATTERN_DECL_NODE(matmul_linear_w);
+  PATTERN_DECL_NODE(matmul_linear_out);
+  PATTERN_DECL_NODE(eltadd_linear);
+  PATTERN_DECL_NODE(eltadd_linear_b);
+  PATTERN_DECL_NODE(eltadd_linear_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(eltadd_out)
+  PATTERN_DECL_NODE(attention_output);
+
+  // while loop
+  PATTERN_DECL_NODE(while0);
+
+  // Feed Forward nodes
+  PATTERN_DECL_NODE(ffn_layer_norm);
+  PATTERN_DECL_NODE(ffn_layer_norm_scale);
+  PATTERN_DECL_NODE(ffn_layer_norm_bias);
+  PATTERN_DECL_NODE(ffn_layer_norm_mean);
+  PATTERN_DECL_NODE(ffn_layer_norm_variance);
+  PATTERN_DECL_NODE(ffn_layer_norm_out);
+  PATTERN_DECL_NODE(ffn_matmul0);
+  PATTERN_DECL_NODE(ffn_matmul0_w);
+  PATTERN_DECL_NODE(ffn_matmul0_out);
+  PATTERN_DECL_NODE(ffn_eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_out);
+  PATTERN_DECL_NODE(ffn_gelu);
+  PATTERN_DECL_NODE(ffn_gelu_out);
+  PATTERN_DECL_NODE(ffn_matmul1);
+  PATTERN_DECL_NODE(ffn_matmul1_w);
+  PATTERN_DECL_NODE(ffn_matmul1_out);
+  PATTERN_DECL_NODE(ffn_eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(ffn_eltadd_out)
+  PATTERN_DECL_NODE(ffn_output);
+};
+
+struct FusedMultiTransformerEncoderFuseQKVPattern : public PatternBase {
+  FusedMultiTransformerEncoderFuseQKVPattern(PDPattern* pattern,
+                                             const std::string& name_scope)
+      : PatternBase(
+            pattern, name_scope, "fused_multi_transformer_encoder_fuse_qkv") {}
+
+  PDNode* operator()();
+
+  // Q, K, V path
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(matmul0);
+  PATTERN_DECL_NODE(matmul0_w);
+  PATTERN_DECL_NODE(matmul0_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_0_out);
+
+  PATTERN_DECL_NODE(split0)
+  PATTERN_DECL_NODE(split0_q_out)
+  PATTERN_DECL_NODE(split0_k_out)
+  PATTERN_DECL_NODE(split0_v_out)
+
+  // Q, K matmul
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  // QK, V matmul
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+
+  // while loop
+  PATTERN_DECL_NODE(while0);
+
+  // out linear
+  PATTERN_DECL_NODE(matmul_linear);
+  PATTERN_DECL_NODE(matmul_linear_w);
+  PATTERN_DECL_NODE(matmul_linear_out);
+  PATTERN_DECL_NODE(eltadd_linear);
+  PATTERN_DECL_NODE(eltadd_linear_b);
+  PATTERN_DECL_NODE(eltadd_linear_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(eltadd_out)
+  PATTERN_DECL_NODE(attention_output);
+
+  // Feed Forward nodes
+  PATTERN_DECL_NODE(ffn_layer_norm);
+  PATTERN_DECL_NODE(ffn_layer_norm_scale);
+  PATTERN_DECL_NODE(ffn_layer_norm_bias);
+  PATTERN_DECL_NODE(ffn_layer_norm_mean);
+  PATTERN_DECL_NODE(ffn_layer_norm_variance);
+  PATTERN_DECL_NODE(ffn_layer_norm_out);
+  PATTERN_DECL_NODE(ffn_matmul0);
+  PATTERN_DECL_NODE(ffn_matmul0_w);
+  PATTERN_DECL_NODE(ffn_matmul0_out);
+  PATTERN_DECL_NODE(ffn_eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_out);
+  PATTERN_DECL_NODE(ffn_gelu);
+  PATTERN_DECL_NODE(ffn_gelu_out);
+  PATTERN_DECL_NODE(ffn_matmul1);
+  PATTERN_DECL_NODE(ffn_matmul1_w);
+  PATTERN_DECL_NODE(ffn_matmul1_out);
+  PATTERN_DECL_NODE(ffn_eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(ffn_eltadd_out)
+  PATTERN_DECL_NODE(ffn_output);
+};
+
+struct MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern
+    : public PatternBase {
+  MultiDevicesFusedMultiTransformerEncoderFuseQKVPattern(
+      PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern,
+                    name_scope,
+                    "multi_devices_fused_multi_transformer_encoder_fuse_qkv") {}
+
+  PDNode* operator()();
+
+  // Q, K, V path
+  PATTERN_DECL_NODE(input0);
+  PATTERN_DECL_NODE(layer_norm);
+  PATTERN_DECL_NODE(layer_norm_scale);
+  PATTERN_DECL_NODE(layer_norm_bias);
+  PATTERN_DECL_NODE(layer_norm_mean);
+  PATTERN_DECL_NODE(layer_norm_variance);
+  PATTERN_DECL_NODE(layer_norm_out);
+  PATTERN_DECL_NODE(c_identity);
+  PATTERN_DECL_NODE(c_identity_out);
+  PATTERN_DECL_NODE(matmul0);
+  PATTERN_DECL_NODE(matmul0_w);
+  PATTERN_DECL_NODE(matmul0_out);
+  PATTERN_DECL_NODE(eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(eltadd0_out);
+  PATTERN_DECL_NODE(reshape2_0);
+  PATTERN_DECL_NODE(reshape2_0_out);
+  PATTERN_DECL_NODE(transpose2_0);
+  PATTERN_DECL_NODE(transpose2_0_out);
+
+  PATTERN_DECL_NODE(split0)
+  PATTERN_DECL_NODE(split0_q_out)
+  PATTERN_DECL_NODE(split0_k_out)
+  PATTERN_DECL_NODE(split0_v_out)
+
+  // Q, K matmul
+  PATTERN_DECL_NODE(matmul_qk);
+  PATTERN_DECL_NODE(matmul_qk_out);
+  PATTERN_DECL_NODE(eltadd_qk);
+  PATTERN_DECL_NODE(eltadd_qk_b);
+  PATTERN_DECL_NODE(eltadd_qk_out);
+  PATTERN_DECL_NODE(softmax_qk);
+  PATTERN_DECL_NODE(softmax_qk_out);
+
+  // QK, V matmul
+  PATTERN_DECL_NODE(matmul_qkv);
+  PATTERN_DECL_NODE(matmul_qkv_out);
+  PATTERN_DECL_NODE(reshape2_qkv);
+  PATTERN_DECL_NODE(reshape2_qkv_out);
+  PATTERN_DECL_NODE(transpose2_qkv);
+  PATTERN_DECL_NODE(transpose2_qkv_out);
+
+  // while loop
+  PATTERN_DECL_NODE(while0);
+
+  // out linear
+  PATTERN_DECL_NODE(matmul_linear);
+  PATTERN_DECL_NODE(matmul_linear_w);
+  PATTERN_DECL_NODE(matmul_linear_out);
+  PATTERN_DECL_NODE(c_allreduce_sum);
+  PATTERN_DECL_NODE(c_allreduce_sum_out);
+  PATTERN_DECL_NODE(eltadd_linear);
+  PATTERN_DECL_NODE(eltadd_linear_b);
+  PATTERN_DECL_NODE(eltadd_linear_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(eltadd_out)
+  PATTERN_DECL_NODE(attention_output);
+
+  // Feed Forward nodes
+  PATTERN_DECL_NODE(ffn_layer_norm);
+  PATTERN_DECL_NODE(ffn_layer_norm_scale);
+  PATTERN_DECL_NODE(ffn_layer_norm_bias);
+  PATTERN_DECL_NODE(ffn_layer_norm_mean);
+  PATTERN_DECL_NODE(ffn_layer_norm_variance);
+  PATTERN_DECL_NODE(ffn_layer_norm_out);
+  PATTERN_DECL_NODE(ffn_c_identity);
+  PATTERN_DECL_NODE(ffn_c_identity_out);
+  PATTERN_DECL_NODE(ffn_matmul0);
+  PATTERN_DECL_NODE(ffn_matmul0_w);
+  PATTERN_DECL_NODE(ffn_matmul0_out);
+  PATTERN_DECL_NODE(ffn_eltadd0);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd0_out);
+  PATTERN_DECL_NODE(ffn_gelu);
+  PATTERN_DECL_NODE(ffn_gelu_out);
+  PATTERN_DECL_NODE(ffn_matmul1);
+  PATTERN_DECL_NODE(ffn_matmul1_w);
+  PATTERN_DECL_NODE(ffn_matmul1_out);
+  PATTERN_DECL_NODE(ffn_c_allreduce_sum);
+  PATTERN_DECL_NODE(ffn_c_allreduce_sum_out);
+  PATTERN_DECL_NODE(ffn_eltadd1);    // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_b);  // ELEMENTWISE_ADD
+  PATTERN_DECL_NODE(ffn_eltadd1_out);
+
+  // output elementwise_add
+  PATTERN_DECL_NODE(ffn_eltadd_out)
+  PATTERN_DECL_NODE(ffn_output);
+};
+}  // namespace patterns
+
+class FusedMultiTransformerEncoderPass : public FusePassBase {
+ public:
+  FusedMultiTransformerEncoderPass();
+  virtual ~FusedMultiTransformerEncoderPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"fused_multi_transformer_encoder"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+
+class FusedMultiTransformerEncoderFuseQKVPass : public FusePassBase {
+ public:
+  FusedMultiTransformerEncoderFuseQKVPass();
+  virtual ~FusedMultiTransformerEncoderFuseQKVPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{"fused_multi_transformer_encoder_fuse_qkv"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+
+class MultiDevicesFusedMultiTransformerEncoderFuseQKVPass
+    : public FusePassBase {
+ public:
+  MultiDevicesFusedMultiTransformerEncoderFuseQKVPass();
+  virtual ~MultiDevicesFusedMultiTransformerEncoderFuseQKVPass() {}
+
+ protected:
+  void ApplyImpl(Graph* graph) const;
+
+  const std::string name_scope_{
+      "multi_devices_fused_multi_transformer_encoder_fuse_qkv"};
+
+ private:
+  int BuildFusion(Graph* graph,
+                  const std::string& name_scope,
+                  Scope* scope) const;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc
new file mode 100644
index 00000000000000..4954bf3576e2a0
--- /dev/null
+++ b/paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass_tester.cc
@@ -0,0 +1,539 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/ir/fused_multi_transformer_encoder_pass.h"  // NOLINT
+#include "paddle/fluid/framework/ir/pass_tester_helper.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void AddVarToScope(Scope* param_scope,
+                   const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+
+  // MHA: pre Layer Norm
+  AddVarToScope(param_scope, "ln_scale", {1024});
+  AddVarToScope(param_scope, "ln_bias", {1024});
+
+  // MHA: QKV fc
+  AddVarToScope(param_scope, "weights0", {1024, 1024});
+  AddVarToScope(param_scope, "weights1", {1024, 1024});
+  AddVarToScope(param_scope, "weights2", {1024, 1024});
+  AddVarToScope(param_scope, "bias_0", {1024});
+  AddVarToScope(param_scope, "bias_1", {1024});
+  AddVarToScope(param_scope, "bias_2", {1024});
+
+  // MHA: QK bias
+  AddVarToScope(param_scope, "biasqk", {1024});
+
+  // MHA: out Linear
+  AddVarToScope(param_scope, "weights_l", {1024, 1024});
+  AddVarToScope(param_scope, "bias_l", {1024});
+
+  // MHA: pre Layer Norm
+  AddVarToScope(param_scope, "ffn_ln_scale", {1024});
+  AddVarToScope(param_scope, "ffn_ln_bias", {1024});
+
+  // FFN: fc1 -> (gelu) -> fc2
+  AddVarToScope(param_scope, "ffn_weights0", {1024, 4096});
+  AddVarToScope(param_scope, "ffn_weights1", {4096, 1024});
+  AddVarToScope(param_scope, "ffn_bias_0", {4096});
+  AddVarToScope(param_scope, "ffn_bias_1", {1024});
+
+  return param_scope;
+}
+
+TEST(FusedMultiTransformerEncoderPass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, ln_scale, ln_bias)           layer_norm       -> layer_norm_out
+  // (layer_norm_out, weights_0)      matmul_v2        -> matmul_out0
+  // (layer_norm_out, weights_1)      matmul_v2        -> matmul_out1
+  // (layer_norm_out, weights_2)      matmul_v2        -> matmul_out2
+  // (matmul_out0, bias_0)            elementwise_add  -> eltadd_0
+  // (matmul_out1, bias_1)            elementwise_add  -> eltadd_1
+  // (matmul_out2, bias_2)            elementwise_add  -> eltadd_2
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (eltadd_1)                       reshape2         -> reshape_1
+  // (eltadd_2)                       reshape2         -> reshape_2
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (reshape_1)                      transpose2       -> transpose_1
+  // (reshape_2)                      transpose2       -> transpose_2
+  // (transpose_0, transpose_1)       matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul_v2        -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    matmul_v2        -> matmul_linear
+  // (matmul_linear)                  elementwise_add  -> eltadd_linear
+  // (eltadd_out)                     elementwise_add  -> attention_out
+  //
+  // (attention_out, scale, bias)     layer_norm       -> ffn_layer_norm_out
+  // (layer_norm_out, ffn_matmul0_w)  matmul_v2        -> ffn_matmul0
+  // (ffn_matmul0, ffn_bias0)         elementwise_add  -> ffn_eltadd0
+  // (ffn_eltadd0)                    gelu             -> ffn_gelu
+  // (ffn_gelu)                       matmul_v2        -> ffn_matmul1
+  // (ffn_matmul1, ffn_bias1)         elementwise_add  -> ffn_eltadd1
+  // (attention_out, ffn_eltadd1)     elementwise_add  -> ffn_output
+  //
+  // (transpose_1, transpose_2)       while            -> decoder block
+
+  Layers layers;
+  // MHA: pre LayerNorm
+  auto* x = layers.data("x", {1, 128, 1024});
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);
+  auto* ln_out = layers.layer_norm(x, ln_scale, ln_bias)[0];
+
+  // MHA: QKV fc
+  auto* weights_0 = layers.data("weights0", {1024, 1024}, true);
+  auto* weights_1 = layers.data("weights1", {1024, 1024}, true);
+  auto* weights_2 = layers.data("weights2", {1024, 1024}, true);
+  auto* matmul_out_0 =
+      layers.matmul_v2(ln_out, weights_0, nullptr, false, true);
+  auto* matmul_out_1 =
+      layers.matmul_v2(ln_out, weights_1, nullptr, false, true);
+  auto* matmul_out_2 =
+      layers.matmul_v2(ln_out, weights_2, nullptr, false, true);
+
+  auto* b0 = layers.data("bias_0", {1024}, true);
+  auto* b1 = layers.data("bias_1", {1024}, true);
+  auto* b2 = layers.data("bias_2", {1024}, true);
+  auto* elementwise_out_0 =
+      layers.elementwise_add(matmul_out_0, b0, nullptr, 2);
+  auto* elementwise_out_1 =
+      layers.elementwise_add(matmul_out_1, b1, nullptr, 2);
+  auto* elementwise_out_2 =
+      layers.elementwise_add(matmul_out_2, b2, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 16, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+  auto* reshape_1 = layers.reshape2(elementwise_out_1, shape, true);
+  auto* reshape_2 = layers.reshape2(elementwise_out_2, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+  auto* transpose_1 = layers.transpose2(reshape_1, axis, true);
+  auto* transpose_2 = layers.transpose2(reshape_2, axis, true);
+
+  // Link to decoder while block
+  layers.while_loop({transpose_1, transpose_2});
+
+  // MHA: QK matmul
+  auto* matmul_qk =
+      layers.matmul(transpose_0, transpose_1, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk, nullptr, -1);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  // MHA: QKV matmul
+  auto* matmul_qkv = layers.matmul_v2(softmax_qk, transpose_2);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true);
+
+  // MHA: out Linear
+  auto* weights_l = layers.data("weights_l", {1024, 1024}, true);
+  auto* bias_l = layers.data("weightsl", {1024, 1024}, true);
+  auto* linear_matmut_out =
+      layers.matmul_v2(reshape_qkv_out, weights_l, nullptr, false, true);
+  auto* linear_eltadd_out =
+      layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2);
+
+  auto* attention_out = layers.elementwise_add(x, linear_eltadd_out);
+
+  // FFN: pre LayerNorm
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);
+  auto* ffn_ln_out =
+      layers.layer_norm(attention_out, ffn_ln_scale, ffn_ln_bias)[0];
+
+  // FFN: fc1 -> gelu -> fc2
+  auto* ffn_weights0 = layers.data("ffn_weights0", {1024, 4096}, true);
+  auto* ffn_weights1 = layers.data("ffn_weights1", {4096, 1024}, true);
+  auto* ffn_bias0 = layers.data("ffn_bias0", {4096}, true);
+  auto* ffn_bias1 = layers.data("ffn_bias1", {1024}, true);
+  auto* ffn_matmul0_out =
+      layers.matmul_v2(ffn_ln_out, ffn_weights0, nullptr, false, true);
+  auto* ffn_eltadd0_out =
+      layers.elementwise_add(ffn_matmul0_out, ffn_bias0, nullptr, 2);
+  auto* ffn_gelu_out = layers.gelu(ffn_eltadd0_out);
+  auto* ffn_matmul1_out =
+      layers.matmul_v2(ffn_gelu_out, ffn_weights1, nullptr, false, true);
+  auto* ffn_eltadd1_out =
+      layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2);
+
+  layers.elementwise_add(attention_out, ffn_eltadd1_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass =
+      PassRegistry::Instance().Get("fused_multi_transformer_encoder_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO) << "get fused_multi_transformer_encoder_pass failed";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+  int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(num_nodes_before,
+                    num_nodes_after + 56,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_encoder_pass, The "
+                        "node num in graph "
+                        "should be %d, but the result is %d",
+                        num_nodes_before - 56,
+                        num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_encoder pass, "
+                        "there should be one fused_multi_transformer op, "
+                        "but the result is %d",
+                        num_fused_nodes_after));
+}
+
+TEST(FusedMultiTransformerEncoderPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("fused_multi_transformer_encoder_pass"));
+}
+
+TEST(FusedMultiTransformerEncoderFuseQKVPass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, ln_scale, ln_bias)           layer_norm       -> layer_norm_out
+  // (layer_norm_out, weights_0)      matmul_v2        -> matmul_out0
+  // (matmul_out0, bias_0)            elementwise_add  -> eltadd_0
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (transpose_0)                    split            -> split_q, split_k,
+  // split_v (split_k)                        assign           -> assign_k
+  // (split_v)                        assign           -> assign_v
+  // (split_q, split_k)               matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul_v2        -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    matmul_v2        -> matmul_linear
+  // (matmul_linear)                  elementwise_add  -> eltadd_linear
+  // (eltadd_out)                     elementwise_add  -> attention_out
+  //
+  // (attention_out, scale, bias)     layer_norm       -> ffn_layer_norm_out
+  // (layer_norm_out, ffn_matmul0_w)  matmul_v2        -> ffn_matmul0
+  // (ffn_matmul0, ffn_bias0)         elementwise_add  -> ffn_eltadd0
+  // (ffn_eltadd0)                    gelu             -> ffn_gelu
+  // (ffn_gelu)                       matmul_v2        -> ffn_matmul1
+  // (ffn_matmul1, ffn_bias1)         elementwise_add  -> ffn_eltadd1
+  // (attention_out, ffn_eltadd1)     elementwise_add  -> ffn_output
+  //
+  // (transpose_1, transpose_2)       while            -> decoder block
+
+  Layers layers;
+  // MHA: pre LayerNorm
+  auto* x = layers.data("x", {1, 128, 1024});
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);
+  auto* ln_out = layers.layer_norm(x, ln_scale, ln_bias)[0];
+
+  // MHA: QKV fc
+  auto* weights_0 = layers.data("weights0", {1024, 3072}, true);
+  auto* matmul_out_0 =
+      layers.matmul_v2(ln_out, weights_0, nullptr, false, true);
+
+  auto* b0 = layers.data("bias_0", {3072}, true);
+  auto* elementwise_out_0 =
+      layers.elementwise_add(matmul_out_0, b0, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 16, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+
+  auto split_outs = layers.split(transpose_0, 3, 3);
+  auto* split_q = split_outs[0];
+  auto* split_k = split_outs[1];
+  auto* split_v = split_outs[2];
+  layers.assign(split_k);
+  layers.assign(split_v);
+
+  // Link to decoder while block
+  layers.while_loop({split_k, split_v});
+
+  // MHA: QK matmul
+  auto* matmul_qk = layers.matmul(split_q, split_k, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  // MHA: QKV matmul
+  auto* matmul_qkv = layers.matmul_v2(softmax_qk, split_v);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true);
+
+  // MHA: out Linear
+  auto* weights_l = layers.data("weights_l", {1024, 1024}, true);
+  auto* bias_l = layers.data("weightsl", {1024, 1024}, true);
+  auto* linear_matmut_out =
+      layers.matmul_v2(reshape_qkv_out, weights_l, nullptr, false, true);
+  auto* linear_eltadd_out =
+      layers.elementwise_add(linear_matmut_out, bias_l, nullptr, 2);
+
+  auto* attention_out = layers.elementwise_add(x, linear_eltadd_out);
+
+  // FFN: pre LayerNorm
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);
+  auto* ffn_ln_out =
+      layers.layer_norm(attention_out, ffn_ln_scale, ffn_ln_bias)[0];
+
+  // FFN: fc1 -> gelu -> fc2
+  auto* ffn_weights0 = layers.data("ffn_weights0", {1024, 4096}, true);
+  auto* ffn_weights1 = layers.data("ffn_weights1", {4096, 1024}, true);
+  auto* ffn_bias0 = layers.data("ffn_bias0", {4096}, true);
+  auto* ffn_bias1 = layers.data("ffn_bias1", {1024}, true);
+  auto* ffn_matmul0_out =
+      layers.matmul_v2(ffn_ln_out, ffn_weights0, nullptr, false, true);
+  auto* ffn_eltadd0_out =
+      layers.elementwise_add(ffn_matmul0_out, ffn_bias0, nullptr, 2);
+  auto* ffn_gelu_out = layers.gelu(ffn_eltadd0_out);
+  auto* ffn_matmul1_out =
+      layers.matmul_v2(ffn_gelu_out, ffn_weights1, nullptr, false, true);
+  auto* ffn_eltadd1_out =
+      layers.elementwise_add(ffn_matmul1_out, ffn_bias1, nullptr, 2);
+
+  layers.elementwise_add(attention_out, ffn_eltadd1_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass = PassRegistry::Instance().Get(
+      "fused_multi_transformer_encoder_fuse_qkv_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO) << "get fused_multi_transformer_encoder_fuse_qkv_pass failed";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+  int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before,
+      num_nodes_after + 44,
+      platform::errors::InvalidArgument(
+          "After the fused_multi_transformer_encoder_fuse_qkv_pass, "
+          "The node num in graph should be %d, but the result is %d",
+          num_nodes_before - 44,
+          num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_encoder_fuse_qkv "
+                        "pass, there should be one fused_multi_transformer "
+                        "op, but the result is %d",
+                        num_fused_nodes_after));
+}
+
+TEST(FusedMultiTransformerEncoderFuseQKVPass, pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible("fused_multi_transformer_encoder_fuse_qkv_pass"));
+}
+
+TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass, basic) {
+  // inputs                           operator            output
+  // --------------------------------------------------------------------
+  // (x, ln_scale, ln_bias)           layer_norm       -> layer_norm_out
+  // (layer_norm_out)                 c_identity       -> c_identity_out
+  // (c_identity_out, weights_0)      matmul_v2        -> matmul_out0
+  // (matmul_out0)                    elementwise_add  -> eltadd_0
+  // (eltadd_0)                       reshape2         -> reshape_0
+  // (reshape_0)                      transpose2       -> transpose_0
+  // (transpose_0)                    split            -> split_q, split_k,
+  // split_v (split_k)                        assign           -> assign_k
+  // (split_v)                        assign           -> assign_v
+  // (split_q, split_k)               matmul           -> matmul_qk
+  // (matmul_qk, bias_qk)             elementwise_add  -> eltadd_qk
+  // (eltadd_qk)                      softmax          -> softmax_qk
+  // (softmax_qk, transpose_2)        matmul_v2        -> matmul_qkv
+  // (matmul_qkv)                     transpose        -> transpose_qkv
+  // (transpose_qkv)                  reshape          -> reshape_qkv
+  // (reshape_qkv)                    matmul_v2        -> matmul_linear
+  // (matmul_linear)                  c_all_reduce     -> c_all_reduce_out
+  // (c_all_reduce_out)               elementwise_add  -> eltadd_linear
+  // (eltadd_out)                     elementwise_add  -> attention_out
+  //
+  // (attention_out, scale, bias)     layer_norm       -> ffn_layer_norm_out
+  // (ffn_layer_norm_out)             c_identity       -> ffn_c_identity_out
+  // (ffn_c_identity_out, ffn_matmul0_w)matmul_v2      -> ffn_matmul0
+  // (ffn_matmul0, ffn_bias0)         elementwise_add  -> ffn_eltadd0
+  // (ffn_eltadd0)                    gelu             -> ffn_gelu
+  // (ffn_gelu)                       matmul_v2        -> ffn_matmul1
+  // (ffn_matmul1)                    c_all_reduce     -> ffn_c_all_reduce_out
+  // (ffn_c_all_reduce_out, ffn_bias1)elementwise_add  -> ffn_eltadd1
+  // (attention_out, ffn_eltadd1)     elementwise_add  -> ffn_output
+  //
+  // (transpose_1, transpose_2)       while            -> decoder block
+
+  Layers layers;
+  // MHA: pre LayerNorm
+  auto* x = layers.data("x", {1, 128, 1024});
+  auto* ln_scale = layers.data("ln_scale", {1024}, true);
+  auto* ln_bias = layers.data("ln_bias", {1024}, true);
+  auto* ln_out = layers.layer_norm(x, ln_scale, ln_bias)[0];
+  auto* c_identity_out = layers.c_identity(ln_out);
+
+  // MHA: QKV fc
+  auto* weights_0 = layers.data("weights0", {1024, 3072}, true);
+  auto* matmul_out_0 =
+      layers.matmul_v2(c_identity_out, weights_0, nullptr, false, true);
+
+  auto* b0 = layers.data("bias_0", {3072}, true);
+  auto* elementwise_out_0 =
+      layers.elementwise_add(matmul_out_0, b0, nullptr, 2);
+
+  std::vector<int> shape = {1, 128, 16, 64};
+  auto* reshape_0 = layers.reshape2(elementwise_out_0, shape, true);
+
+  std::vector<int> axis = {0, 2, 1, 3};
+  auto* transpose_0 = layers.transpose2(reshape_0, axis, true);
+
+  auto split_outs = layers.split(transpose_0, 3, 3);
+  auto* split_q = split_outs[0];
+  auto* split_k = split_outs[1];
+  auto* split_v = split_outs[2];
+  layers.assign(split_k);
+  layers.assign(split_v);
+
+  // Link to decoder while block
+  layers.while_loop({split_k, split_v});
+
+  // MHA: QK matmul
+  auto* matmul_qk = layers.matmul(split_q, split_k, nullptr, false, true);
+
+  auto* bqk = layers.data("biasqk", {1, 12, 128, 128}, true);
+  auto* elementwise_qk = layers.elementwise_add(matmul_qk, bqk);
+  auto* softmax_qk = layers.softmax(elementwise_qk, -1);
+
+  // MHA: QKV matmul
+  auto* matmul_qkv = layers.matmul_v2(softmax_qk, split_v);
+
+  auto* transpose_qkv = layers.transpose2(matmul_qkv, {0, 2, 1, 3}, true);
+  auto* reshape_qkv_out = layers.reshape2(transpose_qkv, {1, 128, 1024}, true);
+
+  // MHA: out Linear
+  auto* weights_l = layers.data("weights_l", {1024, 1024}, true);
+  auto* bias_l = layers.data("weightsl", {1024, 1024}, true);
+  auto* linear_matmut_out =
+      layers.matmul_v2(reshape_qkv_out, weights_l, nullptr, false, true);
+  auto* c_allreduce_out = layers.c_allreduce_sum(linear_matmut_out);
+  auto* linear_eltadd_out =
+      layers.elementwise_add(c_allreduce_out, bias_l, nullptr, 2);
+
+  auto* attention_out = layers.elementwise_add(x, linear_eltadd_out);
+
+  // FFN: pre LayerNorm
+  auto* ffn_ln_scale = layers.data("ffn_ln_scale", {1024}, true);
+  auto* ffn_ln_bias = layers.data("ffn_ln_bias", {1024}, true);
+  auto* ffn_ln_out =
+      layers.layer_norm(attention_out, ffn_ln_scale, ffn_ln_bias)[0];
+  auto* ffn_c_identity_out = layers.c_identity(ffn_ln_out);
+
+  // FFN: fc1 -> gelu -> fc2
+  auto* ffn_weights0 = layers.data("ffn_weights0", {1024, 4096}, true);
+  auto* ffn_weights1 = layers.data("ffn_weights1", {4096, 1024}, true);
+  auto* ffn_bias0 = layers.data("ffn_bias0", {4096}, true);
+  auto* ffn_bias1 = layers.data("ffn_bias1", {1024}, true);
+  auto* ffn_matmul0_out =
+      layers.matmul_v2(ffn_c_identity_out, ffn_weights0, nullptr, false, true);
+  auto* ffn_eltadd0_out =
+      layers.elementwise_add(ffn_matmul0_out, ffn_bias0, nullptr, 2);
+  auto* ffn_gelu_out = layers.gelu(ffn_eltadd0_out);
+  auto* ffn_matmul1_out =
+      layers.matmul_v2(ffn_gelu_out, ffn_weights1, nullptr, false, true);
+  auto* ffn_allreduce_out = layers.c_allreduce_sum(ffn_matmul1_out);
+  auto* ffn_eltadd1_out =
+      layers.elementwise_add(ffn_allreduce_out, ffn_bias1, nullptr, 2);
+
+  layers.elementwise_add(attention_out, ffn_eltadd1_out);
+
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set("__param_scope__", CreateParamScope());
+
+  auto pass = PassRegistry::Instance().Get(
+      "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass");
+  if (pass.get() == nullptr)
+    LOG(INFO)
+        << "get multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass "
+           "failed";
+  int num_nodes_before = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+
+  graph.reset(pass->Apply(graph.release()));
+  int num_nodes_after = graph->Nodes().size();
+  VLOG(3) << DebugString(graph);
+  int num_fused_nodes_after = GetNumOpNodes(graph, "fused_multi_transformer");
+
+  PADDLE_ENFORCE_EQ(
+      num_nodes_before,
+      num_nodes_after + 52,
+      platform::errors::InvalidArgument(
+          "After the fused_multi_transformer_encoder_fuse_qkv_pass, "
+          "The node num in graph should be %d, but the result is %d",
+          num_nodes_before - 52,
+          num_nodes_after));
+  PADDLE_ENFORCE_EQ(num_fused_nodes_after,
+                    1,
+                    platform::errors::InvalidArgument(
+                        "After the fused_multi_transformer_encoder_fuse_qkv "
+                        "multi-devices pass, there should be one "
+                        "fused_multi_transformer op, but the result is %d",
+                        num_fused_nodes_after));
+}
+
+TEST(MultiDevicesFusedMultiTransformerEncoderFuseQKVPass,
+     pass_op_version_check) {
+  ASSERT_TRUE(
+      paddle::framework::compatible::PassVersionCheckerRegistrar::GetInstance()
+          .IsPassCompatible(
+              "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass"));
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(fused_multi_transformer_encoder_pass);
+USE_PASS(fused_multi_transformer_encoder_fuse_qkv_pass);
+USE_PASS(multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass);
diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc
index 6946fb6d7d9eee..5143ccfe4531ce 100644
--- a/paddle/fluid/framework/ir/graph.cc
+++ b/paddle/fluid/framework/ir/graph.cc
@@ -75,7 +75,6 @@ Graph::Graph(const ProgramDesc &program,
     }
   } else {
     auto var_nodes = InitFromProgram(program_, start_op_index, end_op_index);
-    ResolveHazard(var_nodes);
   }
 }
 
@@ -88,7 +87,6 @@ Graph::Graph(const BlockDesc &block,
              const int64_t end_op_index)
     : main_graph_(main_graph) {
   auto var_nodes = InitFromBlock(block, start_op_index, end_op_index);
-  ResolveHazard(var_nodes);
 }
 
 // TODO(levi): delete this interface after when we can convert all
diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc
index f800a1eba89e1a..32edbb6176c4d3 100644
--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -682,9 +682,14 @@ void GraphToProgram(const Graph &graph,
       // avoid kRootBlockIndex not 0
       if (idx == kRootBlockIndex) continue;
 
-      block = program_pb.add_blocks();
-      block->set_idx(idx);
-      block->set_parent_idx(kRootBlockIndex);
+      if (static_cast<int>(idx) < program_pb.blocks_size()) {
+        block = program_pb.mutable_blocks(idx);
+      } else {
+        block = program_pb.add_blocks();
+        block->set_idx(idx);
+        block->set_parent_idx(kRootBlockIndex);
+      }
+
       GraphToBlock(*graph.GetSubGraph(idx),
                    block,
                    sort_kind,
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 0d63ce21211318..fbf65ca47f3456 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -112,6 +112,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) {
   if (graph.Nodes().empty()) return false;
 
   for (auto &node : GraphTraits::DFS(graph)) {
+    if (node.Name().rfind("__control_var") == 0) continue;
     for (const auto &pdnode : pattern_.nodes()) {
       if (pdnode->Tell(&node)) {
         VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name();
@@ -383,7 +384,6 @@ std::string PDPattern::DotString() const {
   // Create Edges
   for (const auto &edge : edges()) {
     if (!node2dot.count(edge.first) || !node2dot.count(edge.second)) {
-      LOG(ERROR) << "no node " << edge.first << " " << edge.second;
       continue;
     }
     auto &src = node2dot.at(edge.first);
@@ -453,7 +453,8 @@ PDNode *PDNode::assert_var_not_persistable() {
 
 PDNode *PDNode::assert_is_persistable_var() {
   assert_is_var();
-  asserts_.emplace_back([=](Node *x) { return x->Var()->Persistable(); });
+  asserts_.emplace_back(
+      [=](Node *x) { return x->Var() && x->Var()->Persistable(); });
   return this;
 }
 
@@ -957,6 +958,44 @@ PDNode *patterns::OperatorActivation::operator()(
   return activation_out;
 }
 
+PDNode *patterns::OperatorUnsqueeze2::operator()(
+    const std::string &operator_type, const int num_of_operator_outs) {
+  auto *preceding_op = pattern->NewNode(preceding_op_repr())
+                           ->assert_is_op(operator_type)
+                           ->assert_has_n_outputs(num_of_operator_outs);
+  auto *preceding_op_out = pattern->NewNode(preceding_op_out_repr())
+                               ->AsIntermediate()
+                               ->assert_is_op_output(operator_type, "Out")
+                               ->assert_is_op_input("unsqueeze2");
+  auto *unsqueeze2_op =
+      pattern->NewNode(unsqueeze2_op_repr())->assert_is_op("unsqueeze2");
+  auto *unsqueeze2_out = pattern->NewNode(unsqueeze2_out_repr())
+                             ->AsOutput()
+                             ->assert_is_op_output("unsqueeze2");
+  preceding_op->LinksTo({preceding_op_out});
+  unsqueeze2_op->LinksFrom({preceding_op_out}).LinksTo({unsqueeze2_out});
+  return unsqueeze2_out;
+}
+
+PDNode *patterns::OperatorReshape2::operator()(const std::string &operator_type,
+                                               const int num_of_operator_outs) {
+  auto *preceding_op = pattern->NewNode(preceding_op_repr())
+                           ->assert_is_op(operator_type)
+                           ->assert_has_n_outputs(num_of_operator_outs);
+  auto *preceding_op_out = pattern->NewNode(preceding_op_out_repr())
+                               ->AsIntermediate()
+                               ->assert_is_op_output(operator_type, "Out")
+                               ->assert_is_op_input("reshape2");
+  auto *reshape2_op =
+      pattern->NewNode(reshape2_op_repr())->assert_is_op("reshape2");
+  auto *reshape2_out = pattern->NewNode(reshape2_out_repr())
+                           ->AsOutput()
+                           ->assert_is_op_output("reshape2");
+  preceding_op->LinksTo({preceding_op_out});
+  reshape2_op->LinksFrom({preceding_op_out}).LinksTo({reshape2_out});
+  return reshape2_out;
+}
+
 PDNode *patterns::SeqConvEltAddRelu::operator()(
     paddle::framework::ir::PDNode *seqconv_input) {
   // Create Operators
@@ -1003,6 +1042,26 @@ PDNode *patterns::SeqConvEltAddRelu::operator()(
   return relu_out_var;
 }
 
+PDNode *patterns::Squeeze2Transpose2::operator()() {
+  auto *squeeze2_op_in = pattern->NewNode(squeeze2_op_in_repr())
+                             ->AsInput()
+                             ->assert_has_n_outputs(1)
+                             ->assert_is_op_input("squeeze2", "X");
+  auto *squeeze2_op = pattern->NewNode(squeeze2_op_repr())
+                          ->assert_is_op("squeeze2")
+                          ->assert_has_n_outputs(2);
+  auto *squeeze2_op_out = pattern->NewNode(squeeze2_op_out_repr())
+                              ->AsIntermediate()
+                              ->assert_is_op_output("squeeze2", "Out")
+                              ->assert_is_op_input("transpose2", "X");
+  auto *transpose2_op =
+      pattern->NewNode(transpose2_op_repr())->assert_is_op("transpose2");
+
+  squeeze2_op->LinksFrom({squeeze2_op_in}).LinksTo({squeeze2_op_out});
+  transpose2_op->LinksFrom({squeeze2_op_out});
+  return transpose2_op;
+}
+
 PDNode *patterns::FC::operator()(paddle::framework::ir::PDNode *x,
                                  bool with_bias,
                                  bool with_relu) {
@@ -3535,8 +3594,20 @@ PDNode *patterns::LayernormShiftPartitionPattern::operator()() {
           });
   auto reshape1_out = pattern->NewNode(reshape1_out_repr())
                           ->AsIntermediate()
-                          ->assert_is_op_input("reshape2", "X")
                           ->assert_is_op_output("reshape2", "Out");
+  PDNode *roll1_op = nullptr;
+  PDNode *roll1_out = nullptr;
+
+  if (!with_roll_) {
+    reshape1_out->assert_is_op_input("reshape2", "X");
+  } else {
+    reshape1_out->assert_is_op_input("roll", "X");
+    roll1_op = pattern->NewNode(roll1_op_repr())->assert_is_op("roll");
+    roll1_out = pattern->NewNode(roll1_out_repr())
+                    ->AsIntermediate()
+                    ->assert_is_op_output("roll", "Out")
+                    ->assert_is_op_input("reshape2", "X");
+  }
   auto reshape2_op =
       pattern->NewNode(reshape2_op_repr())
           ->assert_is_op("reshape2")
@@ -3546,6 +3617,7 @@ PDNode *patterns::LayernormShiftPartitionPattern::operator()() {
                                      node->Op()->GetAttr("shape"))
                         .size() == 6);
           });
+
   auto reshape2_out = pattern->NewNode(reshape2_out_repr())
                           ->AsIntermediate()
                           ->assert_is_op_input("transpose2", "X")
@@ -3594,7 +3666,12 @@ PDNode *patterns::LayernormShiftPartitionPattern::operator()() {
   layer_norm_op->LinksFrom({layer_norm_in, layer_norm_bias, layer_norm_scale})
       .LinksTo({layer_norm_out});
   reshape1_op->LinksFrom({layer_norm_out}).LinksTo({reshape1_out});
-  reshape2_op->LinksFrom({reshape1_out}).LinksTo({reshape2_out});
+  if (!with_roll_) {
+    reshape2_op->LinksFrom({reshape1_out}).LinksTo({reshape2_out});
+  } else {
+    roll1_op->LinksFrom({reshape1_out}).LinksTo({roll1_out});
+    reshape2_op->LinksFrom({roll1_out}).LinksTo({reshape2_out});
+  }
   transpose_op->LinksFrom({reshape2_out}).LinksTo({transpose_out});
   reshape3_op->LinksFrom({transpose_out}).LinksTo({reshape3_out});
   reshape4_op->LinksFrom({reshape3_out}).LinksTo({reshape4_out});
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index b2eb740b9acaf7..af09ce0b86a510 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -539,6 +539,32 @@ struct OperatorActivation : public PatternBase {
   PATTERN_DECL_NODE(activation_out);
 };
 
+struct OperatorUnsqueeze2 : public PatternBase {
+  OperatorUnsqueeze2(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "operator_unsqueeze2") {}
+
+  PDNode* operator()(const std::string& operator_type,
+                     const int num_of_outputs);
+
+  PATTERN_DECL_NODE(preceding_op);
+  PATTERN_DECL_NODE(preceding_op_out);
+  PATTERN_DECL_NODE(unsqueeze2_op);
+  PATTERN_DECL_NODE(unsqueeze2_out);
+};
+
+struct OperatorReshape2 : public PatternBase {
+  OperatorReshape2(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "operator_reshape2") {}
+
+  PDNode* operator()(const std::string& operator_type,
+                     const int num_of_outputs);
+
+  PATTERN_DECL_NODE(preceding_op);
+  PATTERN_DECL_NODE(preceding_op_out);
+  PATTERN_DECL_NODE(reshape2_op);
+  PATTERN_DECL_NODE(reshape2_out);
+};
+
 // SEQCONV with Elementwise_Add ReLU
 // op: seqconv + elementwise_add + relu
 // named nodes:
@@ -608,6 +634,20 @@ struct FCMKLDNN : public PatternBase {
   PATTERN_DECL_NODE(output);
 };
 
+// Squeeze2 + Transpose2
+// Forward pass
+struct Squeeze2Transpose2 : public PatternBase {
+  Squeeze2Transpose2(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "squeeze2_transpose2") {}
+
+  PDNode* operator()();
+
+  PATTERN_DECL_NODE(squeeze2_op_in);
+  PATTERN_DECL_NODE(squeeze2_op);
+  PATTERN_DECL_NODE(squeeze2_op_out);
+  PATTERN_DECL_NODE(transpose2_op);
+};
+
 // Embedding
 struct Embedding : public PatternBase {
   Embedding(PDPattern* pattern, const std::string& name_scope)
@@ -1073,7 +1113,7 @@ struct ResidualElementwise : public PatternBase {
 };
 
 // General struct for immutable ops:
-// reshape, transpose, slice, shape, nearest-interp
+// reshape, transpose, slice, nearest-interp
 // Forward pass for no weights-op.
 // immutable_out is a result of the operator.
 struct Immutable : public PatternBase {
@@ -1917,11 +1957,13 @@ struct LayerNorm : public PatternBase {
 //
 struct LayernormShiftPartitionPattern : public PatternBase {
   LayernormShiftPartitionPattern(PDPattern* pattern,
-                                 const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "layernorm_shift_partition") {}
+                                 const std::string& name_scope,
+                                 bool with_roll)
+      : PatternBase(pattern, name_scope, "layernorm_shift_partition"),
+        with_roll_(with_roll) {}
 
   PDNode* operator()();
-
+  bool with_roll_;
   PATTERN_DECL_NODE(layer_norm_in);
   PATTERN_DECL_NODE(layer_norm_op);
   PATTERN_DECL_NODE(layer_norm_bias);
@@ -1929,6 +1971,10 @@ struct LayernormShiftPartitionPattern : public PatternBase {
   PATTERN_DECL_NODE(layer_norm_out);
   PATTERN_DECL_NODE(reshape1_op);
   PATTERN_DECL_NODE(reshape1_out);
+  // optional op roll
+  PATTERN_DECL_NODE(roll1_op);
+  PATTERN_DECL_NODE(roll1_out);
+
   PATTERN_DECL_NODE(reshape2_op);
   PATTERN_DECL_NODE(reshape2_out);
   PATTERN_DECL_NODE(transpose_op);
@@ -1956,12 +2002,26 @@ struct AddSupportInt8 : public PatternBase {
   a->outputs.push_back(b);    \
   b->inputs.push_back(a);
 
+// UnLink 2 ir::Nodes from each other.
+#define IR_NODE_UNLINK(a, b)                                                  \
+  a->outputs.erase(                                                           \
+      std::remove(std::begin(a->outputs), std::end(a->outputs), b),           \
+      std::end(a->outputs));                                                  \
+  b->inputs.erase(std::remove(std::begin(b->inputs), std::end(b->inputs), a), \
+                  std::end(b->inputs));
+
 // Set the out_var as the output of the op
 #define IR_OP_VAR_LINK(op, out_var) \
   op->outputs.push_back(out_var);   \
   out_var->inputs.clear();          \
   out_var->inputs.push_back(op);
 
+// Set the in_var as the input of the op
+#define IR_VAR_OP_LINK(in_var, op) \
+  in_var->outputs.clear();         \
+  in_var->outputs.push_back(op);   \
+  op->inputs.push_back(in_var);
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_test.cc b/paddle/fluid/framework/ir/graph_test.cc
index 84a9a3b74c0a24..b1d550c54b4e03 100644
--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -130,86 +130,6 @@ TEST(GraphTest, Basic) {
   ASSERT_EQ(nodes.size(), 5UL);
 }
 
-TEST(GraphTest, WriteAfterRead) {
-  // void Test() {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"a"});
-  op->SetOutput("Out", {"b"});
-  op->SetAttr("op_role", 1);
-
-  op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("dummy");
-  op->SetInput("X", {"c"});
-  op->SetOutput("Out", {"a"});
-  op->SetAttr("op_role", 1);
-
-  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
-
-  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  ir::Node *control_dep1 = nullptr;
-  ir::Node *control_dep2 = nullptr;
-  for (ir::Node *n : g->Nodes()) {
-    if (n->Name() == "sum") {
-      ASSERT_EQ(n->outputs[0]->Name(), "b");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
-      control_dep1 = n->outputs[1];
-      ASSERT_EQ(n->outputs.size(), 2UL);
-    }
-    if (n->Name() == "dummy") {
-      ASSERT_EQ(n->inputs[0]->Name(), "c");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
-      control_dep2 = n->inputs[1];
-      ASSERT_EQ(n->inputs.size(), 2UL);
-    }
-  }
-  ASSERT_EQ(control_dep1, control_dep2);
-}
-
-TEST(GraphTest, WriteAfterWrite) {
-  // void Test() {
-  ProgramDesc prog;
-  auto *op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("sum");
-  op->SetInput("X", {"a"});
-  op->SetOutput("Out", {"b"});
-  op->SetAttr("op_role", 1);
-
-  op = prog.MutableBlock(0)->AppendOp();
-  op->SetType("dummy");
-  op->SetInput("X", {"c"});
-  op->SetOutput("Out", {"b"});
-  op->SetAttr("op_role", 1);
-
-  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
-  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
-
-  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
-  ir::Node *control_dep1 = nullptr;
-  ir::Node *control_dep2 = nullptr;
-  for (ir::Node *n : g->Nodes()) {
-    if (n->Name() == "sum") {
-      ASSERT_EQ(n->outputs[0]->Name(), "b");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
-      ASSERT_EQ(n->outputs.size(), 2UL);
-      control_dep1 = n->outputs[1];
-    }
-    if (n->Name() == "dummy") {
-      ASSERT_EQ(n->inputs[0]->Name(), "c");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
-      control_dep2 = n->inputs[1];
-      ASSERT_EQ(n->inputs.size(), 2UL);
-    }
-  }
-  ASSERT_NE(control_dep1, nullptr);
-  ASSERT_NE(control_dep2, nullptr);
-  ASSERT_EQ(control_dep1, control_dep2);
-}
-
 TEST(GraphTest, TestException) {
   ProgramDesc prog;
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
@@ -350,12 +270,13 @@ TEST(GraphTest, TestMultiBlock) {
   op = prog.MutableBlock(1)->AppendOp();
   op->SetType("dummy");
   op->SetInput("X", {"c"});
-  op->SetOutput("Out", {"a"});
+  op->SetOutput("Out", {"d"});
   op->SetAttr("op_role", 1);
 
   prog.MutableBlock(1)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(1)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(1)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(1)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
 
   // Set contents in block_2.
   op = prog.MutableBlock(2)->AppendOp();
@@ -367,12 +288,13 @@ TEST(GraphTest, TestMultiBlock) {
   op = prog.MutableBlock(2)->AppendOp();
   op->SetType("dummy");
   op->SetInput("X", {"c"});
-  op->SetOutput("Out", {"b"});
+  op->SetOutput("Out", {"d"});
   op->SetAttr("op_role", 1);
 
   prog.MutableBlock(2)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(2)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
   prog.MutableBlock(2)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(1)->Var("d")->SetType(proto::VarType::LOD_TENSOR);
 
   // Step2: Convert program into graph, 3 blocks corresponding 3 sub_graphs.
   std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
@@ -399,45 +321,29 @@ TEST(GraphTest, TestMultiBlock) {
 
   // Check contents in sub_graph_1.
   const ir::Graph *g1 = g->GetSubGraph(1);
-  ir::Node *control_dep1 = nullptr;
-  ir::Node *control_dep2 = nullptr;
   for (ir::Node *n : g1->Nodes()) {
     if (n->Name() == "sum") {
       ASSERT_EQ(n->outputs[0]->Name(), "b");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
-      control_dep1 = n->outputs[1];
-      ASSERT_EQ(n->outputs.size(), 2UL);
+      ASSERT_EQ(n->outputs.size(), 1UL);
     }
     if (n->Name() == "dummy") {
       ASSERT_EQ(n->inputs[0]->Name(), "c");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
-      control_dep2 = n->inputs[1];
-      ASSERT_EQ(n->inputs.size(), 2UL);
+      ASSERT_EQ(n->inputs.size(), 1UL);
     }
   }
-  ASSERT_EQ(control_dep1, control_dep2);
 
   // Check contents in sub_graph_2.
   const ir::Graph *g2 = g->GetSubGraph(2);
-  control_dep1 = nullptr;
-  control_dep2 = nullptr;
   for (ir::Node *n : g2->Nodes()) {
     if (n->Name() == "sum") {
       ASSERT_EQ(n->outputs[0]->Name(), "b");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
-      ASSERT_EQ(n->outputs.size(), 2UL);
-      control_dep1 = n->outputs[1];
+      ASSERT_EQ(n->outputs.size(), 1UL);
     }
     if (n->Name() == "dummy") {
       ASSERT_EQ(n->inputs[0]->Name(), "c");
-      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
-      control_dep2 = n->inputs[1];
-      ASSERT_EQ(n->inputs.size(), 2UL);
+      ASSERT_EQ(n->inputs.size(), 1UL);
     }
   }
-  ASSERT_NE(control_dep1, nullptr);
-  ASSERT_NE(control_dep2, nullptr);
-  ASSERT_EQ(control_dep1, control_dep2);
 
   // Step3: Clone graph.
   std::shared_ptr<ir::Graph> clone_g = g->Clone();
diff --git a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
index 9b2793c3034969..ed5ac1a4c09385 100644
--- a/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/infer_shape_pass.cc
@@ -53,6 +53,10 @@ void InferShapePass::ApplyImpl(ir::Graph* graph) const {
       if (node->Var()->GetDataType() == proto::VarType::INT64) {
         node->Var()->SetDataType(proto::VarType::INT32);
       }
+      // float64->float32
+      if (node->Var()->GetDataType() == proto::VarType::FP64) {
+        node->Var()->SetDataType(proto::VarType::FP32);
+      }
     }
   }
 
diff --git a/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc
index a0820afc2d8ee7..76b2792e3dfec0 100644
--- a/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_dtype_transfer_pass.cc
@@ -37,11 +37,30 @@ void InferenceDtypeTransferPass::ApplyImpl(ir::Graph* graph) const {
     VLOG(10) << "Transfer var to fp16...";
     auto* scope = ipu_backend->GetScope();
 
+    // Record specific vars to skip
+    std::set<std::string> skip_var_lists;
+    for (auto* node : graph->Nodes()) {
+      if (node->IsOp()) {
+        // clip op' attrs `max` and `min` only support FP32
+        if (node->Op()->Type() == "popart_clip") {
+          auto min_tensor_name = node->Op()->InputArgumentNames()[1];
+          auto max_tensor_name = node->Op()->InputArgumentNames()[2];
+          skip_var_lists.insert(min_tensor_name);
+          skip_var_lists.insert(max_tensor_name);
+        }
+      }
+    }
+
     std::unordered_set<std::string> used_var_names;
     for (auto* node : graph->Nodes()) {
       if (node->IsVar()) {
         auto var_desc = node->Var();
         if (var_desc->GetDataType() == proto::VarType::FP32) {
+          // Skip specific vars
+          if (skip_var_lists.find(var_desc->Name()) != skip_var_lists.end()) {
+            continue;
+          }
+
           // Transfer the dtypes of var_desc
           var_desc->SetDataType(proto::VarType::FP16);
           VLOG(10) << "Transfer the VarDesc of " << var_desc->Name() << " to "
@@ -81,6 +100,12 @@ void InferenceDtypeTransferPass::ApplyImpl(ir::Graph* graph) const {
           }
         }
         if (op_desc->Type() == "popart_constant") {
+          // Skip specific constant
+          auto output_var_name = node->outputs[0]->Var()->Name();
+          if (skip_var_lists.find(output_var_name) != skip_var_lists.end()) {
+            continue;
+          }
+
           // Transfer the dtype of fill_constant Op
           if (op_desc->GetAttrIfExists<int>("dtype") == 1) {
             op_desc->SetAttr("dtype", 10);
diff --git a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
index 55a4e320ea274e..11679c95b1133f 100644
--- a/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
+++ b/paddle/fluid/framework/ir/ipu/inference_process_pass.cc
@@ -93,6 +93,33 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
   // Set tiles_per_ipu for IPUMODEL
   ipu_strategy_instance_->tiles_per_ipu = 128;
 
+  // Set Cache path
+  auto* ipu_cache_path = getenv("IPU_CACHE_PATH");
+  if (ipu_cache_path) {
+    ipu_strategy_instance_->popart_options.enableEngineCaching = true;
+    ipu_strategy_instance_->popart_options.cachePath =
+        std::string{ipu_cache_path};
+  }
+
+  // custom ops and patterns
+  std::unordered_set<std::string> custom_op_names;
+  auto custom_ops_info =
+      graph->Get<std::vector<std::vector<std::string>>>("custom_ops_info");
+  for (auto custom_op : custom_ops_info) {
+    ipu_strategy_instance_->AddCustomOp(
+        custom_op[0], custom_op[1], custom_op[2], atoi(custom_op[3].c_str()));
+    custom_op_names.insert(custom_op[0]);
+  }
+  auto patterns =
+      graph->Get<std::vector<std::vector<std::string>>>("custom_patterns");
+  for (auto pattern : patterns) {
+    if (pattern[1] == "True") {
+      ipu_strategy_instance_->EnablePattern(pattern[0]);
+    } else if (pattern[1] == "False") {
+      ipu_strategy_instance_->DisablePattern(pattern[0]);
+    }
+  }
+
   ipu_backend->SetIpuStrategy(*(ipu_strategy_instance_.get()));
 
   // Get feed_list and fetch list
@@ -140,6 +167,11 @@ void InferenceProcessPass::ApplyImpl(ir::Graph* graph) const {
           "feed_list",
           new std::vector<std::string>(feed_list.begin(), feed_list.end()));
     }
+    if (pass_name == "popart_canonicalization_pass") {
+      pass->Set("custom_ops",
+                new std::unordered_set<std::string>(custom_op_names.begin(),
+                                                    custom_op_names.end()));
+    }
     pass->Apply(graph);
   }
 
diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc
index a97873e82f4554..47a3e46d076c31 100644
--- a/paddle/fluid/framework/ir/is_test_pass.cc
+++ b/paddle/fluid/framework/ir/is_test_pass.cc
@@ -25,17 +25,18 @@ class Graph;
 void IsTestPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Sets is_test attrbiute to true and if it is missing, inserts it "
              "for activations and pooling.";
-  auto op_list = {"pool2d",      "sigmoid",      "logsigmoid",
-                  "softshrink",  "exp",          "brelu",
-                  "pow",         "leaky_relu",   "stanh",
-                  "relu",        "tanh",         "tanh_shrink",
-                  "sqrt",        "abs",          "ceil",
-                  "elu",         "floor",        "cos",
-                  "sin",         "round",        "reciprocal",
-                  "hard_shrink", "hard_sigmoid", "relu6",
-                  "soft_relu",   "swish",        "thresholded_relu",
-                  "log",         "square",       "softplus",
-                  "softsign",    "silu",         "mish"};
+  auto op_list = {"pool2d",        "sigmoid",      "logsigmoid",
+                  "softshrink",    "exp",          "brelu",
+                  "pow",           "leaky_relu",   "stanh",
+                  "relu",          "tanh",         "tanh_shrink",
+                  "sqrt",          "abs",          "ceil",
+                  "elu",           "floor",        "cos",
+                  "sin",           "round",        "reciprocal",
+                  "hard_shrink",   "hard_sigmoid", "relu6",
+                  "soft_relu",     "swish",        "thresholded_relu",
+                  "log",           "square",       "softplus",
+                  "softsign",      "silu",         "mish",
+                  "gumbel_softmax"};
   for (const Node* n : graph->Nodes()) {
     if (n->IsOp()) {
       auto* op = n->Op();
diff --git a/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.cc b/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.cc
index 9353f4b3efd848..dbe990f636372c 100644
--- a/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.cc
@@ -16,6 +16,7 @@
 
 #include <cmath>
 #include <string>
+#include <vector>
 
 #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_proto_maker.h"
@@ -85,22 +86,33 @@ LayerNormShiftPartitionFusePass::LayerNormShiftPartitionFusePass() {
       .AddAttr("axis")
       .IsType<std::vector<int>>()
       .End();
+  AddOpCompat(OpCompat("roll"))
+      .AddInput("X")
+      .IsTensor()
+      .End()
+      .AddOutput("Out")
+      .IsTensor()
+      .End()
+      .AddAttr("axis")
+      .IsType<std::vector<int64_t>>()
+      .End()
+      .AddAttr("shifts")
+      .IsType<std::vector<int64_t>>()
+      .End();
 }
 
-void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
+int LayerNormShiftPartitionFusePass::ApplyPattern(ir::Graph* graph,
+                                                  bool with_roll) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph,
       platform::errors::InvalidArgument(
           "The input graph of LayerNormShiftPartitionFusePass should not be "
           "nullptr."));
-
   FusePassBase::Init(scope_name_, graph);
-
   GraphPatternDetector gpd;
   patterns::LayernormShiftPartitionPattern shift_patition_pattern(
-      gpd.mutable_pattern(), scope_name_);
+      gpd.mutable_pattern(), scope_name_, with_roll);
   shift_patition_pattern();
-
   int found_count = 0;
   auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
                      Graph* g) {
@@ -108,8 +120,13 @@ void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
       LOG(WARNING) << "layernorm_shift_partition_fuse in op compat failed.";
       return;
     }
-
-    VLOG(4) << "layernorm_shift_partition_fuse pass";
+    if (with_roll) {
+      VLOG(4)
+          << "layernorm_shift_partition_fuse pass, shift_size>0, with roll op";
+    } else {
+      VLOG(4) << "layernorm_shift_partition_fuse pass, shift_size=0, without "
+                 "roll op";
+    }
     GET_IR_NODE_FROM_SUBGRAPH(
         layer_norm_in, layer_norm_in, shift_patition_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
@@ -123,6 +140,15 @@ void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(reshape1_op, reshape1_op, shift_patition_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         reshape1_out, reshape1_out, shift_patition_pattern);
+    Node* roll1_op = nullptr;
+    Node* roll1_out = nullptr;
+    if (with_roll) {
+      GET_IR_NODE_FROM_SUBGRAPH(tmp_roll1_op, roll1_op, shift_patition_pattern);
+      GET_IR_NODE_FROM_SUBGRAPH(
+          tmp_roll1_out, roll1_out, shift_patition_pattern);
+      roll1_op = tmp_roll1_op;
+      roll1_out = tmp_roll1_out;
+    }
     GET_IR_NODE_FROM_SUBGRAPH(reshape2_op, reshape2_op, shift_patition_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         reshape2_out, reshape2_out, shift_patition_pattern);
@@ -136,6 +162,21 @@ void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(reshape4_op, reshape4_op, shift_patition_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         reshape4_out, reshape4_out, shift_patition_pattern);
+    std::unordered_set<const Node*> del_node_set = {layer_norm_op,
+                                                    layer_norm_out,
+                                                    reshape1_op,
+                                                    reshape1_out,
+                                                    reshape2_op,
+                                                    reshape2_out,
+                                                    transpose_op,
+                                                    transpose_out,
+                                                    reshape3_op,
+                                                    reshape3_out,
+                                                    reshape4_op};
+    if (with_roll) {
+      del_node_set.insert(roll1_op);
+      del_node_set.insert(roll1_out);
+    }
 
     std::vector<int> shape_atr1 =
         PADDLE_GET_CONST(std::vector<int>, reshape1_op->Op()->GetAttr("shape"));
@@ -165,7 +206,20 @@ void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
     if (window_size < 0 || input_resolution < 0) {
       return;
     }
-
+    int shift_size = 0;
+    if (with_roll) {
+      std::vector<int64_t> roll_axis = PADDLE_GET_CONST(
+          std::vector<int64_t>, roll1_op->Op()->GetAttr("axis"));
+      std::vector<int64_t> roll_shifts = PADDLE_GET_CONST(
+          std::vector<int64_t>, roll1_op->Op()->GetAttr("shifts"));
+      if (roll_axis.size() != 2 || roll_axis[0] != 1 || roll_axis[1] != 2) {
+        return;
+      }
+      if (roll_shifts.size() != 2 || roll_shifts[0] != roll_shifts[1]) {
+        return;
+      }
+      shift_size = static_cast<int>(-roll_shifts[0]);
+    }
     OpDesc new_op_desc;
     new_op_desc.SetType("layernorm_shift_partition");
     new_op_desc.SetInput("X", {layer_norm_in->Name()});
@@ -176,6 +230,7 @@ void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
     new_op_desc.SetAttr("begin_norm_axis",
                         layer_norm_op->Op()->GetAttr("begin_norm_axis"));
     new_op_desc.SetAttr("window_size", window_size);
+    new_op_desc.SetAttr("shift_size", shift_size);
     new_op_desc.SetAttr("input_resolution", input_resolution);
     new_op_desc.Flush();
 
@@ -185,22 +240,19 @@ void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
     IR_NODE_LINK_TO(layer_norm_bias, layernorm_shift_partition);
     IR_NODE_LINK_TO(layer_norm_scale, layernorm_shift_partition);
     IR_NODE_LINK_TO(layernorm_shift_partition, reshape4_out);
-    GraphSafeRemoveNodes(graph,
-                         {layer_norm_op,
-                          layer_norm_out,
-                          reshape1_op,
-                          reshape1_out,
-                          reshape2_op,
-                          reshape2_out,
-                          transpose_op,
-                          transpose_out,
-                          reshape3_op,
-                          reshape3_out,
-                          reshape4_op});
+    GraphSafeRemoveNodes(graph, del_node_set);
     ++found_count;
   };
 
   gpd(graph, handler);
+
+  return found_count;
+}
+
+void LayerNormShiftPartitionFusePass::ApplyImpl(ir::Graph* graph) const {
+  int found_count = 0;
+  found_count += ApplyPattern(graph, true);
+  found_count += ApplyPattern(graph, false);
   AddStatis(found_count);
 }
 
diff --git a/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.h b/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.h
index 7c3d435ef43044..6bbcd64e30a27f 100644
--- a/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.h
+++ b/paddle/fluid/framework/ir/layernorm_shift_partition_fuse_pass.h
@@ -37,6 +37,26 @@ namespace ir {
 //  reshape2
 //     |
 //  other_op
+//
+// or
+//
+//     |
+// layer_norm
+//     |
+//  reshape2
+//     |
+//    roll
+//     |
+//  reshape2                            |
+//     |           fuse       layernorm_shift_patition
+// transpose2       ->                  |
+//     |                             other_op
+//  reshape2
+//     |
+//  reshape2
+//     |
+//  other_op
+
 class LayerNormShiftPartitionFusePass : public FusePassBase {
  public:
   LayerNormShiftPartitionFusePass();
@@ -44,6 +64,7 @@ class LayerNormShiftPartitionFusePass : public FusePassBase {
 
  protected:
   void ApplyImpl(ir::Graph *graph) const override;
+  int ApplyPattern(ir::Graph *graph, bool with_roll) const;
 
  private:
   const std::string scope_name_{"layernorm_shift_partition_fuse"};
diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
index 399ad4a3ca5231..223d944c83a853 100644
--- a/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
+++ b/paddle/fluid/framework/ir/memory_optimize_pass/recurrent_op_eager_deletion_pass.cc
@@ -36,6 +36,29 @@ void RecurrentOpEagerDeletionPass::ApplyImpl(Graph *graph) const {
   std::unordered_map<size_t, OpAndGradOpPair> target_ops =
       DeviceIdToRecurrentAndRecurrentGradOp(*graph);
 
+  if (graph->IsConstructedByPartialProgram()) {
+    PADDLE_ENFORCE_LE(target_ops.size(),
+                      1,
+                      platform::errors::InvalidArgument(
+                          "Unsupported multi devices if graph is constructed "
+                          "with partial program."));
+    size_t scope_idx = 0;
+    auto &recur_ops = target_ops[scope_idx].first;
+    auto &recur_grad_ops = target_ops[scope_idx].second;
+
+    auto all_ops = graph->OriginProgram().Block(0).AllOps();
+    if (recur_ops.empty()) {
+      operators::AppendOpVariantByOpName(
+          all_ops, std::string("recurrent"), &recur_ops);
+    } else if (recur_grad_ops.empty()) {
+      operators::AppendOpVariantByOpName(
+          all_ops, std::string("recurrent_grad"), &recur_grad_ops);
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "One of recur_ops or recur_grad_ops should be empty."));
+    }
+  }
+
   for (auto &entry : target_ops) {
     // Prepare safe eager deletion on different devices because the garbage
     // collection may be different across devices
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
old mode 100644
new mode 100755
index 394c1ae797e4c7..2c807b411b607d
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.cc
@@ -19,7 +19,6 @@
 #include <algorithm>
 
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
@@ -68,7 +67,7 @@ std::vector<float> ComputePropagateScalesMkldnnPass::GetScales(Tensor* tensor,
     for (int i = 0; i < columns; i++) {
       float max_value = FLT_MIN;
       for (int j = 0; j < rows; j++) {
-        max_value = std::max(max_value, std::abs(data[i + j * columns]));
+        max_value = std::max(max_value, std::abs(data[j + i * rows]));
       }
       max_value = 1.0 / max_value;
       if (std::isinf(max_value) || std::isnan(max_value)) {
@@ -337,27 +336,45 @@ void ComputePropagateScalesMkldnnPass::ComputeWeightScales(
   ComputeLstmWeightScales(graph, scope, "WeightX", "WeightH", var_quant_scales);
 }
 
-void ComputePropagateScalesMkldnnPass::UpdateScaleOpInScale(
+void ComputePropagateScalesMkldnnPass::UpdateScaleOpInOutScales(
     Node* op_node,
     const std::string& input_name,
     const std::string& output_name,
     StringPairMap* var_quant_scales) const {
-  auto iter = var_quant_scales->find(output_name);
-  if (iter != var_quant_scales->end()) {
-    auto pair = iter->second;
-    const auto tensor = pair.second;
-
-    const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
-    Tensor tmp_tensor;
-    tmp_tensor.Resize(tensor.dims());
-    auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
-    for (int i = 0; i < tensor.numel(); i++) {
-      data[i] = data[i] * scale;
-    }
+  auto out_iter = var_quant_scales->find(output_name);
+  auto input_iter = var_quant_scales->find(input_name);
+  // All the input and output have scales
+  if (out_iter != var_quant_scales->end() &&
+      input_iter != var_quant_scales->end()) {
+    return;
+  }
+  const auto scale = PADDLE_GET_CONST(float, op_node->Op()->GetAttr("scale"));
+  if (std::abs(scale) < 1e-6 && out_iter != var_quant_scales->end()) {
+    return;
+  }
 
-    auto new_pair = std::make_pair(pair.first, tmp_tensor);
-    var_quant_scales->insert(std::make_pair(input_name, new_pair));
+  std::string name = input_name;
+  auto iter = out_iter;
+  if (input_iter != var_quant_scales->end()) {
+    iter = input_iter;
+    name = output_name;
+  }
+
+  phi::DenseTensor tmp_tensor;
+  auto pair = iter->second;
+  const auto tensor = pair.second;
+  tmp_tensor.Resize(tensor.dims());
+  auto* data = tmp_tensor.mutable_data<float>(platform::CPUPlace());
+  auto* src_data = tensor.data<float>();
+  for (int i = 0; i < tensor.numel(); i++) {
+    if (out_iter != var_quant_scales->end()) {
+      data[i] = src_data[i] / scale;
+    } else {
+      data[i] = src_data[i] * scale;
+    }
   }
+  auto new_pair = std::make_pair(pair.first, tmp_tensor);
+  var_quant_scales->insert(std::make_pair(name, new_pair));
 }
 
 std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
@@ -394,21 +411,62 @@ std::unordered_set<std::string> ComputePropagateScalesMkldnnPass::UpdateScales(
       auto out_iter = var_quant_scales->find(op_node->Op()->Output("Out")[0]);
       if (out_iter != var_quant_scales->end()) {
         std::vector<std::string> input_names = op_node->Op()->Input("X");
-        for (auto input_name : input_names)
-          (*var_quant_scales)[input_name] = out_iter->second;
+        for (auto input_name : input_names) {
+          auto concat_in_iter = var_quant_scales->find(input_name);
+          if (concat_in_iter == var_quant_scales->end())
+            (*var_quant_scales)[input_name] = out_iter->second;
+          else
+            (*var_quant_scales)[input_name].second = out_iter->second.second;
+        }
       }
     } else if (op_name == "scale") {
       const std::string output_name = op_node->Op()->Output("Out")[0];
+      const std::string input_name = op_node->Op()->Input("X")[0];
       auto out_iter = var_quant_scales->find(output_name);
-      if (out_iter != var_quant_scales->end()) {
-        const std::string input_name = op_node->Op()->Input("X")[0];
-        UpdateScaleOpInScale(
+      auto input_iter = var_quant_scales->find(input_name);
+      if (out_iter != var_quant_scales->end() ||
+          input_iter != var_quant_scales->end()) {
+        UpdateScaleOpInOutScales(
             op_node, input_name, output_name, var_quant_scales);
       }
     }
   }
   return waiting_for_scale;
 }
+void ComputePropagateScalesMkldnnPass::UpdateReluOutputScales(
+    ir::Graph* graph, StringPairMap* var_quant_scales) const {
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp()) continue;
+    auto op = op_node->Op();
+    bool is_unsigned = false;
+    std::string output_name = "Out";
+    std::string act_name;
+    if (op->Type() == "relu") {
+      is_unsigned = true;
+    } else {
+      if (op->Type() == "conv2d") {
+        act_name = "fuse_activation";
+        output_name = "Output";
+      } else if (op->Type() == "fc") {
+        act_name = "activation_type";
+      }
+      if (!act_name.empty()) {
+        auto act = op->GetAttrIfExists<std::string>(act_name);
+        if (act == "relu" || act == "relu6") {
+          is_unsigned = true;
+        }
+      }
+    }
+    if (is_unsigned) {
+      std::string output_var_name = op->Output(output_name)[0];
+      auto out_iter = var_quant_scales->find(output_var_name);
+      if (out_iter != var_quant_scales->end()) {
+        (*var_quant_scales)[output_var_name].first = true;
+      }
+    }
+  }
+}
 
 void ComputePropagateScalesMkldnnPass::PropagateScales(
     ir::Graph* graph,
@@ -427,21 +485,6 @@ void ComputePropagateScalesMkldnnPass::PropagateScales(
   }
 }
 
-void ComputePropagateScalesMkldnnPass::ConvertStringPairMap(
-    const StringPairMap& var_quant_scales,
-    std::unordered_map<std::string, std::vector<float>>* info_map) const {
-  for (auto iter = var_quant_scales.begin(); iter != var_quant_scales.end();
-       iter++) {
-    auto* data = iter->second.second.data<float>();
-    std::vector<float> data_v;
-    for (int i = 0; i < iter->second.second.numel(); i++) {
-      data_v.push_back(data[i]);
-    }
-
-    info_map->insert(std::make_pair(iter->first, data_v));
-  }
-}
-
 void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
   VLOG(3) << "Convert paddle model to mkldnn quantized model.";
   const std::string pattern_name = "compute_propagate_scales_mkldnn_pass";
@@ -461,13 +504,13 @@ void ComputePropagateScalesMkldnnPass::ApplyImpl(ir::Graph* graph) const {
   auto* scope = param_scope();
   GetQuantInfo(graph, &var_quant_scales);
   ComputeWeightScales(graph, scope, &var_quant_scales);
+  UpdateReluOutputScales(graph, &var_quant_scales);
   PropagateScales(graph, &var_quant_scales, scale_immutable_ops);
 
   // save var_quant_scales in the first op's attr
   // for cpu_quantize_pass
-  std::unordered_map<std::string, std::vector<float>> info_map;
-  ConvertStringPairMap(var_quant_scales, &info_map);
-  SaveInfoInTheFirstOp(graph, "has_quant_info", "var_quant_scales", info_map);
+  SaveInfoInTheFirstOp(
+      graph, "has_quant_info", "var_quant_scales", var_quant_scales);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
old mode 100644
new mode 100755
index 09863fdc768b22..001969887da653
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h
@@ -17,13 +17,12 @@
 #include <string>
 
 #include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h"
 
 namespace paddle {
 namespace framework {
 namespace ir {
 
-using StringPairMap = std::unordered_map<std::string, std::pair<bool, Tensor>>;
-
 class ComputePropagateScalesMkldnnPass : public FusePassBase {
  public:
   ComputePropagateScalesMkldnnPass() = default;
@@ -77,10 +76,13 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
                            Scope* scope,
                            StringPairMap* var_quant_scales) const;
 
-  void UpdateScaleOpInScale(Node* op_node,
-                            const std::string& input_name,
-                            const std::string& output_name,
-                            StringPairMap* var_quant_scales) const;
+  void UpdateReluOutputScales(ir::Graph* graph,
+                              StringPairMap* var_quant_scales) const;
+
+  void UpdateScaleOpInOutScales(Node* op_node,
+                                const std::string& input_name,
+                                const std::string& output_name,
+                                StringPairMap* var_quant_scales) const;
 
   std::unordered_set<std::string> UpdateScales(
       ir::Graph* graph,
@@ -91,10 +93,6 @@ class ComputePropagateScalesMkldnnPass : public FusePassBase {
       ir::Graph* graph,
       StringPairMap* var_quant_scales,
       const std::unordered_set<std::string>& scale_immutable_ops) const;
-
-  void ConvertStringPairMap(
-      const StringPairMap& var_quant_scales,
-      std::unordered_map<std::string, std::vector<float>>* info_map) const;
 };
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
index 38c6fb57d58e38..543394549e2492 100644
--- a/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass_tester.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <gtest/gtest.h>
+#include <unordered_map>
 
 #include "paddle/fluid/framework/ir/mkldnn/compute_propagate_scales_mkldnn_pass.h"
 #include "paddle/fluid/framework/naive_executor.h"
@@ -91,11 +92,16 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
         graph, scope, wx_name, wh_name, var_quant_scales);
   }
 
+  void UpdateReluOutputScales(ir::Graph* graph,
+                              StringPairMap* var_quant_scales) const {
+    pass->UpdateReluOutputScales(graph, var_quant_scales);
+  }
+
   void InitTensorHolder(Scope* scope,
                         const paddle::platform::Place& place,
                         const std::string& var_name) {
     auto x = scope->Var(var_name);
-    auto tensor = x->GetMutable<LoDTensor>();
+    auto tensor = x->GetMutable<phi::DenseTensor>();
     auto tensor_size = 1;
     if (var_name == "filter") {
       tensor_size = positive_and_negative_values.size();
@@ -124,7 +130,6 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
   }
 
   void ComputeRnnWeightScalesTest(const std::string& type,
-                                  const std::initializer_list<std::string>& ops,
                                   const framework::ProgramDesc& prog,
                                   std::vector<double> scales) {
     ir::Graph* graph(new ir::Graph(prog));
@@ -140,7 +145,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
     StringPairMap var_quant_scales;
 
     auto* wx_var = scope.FindVar(wx_var_names);
-    auto* wx_tensor = wx_var->GetMutable<LoDTensor>();
+    auto* wx_tensor = wx_var->GetMutable<phi::DenseTensor>();
     wx_tensor->Resize(phi::make_dim(wx.size(), wx[0].size()));
     for (size_t i = 0; i < wx.size(); i++)
       std::copy(begin(wx[i]),
@@ -149,7 +154,7 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
                     i * wx[0].size());
 
     auto* wh_var = scope.FindVar(wh_var_names);
-    auto* wh_tensor = wh_var->GetMutable<LoDTensor>();
+    auto* wh_tensor = wh_var->GetMutable<phi::DenseTensor>();
     wh_tensor->Resize(phi::make_dim(wh.size(), wh[0].size()));
     for (size_t i = 0; i < wh.size(); i++)
       std::copy(begin(wh[i]),
@@ -174,6 +179,24 @@ class ComputePropagateScalesMkldnnPassTest : public testing::Test {
     }
   }
 
+  void UpdateReluOutputScaleTest(
+      const framework::ProgramDesc& prog,
+      StringPairMap* var_quant_scales,
+      const std::initializer_list<std::string>& variable_names) {
+    ir::Graph* graph(new ir::Graph(prog));
+    Scope scope;
+
+    PrepareGraph(graph, prog, &scope, conv_variable_names);
+
+    UpdateReluOutputScales(graph, var_quant_scales);
+
+    for (auto& var_name : variable_names) {
+      auto iter = var_quant_scales->find(var_name);
+      ASSERT_NE(iter, var_quant_scales->end());
+      ASSERT_EQ((*var_quant_scales)[var_name].first, true);
+    }
+  }
+
  private:
   std::unique_ptr<ComputePropagateScalesMkldnnPass> pass;
 };
@@ -182,11 +205,15 @@ void SetOp(ProgramDesc* prog,
            const std::string& type,
            const std::string& name,
            const std::vector<std::string>& inputs,
-           const std::vector<std::string>& outputs) {
+           const std::vector<std::string>& outputs,
+           const std::unordered_map<std::string, std::string>& attrs = {}) {
   auto* op = prog->MutableBlock(0)->AppendOp();
   op->SetType(type);
   op->SetAttr("use_mkldnn", true);
   op->SetAttr("name", name);
+  if (!attrs.empty())
+    for (auto& attr : attrs) op->SetAttr(attr.first, attr.second);
+
   if (type == "conv2d") {
     op->SetInput("Input", {inputs[0]});
     if (inputs.size() > 1) op->SetInput("Filter", {inputs[1]});
@@ -211,6 +238,23 @@ ProgramDesc BuildConv2dProgramDesc() {
   return prog;
 }
 
+ProgramDesc BuildConv2dReluProgramDesc() {
+  ProgramDesc prog;
+  for (auto& v : conv_variable_names) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  std::unordered_map<std::string, std::string> attrs = {
+      {"fuse_activation", "relu"}};
+  SetOp(&prog,
+        "conv2d",
+        "Conv2d",
+        {"conv_in", "filter", "bias"},
+        {"conv_out"},
+        attrs);
+
+  return prog;
+}
+
 ProgramDesc BuildFusionGruProgramDesc() {
   ProgramDesc prog;
   for (auto& v : rnn_variable_names) {
@@ -262,7 +306,7 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
   StringPairMap var_quant_scales;
 
   auto* var = scope.FindVar(weight_var_name);
-  auto* weight_tensor = var->GetMutable<LoDTensor>();
+  auto* weight_tensor = var->GetMutable<phi::DenseTensor>();
   weight_tensor->Resize(phi::make_dim(1, values.size()));
   std::copy(begin(values),
             end(values),
@@ -283,15 +327,24 @@ TEST_F(ComputePropagateScalesMkldnnPassTest, compute_var_scales) {
 }
 
 TEST_F(ComputePropagateScalesMkldnnPassTest, compute_gru_weight_scales) {
-  ComputeRnnWeightScalesTest("gru",
-                             {"fusion_gru", "multi_gru"},
-                             BuildFusionGruProgramDesc(),
-                             gru_scales);
+  ComputeRnnWeightScalesTest("gru", BuildFusionGruProgramDesc(), gru_scales);
 }
 
 TEST_F(ComputePropagateScalesMkldnnPassTest, compute_lstm_weight_scales) {
-  ComputeRnnWeightScalesTest(
-      "lstm", {"fusion_lstm"}, BuildFusionLstmProgramDesc(), lstm_scales);
+  ComputeRnnWeightScalesTest("lstm", BuildFusionLstmProgramDesc(), lstm_scales);
+}
+
+TEST_F(ComputePropagateScalesMkldnnPassTest, update_relu_output_scales) {
+  StringPairMap var_quant_scales;
+  for (auto& var_name : conv_variable_names) {
+    phi::DenseTensor tensor;
+    auto* data = tensor.mutable_data<float>({1}, platform::CPUPlace());
+    data[0] = 10;
+    auto pair = std::make_pair(false, tensor);
+    var_quant_scales.insert(std::make_pair(var_name, pair));
+  }
+  UpdateReluOutputScaleTest(
+      BuildConv2dReluProgramDesc(), &var_quant_scales, {"conv_out"});
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index 92351d5067f6b9..b8eddad1ce026d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -229,6 +229,7 @@ void CPUQuantizePass::DequantizeOutput(Graph* g,
                     std::vector<std::string>({dequantize_in_node->Name()}));
   deq_desc.SetOutput("Output", std::vector<std::string>({output->Name()}));
   deq_desc.SetAttr("Scale", scale);
+  deq_desc.SetAttr("is_negative_input", !is_unsigned);
   auto dequantize_op = g->CreateOpNode(&deq_desc);  // OpDesc will be copied.
 
   // update op's output
@@ -332,24 +333,14 @@ bool CPUQuantizePass::IsOpQuantized(const Node* node) const {
 }
 
 void CPUQuantizePass::GetQuantInfo(Graph* graph) const {
-  std::unordered_map<std::string, std::vector<float>> info_map{};
-  GetInfoFromTheFirstOp(graph, "has_quant_info", "var_quant_scales", &info_map);
-
-  for (auto iter = info_map.begin(); iter != info_map.end(); iter++) {
-    LoDTensor tensor;
-    const int size = static_cast<int>(iter->second.size());
-    auto* data = tensor.mutable_data<double>({size}, platform::CPUPlace());
-    for (int i = 0; i < size; i++) {
-      data[i] = static_cast<double>(iter->second[i]);
-    }
-
-    auto pair = std::make_pair(false, tensor);
-    var_quant_scales_->insert(std::make_pair(iter->first, pair));
-  }
+  GetInfoFromTheFirstOp(
+      graph, "has_quant_info", "var_quant_scales", var_quant_scales_);
 }
 
-void CPUQuantizePass::QuantizeConv(Graph* graph,
-                                   bool with_residual_data) const {
+void CPUQuantizePass::QuantizeConv(
+    Graph* graph,
+    bool with_residual_data,
+    std::vector<std::string>* changed_weight) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
   patterns::ConvResidual conv_pattern{pattern, name_scope_};
@@ -422,7 +413,16 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
     auto filter_scale_tensor = GetScaleTensorForNode(conv_filter);
     EigenVectorArrayMap eigen_tensor{filter_scale_tensor.data<double>(),
                                      filter_scale_tensor.numel()};
-    eigen_tensor *= static_cast<double>(S8_MAX);
+
+    // If the scale value of a weight is already multiplied by S8_MAX, it does
+    // not need to be multiplied again
+    if (std::find(changed_weight->begin(),
+                  changed_weight->end(),
+                  conv_filter->Name()) == changed_weight->end()) {
+      eigen_tensor *= static_cast<double>(S8_MAX);
+      changed_weight->push_back(conv_filter->Name());
+    }
+
     std::vector<float> filter_scale{
         filter_scale_tensor.data<double>(),
         filter_scale_tensor.data<double>() + filter_scale_tensor.numel()};
@@ -597,6 +597,20 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
       return;
     }
 
+    bool are_all_inputs_unsigned{true};
+    // if all inputs were unsigned, then the output was set to unsigned
+    // during the scale calculation step
+    auto inputs = concat_op->inputs;
+    for (size_t i = 0; i < inputs.size(); i++) {
+      if (AreScalesPresentForVarNames({inputs[i]->Name()})) {
+        auto scale_data = GetScaleDataByName(inputs[i]->Name());
+        if (scale_data.first == false) {
+          are_all_inputs_unsigned = false;
+          break;
+        }
+      }
+    }
+
     GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
 
     if (!AreScalesPresentForNodes({concat_out})) {
@@ -605,17 +619,12 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
       return;
     }
 
-    // if all inputs were unsigned, then the output was set to unsigned
-    // during the scale calculation step
-    bool are_all_inputs_unsigned{false};
-    auto output_scale =
-        GetScaleValueForNode(concat_out, &are_all_inputs_unsigned);
+    auto output_scale = GetScaleValueForNode(concat_out);
 
     QuantizeInputs(g, concat_op, "X", are_all_inputs_unsigned);
 
     DequantizeOutput(
         g, concat_op, concat_out, "Out", output_scale, are_all_inputs_unsigned);
-
     ++quantize_concat_count;
   };
 
@@ -699,6 +708,13 @@ void CPUQuantizePass::QuantizeImmutable(Graph* graph,
       return;
     }
 
+    // skip if the dtype of immutable_in is not float32
+    auto dtype = immutable_in->Var()->GetDataType();
+    if (dtype != proto::VarType::FP32) {
+      MarkAndLogCannotQuantizeOp(immutable_op, "The input dtype is not float.");
+      return;
+    }
+
     if (!AreScalesPresentForNodes({immutable_out})) {
       MarkAndLogCannotQuantizeOp(immutable_op,
                                  "No scale available for the operator");
@@ -1158,9 +1174,12 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
       param_scope(),
       platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
+  // Save the scale values of which weights have been processed to avoid
+  // secondary processing
+  std::vector<std::string> changed_weight = {};
   GetQuantInfo(graph);
-  QuantizeConv(graph, false /* with_residual_data */);
-  QuantizeConv(graph, true /* with_residual_data */);
+  QuantizeConv(graph, false /* with_residual_data */, &changed_weight);
+  QuantizeConv(graph, true /* with_residual_data */, &changed_weight);
   QuantizePool(graph);
   QuantizeConcat(graph);
   QuantizePriorBox(graph);
@@ -1170,7 +1189,6 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeImmutable(graph, "reshape2", "X");
   QuantizeImmutable(graph, "transpose2", "X");
   QuantizeImmutable(graph, "slice", "Input");
-  QuantizeImmutable(graph, "shape", "Input");
   QuantizeImmutable(graph, "nearest_interp", "X");
   QuantizeImmutable(graph, "nearest_interp_v2", "X");
   QuantizeElementwise(graph, "elementwise_add");
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index f26d8bfc84c150..a7470520af1975 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -49,7 +49,9 @@ class CPUQuantizePass : public FusePassBase {
  protected:
   void ApplyImpl(ir::Graph* graph) const override;
 
-  void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
+  void QuantizeConv(Graph* graph,
+                    bool with_residual_data = false,
+                    std::vector<std::string>* changed_weight = nullptr) const;
   void QuantizeFc(Graph* graph) const;
   void QuantizePool(Graph* graph) const;
   void QuantizeConcat(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
old mode 100644
new mode 100755
index 4dabdd6bed0bd6..70623214503d89
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -66,7 +66,7 @@ void SetOp(ProgramDesc* prog,
              type == "nearest_interp" || type == "nearest_interp_v2") {
     op->SetInput("X", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
-  } else if (type == "slice" || type == "shape") {
+  } else if (type == "slice") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Out", {outputs[0]});
   } else if (type == "dropout") {
@@ -467,7 +467,7 @@ static const std::initializer_list<std::string> variable_names_immutable_ops = {
 void TestImmutableOp(const std::string tested_op) {
   ProgramDesc prog;
   for (auto& v : variable_names_immutable_ops) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
   SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
   SetOp(&prog, tested_op, tested_op, {"b"}, {"c"}, true, "int8");
@@ -520,7 +520,7 @@ void TestImmutableOpBetweenNonQuantizedOp(const std::string tested_op) {
 void TestImmutableOpWithManyOutputs(const std::string tested_op) {
   ProgramDesc prog;
   for (auto& v : variable_names_immutable_ops) {
-    prog.MutableBlock(0)->Var(v);
+    prog.MutableBlock(0)->Var(v)->SetDataType(proto::VarType::FP32);
   }
 
   SetOp(&prog, "dropout", "Dropout1", {"a"}, {"b"}, true, "float32");
@@ -556,12 +556,8 @@ void TestImmutableOpWithManyOutputs(const std::string tested_op) {
            SCALE * S8_MAX);
 }
 
-const std::vector<std::string> immutables = {"reshape2",
-                                             "transpose2",
-                                             "slice",
-                                             "shape",
-                                             "nearest_interp",
-                                             "nearest_interp_v2"};
+const std::vector<std::string> immutables = {
+    "reshape2", "transpose2", "slice", "nearest_interp", "nearest_interp_v2"};
 
 class TestImmutables : public testing::TestWithParam<std::string> {};
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
index 7c23976d3c6e28..964b879b6d6e7e 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.cc
@@ -104,51 +104,24 @@ void CPUQuantizeSquashPass::FindNodesToKeep(
   AddStatis(found_count);
 }
 
-bool CPUQuantizeSquashPass::IsDequantizeInputUint8(
-    const Node* dequant_in) const {
-  PADDLE_ENFORCE_EQ(
-      dequant_in->inputs.size(),
-      1,
-      platform::errors::InvalidArgument(
-          "Dequantize (id: %f) should have only one input.", dequant_in->id()));
-  if (dequant_in->inputs[0]->IsOp()) {
-    auto prev_op = dequant_in->inputs[0]->Op();
-    std::string act_name;
-    if (prev_op->Type() == "relu") {
-      return true;
-    } else {
-      if (prev_op->Type() == "conv2d") {
-        act_name = "fuse_activation";
-      } else if (prev_op->Type() == "fc") {
-        act_name = "activation_type";
-      }
-      if (!act_name.empty()) {
-        auto act = prev_op->GetAttrIfExists<std::string>(act_name);
-        if (act == "relu" || act == "relu6") {
-          return true;
-        }
-      }
-    }
-  }
-  return false;
-}
-
 bool CPUQuantizeSquashPass::IsDequantizeQuantizeIncompatible(
-    Node* quant_op, Node* dequant_in, Node* next_op) const {
-  bool is_concat_signed =
+    Node* quant_op, Node* dequant_op, Node* next_op) const {
+  bool is_next_op_signed =
       quant_op->Op()->GetAttrIfExists<bool>("is_negative_input");
-  bool is_input_unsigned = IsDequantizeInputUint8(dequant_in);
+  bool is_input_signed =
+      dequant_op->Op()->GetAttrIfExists<bool>("is_negative_input");
+
   /* TODO(sfraczek): remove elementwise from this condition when BinaryMKLDNN
    kernel will support two different input data types */
   bool is_next_op_concat_or_elementwise =
       next_op->Op()->Type() == "concat" ||
       next_op->Op()->Type().find("elementwise") == 0;
-  if (is_next_op_concat_or_elementwise && is_concat_signed &&
-      is_input_unsigned) {
+  if (is_next_op_concat_or_elementwise &&
+      (is_next_op_signed ^ is_input_signed)) {
     VLOG(4) << "Do not squash dequant-quant, because "
             << "next_op is: " << next_op->Op()->Type()
-            << ", is_concat_signed: " << is_concat_signed
-            << ", is_input_unsigned: " << is_input_unsigned << ".";
+            << ", is_next_op_signed: " << is_next_op_signed
+            << ", is_input_signed: " << is_input_signed << ".";
     return true;
   }
   return false;
@@ -173,7 +146,7 @@ void CPUQuantizeSquashPass::DequantQuantSquash(
     GET_IR_NODE_FROM_SUBGRAPH(quant_out, quant_out, squash_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, squash_pattern);
 
-    if (IsDequantizeQuantizeIncompatible(quant_op, dequant_in, next_op)) {
+    if (IsDequantizeQuantizeIncompatible(quant_op, dequant_op, next_op)) {
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
index 5207cc519c6980..3aed54609d4512 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass.h
@@ -43,11 +43,6 @@ class CPUQuantizeSquashPass : public FusePassBase {
       Graph* graph,
       std::unordered_map<const Node*, int>* nodes_keep_counter) const;
 
-  /*
-   * Check if input to dequantize is uint8
-   */
-  bool IsDequantizeInputUint8(const Node* dequant_in) const;
-
   /*
    * Don't squash unsigned dequantize with signed quantize.
    * This is important for concat and elementwise ops.
@@ -55,7 +50,7 @@ class CPUQuantizeSquashPass : public FusePassBase {
    * elementwise assumes first input type.
    */
   bool IsDequantizeQuantizeIncompatible(Node* quant_op,
-                                        Node* dequant_in,
+                                        Node* dequant_op,
                                         Node* next_op) const;
 
   /*
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
index 655cc95bf28a05..cd71ff153d6010 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_squash_pass_tester.cc
@@ -68,15 +68,11 @@ void SetOp(ProgramDesc* prog,
     op->SetAttr("padding_algorithm", std::string("EXPLICIT"));
     op->SetAttr("data_format", std::string("NCHW"));
     op->SetAttr("force_fp32_output", false);
-  } else if (type == "quantize") {
+  } else if (type == "quantize" || type == "dequantize") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
     op->SetAttr("Scale", scale[0]);
     op->SetAttr("is_negative_input", is_negative_input);
-  } else if (type == "dequantize") {
-    op->SetInput("Input", {inputs[0]});
-    op->SetOutput("Output", {outputs[0]});
-    op->SetAttr("Scale", scale[0]);
   } else if (type == "requantize") {
     op->SetInput("Input", {inputs[0]});
     op->SetOutput("Output", {outputs[0]});
@@ -303,31 +299,22 @@ ProgramDesc BuildConvMultiRequantProgramDesc(bool use_mkldnn,
   return prog;
 }
 
-/* a->relu->b->Dequant->c(u8)->Quant->d-\
- * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x
- * i->relu->j->Dequant->k(u8)->Quant->l-/
+/* a->relu->b->Dequant(u8)->c->Quant(u8)->d-\
+ * e->relu->f->Dequant(u8)->g->Quant(u8)->h--Concat1->i
  */
-ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
+ProgramDesc BuildU8U8ConcatProgramDesc(float scale_out, float scale) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
     prog.MutableBlock(0)->Var(v);
   }
   SetOp(&prog, "relu", "Relu1", {"a"}, {"b"}, true, {scale, scale_out});
   SetOp(&prog, "relu", "Relu2", {"e"}, {"f"}, true, {scale, scale_out});
-  SetOp(&prog, "relu", "Relu3", {"i"}, {"j"}, true, {scale, scale_out});
-
-  SetOp(
-      &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out});
 
   SetOp(&prog,
-        "quantize",
-        "Quant1",
+        "dequantize",
+        "Dequant1",
+        {"b"},
         {"c"},
-        {"d"},
         true,
         {scale, scale_out},
         0.0f,
@@ -336,10 +323,23 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
         1,
         false);  // is_negative_input = false
   SetOp(&prog,
-        "quantize",
-        "Quant2",
+        "dequantize",
+        "Dequant2",
+        {"f"},
         {"g"},
-        {"h"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
+
+  SetOp(&prog,
+        "quantize",
+        "Quant1",
+        {"c"},
+        {"d"},
         true,
         {scale, scale_out},
         0.0f,
@@ -349,9 +349,9 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
         false);  // is_negative_input = false
   SetOp(&prog,
         "quantize",
-        "Quant3",
-        {"k"},
-        {"l"},
+        "Quant2",
+        {"g"},
+        {"h"},
         true,
         {scale, scale_out},
         0.0f,
@@ -360,27 +360,47 @@ ProgramDesc BuildU8U8U8ConcatProgramDesc(float scale_out, float scale) {
         1,
         false);  // is_negative_input = false
 
-  SetOp(&prog, "concat", "Concat1", {"d", "h", "l"}, {"x"}, true);
+  SetOp(&prog, "concat", "Concat1", {"d", "h"}, {"i"}, true);
   return prog;
 }
 
-/* a->relu->b->Dequant->c(u8)->Quant->d-\
- * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x
- * i->pool2d->j->Dequant->k(s8)->Quant->l-/
+/* a->relu->b->Dequant(u8)->c->Quant(s8)->d-\
+ * e->relu->f->Dequant(u8)->g->Quant(s8)->h--Concat1->x
+ * i->pool2d->j->Dequant(s8)->k->Quant(s8)->l-/
  */
 ProgramDesc BuildU8U8S8ConcatProgramDesc(float scale_out, float scale) {
   ProgramDesc prog;
   for (auto& v : variable_names) {
     prog.MutableBlock(0)->Var(v);
   }
-  SetOp(&prog, "relu", "Pool2d1", {"a"}, {"b"}, true, {scale, scale_out});
-  SetOp(&prog, "relu", "Relu1", {"e"}, {"f"}, true, {scale, scale_out});
+  SetOp(&prog, "relu", "Relu1", {"a"}, {"b"}, true, {scale, scale_out});
+  SetOp(&prog, "relu", "Relu2", {"e"}, {"f"}, true, {scale, scale_out});
   SetOp(&prog, "pool2d", "Pool2d2", {"i"}, {"j"}, true, {scale, scale_out});
 
-  SetOp(
-      &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out});
+  SetOp(&prog,
+        "dequantize",
+        "Dequant1",
+        {"b"},
+        {"c"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
+  SetOp(&prog,
+        "dequantize",
+        "Dequant2",
+        {"f"},
+        {"g"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
   SetOp(
       &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out});
 
@@ -392,9 +412,9 @@ ProgramDesc BuildU8U8S8ConcatProgramDesc(float scale_out, float scale) {
   return prog;
 }
 
-/* a->pool2d->b->Dequant->c(s8)->Quant->d-\
- * e->relu->f->Dequant->g(u8)->Quant->h--Concat1->x
- * i->pool2d->j->Dequant->k(s8)->Quant->l-/
+/* a->pool2d->b->Dequant(s8)->c->Quant(s8)->d-\
+ * e->relu->f->Dequant(u8)->g->Quant(s8)->h--Concat1->x
+ * i->pool2d->j->Dequant(s8)->k->Quant(s8)->l-/
  */
 ProgramDesc BuildS8U8S8ConcatProgramDesc(float scale_out, float scale) {
   ProgramDesc prog;
@@ -407,8 +427,18 @@ ProgramDesc BuildS8U8S8ConcatProgramDesc(float scale_out, float scale) {
 
   SetOp(
       &prog, "dequantize", "Dequant1", {"b"}, {"c"}, true, {scale, scale_out});
-  SetOp(
-      &prog, "dequantize", "Dequant2", {"f"}, {"g"}, true, {scale, scale_out});
+  SetOp(&prog,
+        "dequantize",
+        "Dequant2",
+        {"f"},
+        {"g"},
+        true,
+        {scale, scale_out},
+        0.0f,
+        "float32",
+        false,
+        1,
+        false);  // is_negative_input = false
   SetOp(
       &prog, "dequantize", "Dequant3", {"j"}, {"k"}, true, {scale, scale_out});
 
@@ -1141,13 +1171,12 @@ TEST(CpuQuantizeSquashPass, squash_all_s8_input_to_concat1) {
 }
 
 TEST(CpuQuantizeSquashPass, squash_all_u8_input_to_concat2) {
-  // removed 3 x 4 (dequantize_op, dequantize_out, quantize, quantize_out)
-  auto remove_nodes = 12;
+  // removed 2 x 4 (dequantize_op, dequantize_out, quantize, quantize_out)
+  auto remove_nodes = 8;
   std::unordered_map<std::string, int> expected_operators = {
-      {"concat", 1}, {"quantize", 0}, {"dequantize", 0}, {"relu", 3}};
-  CheckNodesTest(BuildU8U8U8ConcatProgramDesc(1.2f, 1.2f),
-                 expected_operators,
-                 remove_nodes);
+      {"concat", 1}, {"quantize", 0}, {"dequantize", 0}, {"relu", 2}};
+  CheckNodesTest(
+      BuildU8U8ConcatProgramDesc(1.2f, 1.2f), expected_operators, remove_nodes);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
index cdb0f70a56667d..f4ac65a9ab1993 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.cc
@@ -1,4 +1,4 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,9 +14,8 @@
 
 #include "paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h"
 
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -26,20 +25,20 @@ namespace ir {
 using string::PrettyLogDetail;
 
 void FuseFCActOneDNNPass::ApplyImpl(Graph *graph) const {
-  std::vector<std::string> act_types = {
-      "gelu", "tanh", "sigmoid", "mish", "hard_swish"};
+  auto act_types = paddle::platform::GetSupportedActivations();
 
-  for (std::string act_type : act_types) FuseFCAct(graph, act_type);
+  for (auto act_type : act_types) FuseFCAct(graph, act_type);
 }
 
 void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
                                     const std::string &act_type) const {
   PADDLE_ENFORCE_NOT_NULL(
       graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init("fc_act", graph);
+  FusePassBase::Init("fc_" + act_type + "_mkldnn_fuse_pass", graph);
 
   GraphPatternDetector gpd;
-  patterns::OperatorActivation fc_act_pattern(gpd.mutable_pattern(), "fc_act");
+  patterns::OperatorActivation fc_act_pattern(
+      gpd.mutable_pattern(), "fc_" + act_type + "_mkldnn_fuse_pass");
   fc_act_pattern("fc", act_type);
 
   int found_fc_act_count = 0;
@@ -62,15 +61,23 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
               "is used."));
     }
 
+    auto attr_map = paddle::platform::GetAttributeMap(act_type);
+    for (const auto &attr : attr_map) {
+      if (act_op->HasAttr(attr.first)) {
+        fc_op->SetAttr(attr.second, act_op->GetAttr(attr.first));
+      }
+    }
+
     if (act_type == "gelu" && act_op->HasAttr("approximate")) {
-      bool approximate = PADDLE_GET_CONST(bool, act_op->GetAttr("approximate"));
-      std::string type = approximate ? "_tanh" : "_erf";
-      fc_op->SetAttr("activation_type", act_type + type);
+      std::string gelu_act_type =
+          PADDLE_GET_CONST(bool, act_op->GetAttr("approximate")) ? "gelu_tanh"
+                                                                 : "gelu_erf";
+      fc_op->SetAttr("fuse_activation", gelu_act_type);
     } else {
-      fc_op->SetAttr("activation_type", act_type);
+      fc_op->SetAttr("fuse_activation", act_type);
     }
-    fc_op->SetAttr("use_mkldnn", true);
 
+    fc_op->SetAttr("use_mkldnn", true);
     fc_op->SetOutput("Out", {act_out->Name()});
 
     IR_OP_VAR_LINK(fc, act_out);
@@ -80,7 +87,8 @@ void FuseFCActOneDNNPass::FuseFCAct(Graph *graph,
 
   gpd(graph, handler);
   AddStatis(found_fc_act_count);
-  if (!Has("disable_logs") || !Get<bool>("disable_logs"))
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
+      found_fc_act_count > 0)
     PrettyLogDetail(
         "---    fused %d fc with %s activation", found_fc_act_count, act_type);
 }
@@ -95,8 +103,16 @@ REGISTER_PASS_CAPABILITY(fc_act_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .LE("fc", 0)
-            .LE("gelu", 0)
-            .LE("sigmoid", 0)
-            .LE("mish", 1)
+            .EQ("abs", 0)
+            .LE("clip", 1)
+            .EQ("gelu", 0)
+            .EQ("hard_sigmoid", 0)
             .LE("hard_swish", 0)
-            .LE("tanh", 0));
+            .LE("leaky_relu", 1)
+            .LE("mish", 1)
+            .EQ("relu", 0)
+            .EQ("relu6", 0)
+            .EQ("sigmoid", 0)
+            .EQ("sqrt", 0)
+            .EQ("swish", 0)
+            .EQ("tanh", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
index 23f4296b98bcab..7e4032d4a13529 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass.h
@@ -23,21 +23,14 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
-/*
- * \brief   Fuse the FC and activation operators into single OneDNN's
- *          FC with post-op.
- *
- * \note    Currently only GeLU, hardswish, sigmoid, mish and tanh are supported
- * as an activation function.
- */
 class FuseFCActOneDNNPass : public FusePassBase {
  public:
   virtual ~FuseFCActOneDNNPass() {}
 
  protected:
-  void ApplyImpl(ir::Graph *graph) const override;
+  void ApplyImpl(Graph *graph) const override;
 
-  void FuseFCAct(ir::Graph *graph, const std::string &act_types) const;
+  void FuseFCAct(Graph *graph, const std::string &act_types) const;
 };
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
index 38f253703ceeec..2951e2522d0f5c 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_act_mkldnn_fuse_pass_tester.cc
@@ -78,9 +78,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluTanh) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("gelu_tanh"), 0);
     }
   }
@@ -113,9 +113,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluErf) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("gelu_erf"), 0);
     }
   }
@@ -146,9 +146,9 @@ TEST(FuseFCActOneDNNPass, FuseWithGeluAuto) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("gelu"), 0);
     }
   }
@@ -179,9 +179,9 @@ TEST(FuseFCActOneDNNPass, FuseWithTanh) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("tanh"), 0);
     }
   }
@@ -213,9 +213,9 @@ TEST(FuseFCActOneDNNPass, FuseWithSigmoid) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("sigmoid"), 0);
     }
   }
@@ -246,9 +246,9 @@ TEST(FuseFCActOneDNNPass, FuseWithMish) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("mish"), 0);
     }
   }
@@ -280,9 +280,9 @@ TEST(FuseFCActOneDNNPass, FuseWithHardSwish) {
       const auto* op = node->Op();
       ASSERT_TRUE(op->HasAttr("use_mkldnn"));
       EXPECT_TRUE(PADDLE_GET_CONST(bool, op->GetAttr("use_mkldnn")));
-      ASSERT_TRUE(op->HasAttr("activation_type"));
+      ASSERT_TRUE(op->HasAttr("fuse_activation"));
       auto act_type =
-          PADDLE_GET_CONST(std::string, op->GetAttr("activation_type"));
+          PADDLE_GET_CONST(std::string, op->GetAttr("fuse_activation"));
       EXPECT_EQ(act_type.compare("hard_swish"), 0);
     }
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 6f7bb614cc79f5..49db8b8f7f8e54 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -38,7 +38,7 @@ USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
 USE_OP_ITSELF(gelu);
-USE_OP_DEVICE_KERNEL(gelu, MKLDNN);
+PD_DECLARE_KERNEL(gelu, OneDNN, ALL_LAYOUT);
 PD_DECLARE_ARG_MAPPING_FN(gelu);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
index a714f236c46165..6899a7202da9cc 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_pass_util.h
@@ -22,6 +22,9 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+using StringPairMap =
+    std::unordered_map<std::string, std::pair<bool, phi::DenseTensor>>;
+
 static void SaveInfoInTheFirstOp(
     ir::Graph* graph,
     const std::string& flag,
@@ -44,6 +47,31 @@ static void SaveInfoInTheFirstOp(
   }
 }
 
+static void SaveInfoInTheFirstOp(ir::Graph* graph,
+                                 const std::string& flag,
+                                 const std::string& key_suffix,
+                                 const StringPairMap& info_map) {
+  VLOG(3) << "save variables in the first op's attr";
+
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    op_node->Op()->SetAttr(flag, true);
+    for (auto iter = info_map.begin(); iter != info_map.end(); ++iter) {
+      auto* data = iter->second.second.data<float>();
+      std::vector<float> data_v(data, data + iter->second.second.numel());
+      op_node->Op()->SetAttr(iter->first + suffix + "_unsigned",
+                             iter->second.first);
+      op_node->Op()->SetAttr(iter->first + suffix, data_v);
+    }
+    break;
+  }
+}
+
 static void GetInfoFromTheFirstOp(
     ir::Graph* graph,
     const std::string& flag,
@@ -77,6 +105,54 @@ static void GetInfoFromTheFirstOp(
   }
 }
 
+static void GetInfoFromTheFirstOp(ir::Graph* graph,
+                                  const std::string& flag,
+                                  const std::string& key_suffix,
+                                  StringPairMap* info_map) {
+  VLOG(3) << "get variables from the first op's attr";
+  const std::string unsigned_flag = "_unsigned";
+  const std::string suffix = "_" + key_suffix + "_" + flag;
+  const std::string suffix_is_unsigned = suffix + unsigned_flag;
+  for (auto* op_node :
+       ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
+    if (!op_node->IsOp() || op_node->Op()->Type() == "feed" ||
+        op_node->Op()->Type() == "fetch")
+      continue;
+
+    auto* op_desc = op_node->Op();
+    if (op_desc->GetAttrIfExists<bool>(flag)) {
+      op_desc->RemoveAttr(flag);
+      std::vector<std::string> attr_names = op_desc->AttrNames();
+      for (auto fake_name : attr_names) {
+        auto is_unsigned = false;
+        size_t pos = fake_name.find(suffix_is_unsigned);
+
+        if (pos != std::string::npos) {
+          std::string unsigned_var_name = fake_name;
+          is_unsigned =
+              PADDLE_GET_CONST(bool, op_desc->GetAttr(unsigned_var_name));
+
+          std::string var_name = fake_name.substr(0, pos);
+          size_t unsigned_pos = fake_name.find(unsigned_flag);
+          std::string vector_name =
+              fake_name.erase(unsigned_pos, unsigned_flag.length());
+          auto scales_vector = PADDLE_GET_CONST(std::vector<float>,
+                                                op_desc->GetAttr(vector_name));
+          phi::DenseTensor tensor;
+          const int size = static_cast<int>(scales_vector.size());
+          auto data = tensor.mutable_data<double>({size}, platform::CPUPlace());
+          std::copy(scales_vector.begin(), scales_vector.end(), data);
+          auto pair = std::make_pair(is_unsigned, tensor);
+          info_map->insert(std::make_pair(var_name, pair));
+          op_desc->RemoveAttr(unsigned_var_name);
+          op_desc->RemoveAttr(vector_name);
+        }
+      }
+      break;
+    }
+  }
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.cc
new file mode 100644
index 00000000000000..0f8d0452aa17ba
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.cc
@@ -0,0 +1,144 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void FuseOperatorReshape2OneDNNPass::ApplyImpl(Graph *graph) const {
+  // THIS FUSE WILL WORK ONLY WITH OPERATORS THAT OUTPUTS PLAIN MEMORY, F.E.
+  // ABCD FOR 4D! BE AWARE OF THAT!
+  std::vector<std::pair<std::string, int>> ops_and_outputs = {
+      {"fc", 1}, {"transpose2", 2}};
+
+  for (const auto &op_and_outputs : ops_and_outputs)
+    FuseReshape2(graph, op_and_outputs.first, op_and_outputs.second);
+}
+
+void FuseOperatorReshape2OneDNNPass::FuseReshape2(Graph *graph,
+                                                  const std::string &op_type,
+                                                  int num_of_outputs) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(op_type + "_reshape2_onednn_fuse_pass", graph);
+
+  GraphPatternDetector gpd;
+  patterns::OperatorReshape2 op_reshape2_pattern(
+      gpd.mutable_pattern(), op_type + "_reshape2_onednn_fuse_pass");
+  op_reshape2_pattern(op_type, num_of_outputs);
+
+  int found_operator_reshape2_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    GET_IR_NODE_FROM_SUBGRAPH(operator_op, preceding_op, op_reshape2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        operator_out, preceding_op_out, op_reshape2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_op, reshape2_op, op_reshape2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape2_out, reshape2_out, op_reshape2_pattern);
+
+    if (!operator_op->Op()->HasAttr("use_mkldnn") ||
+        (operator_op->Op()->HasAttr("use_mkldnn") &&
+         !(PADDLE_GET_CONST(bool, operator_op->Op()->GetAttr("use_mkldnn"))))) {
+      VLOG(4) << "Only oneDNN version of " << op_type
+              << "can be fused with reshape2.";
+      return;
+    }
+
+    if (operator_op->Op()->HasAttr("fused_unsqueeze2_axes")) {
+      VLOG(4) << "Cannot do " << op_type << " + reshape2 fuse, because "
+              << op_type << " is already fused with unsqueeze2!";
+      return;
+    }
+
+    std::vector<int> reshape2_shape =
+        PADDLE_GET_CONST(std::vector<int>, reshape2_op->Op()->GetAttr("shape"));
+
+    int num_of_minus_ones = 0;
+
+    for (size_t i = 0; i < reshape2_shape.size(); ++i) {
+      if (reshape2_shape[i] == 0) {
+        VLOG(4) << "OneDNN op+reshape2 fuse pass does not support zero dims, "
+                   "skipping";
+        return;
+      } else if (reshape2_shape[i] == -1) {
+        ++num_of_minus_ones;
+      }
+    }
+
+    if (num_of_minus_ones > 1) {
+      VLOG(4) << "Number of -1 values inside of reshape2 shouldn't be greater "
+                 "than one in op+reshape2 oneDNN fuse pass, skipping";
+      return;
+    }
+
+    auto const &names = reshape2_op->Op()->InputNames();
+
+    bool has_shape_tensor =
+        std::find(names.begin(), names.end(), "ShapeTensor") != names.end();
+    bool has_shape_tensor_list =
+        std::find(names.begin(), names.end(), "ShapeTensorList") != names.end();
+
+    if (has_shape_tensor &&
+        reshape2_op->Op()->Input("ShapeTensor").size() > 0) {
+      VLOG(4) << "Cannot fuse " << op_type
+              << " and reshape2 because reshape2 dims are specified by "
+                 "ShapeTensor!";
+      return;
+    }
+
+    if (has_shape_tensor_list &&
+        reshape2_op->Op()->Input("ShapeTensorList").size() > 0) {
+      VLOG(4) << "Cannot fuse " << op_type
+              << " and reshape2 because reshape2 dims are specified by "
+                 "ShapeTensorList!";
+      return;
+    }
+
+    operator_op->Op()->SetAttr("fused_reshape2_shape", reshape2_shape);
+    operator_op->Op()->SetOutput("Out", {reshape2_out->Name()});
+
+    IR_OP_VAR_LINK(operator_op, reshape2_out);
+    GraphSafeRemoveNodes(g, {reshape2_op, operator_out});
+    found_operator_reshape2_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_operator_reshape2_count);
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
+      found_operator_reshape2_count > 0)
+    PrettyLogDetail("---    fused %d %s with reshape2",
+                    found_operator_reshape2_count,
+                    op_type);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(operator_reshape2_onednn_fuse_pass,
+              paddle::framework::ir::FuseOperatorReshape2OneDNNPass);
+REGISTER_PASS_CAPABILITY(operator_reshape2_onednn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .GE("reshape2", 0)
+            .GE("fc", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h
new file mode 100644
index 00000000000000..a3369b453deefa
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/operator_reshape2_onednn_fuse_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class FuseOperatorReshape2OneDNNPass : public FusePassBase {
+ public:
+  virtual ~FuseOperatorReshape2OneDNNPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+  void FuseReshape2(Graph *graph,
+                    const std::string &op_type,
+                    int num_of_outputs) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
new file mode 100644
index 00000000000000..2759c79b7a7d93
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void FuseOperatorScaleOneDNNPass::ApplyImpl(Graph *graph) const {
+  const std::vector<std::string> fusable_ops{"fc", "matmul", "matmul_v2"};
+  for (const auto &op : fusable_ops) FuseScale(graph, op);
+}
+
+void FuseOperatorScaleOneDNNPass::FuseScale(Graph *graph,
+                                            const std::string &op_type) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(op_type + "_scale_onednn_fuse_pass", graph);
+
+  GraphPatternDetector gpd;
+  patterns::OperatorActivation op_scale_pattern(
+      gpd.mutable_pattern(), op_type + "_scale_onednn_fuse_pass");
+  op_scale_pattern(op_type, "scale");
+
+  int found_operator_scale_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    GET_IR_NODE_FROM_SUBGRAPH(operator_op, preceding_op, op_scale_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(operator_out, preceding_op_out, op_scale_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale_op, activation, op_scale_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(scale_out, activation_out, op_scale_pattern);
+
+    if (operator_op->Op()->HasAttr("use_mkldnn") &&
+        !(PADDLE_GET_CONST(bool, operator_op->Op()->GetAttr("use_mkldnn")))) {
+      VLOG(4) << "Only oneDNN version of " << op_type
+              << "can be fused with scale.";
+      return;
+    }
+
+    if (scale_op->Op()->GetAttrIfExists<float>("bias") != 0.0) {
+      VLOG(4) << op_type << " can be fused only with unbiased scale.";
+      return;
+    }
+
+    float scale = PADDLE_GET_CONST(float, scale_op->Op()->GetAttr("scale"));
+
+    auto *scope = param_scope();
+    auto const &names = scale_op->Op()->InputNames();
+    bool has_scale_tensor =
+        std::find(names.begin(), names.end(), "ScaleTensor") != names.end();
+
+    if (has_scale_tensor && scale_op->Op()->Input("ScaleTensor").size() > 0) {
+      std::string scale_var_name = scale_op->Op()->Input("ScaleTensor").front();
+      auto *scale_var = scope->FindVar(scale_var_name);
+      // ScaleTensor must be weight
+      if (scale_var == nullptr) return;
+      auto *scale_tensor = scale_var->GetMutable<LoDTensor>();
+      scale = *(scale_tensor->data<float>());
+    }
+
+    operator_op->Op()->SetAttr("fused_output_scale", scale);
+    operator_op->Op()->SetOutput("Out", {scale_out->Name()});
+
+    IR_OP_VAR_LINK(operator_op, scale_out);
+    GraphSafeRemoveNodes(g, {scale_op, operator_out});
+    found_operator_scale_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_operator_scale_count);
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
+      found_operator_scale_count > 0)
+    PrettyLogDetail(
+        "---    fused %d %s with scale", found_operator_scale_count, op_type);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(operator_scale_onednn_fuse_pass,
+              paddle::framework::ir::FuseOperatorScaleOneDNNPass);
+REGISTER_PASS_CAPABILITY(operator_scale_onednn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("fc", 0)
+            .LE("matmul", 1)
+            .EQ("matmul_v2", 0)
+            .EQ("scale", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h
new file mode 100644
index 00000000000000..e4e0295bf5604b
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/operator_scale_onednn_fuse_pass.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class FuseOperatorScaleOneDNNPass : public FusePassBase {
+ public:
+  virtual ~FuseOperatorScaleOneDNNPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+
+  void FuseScale(Graph *graph, const std::string &op_type) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
new file mode 100644
index 00000000000000..80f49613c63aca
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h"
+
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void FuseOperatorUnsqueeze2OneDNNPass::ApplyImpl(Graph *graph) const {
+  std::vector<std::pair<std::string, int>> ops_and_outputs = {
+      {"transpose2", 2}, {"elementwise_mul", 1}};
+
+  for (const auto &op_and_outputs : ops_and_outputs)
+    FuseUnsqueeze2(graph, op_and_outputs.first, op_and_outputs.second);
+}
+
+void FuseOperatorUnsqueeze2OneDNNPass::FuseUnsqueeze2(
+    Graph *graph, const std::string &op_type, int num_of_outputs) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
+  FusePassBase::Init(op_type + "_unsqueeze2_onednn_fuse_pass", graph);
+
+  GraphPatternDetector gpd;
+  patterns::OperatorUnsqueeze2 op_unsqueeze2_pattern(
+      gpd.mutable_pattern(), op_type + "_unsqueeze2_onednn_fuse_pass");
+  op_unsqueeze2_pattern(op_type, num_of_outputs);
+
+  int found_operator_unsqueeze2_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    GET_IR_NODE_FROM_SUBGRAPH(operator_op, preceding_op, op_unsqueeze2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        operator_out, preceding_op_out, op_unsqueeze2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        unsqueeze2_op, unsqueeze2_op, op_unsqueeze2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        unsqueeze2_out, unsqueeze2_out, op_unsqueeze2_pattern);
+
+    if (!operator_op->Op()->HasAttr("use_mkldnn") ||
+        (operator_op->Op()->HasAttr("use_mkldnn") &&
+         !(PADDLE_GET_CONST(bool, operator_op->Op()->GetAttr("use_mkldnn"))))) {
+      VLOG(4) << "Only oneDNN version of " << op_type
+              << "can be fused with unsqueeze2.";
+      return;
+    }
+
+    std::vector<int> unsqueeze2_axes = PADDLE_GET_CONST(
+        std::vector<int>, unsqueeze2_op->Op()->GetAttr("axes"));
+
+    auto const &names = unsqueeze2_op->Op()->InputNames();
+
+    bool has_axes_tensor =
+        std::find(names.begin(), names.end(), "AxesTensor") != names.end();
+    bool has_axes_tensor_list =
+        std::find(names.begin(), names.end(), "AxesTensorList") != names.end();
+
+    if (has_axes_tensor &&
+        unsqueeze2_op->Op()->Input("AxesTensor").size() > 0) {
+      VLOG(4) << "Cannot fuse " << op_type
+              << " and unsqueeze2 because unsqueeze2 dims are specified by "
+                 "AxesTensor!";
+      return;
+    }
+
+    if (has_axes_tensor_list &&
+        unsqueeze2_op->Op()->Input("AxesTensorList").size() > 0) {
+      VLOG(4) << "Cannot fuse " << op_type
+              << " and unsqueeze2 because unsqueeze2 dims are specified by "
+                 "AxesTensorList!";
+      return;
+    }
+
+    operator_op->Op()->SetAttr("fused_unsqueeze2_axes", unsqueeze2_axes);
+    operator_op->Op()->SetOutput("Out", {unsqueeze2_out->Name()});
+
+    IR_OP_VAR_LINK(operator_op, unsqueeze2_out);
+    GraphSafeRemoveNodes(g, {unsqueeze2_op, operator_out});
+    found_operator_unsqueeze2_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_operator_unsqueeze2_count);
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
+      found_operator_unsqueeze2_count > 0)
+    PrettyLogDetail("---    fused %d %s with unsqueeze2",
+                    found_operator_unsqueeze2_count,
+                    op_type);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(operator_unsqueeze2_onednn_fuse_pass,
+              paddle::framework::ir::FuseOperatorUnsqueeze2OneDNNPass);
+REGISTER_PASS_CAPABILITY(operator_unsqueeze2_onednn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .GE("unsqueeze2", 0)
+            .GE("transpose2", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h
new file mode 100644
index 00000000000000..eddd62e6106287
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/operator_unsqueeze2_onednn_fuse_pass.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class FuseOperatorUnsqueeze2OneDNNPass : public FusePassBase {
+ public:
+  virtual ~FuseOperatorUnsqueeze2OneDNNPass() {}
+
+ protected:
+  void ApplyImpl(Graph *graph) const override;
+  void FuseUnsqueeze2(Graph *graph,
+                      const std::string &op_type,
+                      int num_of_outputs) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
old mode 100644
new mode 100755
index 177309376e825b..e2065648ac5d04
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass.cc
@@ -52,35 +52,23 @@ bool HasBias(ir::Node* conv_op) {
          conv_op->Op()->Input("Bias").size() > 0;
 }
 
-bool ShouldSkipConv(ir::Node* conv_op, Scope* scope, ir::Node* conv_filter) {
-  if (!platform::HasOpINT8DataType(conv_op->Op())) {
-    VLOG(4) << "Skipping non-int8 convolution (id: " << conv_op->id() << ").";
-    return true;
-  }
-
-  auto filter_var = scope->GetVar(conv_filter->Name());
-  if (filter_var->Get<LoDTensor>().dtype() != phi::DataType::FLOAT32) {
-    VLOG(4) << "Skipping convolution (id: " << conv_op->id()
-            << ") because it's a bug that it is detected again.";
-    return true;
-  }
-
-  VLOG(4) << "Not skipping convolution (id: " << conv_op->id() << ")";
-  return false;
-}
-
 template <typename T>
 void QuantizeConvInput(Scope* scope,
                        ir::Graph* g,
                        ir::Node* conv_op,
                        const std::string& input_name,
                        const std::string& scales_attr_name) {
-  const auto scales =
-      conv_op->Op()->GetAttrIfExists<std::vector<float>>(scales_attr_name);
-
-  auto* tensor = scope->GetVar(input_name)->GetMutable<LoDTensor>();
-  QuantizeParams<T>(tensor, scales);
-
+  auto var = scope->GetVar(input_name);
+  if (var->Get<LoDTensor>().dtype() != phi::DataType::FLOAT32) {
+    VLOG(1) << "Skipping quantize the input: " << input_name
+            << " of convolution because it is detected again.";
+  } else {
+    const auto scales =
+        conv_op->Op()->GetAttrIfExists<std::vector<float>>(scales_attr_name);
+
+    auto* tensor = scope->GetVar(input_name)->GetMutable<LoDTensor>();
+    QuantizeParams<T>(tensor, scales);
+  }
   conv_op->Op()->SetAttr(scales_attr_name, std::vector<float>(1, 1));
 }
 
@@ -151,7 +139,8 @@ void ParamsQuantizationMkldnnPass::QuantizeConv(ir::Graph* graph,
     PADDLE_ENFORCE_NOT_NULL(
         scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
 
-    if (ShouldSkipConv(conv_op, scope, conv_filter)) {
+    // If not a quantized OP
+    if (!platform::HasOpINT8DataType(conv_op->Op())) {
       return;
     }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
old mode 100644
new mode 100755
index 507f25d92d8bc5..e04cf388ac0d74
--- a/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/params_quantization_mkldnn_pass_tester.cc
@@ -89,8 +89,14 @@ struct ProgramStrategy {
 
   virtual void CheckOp(const OpDesc& op) const = 0;
 
-  VarDesc* AddInput(OpDesc* op, std::string input_name, const Data& data) {
-    const std::string var_name = input_name + "_var";
+  VarDesc* AddInput(OpDesc* op,
+                    std::string input_name,
+                    const Data& data,
+                    const std::string user_var_name = "") {
+    std::string var_name = user_var_name;
+    if (var_name.empty()) {
+      var_name = input_name + "_var";
+    }
     op->SetInput(input_name, {var_name});
     auto var = program.MutableBlock(0)->Var(var_name);
     var->SetShape(data.getShape());
@@ -98,8 +104,14 @@ struct ProgramStrategy {
     return var;
   }
 
-  void AddOutput(OpDesc* op, std::string output_name, const Data& data) {
-    const std::string var_name = output_name + "_var";
+  void AddOutput(OpDesc* op,
+                 std::string output_name,
+                 const Data& data,
+                 const std::string user_var_name = "") {
+    std::string var_name = user_var_name;
+    if (var_name.empty()) {
+      var_name = output_name + "_var";
+    }
     op->SetOutput(output_name, {var_name});
     program.MutableBlock(0)->Var(var_name);
     test_scope.CreateTensor(var_name, data);
@@ -117,21 +129,23 @@ struct ConvProgramStrategy : public ProgramStrategy {
                       std::vector<float>&& scale_weights,
                       int groups = 1,
                       Data&& bias = Data(),
-                      std::vector<float>&& scale_bias = {})
+                      std::vector<float>&& scale_bias = {},
+                      bool share_weight = false)
       : input(std::move(input)),
         filter(std::move(filter)),
         output(std::move(output)),
         scale_weights(std::move(scale_weights)),
         groups(std::move(groups)),
         bias(std::move(bias)),
-        scale_bias(std::move(scale_bias)) {}
+        scale_bias(std::move(scale_bias)),
+        share_weight(std::move(share_weight)) {}
 
  protected:
-  OpDesc* CreateBasicConvOp() {
+  OpDesc* CreateBasicConvOp(const std::string conv_name = "Conv1") {
     auto op = program.MutableBlock(0)->AppendOp();
     op->SetType("conv2d");
     op->SetAttr("use_mkldnn", true);
-    op->SetAttr("name", std::string{"Conv1"});
+    op->SetAttr("name", conv_name);
     op->SetAttr("mkldnn_data_type", std::string{"int8"});
     op->SetAttr("data_format", std::string{"NCHW"});
     op->SetAttr("dilations", std::vector<int>({1, 1}));
@@ -155,6 +169,20 @@ struct ConvProgramStrategy : public ProgramStrategy {
       AddInput(op, "Bias", bias);
       op->SetAttr("Bias_scales", scale_bias);
     }
+
+    if (share_weight) {
+      OpDesc* op2 = CreateBasicConvOp("Conv2");
+      AddInput(op2, "Input", input);
+      AddInput(op2, "Filter", filter)->SetPersistable(true);
+      AddOutput(op2, "Output", output, "output2");
+      op2->SetAttr("Scale_weights", scale_weights);
+      op2->SetAttr("Scale_in", 1.0f);
+      op2->SetAttr("groups", groups);
+      if (HasBias()) {
+        AddInput(op2, "Bias", bias, "Bias2");
+        op2->SetAttr("Bias_scales", scale_bias);
+      }
+    }
   }
 
   void CheckOp(const OpDesc& op) const override {
@@ -210,9 +238,9 @@ struct ConvProgramStrategy : public ProgramStrategy {
   const Data output;
   const std::vector<float> scale_weights;
   const int groups;
-
   const Data bias;
   const std::vector<float> scale_bias;
+  const bool share_weight;
 };
 
 struct ParamsQuantizationMkldnnPassTestFixture : public ::testing::Test {
@@ -340,6 +368,19 @@ TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1w) {
   RunPassTest(std::move(program));
 }
 
+TEST_F(ParamsQuantizationMkldnnPassTestFixture, conv_with_bias_2g2o2i1h1ws) {
+  auto program = std::make_unique<ConvProgramStrategy>(
+      GenericInput(),
+      Data({2, 2, 2, 1, 1}, {1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f, 1.5f}),
+      GenericOutput(),
+      std::vector<float>{2.f, 2.f, 4.f, 4.f},
+      2,
+      Data({2, 2, 1, 1, 1}, {1.5f, 1.5f, 1.5f, 1.5f}),
+      std::vector<float>{2.f, 2.f, 4.f, 4.f},
+      true);
+  RunPassTest(std::move(program));
+}
+
 }  // namespace
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
index b674ef52183c00..f0577bab7fb7ae 100644
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.cc
@@ -109,27 +109,34 @@ void QuantDequantMkldnnPass::CollectWeightScalesInfoFromONNXFormatDequantize(
 
     if (op_node->Name() == "dequantize_linear") {
       auto* op_desc = op_node->Op();
+
+      auto scale_name = op_desc->Input("Scale")[0];
+      auto* var = scope->FindVar(scale_name);
+      PADDLE_ENFORCE_NOT_NULL(
+          var,
+          platform::errors::NotFound(
+              "The Scales variable [%s] of dequantize op is not found.", var));
+
+      auto* scale_tensor = var->GetMutable<LoDTensor>();
+      auto* scale_data = scale_tensor->data<float>();
+
       auto x_var_name = op_desc->Input("X")[0];
       auto* weight_var = scope->FindVar(x_var_name);
       if (!weight_var) {
         auto out_var_name = op_desc->Output("Y")[0];
-        if (var_quant_scales->count(x_var_name) &&
-            !var_quant_scales->count(out_var_name)) {
-          std::vector<float> scale_v = var_quant_scales->at(x_var_name);
+        float scale = 1.0 / scale_data[0];
+        if (std::isinf(scale) || std::isnan(scale)) {
+          scale = 0.0;
+        }
+        std::vector<float> scale_v = {scale};
+        if (!var_quant_scales->count(out_var_name)) {
           var_quant_scales->insert(std::make_pair(out_var_name, scale_v));
         }
+        if (!var_quant_scales->count(x_var_name)) {
+          var_quant_scales->insert(std::make_pair(x_var_name, scale_v));
+        }
       } else {
         *onnx_format_quantize_model = true;
-        auto scale_name = op_desc->Input("Scale")[0];
-        auto* var = scope->FindVar(scale_name);
-        PADDLE_ENFORCE_NOT_NULL(
-            var,
-            platform::errors::NotFound(
-                "The Scales variable [%s] of dequantize op is not found.",
-                var));
-
-        auto* scale_tensor = var->GetMutable<LoDTensor>();
-        auto* scale_data = scale_tensor->data<float>();
         std::vector<float> thresholds(scale_data,
                                       scale_data + scale_tensor->numel());
         weight_thresholds->insert(std::make_pair(x_var_name, thresholds));
@@ -182,7 +189,7 @@ void QuantDequantMkldnnPass::CollectInputScalesFromQuantize(
       auto* scale_data = scale_tensor->data<float>();
       float scale = 1.0 / scale_data[0];
       if (std::isinf(scale) || std::isnan(scale)) {
-        scale = 0.0;
+        continue;
       }
 
       if (!var_quant_scales->count(x_var_name)) {
@@ -520,12 +527,10 @@ void QuantDequantMkldnnPass::ConvertFromINT8ToFP32(
     int step_c = step_n / size;
     for (int i = 0; i < weight_dims[0]; i++) {
       int begin_n = i * step_n;
-      for (int j = begin_n; j < begin_n + step_n; j++) {
-        for (int k = 0; k < size; k++) {
-          int begin_c = k * step_c;
-          for (int m = begin_c; m < begin_c + step_c; m++) {
-            weight_data[m] *= scales[k];
-          }
+      for (int j = 0; j < size; j++) {
+        int begin_c = begin_n + j * step_c;
+        for (int k = 0; k < step_c; k++) {
+          weight_data[begin_c + k] *= scales[j];
         }
       }
     }
@@ -588,7 +593,8 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat(
     Scope* scope,
     const std::string& weight_name,
     const std::unordered_map<std::string, std::vector<float>>&
-        weight_thresholds) const {
+        weight_thresholds,
+    std::vector<std::string>* dequantized_weights_names) const {
   auto* op_desc = op_node->Op();
   std::string weight_var_name = op_desc->Input(weight_name)[0];
 
@@ -596,6 +602,13 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat(
   auto iter = weight_thresholds.find(weight_var_name);
   if (iter != weight_thresholds.end()) {
     scales = iter->second;
+    auto name_iter = std::find(dequantized_weights_names->begin(),
+                               dequantized_weights_names->end(),
+                               weight_var_name);
+    // Has been dequantized
+    if (name_iter != dequantized_weights_names->end()) {
+      return;
+    }
   } else {
     if (!IsInt8Weight(op_node, scope, weight_name)) {
       return;
@@ -605,7 +618,7 @@ void QuantDequantMkldnnPass::DequantizeOpWeightsFromONNXFormat(
         "the model is correct.",
         weight_var_name));
   }
-
+  dequantized_weights_names->push_back(weight_var_name);
   auto* var = scope->FindVar(weight_var_name);
   PADDLE_ENFORCE_NOT_NULL(
       var,
@@ -634,14 +647,17 @@ void QuantDequantMkldnnPass::DequantizeWeights(
         << "No need to dequantize weights because weight_thresholds is empty.";
     return;
   }
-
+  std::vector<std::string> dequantized_weights_names;
   for (auto* op_node :
        ir::TopologyVarientSort(*graph, static_cast<ir::SortKind>(0))) {
     if (!op_node->IsOp()) continue;
     if (op_node->Name() == "conv2d" || op_node->Name() == "depthwise_conv2d") {
       if (onnx_format_quantize_model) {
-        DequantizeOpWeightsFromONNXFormat(
-            op_node, scope, "Filter", weight_thresholds);
+        DequantizeOpWeightsFromONNXFormat(op_node,
+                                          scope,
+                                          "Filter",
+                                          weight_thresholds,
+                                          &dequantized_weights_names);
       } else if (IsInt8Weight(op_node, scope, "Filter")) {
         DequantizeOpWeights(
             op_node, scope, "Filter", "Output", weight_thresholds);
@@ -650,7 +666,7 @@ void QuantDequantMkldnnPass::DequantizeWeights(
                op_node->Name() == "matmul_v2") {
       if (onnx_format_quantize_model) {
         DequantizeOpWeightsFromONNXFormat(
-            op_node, scope, "Y", weight_thresholds);
+            op_node, scope, "Y", weight_thresholds, &dequantized_weights_names);
       } else if (IsInt8Weight(op_node, scope, "Y")) {
         DequantizeOpWeights(op_node, scope, "Y", "Out", weight_thresholds);
       }
diff --git a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
old mode 100644
new mode 100755
index eee7fc96ed1d4e..b89b393cc329c9
--- a/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/quant_dequant_mkldnn_pass.h
@@ -125,7 +125,8 @@ class QuantDequantMkldnnPass : public FusePassBase {
       Scope* scope,
       const std::string& weight_name,
       const std::unordered_map<std::string, std::vector<float>>&
-          weight_thresholds) const;
+          weight_thresholds,
+      std::vector<std::string>* dequantized_weights_names) const;
 
   void DequantizeWeights(
       ir::Graph* graph,
diff --git a/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.cc
new file mode 100644
index 00000000000000..00c077cc84d504
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void FuseSqueeze2Transpose2OneDNNPass::ApplyImpl(Graph *graph) const {
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Pointer to graph argument should not be NULL."));
+
+  FusePassBase::Init("squeeze2_transpose2_onednn_fuse_pass", graph);
+
+  GraphPatternDetector gpd;
+  patterns::Squeeze2Transpose2 squeeze2_transpose2_pattern(
+      gpd.mutable_pattern(), "squeeze2_transpose2_onednn_fuse_pass");
+  squeeze2_transpose2_pattern();
+
+  int found_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    GET_IR_NODE_FROM_SUBGRAPH(
+        squeeze2_op_in, squeeze2_op_in, squeeze2_transpose2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        squeeze2_op, squeeze2_op, squeeze2_transpose2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        squeeze2_op_out, squeeze2_op_out, squeeze2_transpose2_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(
+        transpose2_op, transpose2_op, squeeze2_transpose2_pattern);
+
+    if (!transpose2_op->Op()->HasAttr("use_mkldnn") ||
+        (transpose2_op->Op()->HasAttr("use_mkldnn") &&
+         !(PADDLE_GET_CONST(bool,
+                            transpose2_op->Op()->GetAttr("use_mkldnn"))))) {
+      VLOG(4) << "Only oneDNN version of transpose2 can be fused after with "
+                 "squeeze2.";
+      return;
+    }
+
+    std::vector<int> squeeze2_axes =
+        PADDLE_GET_CONST(std::vector<int>, squeeze2_op->Op()->GetAttr("axes"));
+    transpose2_op->Op()->SetAttr("fused_squeeze2_axes", squeeze2_axes);
+    transpose2_op->Op()->SetInput("X", {squeeze2_op_in->Name()});
+
+    IR_VAR_OP_LINK(squeeze2_op_in, transpose2_op);
+    GraphSafeRemoveNodes(g, {squeeze2_op, squeeze2_op_out});
+    found_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_count);
+  if ((!Has("disable_logs") || !Get<bool>("disable_logs"))) {
+    PrettyLogDetail("--- fused %d squeeze2 with transpose2", found_count);
+  }
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(squeeze2_transpose2_onednn_fuse_pass,
+              paddle::framework::ir::FuseSqueeze2Transpose2OneDNNPass);
+REGISTER_PASS_CAPABILITY(squeeze2_transpose2_onednn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .GE("squeeze2", 0)
+            .GE("transpose2", 0));
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h b/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h
similarity index 58%
rename from paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
rename to paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h
index a4d60e91e8455c..be3871bdfe2fbc 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/squeeze2_transpose2_onednn_fuse_pass.h
@@ -1,4 +1,4 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -14,24 +14,22 @@
 
 #pragma once
 
-#include <string>
-#include <unordered_set>
-
-#include "paddle/fluid/inference/analysis/analysis_pass.h"
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
 
 namespace paddle {
-namespace inference {
-namespace analysis {
-
-struct Argument;
+namespace framework {
+namespace ir {
 
-class IrInferCleanGraphPass : public AnalysisPass {
+class FuseSqueeze2Transpose2OneDNNPass : public FusePassBase {
  public:
-  void RunImpl(Argument *argument) override;
+  virtual ~FuseSqueeze2Transpose2OneDNNPass() {}
 
-  std::string repr() const override { return "ir_graph_clean_pass"; }
+ protected:
+  void ApplyImpl(Graph *graph) const override;
 };
 
-}  // namespace analysis
-}  // namespace inference
+}  // namespace ir
+}  // namespace framework
+
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
index 7b203125681c54..13465610e47fd6 100644
--- a/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
+++ b/paddle/fluid/framework/ir/multi_batch_merge_pass.cc
@@ -331,8 +331,6 @@ void BatchMergePass::ApplyImpl(ir::Graph* graph) const {
       copy_node(node);
     }
   }
-
-  result.ResolveHazard(created);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
index 35f72deab89057..4ad93183996fa5 100644
--- a/paddle/fluid/framework/ir/pass.cc
+++ b/paddle/fluid/framework/ir/pass.cc
@@ -22,6 +22,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
+class Scope;
 namespace ir {
 class Graph;
 }  // namespace ir
@@ -35,6 +36,18 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+static const char kParamScopeAttr[] = "__param_scope__";
+
+static const std::vector<std::string> support_subgraph_passes = {
+    "simplify_with_basic_ops_pass",
+    "fused_multi_transformer_encoder_pass",
+    "fused_multi_transformer_decoder_pass",
+    "fused_multi_transformer_encoder_fuse_qkv_pass",
+    "fused_multi_transformer_decoder_fuse_qkv_pass",
+    "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass",
+    "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass",
+    "fuse_multi_transformer_layer_pass"};
+
 Graph *Pass::Apply(Graph *graph) const {
   VLOG(10) << "start to apply pass " << Type() << " to graph";
   CheckPrevPass();
@@ -65,11 +78,41 @@ Graph *Pass::Apply(Graph *graph) const {
       true,
       platform::errors::InvalidArgument(
           "The VarDescs of persistable variable are not consistency."));
-  applied_ = true;
   if (!graph->Has(kPassRecorder)) {
     graph->Set<PassRecorder>(kPassRecorder, new PassRecorder);
   }
   graph->Get<PassRecorder>(kPassRecorder).insert(Type());
+
+  if (graph->IsMainGraph() && std::count(support_subgraph_passes.begin(),
+                                         support_subgraph_passes.end(),
+                                         Type())) {
+    for (size_t i = 1; i < graph->SubGraphsSize(); i++) {
+      auto *sub_graph = graph->GetSubGraph(i);
+      if (!sub_graph->Has(framework::ir::kParamScopeAttr)) {
+        sub_graph->SetNotOwned<Scope>(
+            framework::ir::kParamScopeAttr,
+            &graph->Get<Scope>(framework::ir::kParamScopeAttr));
+      }
+
+      ApplyImpl(sub_graph);
+      PADDLE_ENFORCE_EQ(
+          HasCircle(*sub_graph),
+          false,
+          platform::errors::InvalidArgument(
+              "Illegal pass %s. Generated graph shouldn't contain cycle.",
+              Type()));
+      PADDLE_ENFORCE_EQ(
+          VarDescIsConsistency(*sub_graph),
+          true,
+          platform::errors::InvalidArgument(
+              "The VarDescs of persistable variable are not consistency."));
+      if (!sub_graph->Has(kPassRecorder)) {
+        sub_graph->Set<PassRecorder>(kPassRecorder, new PassRecorder);
+      }
+      sub_graph->Get<PassRecorder>(kPassRecorder).insert(Type());
+    }
+  }
+  applied_ = true;
 #ifdef PADDLE_WITH_MKLDNN
   // Clear mkl-dnn cache,
   // Passes can change params, tensors, so caching need to be discarded
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 37a28bec16da2d..e0315f0b5b7414 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -47,6 +47,22 @@ constexpr char kPassRecorder[] = "pass_recorder";
 constexpr char kEmbEltwiseLayernormPass[] =
     "embedding_eltwise_layernorm_fuse_pass_flag";
 constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag";
+constexpr char kFusedMultiTransformerEncoderPass[] =
+    "fused_multi_transformer_encoder_pass_flag";
+constexpr char kFusedMultiTransformerDecoderPass[] =
+    "fused_multi_transformer_decoder_pass_flag";
+constexpr char kFusedMultiTransformerEncoderFuseQKVPass[] =
+    "fused_multi_transformer_encoder_fuse_qkv_pass_flag";
+constexpr char kFusedMultiTransformerDecoderFuseQKVPass[] =
+    "fused_multi_transformer_decoder_fuse_qkv_pass_flag";
+constexpr char kMultiDevicesFusedMultiTransformerEncoderFuseQKVPass[] =
+    "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass_flag";
+constexpr char kMultiDevicesFusedMultiTransformerDecoderFuseQKVPass[] =
+    "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass_flag";
+constexpr char kFusedMultiTransformerEncoderFusionCount[] =
+    "fused_multi_transformer_encoder_fusion_count";
+constexpr char kFusedMultiTransformerDecoderFusionCount[] =
+    "fused_multi_transformer_decoder_fusion_count";
 constexpr char kPrelnEmbEltwiseLayernormPass[] =
     "preln_embedding_eltwise_layernorm_fuse_pass_flag";
 
diff --git a/paddle/fluid/framework/ir/pass_tester_helper.h b/paddle/fluid/framework/ir/pass_tester_helper.h
index 4d34e9e0900a14..48f8cb37d60a98 100644
--- a/paddle/fluid/framework/ir/pass_tester_helper.h
+++ b/paddle/fluid/framework/ir/pass_tester_helper.h
@@ -146,6 +146,12 @@ struct Layers {
     return unary_op("relu", x, out);
   }
 
+  VarDesc* gelu(VarDesc* x, VarDesc* out = nullptr, bool approximate = true) {
+    AttributeMap attrs;
+    attrs["approximate"] = approximate;
+    return unary_op("gelu", x, out, &attrs);
+  }
+
   VarDesc* sigmoid(VarDesc* x, VarDesc* out = nullptr) {
     return unary_op("sigmoid", x, out);
   }
@@ -154,6 +160,20 @@ struct Layers {
     return unary_op("tanh", x, out);
   }
 
+  VarDesc* c_identity(VarDesc* x, VarDesc* out = nullptr, int ring_id = -1) {
+    AttributeMap attrs;
+    attrs["ring_id"] = ring_id;
+    return unary_op("c_identity", x, out, &attrs);
+  }
+
+  VarDesc* c_allreduce_sum(VarDesc* x,
+                           VarDesc* out = nullptr,
+                           int ring_id = -1) {
+    AttributeMap attrs;
+    attrs["ring_id"] = ring_id;
+    return unary_op("c_allreduce_sum", x, out, &attrs);
+  }
+
   VarDesc* fc(VarDesc* input,
               VarDesc* w,
               VarDesc* bias,
@@ -332,6 +352,37 @@ struct Layers {
     return outs;
   }
 
+  std::vector<VarDesc*> split(VarDesc* x, int num_or_section, int axis = 0) {
+    std::vector<VarDesc*> outs(num_or_section);
+    for (int i = 0; i < num_or_section; i++) {
+      outs[i] = lod_tensor(unique_name());
+    }
+    std::vector<std::string> out_names(num_or_section);
+    for (int i = 0; i < num_or_section; i++) {
+      out_names[i] = outs[i]->Name();
+    }
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    op->SetType("split");
+    op->SetInput("X", {x->Name()});
+    op->SetOutput("Out", out_names);
+    op->SetAttr("num_or_section", num_or_section);
+    op->SetAttr("axis", axis);
+    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                static_cast<int>(OpRole::kForward));
+    return outs;
+  }
+
+  VarDesc* assign(VarDesc* x) {
+    VarDesc* out = lod_tensor(unique_name());
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    op->SetType("assign");
+    op->SetInput("X", {x->Name()});
+    op->SetOutput("Out", {out->Name()});
+    op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
+                static_cast<int>(OpRole::kForward));
+    return out;
+  }
+
   VarDesc* matmul(VarDesc* x,
                   VarDesc* y,
                   VarDesc* alpha = nullptr,
@@ -459,6 +510,137 @@ struct Layers {
     return out;
   }
 
+  VarDesc* while_loop(std::vector<VarDesc*> xs, VarDesc* cond = nullptr) {
+    VarDesc* out = lod_tensor(unique_name());
+    VarDesc* step_scopes = lod_tensor(unique_name());
+    if (cond == nullptr) cond = lod_tensor(unique_name());
+
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    op->SetType("while");
+    std::vector<std::string> xs_names;
+    for (auto& x : xs) xs_names.emplace_back(x->Name());
+    op->SetInput("X", xs_names);
+    op->SetInput("Condition", {cond->Name()});
+    op->SetOutput("Out", {out->Name()});
+    op->SetOutput("StepScopes", {step_scopes->Name()});
+    op->SetAttr("sub_block", {program_.MutableBlock(0)});
+    op->SetAttr("is_test", true);
+    return out;
+  }
+
+  VarDesc* shape(VarDesc* input) {
+    VarDesc* out = lod_tensor(unique_name());
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    op->SetType("shape");
+    op->SetInput("Input", {input->Name()});
+    op->SetOutput("Out", {out->Name()});
+    return out;
+  }
+
+  VarDesc* slice(VarDesc* input,
+                 std::vector<int> axes,
+                 std::vector<int> starts,
+                 std::vector<int> ends) {
+    VarDesc* out = lod_tensor(unique_name());
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    op->SetType("slice");
+    op->SetInput("Input", {input->Name()});
+    op->SetOutput("Out", {out->Name()});
+    op->SetAttr("axes", axes);
+    op->SetAttr("starts", starts);
+    op->SetAttr("ends", ends);
+    return out;
+  }
+
+  VarDesc* fill_constant_batch_size_like(VarDesc* x,
+                                         int dtype,
+                                         int input_dim_idx,
+                                         int output_dim_idx,
+                                         std::vector<int> shape,
+                                         float value) {
+    VarDesc* out = lod_tensor(unique_name());
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    op->SetType("fill_constant_batch_size_like");
+    op->SetInput("Input", {x->Name()});
+    op->SetAttr("dtype", dtype);
+    op->SetAttr("input_dim_idx", input_dim_idx);
+    op->SetAttr("output_dim_idx", output_dim_idx);
+    op->SetAttr("shape", shape);
+    op->SetAttr("value", value);
+    op->SetOutput("Out", {out->Name()});
+    return out;
+  }
+
+  VarDesc* fused_multi_transformer(VarDesc* x,
+                                   VarDesc* cache_kv,
+                                   VarDesc* src_mask,
+                                   VarDesc* qkv_w,
+                                   VarDesc* qkv_bias,
+                                   VarDesc* out_linear_w,
+                                   VarDesc* out_linear_bias,
+                                   VarDesc* ffn1_w,
+                                   VarDesc* ffn1_bias,
+                                   VarDesc* ffn2_w,
+                                   VarDesc* ffn2_bias,
+                                   VarDesc* ln_scale,
+                                   VarDesc* ln_bias,
+                                   VarDesc* ffn_ln_scale,
+                                   VarDesc* ffn_ln_bias,
+                                   float epsilon,
+                                   float dropout_rate,
+                                   VarDesc* time_stamp = nullptr,
+                                   VarDesc* qkv_out_scale = nullptr,
+                                   VarDesc* out_linear_out_scale = nullptr,
+                                   VarDesc* ffn1_out_scale = nullptr,
+                                   VarDesc* ffn2_out_scale = nullptr,
+                                   std::vector<float> qkv_in_scale = {},
+                                   std::vector<float> out_linear_in_scale = {},
+                                   std::vector<float> ffn1_in_scale = {},
+                                   std::vector<float> ffn2_in_scale = {}) {
+    VarDesc* out = lod_tensor(unique_name());
+    OpDesc* op = program_.MutableBlock(0)->AppendOp();
+    std::string op_type = qkv_out_scale ? "fused_multi_transformer_int8"
+                                        : "fused_multi_transformer";
+    op->SetType(op_type);
+    op->SetInput("X", {x->Name()});
+    op->SetInput("CacheKV", {cache_kv->Name()});
+    op->SetInput("SrcMask", {src_mask->Name()});
+    op->SetInput("QKVW", {qkv_w->Name()});
+    op->SetInput("QKVBias", {qkv_bias->Name()});
+    op->SetInput("OutLinearW", {out_linear_w->Name()});
+    op->SetInput("OutLinearBias", {out_linear_bias->Name()});
+    op->SetInput("FFN1Weight", {ffn1_w->Name()});
+    op->SetInput("FFN1Bias", {ffn1_bias->Name()});
+    op->SetInput("FFN2Weight", {ffn2_w->Name()});
+    op->SetInput("FFN2Bias", {ffn2_bias->Name()});
+    op->SetInput("LnScale", {ln_scale->Name()});
+    op->SetInput("LnBias", {ln_bias->Name()});
+    op->SetInput("FFNLnScale", {ffn_ln_scale->Name()});
+    op->SetInput("FFNLnBias", {ffn_ln_bias->Name()});
+    op->SetAttr("pre_layer_norm", true);
+    op->SetAttr("is_test", true);
+    op->SetAttr("dropout_implementation", "upscale_in_train");
+    op->SetAttr("dropout_rate", dropout_rate);
+    op->SetAttr("epsilon", epsilon);
+    op->SetOutput("Out", {out->Name()});
+
+    if (time_stamp) {
+      op->SetInput("TimeStep", {time_stamp->Name()});
+    }
+
+    if (qkv_out_scale) {
+      op->SetInput("QKVOutScale", {qkv_out_scale->Name()});
+      op->SetInput("OutLinearOutScale", {out_linear_out_scale->Name()});
+      op->SetInput("FFN1OutScale", {ffn1_out_scale->Name()});
+      op->SetInput("FFN2OutScale", {ffn2_out_scale->Name()});
+      op->SetAttr("qkv_in_scale", qkv_in_scale);
+      op->SetAttr("out_linear_in_scale", out_linear_in_scale);
+      op->SetAttr("ffn1_in_scale", ffn1_in_scale);
+      op->SetAttr("ffn2_in_scale", ffn2_in_scale);
+    }
+    return out;
+  }
+
   void backward(std::vector<VarDesc*> targets) {
     // This function is designed to simulate the structure of training program,
     //  but is constructed differently as the actual program.
@@ -523,7 +705,10 @@ struct Layers {
     return var;
   }
 
-  VarDesc* unary_op(std::string type, VarDesc* x, VarDesc* out = nullptr) {
+  VarDesc* unary_op(std::string type,
+                    VarDesc* x,
+                    VarDesc* out = nullptr,
+                    const AttributeMap* attrs = nullptr) {
     if (!out) {
       out = lod_tensor(unique_name());
     }
@@ -531,6 +716,11 @@ struct Layers {
     op->SetType(type);
     op->SetInput("X", {x->Name()});
     op->SetOutput("Out", {out->Name()});
+    if (attrs) {
+      for (auto& iter : *attrs) {
+        op->SetAttr(iter.first, iter.second);
+      }
+    }
     op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
                 static_cast<int>(OpRole::kForward));
     return out;
diff --git a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
index 43f48f87c09a51..79d27948954278 100644
--- a/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/preln_residual_bias_fuse_pass.cc
@@ -143,10 +143,7 @@ void PrelnResidualBiasFusePass::ApplyImpl(ir::Graph *graph) const {
       LOG(WARNING) << "The subgraph is empty.";
       return;
     }
-    if (!IsCompat(subgraph, graph)) {
-      LOG(WARNING) << "preln_residual_bias pass in op compat failed.";
-      return;
-    }
+
     VLOG(4) << "handle PrelnResidualBias fuse";
     GET_IR_NODE_FROM_SUBGRAPH(
         elementwise_bias, elementwise_bias, fused_pattern);
@@ -164,6 +161,21 @@ void PrelnResidualBiasFusePass::ApplyImpl(ir::Graph *graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(layer_norm_mean, layer_norm_mean, fused_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(
         layer_norm_variance, layer_norm_variance, fused_pattern);
+
+    // We can not accept that two or more layer_norm is connected to
+    // elementwise1_out. This will lead to two or more PrelnResidualBias
+    // patterns is found near elementwise1_out, and these patterns will interact
+    // on each other, so we make below check to ensure only one
+    // PrelnResidualBias pattern is delalted with.
+    for (auto op : elementwise1_out->inputs) {
+      if (op->Name() == "preln_residual_bias") return;
+    }
+
+    if (!IsCompat(subgraph, graph)) {
+      LOG(WARNING) << "preln_residual_bias pass in op compat failed.";
+      return;
+    }
+
     std::unordered_set<const Node *> del_node_set;
     // Create an PrelnResidualBias op node
     OpDesc new_desc;
diff --git a/paddle/fluid/framework/naive_executor.cc b/paddle/fluid/framework/naive_executor.cc
index 651ac23e52fe1c..3a85d30386cd50 100644
--- a/paddle/fluid/framework/naive_executor.cc
+++ b/paddle/fluid/framework/naive_executor.cc
@@ -183,5 +183,6 @@ void NaiveExecutor::ResetTrtOps(int num) {
   }
 #endif
 }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 6be8aa776a839c..e3ae9034c44562 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -19,6 +19,7 @@
 #include "paddle/fluid/framework/details/nan_inf_utils.h"
 #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h"
 #include "paddle/fluid/framework/new_executor/interpretercore_util.h"
+#include "paddle/fluid/framework/new_executor/threadpool_config.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/os_info.h"
@@ -29,6 +30,7 @@
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
+#include "paddle/phi/backends/device_manager.h"
 
 PADDLE_DEFINE_EXPORTED_bool(new_executor_use_inplace,
                             false,
@@ -46,9 +48,6 @@ constexpr const char* kTaskCompletion = "TaskCompletion";
 
 namespace paddle {
 namespace framework {
-// NOTE(Aurelius84): Need a better strategy to determine it.
-static constexpr size_t kHostNumThreads = 4;
-static constexpr size_t kDeviceNumThreads = 1;
 
 InterpreterCore::InterpreterCore(const platform::Place& place,
                                  const BlockDesc& block,
@@ -292,8 +291,14 @@ bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
 
 std::shared_ptr<interpreter::AsyncWorkQueue> InterpreterCore::GetWorkQueue() {
   if (async_work_queue_ == nullptr) {
-    async_work_queue_ = std::make_shared<interpreter::AsyncWorkQueue>(
-        kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_);
+    int host_num_threads = 1, deivce_num_threads = 1, prepare_num_threads = 1;
+    std::tie(host_num_threads, deivce_num_threads, prepare_num_threads) =
+        interpreter::GetThreadPoolConfig(place_, vec_instruction_.size());
+    async_work_queue_ =
+        std::make_shared<interpreter::AsyncWorkQueue>(host_num_threads,
+                                                      deivce_num_threads,
+                                                      prepare_num_threads,
+                                                      &main_thread_blocker_);
   }
   return async_work_queue_;
 }
@@ -595,16 +600,61 @@ void InterpreterCore::BuildSkipShareLoDInfo() {
   }
 }
 
+inline void SetDeviceId(const platform::Place& place) {
+  // TODO(zhiqiu): reduce the cost
+  if (platform::is_gpu_place(place)) {
+#if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with CUDA support.",
+        place));
+#else
+    auto dev_id = place.device;
+    platform::SetDeviceId(dev_id);
+#endif
+  } else if (platform::is_xpu_place(place)) {
+#ifndef PADDLE_WITH_XPU
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with XPU support.",
+        place));
+#else
+    auto dev_id = place.device;
+    platform::SetXPUDeviceId(dev_id);
+#endif
+  } else if (platform::is_npu_place(place)) {
+#ifndef PADDLE_WITH_ASCEND_CL
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with NPU support.",
+        place));
+#else
+    auto dev_id = place.device;
+    platform::SetNPUDeviceId(dev_id);
+#endif
+  } else if (platform::is_custom_place(place)) {
+#ifndef PADDLE_WITH_CUSTOM_DEVICE
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Cannot run operator on place %s, please recompile paddle or "
+        "reinstall Paddle with CustomDevice support.",
+        place));
+#else
+    phi::DeviceManager::SetDevice(place);
+#endif
+  }
+}
+
 void InterpreterCore::RunInstruction(const Instruction& instr_node) {
   auto* op = instr_node.OpBase();
   auto place = instr_node.DeviceContext().GetPlace();
   Scope* local_scope = create_local_scope_ ? var_scope_.GetMutableLocalScope()
                                            : var_scope_.GetMutableScope();
+  VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope_);
+
+  SetDeviceId(place);
 
 #ifdef PADDLE_WITH_ASCEND_CL
   if (platform::is_npu_place(place)) {
-    auto dev_id = place.device;
-    platform::SetNPUDeviceId(dev_id);
     // NOTE(wangxi): nan/inf cannot be detected on NPU by checking the variable
     // values, but only through special `float_status` to checks whether
     // the operation is overflow. More about `float_status`, see:
@@ -727,14 +777,23 @@ void InterpreterCore::ExecuteInstructionList(
 
   platform::RecordEvent record_prepare(
       "PrepareAtomic", platform::TracerEventType::UserDefined, 1);
-  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
-  // those for the next step
-  auto atomic_deps = atomic_deps_.get();
-  auto atomic_var_ref = atomic_var_ref_.get();
-
-  atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
-  atomic_var_ref_ =
-      async_work_queue_->PrepareAtomicVarRef(var_scope_.VecMetaInfo());
+
+  std::unique_ptr<std::vector<std::atomic<size_t>>> atomic_deps = nullptr;
+  std::unique_ptr<std::vector<std::atomic<size_t>>> atomic_var_ref = nullptr;
+
+  if (async_work_queue_->QueueNumThreads(kPrepareWorkQueueIdx)) {
+    // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
+    // those for the next step
+    atomic_deps = atomic_deps_.get();
+    atomic_var_ref = atomic_var_ref_.get();
+
+    atomic_deps_ = async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+    atomic_var_ref_ =
+        async_work_queue_->PrepareAtomicVarRef(var_scope_.VecMetaInfo());
+  } else {
+    atomic_deps = interpreter::PrepareAtomicDeps(dependecy_count_);
+    atomic_var_ref = interpreter::PrepareAtomicVarRef(var_scope_.VecMetaInfo());
+  }
   record_prepare.End();
 
   exception_holder_.Clear();
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 2d60b0231a5c08..86f3768c54d403 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -129,7 +129,7 @@ class InterpreterCore {
 
   std::vector<Instruction> vec_instruction_;  // deconstruct before OpFuncNode
 
-  // last_live_ops_[i] contains the id of operatos that last access var[i]
+  // last_live_ops_[i] contains the id of operators that last access var[i]
   std::map<size_t, std::set<size_t>> last_live_ops_;
 
   std::vector<size_t> dependecy_count_;
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index 99b3b68cc2a892..0c1370edd0ef5d 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -42,11 +42,12 @@ namespace framework {
 namespace interpreter {
 
 using VariableIdMap = std::map<std::string, std::vector<int>>;
-constexpr size_t kPrepareWorkQueueIdx = 2;
-const char blocking_queue_prefix[] = "lod_tensor_blocking_queue";
 
 const std::vector<WorkQueueOptions> ConstructWorkQueueOptions(
-    size_t host_num_threads, size_t device_num_threads, EventsWaiter* waiter) {
+    size_t host_num_threads,
+    size_t device_num_threads,
+    size_t prepare_num_threads,
+    EventsWaiter* waiter) {
   std::vector<WorkQueueOptions> group_options;
   // for execute host Kernel
   group_options.emplace_back(/*name*/ "HostTasks",
@@ -60,13 +61,13 @@ const std::vector<WorkQueueOptions> ConstructWorkQueueOptions(
   group_options.emplace_back(/*name*/ "DeviceKernelLaunch",
                              /*num_threads*/ device_num_threads,
                              /*allow_spinning*/ true,
-                             /*always_spinning*/ true,
+                             /*always_spinning*/ false,
                              /*track_task*/ false,
                              /*detached*/ true,
                              /*events_waiter*/ waiter);
   // for prepare deps and others
   group_options.emplace_back(/*name*/ "Prepare",
-                             /*num_threads*/ 1,
+                             /*num_threads*/ prepare_num_threads,
                              /*allow_spinning*/ true,
                              /*always_spinning*/ false,
                              /*track_task*/ false,
@@ -77,10 +78,11 @@ const std::vector<WorkQueueOptions> ConstructWorkQueueOptions(
 
 AsyncWorkQueue::AsyncWorkQueue(size_t host_num_threads,
                                size_t device_num_threads,
+                               size_t prepare_num_threads,
                                EventsWaiter* waiter)
     : host_num_thread_(host_num_threads) {
-  queue_group_ = CreateWorkQueueGroup(
-      ConstructWorkQueueOptions(host_num_threads, device_num_threads, waiter));
+  queue_group_ = CreateWorkQueueGroup(ConstructWorkQueueOptions(
+      host_num_threads, device_num_threads, prepare_num_threads, waiter));
 }
 
 void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
@@ -281,11 +283,12 @@ void create_all_ops(const framework::BlockDesc& block,
   }
 }
 
-std::tuple<VariableValueMap, VariableIdMap> build_variable_map(
+std::tuple<VariableValueMap, VariableIdMap> BuildVariableMap(
     const VariableNameMap& var_name_map,
     VariableScope* var_scope,
     Scope* local_scope,
-    bool enforce_exist = true) {
+    bool allow_var_not_in_program = false,
+    bool allow_var_not_in_scope = false) {
   VariableValueMap name2var;
   VariableIdMap name2id;
   for (auto& item : var_name_map) {
@@ -295,14 +298,10 @@ std::tuple<VariableValueMap, VariableIdMap> build_variable_map(
 
     for (auto& var_name : item.second) {
       if (!var_scope->HasVar(var_name)) {
-        // Hot fix for variables used in dataloader, like
-        // 'lod_tensor_blocking_queue_0' These variables may be created in
-        // scope, and it is not existed as variable in program.
-        if (var_name.find(blocking_queue_prefix) != std::string::npos &&
-            local_scope->FindVar(var_name)) {
+        if (allow_var_not_in_program && local_scope->FindVar(var_name)) {
+          VLOG(3) << "Add " << var_name << " to var_scope";
           var_scope->AddVar(var_name, nullptr);
-        } else if (!enforce_exist) {
-          // skip the non-exist variable: such as recurrent_grad
+        } else if (allow_var_not_in_scope) {
           VLOG(4) << var_name << " don't exist in variable scope, skip it!";
           continue;
         }
@@ -422,7 +421,6 @@ void build_op_func_list(const platform::Place& place,
                                        : var_scope->GetMutableScope();
   std::vector<std::unique_ptr<OperatorBase>>
       ops_unique;  // its elements will be moved to vec_func_list
-  bool flag_log_is_printed = false;
   // Step 1: create all ops for current block.
   create_all_ops(block, &ops_unique);
 
@@ -449,35 +447,44 @@ void build_op_func_list(const platform::Place& place,
 
   for (size_t i = 0; i < ops.size(); ++i) {
     auto op = ops[i].get();
-    VLOG(6) << "Build OpFuncNode from : " << op->Type();
-
-    // Print new executor log if grad op is used.
-    // It's only for test and will be removed later.
-    if (!flag_log_is_printed && op->Type().find("_grad") != std::string::npos) {
-      VLOG(0) << "Standalone Executor is Used.";
-      flag_log_is_printed = true;
-    }
-
-    auto inputs_names = op->Inputs();
-    auto outputs_names = op->Outputs();
-
+    const std::string& op_type = op->Type();
+
+    VLOG(6) << "Build OpFuncNode from : " << op_type;
+
+    // Hot fix for variables used in dataloader, like
+    // 'lod_tensor_blocking_queue_0'. These variables may be created in scope,
+    // and it is not existed as variable in program.
+    const std::set<std::string> ops_with_var_not_in_program = {
+        "create_py_reader"};
+    const std::set<std::string> ops_with_var_not_in_scope = {
+        "conditional_block",
+        "conditional_block_grad",
+        "recurrent_grad",
+        "rnn_memory_helper",
+        "rnn_memory_helper_grad",
+        "while",
+        "while_grad"};
+    bool allow_var_not_in_program = ops_with_var_not_in_program.count(op_type);
+    bool allow_var_not_in_scope = ops_with_var_not_in_scope.count(op_type);
+
+    framework::VariableNameMap& input_name_map = op->Inputs();
     VariableValueMap ins_map;
     VariableIdMap ins_name2id;
-    bool enforce_exist = true;
-    if (op->Type() == "recurrent_grad" || op->Type() == "rnn_memory_helper" ||
-        op->Type() == "rnn_memory_helper_grad" ||
-        op->Type() == "conditional_block" ||
-        op->Type() == "conditional_block_grad" || op->Type() == "while" ||
-        op->Type() == "while_grad") {
-      enforce_exist = false;
-    }
-    std::tie(ins_map, ins_name2id) =
-        build_variable_map(inputs_names, var_scope, local_scope, enforce_exist);
+    std::tie(ins_map, ins_name2id) = BuildVariableMap(input_name_map,
+                                                      var_scope,
+                                                      local_scope,
+                                                      allow_var_not_in_program,
+                                                      allow_var_not_in_scope);
 
+    framework::VariableNameMap& output_name_map = op->Outputs();
     VariableValueMap outs_map;
     VariableIdMap outs_name2id;
-    std::tie(outs_map, outs_name2id) = build_variable_map(
-        outputs_names, var_scope, local_scope, enforce_exist);
+    std::tie(outs_map, outs_name2id) =
+        BuildVariableMap(output_name_map,
+                         var_scope,
+                         local_scope,
+                         /*allow_var_not_in_program=*/false,
+                         allow_var_not_in_scope);
 
     // step 1: build OpFuncNode
     OpFuncNode op_func_node;
@@ -634,7 +641,7 @@ void build_op_func_list(const platform::Place& place,
         if (framework::IsComplexType(kernel_type.data_type_)) {
           interpreter::HandleComplexGradToRealGrad(op_func_node,
                                                    place,
-                                                   outputs_names,
+                                                   output_name_map,
                                                    &runtime_context.outputs,
                                                    var_scope,
                                                    vec_func_list,
@@ -665,17 +672,17 @@ void build_op_func_list(const platform::Place& place,
         }
       }
     } catch (platform::EnforceNotMet& ex) {
-      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
+      framework::InsertCallStackInfo(op_type, op->Attrs(), &ex);
       throw std::move(ex);
     } catch (platform::EOFException&) {
       std::rethrow_exception(std::current_exception());
     } catch (std::exception& ex) {
-      LOG(WARNING) << op->Type() << " raises an exception "
+      LOG(WARNING) << op_type << " raises an exception "
                    << platform::demangle(typeid(ex).name()) << ", "
                    << ex.what();
       std::rethrow_exception(std::current_exception());
     } catch (...) {
-      LOG(WARNING) << op->Type() << " raises an unknown exception";
+      LOG(WARNING) << op_type << " raises an unknown exception";
       std::rethrow_exception(std::current_exception());
     }
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 1860b19b1ca420..8fc0dcb266e3da 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -39,6 +39,7 @@
 #include "paddle/fluid/platform/init.h"
 
 using AtomicVectorSizeT = std::vector<std::atomic<size_t>>;
+constexpr size_t kPrepareWorkQueueIdx = 2;
 
 namespace paddle {
 namespace framework {
@@ -48,6 +49,7 @@ class AsyncWorkQueue {
  public:
   AsyncWorkQueue(size_t host_num_threads,
                  size_t deivce_num_threads,
+                 size_t prepare_num_threads,
                  EventsWaiter* waiter);
 
   std::future<std::unique_ptr<AtomicVectorSizeT>> PrepareAtomicDeps(
@@ -61,6 +63,10 @@ class AsyncWorkQueue {
 
   void Cancel() { queue_group_->Cancel(); }
 
+  size_t QueueNumThreads(size_t idx) {
+    return queue_group_->QueueNumThreads(idx);
+  }
+
  private:
   size_t host_num_thread_;
   std::unique_ptr<WorkQueueGroup> queue_group_;
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.cc b/paddle/fluid/framework/new_executor/new_executor_defs.cc
index 8ee7065368c218..c40a80ce0752c5 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.cc
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.cc
@@ -21,17 +21,8 @@
 
 #include "paddle/phi/core/utils/rw_lock.h"
 
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
 #define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
-#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/new_executor_defs.h b/paddle/fluid/framework/new_executor/new_executor_defs.h
index 82eb237e73d18f..2df8892f5bd8aa 100644
--- a/paddle/fluid/framework/new_executor/new_executor_defs.h
+++ b/paddle/fluid/framework/new_executor/new_executor_defs.h
@@ -24,17 +24,8 @@
 #include "paddle/fluid/platform/event.h"
 #include "paddle/phi/core/utils/rw_lock.h"
 
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
 #define SCOPE_VARS_READER_LOCK AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK AutoWRLock auto_lock(&vars_lock_);
-#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 2531a8e7cd3d3b..67e7293877846f 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -50,7 +50,7 @@ USE_OP_ITSELF(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP_ITSELF(sigmoid_grad);
 USE_OP_ITSELF(tanh_grad);
-USE_OP(sum);
+USE_OP_ITSELF(sum);
 USE_OP_ITSELF(slice_grad);
 USE_OP_ITSELF(lookup_table_grad);
 USE_OP_ITSELF(sqrt);
@@ -101,6 +101,7 @@ PD_DECLARE_KERNEL(slice_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(cross_entropy_with_softmax, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(cross_entropy_with_softmax_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sqrt, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_n, GPU, ALL_LAYOUT);
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/new_executor/threadpool_config.h b/paddle/fluid/framework/new_executor/threadpool_config.h
new file mode 100644
index 00000000000000..0270aa7d1e86e5
--- /dev/null
+++ b/paddle/fluid/framework/new_executor/threadpool_config.h
@@ -0,0 +1,136 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <thread>
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/backends/xpu/xpu_info.h"
+
+DECLARE_bool(new_executor_serial_run);
+
+namespace paddle {
+namespace framework {
+namespace interpreter {
+
+static constexpr size_t kHostNumThreads = 4;
+static constexpr size_t kDeviceNumThreads = 1;
+static constexpr size_t kNumGcThreads = 1;
+static constexpr size_t kNumPrepareThreads = 0;
+
+static constexpr size_t kMinOpNumForAsyncPrepare = 1000;
+
+// By default, one interpretercore contains:
+// 1-size thread pool for device kernel launch (or 0 for cpu execution),
+// 1-size thread pool for host kernel launch (or more if the system contains
+// enough processors).
+
+// And it may contain:
+// 1-size thread pool for gc if it is can not use FastGC,
+// 1-size thread pool for preparation if the program contains two many ops
+// (1000+).
+
+// Note that the purpose of the config is to limit the total 'possible'
+// threads introduced by interpretercore to avoid hurting performance.
+
+inline std::tuple<int, int, int> GetThreadPoolConfig(const phi::Place place,
+                                                     size_t op_num) {
+  int num_device_threads = kDeviceNumThreads,
+      num_host_threads = kHostNumThreads,
+      num_prepare_threads = kNumPrepareThreads;
+
+  if (op_num > kMinOpNumForAsyncPrepare) {
+    num_prepare_threads = 1;
+  }
+
+  int device_count = 0, processor_count = 0;
+  if (platform::is_cpu_place(place)) {
+    num_device_threads = 0;
+    num_host_threads = 4;
+  } else {
+    processor_count = std::thread::hardware_concurrency();
+    if (processor_count) {
+      if (platform::is_gpu_place(place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+        device_count = phi::backends::gpu::GetGPUDeviceCount();
+#endif
+      }
+      if (platform::is_xpu_place(place)) {
+#if defined(PADDLE_WITH_XPU)
+        device_count = phi::backends::xpu::GetXPUDeviceCount();
+#endif
+      }
+      if (platform::is_npu_place(place)) {
+#if defined(PADDLE_WITH_ASCEND_CL)
+        device_count = platform::GetNPUDeviceCount();
+#endif
+      }
+      if (platform::is_ipu_place(place)) {
+#if defined(PADDLE_WITH_IPU)
+        device_count = platform::GetIPUDeviceCount();
+#endif
+      }
+      if (platform::is_custom_place(place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+        device_count =
+            phi::DeviceManager::GetDeviceCount(place.GetDeviceType());
+#endif
+      }
+
+      // Tricky implementation.
+      // In multi-card training, each card may set env like
+      // CUDA_VISIBLE_DEVICE=0 In that case, device_count is set to 8.
+      if (device_count == 1) {
+        device_count = 8;  // in many case, the accelerator has 8 cards.
+      }
+
+      // We expect processor_count = 2 * (the possible total threads when doing
+      // multi-card training), to make sure that the system will not slow down
+      // because of too many threads. Here, 2 is experience value. Since each
+      // device has one interpretercore, the possible total threads when doing
+      // multi-card training = device_count * (the possible total threads in one
+      // interpretercore).
+
+      if (device_count) {
+        auto num = processor_count / device_count / 2 -
+                   (kNumGcThreads + kNumPrepareThreads + num_device_threads);
+        num_host_threads =
+            num > 0 ? (num > kHostNumThreads ? kHostNumThreads : num) : 1;
+      }
+    }
+  }
+
+  // In serial run, only one 1-size thread pool is used
+  if (FLAGS_new_executor_serial_run) {
+    num_host_threads = 0;
+    num_device_threads = 1;
+  }
+
+  VLOG(4) << "place:" << place << ", processor_count:" << processor_count
+          << ", device_count:" << device_count
+          << ", serial_run:" << FLAGS_new_executor_serial_run
+          << ", num_host_threads:" << num_host_threads
+          << ", num_device_threads:" << num_device_threads
+          << ", num_prepare_threads:" << num_prepare_threads;
+  return std::make_tuple(
+      num_host_threads, num_device_threads, num_prepare_threads);
+}
+
+}  // namespace interpreter
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
index 0055a8f055ca29..ad4a51152c4728 100644
--- a/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
+++ b/paddle/fluid/framework/new_executor/workqueue/workqueue.cc
@@ -121,8 +121,13 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
   queues_.resize(num_queues);
   void* buffer = malloc(sizeof(NonblockingThreadPool) * num_queues);
   queues_storage_ = reinterpret_cast<NonblockingThreadPool*>(buffer);
+
   for (size_t idx = 0; idx < num_queues; ++idx) {
     const auto& options = queues_options_[idx];
+    if (options.num_threads == 0) {
+      queues_[idx] = nullptr;
+      continue;
+    }
     if (options.track_task && tracker_ == nullptr &&
         options.events_waiter != nullptr) {
       empty_notifier_ = options.events_waiter->RegisterEvent(kQueueEmptyEvent);
@@ -144,7 +149,9 @@ WorkQueueGroupImpl::WorkQueueGroupImpl(
 
 WorkQueueGroupImpl::~WorkQueueGroupImpl() {
   for (auto queue : queues_) {
-    queue->~NonblockingThreadPool();
+    if (queue) {
+      queue->~NonblockingThreadPool();
+    }
   }
   if (tracker_ != nullptr) {
     tracker_->~TaskTracker();
@@ -161,6 +168,10 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
                                platform::TracerEventType::UserDefined,
                                10 /*level*/);
   assert(queue_idx < queues_.size());
+  PADDLE_ENFORCE_NOT_NULL(
+      queues_.at(queue_idx),
+      platform::errors::NotFound("Workqueue of index %d is not initialized.",
+                                 queue_idx));
   if (queues_options_.at(queue_idx).track_task) {
     fn = [task = std::move(fn),
           raii = CounterGuard<TaskTracker>(tracker_)]() mutable { task(); };
@@ -170,6 +181,9 @@ void WorkQueueGroupImpl::AddTask(size_t queue_idx, std::function<void()> fn) {
 
 size_t WorkQueueGroupImpl::QueueNumThreads(size_t queue_idx) const {
   assert(queue_idx < queues_.size());
+  if (!queues_.at(queue_idx)) {
+    return 0;
+  }
   return queues_.at(queue_idx)->NumThreads();
 }
 
@@ -183,10 +197,14 @@ size_t WorkQueueGroupImpl::QueueGroupNumThreads() const {
 
 void WorkQueueGroupImpl::Cancel() {
   for (auto queue : queues_) {
-    queue->Cancel();
+    if (queue) {
+      queue->Cancel();
+    }
   }
   for (auto queue : queues_) {
-    queue->WaitThreadsExit();
+    if (queue) {
+      queue->WaitThreadsExit();
+    }
   }
 }
 
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index fca4ff253d6e1c..f2474cda0a9a14 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -790,16 +790,20 @@ Attribute OpDesc::GetAttr(const std::string &name, bool with_attr_var) const {
   auto it = attrs_.find(name);
   if (it == attrs_.end()) {
     it = runtime_attrs_.find(name);
+    PADDLE_ENFORCE_NE(
+        it,
+        runtime_attrs_.end(),
+        platform::errors::NotFound("Attribute %s is not found.", name));
   }
-  PADDLE_ENFORCE_NE(
-      it,
-      attrs_.end(),
-      platform::errors::NotFound("Attribute %s is not found.", name));
   if (!with_attr_var) {
     PADDLE_ENFORCE_EQ(
         HasAttrVar(it->second),
         false,
-        platform::errors::NotFound("Attribute %s is not found.", name));
+        platform::errors::NotFound(
+            "Attribute %s with constant value is not found, but found it with "
+            "Variable(s) type, which maybe not supported in some scenarios "
+            "currently, such as TensorRT et.al",
+            name));
   }
   return it->second;
 }
@@ -994,16 +998,25 @@ void OpDesc::Flush() {
 
     std::vector<std::pair<std::string, Attribute>> sorted_attrs{attrs_.begin(),
                                                                 attrs_.end()};
+
+    std::vector<std::pair<std::string, Attribute>> sorted_runtime_attrs{
+        runtime_attrs_.begin(), runtime_attrs_.end()};
+
     std::sort(
         sorted_attrs.begin(),
         sorted_attrs.end(),
         [](std::pair<std::string, Attribute> a,
            std::pair<std::string, Attribute> b) { return a.first < b.first; });
+    std::sort(
+        sorted_runtime_attrs.begin(),
+        sorted_runtime_attrs.end(),
+        [](std::pair<std::string, Attribute> a,
+           std::pair<std::string, Attribute> b) { return a.first < b.first; });
 
     for (auto &attr : sorted_attrs) {
       set_attr_desc(attr.first, attr.second);
     }
-    for (auto &attr : runtime_attrs_) {
+    for (auto &attr : sorted_runtime_attrs) {
       set_attr_desc(attr.first, attr.second);
     }
 
diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h
index 2befc70b2d5ed5..535480602916a7 100644
--- a/paddle/fluid/framework/op_registry.h
+++ b/paddle/fluid/framework/op_registry.h
@@ -171,6 +171,17 @@ inline void RegisterKernelClass(const char* op_type,
   if (library == "MKLDNN") {
     data_layout = "MKLDNNLAYOUT";
   }
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  if (std::is_same<PlaceType, platform::CustomPlace>::value) {
+    OpKernelType key(ToDataType(std::type_index(typeid(T))),
+                     platform::CustomPlace(library_type),
+                     StringToDataLayout(data_layout),
+                     LibraryType::kPlain,
+                     customized_type_value);
+    OperatorWithKernel::AllOpKernels()[op_type][key] = func;
+    return;
+  }
+#endif
   OpKernelType key(ToDataType(std::type_index(typeid(T))),
                    PlaceType(),
                    StringToDataLayout(data_layout),
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index fe64f81ddf0014..613cd4989276d2 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1890,7 +1890,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
   PADDLE_ENFORCE_NE(
       kernels_iter,
       all_op_kernels.end(),
-      platform::errors::Unavailable(
+      platform::errors::Unimplemented(
           "There are no kernels which are registered in the %s operator.",
           type_));
 
@@ -2382,6 +2382,17 @@ void OperatorWithKernel::ParseInputDataType(
       t = &var->Get<LoDTensor>();
     } else if (var->IsType<phi::SelectedRows>()) {
       t = &(var->Get<phi::SelectedRows>().value());
+    } else if (var->IsType<phi::SparseCooTensor>()) {
+      const phi::SparseCooTensor* sp_t = &(var->Get<phi::SparseCooTensor>());
+      PADDLE_ENFORCE_EQ(
+          sp_t->initialized(),
+          true,
+          platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
+                                            "contains uninitialized Tensor.",
+                                            Type(),
+                                            name));
+      *data_type = paddle::framework::TransToProtoVarType(sp_t->dtype());
+      return;
     } else if (var->IsType<LoDTensorArray>()) {
       auto t_arr = &var->Get<LoDTensorArray>();
       for (size_t j = 0; j < t_arr->size(); j++) {
@@ -2419,6 +2430,29 @@ void OperatorWithKernel::ParseMultiInputDataType(
         t = &var->Get<LoDTensor>();
       } else if (var->IsType<phi::SelectedRows>()) {
         t = &(var->Get<phi::SelectedRows>().value());
+      } else if (var->IsType<phi::SparseCooTensor>()) {
+        const phi::SparseCooTensor* sp_t = &(var->Get<phi::SparseCooTensor>());
+        PADDLE_ENFORCE_EQ(
+            sp_t->initialized(),
+            true,
+            platform::errors::InvalidArgument("The %s Op's Input Variable `%s` "
+                                              "contains uninitialized Tensor.",
+                                              Type(),
+                                              name));
+        proto::VarType::Type tmp =
+            paddle::framework::TransToProtoVarType(sp_t->dtype());
+        PADDLE_ENFORCE(tmp == *data_type || *data_type == default_data_type,
+                       platform::errors::InvalidArgument(
+                           "The DataType of %s Op's duplicable or different "
+                           "slot Variable %s must be "
+                           "consistent or reigster GetExpectedKernelType. The "
+                           "current variable type is (%s), but the "
+                           "previous variable type is (%s).",
+                           Type(),
+                           name,
+                           DataTypeToString(tmp),
+                           DataTypeToString(*data_type)));
+        *data_type = tmp;
       } else if (var->IsType<LoDTensorArray>()) {
         auto t_arr = &var->Get<LoDTensorArray>();
         for (size_t j = 0; j < t_arr->size(); j++) {
@@ -2663,6 +2697,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
       } else if (var->IsType<phi::SelectedRows>()) {
         tensor_in = &(var->Get<phi::SelectedRows>());
         phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
+      } else if (var->IsType<phi::SparseCooTensor>()) {
+        tensor_in = &(var->Get<phi::SparseCooTensor>());
+        phi_kernel_context->EmplaceBackInputWithoutSetRange(tensor_in);
       } else if (var->IsType<framework::LoDTensorArray>()) {
         need_prepare_phi_data_ = true;
         tensor_in = &(var->Get<framework::LoDTensorArray>());
@@ -2708,6 +2745,9 @@ void OperatorWithKernel::BuildPhiKernelContext(
         } else if (var->template IsType<phi::SelectedRows>()) {
           tensor_out = var->template GetMutable<phi::SelectedRows>();
           phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
+        } else if (var->template IsType<phi::SparseCooTensor>()) {
+          tensor_out = var->template GetMutable<phi::SparseCooTensor>();
+          phi_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
         } else if (var->template IsType<framework::LoDTensorArray>()) {
           tensor_out = var->template GetMutable<framework::LoDTensorArray>();
           // Note: If the input LoDTensorArray size is 0, the output
diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h
index 43a5b7a0bb9086..7044cee80d1ab9 100644
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -182,11 +182,17 @@ class OperatorBase {
   }
   template <typename T>
   inline const T& Attr(const std::string& name) const {
-    PADDLE_ENFORCE_NE(
-        attrs_.find(name),
-        attrs_.end(),
-        platform::errors::NotFound("(%s) is not found in AttributeMap.", name));
-    return PADDLE_GET_CONST(T, attrs_.at(name));
+    auto it = attrs_.find(name);
+    if (it == attrs_.end()) {
+      it = runtime_attrs_.find(name);
+      PADDLE_ENFORCE_NE(
+          it,
+          runtime_attrs_.end(),
+          platform::errors::NotFound(
+              "(%s) is not found in AttributeMap and RuntimeAttributeMap.",
+              name));
+    }
+    return PADDLE_GET_CONST(T, it->second);
   }
   void SetAttr(const std::string& name, const Attribute& v) {
     PADDLE_ENFORCE_EQ(
@@ -506,6 +512,13 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     });
   }
 
+  bool IsSelectedRowsInputs(const std::string& name) const override {
+    auto vars = ctx_.MultiInputVar(name);
+    return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
+      return var->IsType<phi::SelectedRows>();
+    });
+  }
+
   bool IsSelectedRowsInput(const std::string& name) const override {
     const auto* var = ctx_.InputVar(name);
     return var->IsType<phi::SelectedRows>();
@@ -518,6 +531,16 @@ class ExecutionArgumentMappingContext : public phi::ArgumentMappingContext {
     });
   }
 
+  bool IsSparseCooTensorInput(const std::string& name) const override {
+    const auto* var = ctx_.InputVar(name);
+    return var->IsType<phi::SparseCooTensor>();
+  }
+
+  bool IsSparseCsrTensorInput(const std::string& name) const override {
+    const auto* var = ctx_.InputVar(name);
+    return var->IsType<phi::SparseCsrTensor>();
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     auto vars = ctx_.MultiOutputVar(name);
     return std::all_of(vars.begin(), vars.end(), [](const Variable* var) {
@@ -680,10 +703,9 @@ class OperatorWithKernel : public OperatorBase {
 
   /**
    * Transfer data from scope to a transferred scope. If there is no data need
-   * to
-   * be tranfered, it returns nullptr.
+   * to be transferred, it returns nullptr.
    *
-   * * transfered_inplace_vars is a output vector.
+   * transfered_inplace_vars is a output vector.
    */
   Scope* PrepareData(const Scope& scope,
                      const OpKernelType& expected_kernel_key,
diff --git a/paddle/fluid/framework/operator_kernel_configs.h b/paddle/fluid/framework/operator_kernel_configs.h
index dc798868dc6c93..6a6419042f1c18 100644
--- a/paddle/fluid/framework/operator_kernel_configs.h
+++ b/paddle/fluid/framework/operator_kernel_configs.h
@@ -24,6 +24,16 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+template <typename AlgoT>
+struct SearchFuseResult {
+  SearchFuseResult() {}
+  explicit SearchFuseResult(AlgoT a) : algo(a) {}
+
+  AlgoT algo = static_cast<AlgoT>(0);
+  float time = -1.f;
+  size_t workspace_size = 0;
+};
+
 // thread-safe.
 template <typename TAlgorithm>
 class AlgorithmsCache {
diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc
index 87312cbfde2b95..88ffeb59503d35 100644
--- a/paddle/fluid/framework/scope.cc
+++ b/paddle/fluid/framework/scope.cc
@@ -25,21 +25,10 @@ PADDLE_DEFINE_EXPORTED_bool(
     "Delete local scope eagerly. It will reduce GPU memory usage but "
     "slow down the destruction of variables.(around 1% performance harm)");
 
-// When in inference scenario, the scopes will not be written by two threads in
-// a mean time, but a scope may be read by multiple threads concurrently, and
-// the mutex will cause serious performance issue.
-// So the mutex is disabled when `ON_INFER`.
-#ifdef PADDLE_ON_INFERENCE
-#define SCOPE_KIDS_READER_LOCK
-#define SCOPE_KIDS_WRITER_LOCK
-#define SCOPE_VARS_READER_LOCK
-#define SCOPE_VARS_WRITER_LOCK
-#else
 #define SCOPE_KIDS_READER_LOCK phi::AutoRDLock auto_lock(&kids_lock_);
 #define SCOPE_KIDS_WRITER_LOCK phi::AutoWRLock auto_lock(&kids_lock_);
 #define SCOPE_VARS_READER_LOCK phi::AutoRDLock auto_lock(&vars_lock_);
 #define SCOPE_VARS_WRITER_LOCK phi::AutoWRLock auto_lock(&vars_lock_);
-#endif
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index 647972e61872b6..7f08fc9b4e22c3 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -132,6 +132,11 @@ class Scope : public ScopeBase {
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
+  // only for dygraph_to_static
+  bool CanReuesd() const { return can_reused_; }
+
+  void SetCanReuesd(bool can_reused) { can_reused_ = can_reused; }
+
  protected:
   struct KeyHasher {
     std::size_t operator()(const std::string& key) const {
@@ -169,14 +174,14 @@ class Scope : public ScopeBase {
   mutable std::list<Scope*> kids_;
   const Scope* parent_{nullptr};
 
-  DISABLE_COPY_AND_ASSIGN(Scope);
+  // only for dygraph_to_static
+  bool can_reused_{false};
 
-#ifndef PADDLE_ON_INFERENCE
+  DISABLE_COPY_AND_ASSIGN(Scope);
 
  private:
   mutable phi::RWLock kids_lock_;
   mutable phi::RWLock vars_lock_;
-#endif
 };
 
 // Generate some debug string about the inherience structure of scope, quite
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index fcb061aa93288f..4c80f3f7ab404e 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+#include "paddle/phi/core/sparse_csr_tensor.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index f70f3e3fb4aea4..0780976b2c6f0f 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -22,7 +22,6 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/dlpack_tensor.h"
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/string_array.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/allocation/allocator_facade.h"
diff --git a/paddle/fluid/framework/var_desc.cc b/paddle/fluid/framework/var_desc.cc
index 6f9574b3f2c46c..d4a53c4135a08b 100644
--- a/paddle/fluid/framework/var_desc.cc
+++ b/paddle/fluid/framework/var_desc.cc
@@ -237,6 +237,8 @@ const proto::VarType::TensorDesc &VarDesc::tensor_desc() const {
       return desc_.type().strings();
     case proto::VarType::VOCAB:
       return desc_.type().vocab();
+    case proto::VarType::SPARSE_COO:
+      return desc_.type().sparse_coo();
     default:
       PADDLE_THROW(platform::errors::Unavailable(
           "Getting 'tensor_desc' is not supported by the %s type variable.",
@@ -284,6 +286,8 @@ proto::VarType::TensorDesc *VarDesc::mutable_tensor_desc() {
       return desc_.mutable_type()->mutable_strings();
     case proto::VarType::VOCAB:
       return desc_.mutable_type()->mutable_vocab();
+    case proto::VarType::SPARSE_COO:
+      return desc_.mutable_type()->mutable_sparse_coo();
     default:
       PADDLE_THROW(
           platform::errors::Unavailable("Getting 'mutable_tensor_desc' is not "
diff --git a/paddle/fluid/framework/var_type.h b/paddle/fluid/framework/var_type.h
index d0d26d59923366..bab027868c42f7 100644
--- a/paddle/fluid/framework/var_type.h
+++ b/paddle/fluid/framework/var_type.h
@@ -33,6 +33,7 @@ inline proto::VarType::Type ToVarType(int type) {
   switch (type) {
     case proto::VarType::LOD_TENSOR:
     case proto::VarType::SELECTED_ROWS:
+    case proto::VarType::SPARSE_COO:
     case proto::VarType::LOD_RANK_TABLE:
     case proto::VarType::LOD_TENSOR_ARRAY:
     case proto::VarType::FETCH_LIST:
@@ -59,6 +60,9 @@ inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
     case proto::VarType::SELECTED_ROWS:
       visitor(var.Get<phi::SelectedRows>());
       return;
+    case proto::VarType::SPARSE_COO:
+      visitor(var.Get<phi::SparseCooTensor>());
+      return;
     case proto::VarType::READER:
       visitor(var.Get<ReaderHolder>());
       return;
diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h
index ea7ebce2dae6be..3d78638312ae74 100644
--- a/paddle/fluid/framework/var_type_traits.h
+++ b/paddle/fluid/framework/var_type_traits.h
@@ -54,6 +54,8 @@
 namespace phi {
 class DenseTensor;
 class SelectedRows;
+class SparseCooTensor;
+class SparseCsrTensor;
 }  // namespace phi
 
 // Users should add forward declarations here
@@ -180,6 +182,8 @@ struct VarTypeRegistryImpl {
 using VarTypeRegistry = detail::VarTypeRegistryImpl<
     Tensor,
     phi::SelectedRows,
+    phi::SparseCooTensor,
+    phi::SparseCsrTensor,
     std::vector<Scope *>,
     LoDRankTable,
     Strings,
@@ -252,6 +256,7 @@ REG_PROTO_VAR_TYPE_TRAIT(float, proto::VarType::FP32);
 REG_PROTO_VAR_TYPE_TRAIT(Vocab, proto::VarType::VOCAB);
 REG_PROTO_VAR_TYPE_TRAIT(String, proto::VarType::STRING);
 REG_PROTO_VAR_TYPE_TRAIT(Strings, proto::VarType::STRINGS);
+REG_PROTO_VAR_TYPE_TRAIT(phi::SparseCooTensor, proto::VarType::SPARSE_COO);
 
 /** End of variable type registration */
 
diff --git a/paddle/fluid/framework/variable_helper.cc b/paddle/fluid/framework/variable_helper.cc
index 471efc02078357..90dac6191bd989 100644
--- a/paddle/fluid/framework/variable_helper.cc
+++ b/paddle/fluid/framework/variable_helper.cc
@@ -52,6 +52,8 @@ void InitializeVariable(Variable *var, proto::VarType::Type var_type) {
     var->GetMutable<ReaderHolder>();
   } else if (var_type == proto::VarType::RAW) {
     // GetMutable will be called in operator
+  } else if (var_type == proto::VarType::SPARSE_COO) {
+    var->GetMutable<phi::SparseCooTensor>();
   } else {
     PADDLE_THROW(platform::errors::Unavailable(
         "Variable type %d is not in "
diff --git a/paddle/fluid/imperative/infer_shape_context.h b/paddle/fluid/imperative/infer_shape_context.h
index b7345cf397356f..5702bcfca73296 100644
--- a/paddle/fluid/imperative/infer_shape_context.h
+++ b/paddle/fluid/imperative/infer_shape_context.h
@@ -169,6 +169,7 @@ class DygraphInferShapeContext : public framework::InferShapeContext {
 
     return vec_res;
   }
+
   std::string GetInputNameByIdx(size_t idx) const override {
     auto& op_proto =
         paddle::framework::OpInfoMap::Instance().Get(op_type_).proto_;
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index e1dd78cb9f2288..c7c6eb104f5e6a 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -517,7 +517,7 @@ static void OpBaseRunImpl(const framework::OperatorBase& op,
    * `transfer_scope` is created before PrepareData, the data after
    * transform is stored in the temporary scope, and then discarded
    * after the execution of op, but the original input is directly
-   * overwritten in the previous dynamic graph implemention.
+   * overwritten in the previous dynamic graph implementation.
    */
   auto prepared_op =
       PreparedOp::Prepare(ins, outs, *op_kernel, place, attrs, default_attrs);
diff --git a/paddle/fluid/imperative/layout_autotune.cc b/paddle/fluid/imperative/layout_autotune.cc
index d5a9ba6901087b..1a5a0d9c5d8485 100644
--- a/paddle/fluid/imperative/layout_autotune.cc
+++ b/paddle/fluid/imperative/layout_autotune.cc
@@ -14,23 +14,15 @@
 
 #include "paddle/fluid/imperative/layout_autotune.h"
 
+#include "paddle/fluid/eager/api/utils/global_utils.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/imperative/layout_transformer.h"
 #include "paddle/phi/backends/gpu/gpu_info.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
-
 namespace paddle {
 namespace imperative {
 
-bool LayoutAutoTune::UseLayoutAutoTune() const {
-#if defined(PADDLE_WITH_CUDA)
-  return use_layout_autotune_;
-#else
-  return false;
-#endif
-}
-
 LayoutAutoTune::LayoutAutoTune() {
   const auto& op_info = paddle::framework::OpInfoMap::Instance().map();
   for (auto it = op_info.begin(); it != op_info.end(); it++) {
@@ -140,6 +132,26 @@ paddle::imperative::NameVarMap<VarType> DealLightlyLayoutSensitive(
   return transposer->Apply(ins, outs, attrs, tracer);
 }
 
+LayoutAutotuneGuard::LayoutAutotuneGuard(std::shared_ptr<Tracer> tracer,
+                                         bool use_autotune)
+    : tracer_(tracer) {
+  pre_layout_autotune_ = tracer_->UseLayoutAutoTune();
+  if (pre_layout_autotune_ != use_autotune) {
+    tracer_->EnableLayoutAutoTune();
+    if (!use_autotune) {
+      tracer_->DisableLayoutAutoTune();
+    }
+  }
+}
+
+LayoutAutotuneGuard::~LayoutAutotuneGuard() {
+  if (pre_layout_autotune_) {
+    tracer_->EnableLayoutAutoTune();
+  } else {
+    tracer_->DisableLayoutAutoTune();
+  }
+}
+
 template <typename VarType>
 paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
     const std::string& op_type,
@@ -147,7 +159,7 @@ paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
     const paddle::imperative::NameVarMap<VarType>& outs,
     paddle::framework::AttributeMap* attrs,
     const std::shared_ptr<imperative::Tracer>& tracer) {
-  if (!LayoutAutoTune::Instance().UseLayoutAutoTune()) {
+  if (!tracer->UseLayoutAutoTune()) {
     return ins;
   }
   // When layout autotuning is enabled, the tuner will check the desired layout.
@@ -165,7 +177,7 @@ paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
     } else {
 #if defined(PADDLE_WITH_CUDA)
       if (!phi::backends::gpu::TensorCoreAvailable()) {
-        LayoutAutoTune::Instance().DisableLayoutAutoTune();
+        tracer->DisableLayoutAutoTune();
         return ins;
       }
 #endif
@@ -182,10 +194,12 @@ paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
           (conv_in_type == framework::proto::VarType::FP16);
       if (is_tune_fp32) {
         LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NCHW);
+        LayoutAutoTune::Instance().SetDefaultLayout(DataLayout::NHWC);
       } else if (is_tune_fp16) {
         LayoutAutoTune::Instance().SetDesiredLayout(DataLayout::NHWC);
+        LayoutAutoTune::Instance().SetDefaultLayout(DataLayout::NCHW);
       } else {
-        LayoutAutoTune::Instance().DisableLayoutAutoTune();
+        tracer->DisableLayoutAutoTune();
         return ins;
       }
       VLOG(3) << "Tune the layout from "
diff --git a/paddle/fluid/imperative/layout_autotune.h b/paddle/fluid/imperative/layout_autotune.h
index af7a89123efe8f..0e4c5f1d4e19c9 100644
--- a/paddle/fluid/imperative/layout_autotune.h
+++ b/paddle/fluid/imperative/layout_autotune.h
@@ -19,8 +19,8 @@
 #include <unordered_set>
 
 #include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/imperative/tracer.h"
 #include "paddle/phi/common/layout.h"
-
 namespace paddle {
 namespace imperative {
 
@@ -35,12 +35,6 @@ class LayoutAutoTune {
     return layout_autoTune;
   }
 
-  bool UseLayoutAutoTune() const;
-
-  void EnableLayoutAutoTune() { use_layout_autotune_ = true; }
-
-  void DisableLayoutAutoTune() { use_layout_autotune_ = false; }
-
   bool IsHeavilyLayoutSensitive(const std::string& op_type) const {
     return heavily_layout_sensitive_ops_.count(op_type) != 0;
   }
@@ -64,8 +58,6 @@ class LayoutAutoTune {
  private:
   LayoutAutoTune();
 
-  bool use_layout_autotune_{false};
-
   std::unordered_set<std::string> layout_agnostic_ops_{};
 
   std::unordered_set<std::string> heavily_layout_sensitive_ops_{"batch_norm"};
@@ -73,11 +65,29 @@ class LayoutAutoTune {
   std::unordered_set<std::string> lightly_layout_sensitive_ops_{
       "instance_norm", "softmax", "transpose", "transpose2", "reshape2"};
 
+  // Best Layout in this platform
   DataLayout desired_layout_{DataLayout::UNDEFINED};
 
+  // Default Layout in this model
   DataLayout default_layout_{DataLayout::UNDEFINED};
 };
 
+// LayoutAutotuneGuard is used for RAII.
+class LayoutAutotuneGuard {
+ public:
+  LayoutAutotuneGuard(std::shared_ptr<Tracer> tracer, bool use_autotune);
+
+  ~LayoutAutotuneGuard();
+
+  // forbid copy and operator=
+  LayoutAutotuneGuard(const LayoutAutotuneGuard& guard) = delete;
+  LayoutAutotuneGuard& operator=(const LayoutAutotuneGuard& guard) = delete;
+
+ private:
+  std::shared_ptr<Tracer> tracer_;
+  bool pre_layout_autotune_;
+};
+
 template <typename VarType>
 paddle::imperative::NameVarMap<VarType> AutoTuneLayout(
     const std::string& op_type,
diff --git a/paddle/fluid/imperative/layout_transformer.h b/paddle/fluid/imperative/layout_transformer.h
index 3e857c4ec26f25..555ec4170c8a8a 100644
--- a/paddle/fluid/imperative/layout_transformer.h
+++ b/paddle/fluid/imperative/layout_transformer.h
@@ -19,8 +19,24 @@
 #include "paddle/fluid/imperative/var_helper.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/errors.h"
+#include "paddle/phi/core/tensor_utils.h"
 namespace paddle {
 namespace imperative {
+template <typename VarType>
+void SetOutDataLayout(std::shared_ptr<VarType> var,
+                      const paddle::experimental::DataLayout layout) {
+  if (var != nullptr && var->Var().IsInitialized()) {
+    paddle::imperative::SetDataLayout(var, layout);
+    // set out_tensor's layout
+    if (var->MutableVar()->IsInitialized()) {
+      paddle::framework::Variable* tmp_var = var->MutableVar();
+      auto* out = tmp_var->GetMutable<framework::LoDTensor>();
+      phi::DenseTensorUtils::GetMutableMeta(
+          static_cast<framework::LoDTensor*>(out))
+          ->layout = layout;
+    }
+  }
+}
 
 template <typename VarType>
 std::shared_ptr<VarType> TraceTransposeOp(
@@ -118,7 +134,7 @@ class LayoutTransformer {
           auto out_vars = outs.at(name);
           for (auto& var : out_vars) {
             if (var != nullptr) {
-              paddle::imperative::SetDataLayout(var, layout);
+              paddle::imperative::SetOutDataLayout(var, layout);
             }
           }
           not_in_out = false;
@@ -130,7 +146,7 @@ class LayoutTransformer {
       for (auto& pair : outs) {
         for (auto& var : pair.second) {
           if (var != nullptr) {
-            paddle::imperative::SetDataLayout(var, layout);
+            paddle::imperative::SetOutDataLayout(var, layout);
           }
         }
       }
diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 62bbf77a2df1d7..345f3af0a6d80a 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -146,6 +146,48 @@ PreparedOp::PreparedOp(const framework::OperatorBase& op,
       kernel_signature_(std::move(kernel_signature)),
       phi_kernel_(phi_kernel) {}
 
+#ifdef PADDLE_WITH_MLU
+
+static void tokenize(const std::string& ops,
+                     char delim,
+                     std::unordered_set<std::string>* op_set) {
+  std::string::size_type beg = 0;
+  for (uint64_t end = 0; (end = ops.find(delim, end)) != std::string::npos;
+       ++end) {
+    op_set->insert(ops.substr(beg, end - beg));
+    beg = end + 1;
+  }
+
+  op_set->insert(ops.substr(beg));
+}
+
+static bool is_in_mlu_black_list(const std::string& op_name) {
+  static bool inited = false;
+  static std::unordered_set<std::string> mlu_black_list;
+  static std::mutex s_mtx;
+  if (!inited) {
+    std::lock_guard<std::mutex> guard(s_mtx);
+    if (!inited) {
+      if (std::getenv("MLU_BLACK_LIST") != nullptr) {
+        std::string ops(std::getenv("MLU_BLACK_LIST"));
+        tokenize(ops, ',', &mlu_black_list);
+      }
+      inited = true;
+      VLOG(3) << "MLU Black List: ";
+      for (auto iter = mlu_black_list.begin(); iter != mlu_black_list.end();
+           ++iter) {
+        VLOG(3) << *iter << " ";
+      }
+    }
+  }
+  if (mlu_black_list.find(op_name) != mlu_black_list.end()) {
+    return true;
+  }
+  return false;
+}
+
+#endif
+
 template <typename VarType>
 PreparedOp PrepareImpl(
     const NameVarMap<VarType>& ins,
@@ -194,6 +236,12 @@ PreparedOp PrepareImpl(
 
 #endif
 
+#ifdef PADDLE_WITH_MLU
+  if (is_in_mlu_black_list(op.Type())) {
+    expected_kernel_key.place_ = platform::CPUPlace();
+  }
+#endif
+
   bool has_phi_kernel = false;
 
   const auto* arg_map_fn = phi_op_utils_map.GetArgumentMappingFn(op.Type());
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 35eb3e9384200d..400c0021d6d7e7 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -42,6 +42,8 @@ thread_local bool Tracer::enable_program_desc_tracing_ = false;
 
 thread_local bool Tracer::has_grad_ = true;
 
+thread_local bool Tracer::use_layout_autotune_ = false;
+
 thread_local AmpLevel Tracer::amp_level_ = AmpLevel::O0;
 
 thread_local phi::DataType Tracer::amp_dtype_ = phi::DataType::FLOAT32;
diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h
index 686c367335326d..9a93d299c002a5 100644
--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -28,9 +28,9 @@
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include "paddle/fluid/imperative/layer.h"
+#include "paddle/fluid/imperative/layout_autotune.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/phi/core/compat/arg_map_context.h"
-
 namespace paddle {
 namespace imperative {
 
@@ -139,7 +139,7 @@ class Tracer {
   // intermediate var both in imperative and static mode. But the
   // `UniqueNameGenerator` in C++ and `unique_name.py` in Python doesn't share
   // the same auto-increment id. It will create a variable repeatedly with same
-  // name like `tmp_0` in some cases when transform dygraph into static layers.
+  // name like `tmp_0` in some cases when transform dygraph into static layers.
   // So we modify the default prefix key into `eager_tmp` to distinguish with
   // static graph.
   std::string GenerateUniqueName(std::string key = "dygraph_tmp") {
@@ -184,6 +184,20 @@ class Tracer {
     }
   }
 
+  void DisableLayoutAutoTune() { use_layout_autotune_ = false; }
+
+  void EnableLayoutAutoTune() { use_layout_autotune_ = true; }
+
+  bool UseLayoutAutoTune() {
+#if defined(PADDLE_WITH_CUDA)
+    if (phi::backends::gpu::TensorCoreAvailable()) {
+      return use_layout_autotune_;
+    }
+#endif
+    use_layout_autotune_ = false;
+    return false;
+  }
+
   phi::KernelSignature GetExpectedKernelSignature(
       const std::string& type,
       const NameTensorMap& ins,
@@ -199,8 +213,8 @@ class Tracer {
   std::unique_ptr<UniqueNameGenerator> generator_;
   platform::Place expected_place_;
   GarbageCollectorMap gcs_;
-
   static thread_local bool enable_program_desc_tracing_;
+  static thread_local bool use_layout_autotune_;
   static thread_local bool has_grad_;
   static thread_local AmpLevel amp_level_;
   static thread_local phi::DataType amp_dtype_;
diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt
index 7f2daa942b057d..9e130170a3abfe 100644
--- a/paddle/fluid/inference/CMakeLists.txt
+++ b/paddle/fluid/inference/CMakeLists.txt
@@ -58,6 +58,14 @@ set(STATIC_INFERENCE_API
     phi
     ${mkldnn_quantizer_cfg})
 
+set(OP_LIST
+    ""
+    CACHE STRING "The list of operators that will be compiled")
+
+set(KERNEL_LIST
+    ""
+    CACHE STRING "The list of phi kernels that will be compiled")
+
 #windows GPU static library over the limit, so not create_static_lib, and cc_library is dummy
 if(WIN32 AND WITH_GPU)
   cc_library(paddle_inference DEPS ${fluid_modules} phi ${STATIC_INFERENCE_API}
diff --git a/paddle/fluid/inference/analysis/analyzer.cc b/paddle/fluid/inference/analysis/analyzer.cc
index 9615100f32ad39..4aadb34d7b354e 100644
--- a/paddle/fluid/inference/analysis/analyzer.cc
+++ b/paddle/fluid/inference/analysis/analyzer.cc
@@ -38,8 +38,7 @@ void Analyzer::RunAnalysis(Argument *argument) {
     if (!disable_logs) {
       string::PrettyLogH1("--- Running analysis [%s]", pass);
     }
-    if (!argument->enable_analysis_optim() && pass == "ir_analysis_pass")
-      continue;
+    if (!argument->enable_ir_optim() && pass == "ir_analysis_pass") continue;
 
     auto *ptr = PassRegistry::Global().Retreive(pass);
     PADDLE_ENFORCE_NOT_NULL(ptr,
diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc
index 1df8d06dd89cac..3f5be92f5a3e6f 100644
--- a/paddle/fluid/inference/analysis/analyzer_tester.cc
+++ b/paddle/fluid/inference/analysis/analyzer_tester.cc
@@ -31,7 +31,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
   Argument argument;
   argument.SetDisableLogs(false);
   argument.SetModelDir(FLAGS_inference_model_dir);
-  argument.SetEnableAnalysisOptim(false);
+  argument.SetEnableIrOptim(false);
   argument.SetUseGPU(false);
   argument.SetAnalysisPasses({"ir_graph_build_pass",
                               "ir_analysis_pass",
@@ -44,7 +44,7 @@ TEST(Analyzer, analysis_without_tensorrt) {
 TEST(Analyzer, analysis_with_tensorrt) {
   Argument argument;
   argument.SetDisableLogs(false);
-  argument.SetEnableAnalysisOptim(false);
+  argument.SetEnableIrOptim(false);
   argument.SetTensorRtMaxBatchSize(3);
   argument.SetTensorRtWorkspaceSize(1 << 20);
   argument.SetModelDir(FLAGS_inference_model_dir);
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
old mode 100755
new mode 100644
index b0ed905bfc69f4..2a4ce0d6492b06
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -42,8 +42,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-using framework::ir::Graph;
-
 #ifdef PADDLE_WITH_MKLDNN
 using VarQuantScale =
     std::unordered_map<std::string, std::pair<bool, framework::LoDTensor>>;
@@ -148,7 +146,10 @@ struct Argument {
   DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string);
   DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool);
   DECL_ARGUMENT_FIELD(optim_cache_dir, OptimCacheDir, std::string);
-  DECL_ARGUMENT_FIELD(enable_analysis_optim, EnableAnalysisOptim, bool);
+  DECL_ARGUMENT_FIELD(enable_ir_optim, EnableIrOptim, bool);
+
+  // For JITLayer
+  DECL_ARGUMENT_FIELD(skip_load_params, SkipLoadParams, bool);
 
   // The overall graph to work on.
   DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph);
@@ -343,6 +344,12 @@ struct Argument {
                       IpuAvailableMemoryProportion,
                       float);
   DECL_ARGUMENT_FIELD(ipu_enable_half_partial, IpuEnableHalfPartial, bool);
+  DECL_ARGUMENT_FIELD(ipu_custom_ops_info,
+                      IpuCustomOpsInfo,
+                      std::vector<std::vector<std::string>>);
+  DECL_ARGUMENT_FIELD(ipu_custom_patterns,
+                      IpuCustomPatterns,
+                      std::vector<std::vector<std::string>>);
 
   // npu related
   DECL_ARGUMENT_FIELD(use_npu, UseNpu, bool);
@@ -353,6 +360,8 @@ struct Argument {
   DECL_ARGUMENT_FIELD(mixed_black_list,
                       MixedBlackList,
                       std::unordered_set<std::string>);
+  DECL_ARGUMENT_FIELD(enable_gpu_mixed, EnableGPUMixed, bool);
+  DECL_ARGUMENT_FIELD(mixed_precision_mode, MixedPrecisionMode, int);
 
  private:
   std::unordered_set<std::string> valid_fields_;
diff --git a/paddle/fluid/inference/analysis/helper.h b/paddle/fluid/inference/analysis/helper.h
index e8d719ddb659dc..e891da8e6d19fc 100644
--- a/paddle/fluid/inference/analysis/helper.h
+++ b/paddle/fluid/inference/analysis/helper.h
@@ -153,25 +153,6 @@ T &GetFromScope(const framework::Scope &scope, const std::string &name) {
   return *var->GetMutable<T>();
 }
 
-static framework::proto::ProgramDesc LoadProgramDesc(
-    const std::string &model_path) {
-  std::ifstream fin(model_path, std::ios::in | std::ios::binary);
-  PADDLE_ENFORCE_EQ(
-      fin.is_open(),
-      true,
-      platform::errors::NotFound(
-          "Cannot open file %s, please confirm whether the file exists",
-          model_path));
-  fin.seekg(0, std::ios::end);
-  std::string buffer(fin.tellg(), ' ');
-  fin.seekg(0, std::ios::beg);
-  fin.read(&buffer[0], buffer.size());
-  fin.close();
-  framework::proto::ProgramDesc program_desc;
-  program_desc.ParseFromString(buffer);
-  return program_desc;
-}
-
 static bool FileExists(const std::string &filepath) {
   std::ifstream file(filepath);
   bool exists = file.is_open();
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index f86a22e3db9e1d..f994667df80bbc 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -27,6 +27,7 @@
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/inference/analysis/argument.h"
 #include "paddle/fluid/string/pretty_log.h"
+#include "paddle/phi/core/errors.h"
 
 namespace paddle {
 namespace inference {
@@ -36,15 +37,6 @@ using string::PrettyLogEndl;
 using string::Style;
 
 IRPassManager::IRPassManager(Argument *argument) {
-  ARGUMENT_CHECK_FIELD(argument, main_program);
-  graph_ = std::unique_ptr<Graph>(new Graph(argument->main_program()));
-  if (argument->Has("scope")) {
-    auto *scope_ptr = argument->scope_ptr();
-    PADDLE_ENFORCE_NOT_NULL(scope_ptr,
-                            platform::errors::PreconditionNotMet(
-                                "The scope ptr should not be nullptr."));
-    graph_->SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);
-  }
   disable_logs_ = argument->disable_logs();
 
   ARGUMENT_CHECK_FIELD(argument, ir_analysis_passes);
@@ -77,6 +69,15 @@ void IRPassManager::CreatePasses(Argument *argument,
     pass->Set("optim_input_shape",
               new std::map<std::string, std::vector<int>>(
                   argument->optim_input_shape()));
+    // Now, shape tensor value is not explicit set by user,
+    // it is collected through API CollectShapeRangeInfo.
+    pass->Set("max_shape_tensor",
+              new std::map<std::string, std::vector<int>>());
+    pass->Set("min_shape_tensor",
+              new std::map<std::string, std::vector<int>>());
+    pass->Set("optim_shape_tensor",
+              new std::map<std::string, std::vector<int>>());
+
     // tuned trt dynamic_shape
     pass->Set("trt_tuned_dynamic_shape",
               new bool(argument->tensorrt_tuned_dynamic_shape()));
@@ -86,10 +87,14 @@ void IRPassManager::CreatePasses(Argument *argument,
                               argument->tensorrt_tuned_dynamic_shape();
     pass->Set("with_dynamic_shape", new bool(with_dynamic_shape));
 
+    // mixed precision related
     pass->Set("model_precision", new int(argument->model_precision()));
     pass->Set(
         "mixed_black_list",
         new std::unordered_set<std::string>(argument->mixed_black_list()));
+    pass->Set("enable_gpu_mixed", new bool(argument->enable_gpu_mixed()));
+    pass->Set("mixed_precision_mode",
+              new int(argument->mixed_precision_mode()));
 
     if (pass_name == "graph_viz_pass") {
       std::string optim_cache_dir = argument->optim_cache_dir();
@@ -293,42 +298,18 @@ void IRPassManager::CreatePasses(Argument *argument,
 }
 
 std::unique_ptr<Graph> IRPassManager::Apply(std::unique_ptr<Graph> graph) {
-  if (passes_.empty()) {
-    return graph;
-  }
   PADDLE_ENFORCE_NOT_NULL(
-      graph.get(),
-      platform::errors::PreconditionNotMet("Graph cannot be NULL."));
+      graph.get(), platform::errors::InvalidArgument("Graph cannot be null."));
   // Apply all the passes
   for (const auto &pass : passes_) {
     if (pass->Type() != "graph_viz_pass" && !disable_logs_) {
       PrettyLogEndl(Style::H2(), "--- Running IR pass [%s]", pass->Type());
     }
-    // delete_fill_constant_op_pass is not apply under trt dynamic shape
-    if (pass->Type() == "delete_fill_constant_op_pass") {
-      bool use_dynamic = pass->Get<bool>("with_dynamic_shape");
-      if (use_dynamic) continue;
-    }
     graph.reset(pass->Apply(graph.release()));
   }
   return graph;
 }
 
-framework::proto::ProgramDesc IRPassManager::AcquireProgram(
-    std::unique_ptr<Graph> *graph, ProgramDesc *program) const {
-  auto pass =
-      framework::ir::PassRegistry::Instance().Get("graph_to_program_pass");
-
-  // Direct using ProgramDesc desc(argument->main_program()) may cause
-  // incomplete copies of information.
-  ProgramDesc desc;
-  desc.CopyFrom(*program->Proto());
-  pass->SetNotOwned("program", &desc);
-  auto *the_graph = graph->release();
-  graph->reset(pass->Apply(the_graph));
-  return *desc.Proto();
-}
-
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.h b/paddle/fluid/inference/analysis/ir_pass_manager.h
index 9f9a5fc347123b..c56d3d40f54def 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.h
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.h
@@ -48,15 +48,9 @@ class IRPassManager final {
 
   std::unique_ptr<Graph> Apply(std::unique_ptr<Graph> graph);
 
-  framework::proto::ProgramDesc AcquireProgram(std::unique_ptr<Graph> *graph,
-                                               ProgramDesc *program) const;
-
-  framework::ir::Graph &graph() const { return *graph_; }
-
  private:
   void CreatePasses(Argument *argument, const std::vector<std::string> &passes);
 
-  std::unique_ptr<Graph> graph_;
   std::vector<std::unique_ptr<Pass>> passes_;
   bool disable_logs_{false};
 };
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index cd79b3fcde0ef8..5021336df490db 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -32,6 +32,7 @@
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
@@ -93,14 +94,14 @@ void OutputProcess(framework::ir::Graph *graph,
                   backend,
                   precision,
                   blacklist)) {
-            AddCastOp(graph,
-                      var_node,
-                      next_op,
-                      framework::proto::VarType::FP32,
-                      to_type,
-                      &suffix,
-                      block_desc,
-                      &var_to_cast_op_map);
+            InsertCastOp(graph,
+                         var_node,
+                         next_op,
+                         framework::proto::VarType::FP32,
+                         to_type,
+                         block_desc,
+                         &suffix,
+                         &var_to_cast_op_map);
             var_node->Var()->SetDataType(framework::proto::VarType::FP32);
           }
         }
@@ -117,6 +118,11 @@ void analysis::TensorRtSubgraphPass::ApplyImpl(
     framework::ir::Graph *graph) const {
   framework::ir::FusePassBase::Init("tensorrt_subgraph_pass", graph);
 
+  static std::once_flag trt_plugin_registered;
+  std::call_once(trt_plugin_registered, []() {
+    tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
+  });
+
   auto model_precision =
       static_cast<phi::DataType>(Get<int>("model_precision"));
   if (model_precision == phi::DataType::BFLOAT16) {
@@ -313,6 +319,13 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   auto opt_input_shape =
       Get<std::map<std::string, std::vector<int>>>("optim_input_shape");
 
+  auto min_shape_tensor =
+      Get<std::map<std::string, std::vector<int>>>("min_shape_tensor");
+  auto max_shape_tensor =
+      Get<std::map<std::string, std::vector<int>>>("max_shape_tensor");
+  auto opt_shape_tensor =
+      Get<std::map<std::string, std::vector<int>>>("optim_shape_tensor");
+
   auto allow_build_at_runtime = Get<bool>("trt_allow_build_at_runtime");
   auto shape_range_info_path = Get<std::string>("trt_shape_range_info_path");
   auto trt_tuned_dynamic_shape = Get<bool>("trt_tuned_dynamic_shape");
@@ -322,7 +335,10 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
     inference::DeserializeShapeRangeInfo(shape_range_info_path,
                                          &min_input_shape,
                                          &max_input_shape,
-                                         &opt_input_shape);
+                                         &opt_input_shape,
+                                         &min_shape_tensor,
+                                         &max_shape_tensor,
+                                         &opt_shape_tensor);
   }
 
   // The following procedure is used to rename all the intermediate
@@ -507,6 +523,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   min_input_shape,
                   max_input_shape,
                   opt_input_shape,
+                  min_shape_tensor,
+                  max_shape_tensor,
+                  opt_shape_tensor,
                   disable_trt_plugin_fp16,
                   static_cast<phi::DataType>(Get<int>("model_precision")));
   trt_engine->SetUseOSS(Get<bool>("use_varseqlen"));
diff --git a/paddle/fluid/inference/analysis/passes/CMakeLists.txt b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
index 126e2500c48900..96121601cb6fdb 100644
--- a/paddle/fluid/inference/analysis/passes/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/passes/CMakeLists.txt
@@ -13,7 +13,7 @@ cc_library(
 cc_library(
   convert_to_mixed_precision
   SRCS convert_to_mixed_precision.cc
-  DEPS analysis_pass ir_graph_build_pass)
+  DEPS analysis_pass ir_graph_build_pass auto_mixed_precision_pass)
 cc_library(
   ir_params_sync_among_devices_pass
   SRCS ir_params_sync_among_devices_pass.cc
@@ -30,17 +30,6 @@ cc_library(
   inference_op_replace_pass
   SRCS inference_op_replace_pass.cc
   DEPS analysis_pass graph_to_program_pass)
-if(WITH_TESTING)
-  cc_library(
-    ir_graph_clean_pass
-    SRCS ir_graph_clean_pass.cc
-    DEPS analysis_pass gtest)
-else()
-  cc_library(
-    ir_graph_clean_pass
-    SRCS ir_graph_clean_pass.cc
-    DEPS analysis_pass)
-endif()
 
 cc_library(
   analysis_passes
@@ -52,8 +41,7 @@ cc_library(
        memory_optim_pass
        convert_to_mixed_precision
        inference_op_replace_pass
-       ir_graph_to_program_pass
-       ir_graph_clean_pass)
+       ir_graph_to_program_pass)
 
 set(analysis_deps
     ${analysis_deps} analysis_passes subgraph_detector
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
index 87750d713c6d45..f1939fc8b328b8 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc
@@ -14,897 +14,149 @@
 
 #include "paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h"
 
-#include <algorithm>
-#include <iterator>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-
-#include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/executor.h"
-#include "paddle/fluid/framework/framework.pb.h"
-#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/auto_mixed_precision_pass.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-#include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/inference/io.h"
-#include "paddle/phi/common/data_type.h"
-#include "paddle/phi/common/layout.h"
-#include "paddle/phi/core/tensor_meta.h"
-
-using namespace paddle::framework;  // NOLINT
+#include "paddle/phi/common/backend.h"
 
 namespace paddle {
 namespace inference {
 namespace analysis {
 
-namespace {
-
-inline std::string SerializeParams(framework::Scope* scope,
-                                   const std::vector<std::string>& params) {
-  std::ostringstream os;
-  phi::CPUContext ctx;
-  for (const auto& param : params) {
-    VLOG(3) << "Serialize param: " << param;
-    PADDLE_ENFORCE_NOT_NULL(
-        scope->FindVar(param),
-        platform::errors::NotFound("Block should already have a '%s' variable",
-                                   param));
-    auto* tensor = scope->FindVar(param)->GetMutable<framework::LoDTensor>();
-    framework::SerializeToStream(os, *tensor, ctx);
-  }
-  return os.str();
-}
-
-inline void StrToBinary(const std::string& path, const std::string& str) {
-  std::ofstream file(path.c_str(), std::ios::binary);
-  file.write(str.c_str(), str.size());
-  file.close();
-}
-
-inline bool NodeVarHasDtype(framework::ir::Node* node) {
-  if (node->IsCtrlVar()) return false;
-
-  if (node->IsVar() &&
-      (node->Var()->GetType() ==
-           paddle::framework::proto::VarType::SELECTED_ROWS ||
-       node->Var()->GetType() ==
-           paddle::framework::proto::VarType::LOD_TENSOR ||
-       node->Var()->GetType() ==
-           paddle::framework::proto::VarType::LOD_TENSOR_ARRAY ||
-       node->Var()->GetType() == paddle::framework::proto::VarType::STRINGS ||
-       node->Var()->GetType() == paddle::framework::proto::VarType::VOCAB)) {
-    return true;
-  }
-
-  return false;
-}
-
-// Return Node* which first appers in block.
-framework::ir::Node* GetRealNode(
-    const std::vector<framework::ir::Graph*>& graphes,
-    int block_idx,
-    framework::ir::Node* node,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map) {
-  if (vars_in_multi_block_map->count(node->Name())) {
-    int var_origin_block_id = vars_in_multi_block_map->at(node->Name()).second;
-    if (block_idx != var_origin_block_id) {
-      auto graph = graphes[var_origin_block_id];
-      for (auto nd : graph->Nodes()) {
-        if (nd->Name() == node->Name()) {
-          return nd;
-        }
-      }
-    }
-  }
-
-  return node;
-}
-
-inline bool VarIsMultiOpsOut(
-    const std::vector<framework::ir::Graph*>& graphes,
-    int block_idx,
-    framework::ir::Node* op_node,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map,
-    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
-  CHECK_EQ(op_node->IsOp(), true);
-  for (auto* out : op_node->outputs) {
-    if (out->IsCtrlVar()) continue;
-    auto* real_node =
-        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
-    if (!real_node->Var()->Persistable() &&
-        vars_appear_multi_in_one_block[block_idx].count(out->Name())) {
-      VLOG(2) << out->Name()
-              << " is multi op's out, so we skip convert to fp16";
-      return true;
-    }
-  }
-  return false;
-}
-
-void SaveMixedModel(
-    framework::ir::Graph* graph,
-    framework::Scope* scope,
-    framework::ProgramDesc* mixed_program_desc,
+ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass(
+    const std::string& model_file,
+    const std::string& params_file,
     const std::string& mixed_model_file,
     const std::string& mixed_params_file,
     phi::DataType mixed_precision,
-    const std::unordered_map<std::string,
-                             std::pair<framework::proto::VarType::Type, int>>&
-        vars_in_multi_block_map) {
-  paddle::CPUPlace place;
-  auto parameters = scope->LocalVarNames();
-  std::sort(parameters.begin(), parameters.end());
-
-  std::unordered_set<std::string> weights_should_be_fp32;
-  for (auto* node : graph->Nodes()) {
-    if (!(node->IsVar() && !node->IsCtrlVar())) continue;
-    if (NodeVarHasDtype(node)) {
-      if (node->Var()->Persistable() &&
-          node->Var()->GetDataType() ==
-              paddle::framework::proto::VarType::FP32) {
-        VLOG(2) << "weights keep to fp32: " << node->Name();
-        weights_should_be_fp32.insert(node->Name());
-      }
-    }
-  }
-
-  for (const auto& param_name : parameters) {
-    auto* var = scope->FindLocalVar(param_name);
-    if (var->IsType<framework::LoDTensor>() ||
-        var->IsType<framework::Tensor>()) {
-      auto* t = var->GetMutable<framework::LoDTensor>();
-      if (t->dtype() != phi::DataType::FLOAT32) continue;
-
-      framework::Tensor mixed_tensor;
-      mixed_tensor.Resize(t->dims());
-      auto* data = t->mutable_data<float>(platform::CPUPlace());
-
-      if (mixed_precision == phi::DataType::FLOAT16 &&
-          !weights_should_be_fp32.count(param_name)) {
-        mixed_tensor.set_type(paddle::experimental::DataType::FLOAT16);
-        auto* mixed_data =
-            mixed_tensor.mutable_data<float16>(platform::CPUPlace());
-        for (int i = 0; i < t->numel(); i++) {
-          mixed_data[i] = static_cast<float16>(data[i]);
-        }
-        t->clear();
-        paddle::framework::TensorCopySync(mixed_tensor, place, t);
-      } else if (mixed_precision == phi::DataType::BFLOAT16 &&
-                 !weights_should_be_fp32.count(param_name)) {
-        mixed_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
-        auto* mixed_data =
-            mixed_tensor.mutable_data<bfloat16>(platform::CPUPlace());
-        for (int i = 0; i < t->numel(); i++) {
-          mixed_data[i] = static_cast<bfloat16>(data[i]);
-        }
-        t->clear();
-        paddle::framework::TensorCopySync(mixed_tensor, place, t);
-      }
-    }
-  }
-
-  StrToBinary(mixed_model_file,
-              mixed_program_desc->Proto()->SerializeAsString());
-  StrToBinary(mixed_params_file, SerializeParams(scope, parameters));
-}
-
-bool PhiKernelSupportPrecision(
-    const std::string& op_type,
     phi::Backend backend,
-    phi::DataType data_type,
-    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
-  auto kernels = phi::KernelFactory::Instance().kernels();
-  if (kernels.find(op_type) == kernels.end()) {
-    return false;
-  }
-  phi::KernelKey kernel_key(backend, layout, data_type);
-  return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key);
-}
-
-bool GpuKernelSupportPrecision(
-    const std::string& op_type,
-    phi::DataType data_type,
-    phi::DataLayout layout = phi::DataLayout::ALL_LAYOUT) {
-  auto phi_op_type = phi::TransToPhiKernelName(op_type);
-  bool res = PhiKernelSupportPrecision(
-      phi_op_type, phi::Backend::GPU, data_type, layout);
-  res |= PhiKernelSupportPrecision(
-      phi_op_type, phi::Backend::GPUDNN, data_type, layout);
-
-  if (!res) {
-    auto& all_kernels = OperatorWithKernel::AllOpKernels();
-    auto it = all_kernels.find(op_type);
-    if (it != all_kernels.end()) {
-      for (auto& kern_pair : it->second) {
-        if (platform::is_gpu_place(kern_pair.first.place_) &&
-            kern_pair.first.data_type_ == framework::proto::VarType::FP16) {
-          res = true;
-        }
-      }
-    }
-  }
-  return res;
-}
-
-// Just process special cases.
-bool OutShouldNotConvert(ir::Node* var_node) {
-  auto op_node = var_node->inputs[0];
-  auto* op_desc = op_node->Op();
-
-  // batch_norm's input and output (variance and mean) are the same.
-  if (op_desc->Type() == "batch_norm") {
-    auto vecs = op_desc->Output("MeanOut");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("VarianceOut");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("SavedMean");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-    vecs = op_desc->Output("SavedVariance");
-    if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-      return true;
-    }
-  }
-
-  return false;
-}
-void ProcessOutputNode(
-    const std::vector<framework::ir::Graph*>& graphes,
-    int block_idx,
-    ir::Node* var_node,
-    framework::proto::VarType::Type to_type,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map) {
-  auto* real_node =
-      GetRealNode(graphes, block_idx, var_node, vars_in_multi_block_map);
-  if (!NodeVarHasDtype(real_node)) return;
-  auto* out_var = real_node->Var();
-  if (out_var->GetDataType() == framework::proto::VarType::FP32) {
-    if (OutShouldNotConvert(var_node)) return;
-    out_var->SetDataType(to_type);
-  }
-  VLOG(3) << " out_node name " << var_node->Name() << " data_type "
-          << out_var->GetDataType();
-}
-
-// Just process special cases for weights conversion.
-bool WeightsShouldNotConvert(ir::Node* var_node) {
-  auto op_nodes = var_node->outputs;
-  for (auto* op_node : op_nodes) {
-    auto* op_desc = op_node->Op();
-    // batch_norm op's bias, mean, scale and variance just be float32, so we can
-    // not convert the dtype.
-    if (op_desc->Type() == "batch_norm") {
-      auto vecs = op_desc->Input("Bias");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-      vecs = op_desc->Input("Mean");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-      vecs = op_desc->Input("Scale");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-      vecs = op_desc->Input("Variance");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-    } else if (op_desc->Type() == "fused_multi_transformer") {
-      auto vecs = op_desc->Input("LnScale");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-
-      vecs = op_desc->Input("LnBias");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-
-      vecs = op_desc->Input("FFNLnScale");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-
-      vecs = op_desc->Input("FFNLnBias");
-      if (std::find(vecs.begin(), vecs.end(), var_node->Name()) != vecs.end()) {
-        return true;
-      }
-    }
-  }
-
-  return false;
-}
-inline bool IsFloatVarType(framework::proto::VarType::Type type) {
-  if (type == framework::proto::VarType::FP16 ||
-      type == framework::proto::VarType::FP32 ||
-      type == framework::proto::VarType::BF16)
-    return true;
-  return false;
-}
-void ProcessInputNode(
-    bool support_precision,
-    std::vector<framework::ir::Graph*> graphes,
-    ir::Node* in_node,
-    ir::Node* op_node,
-    int* suffix,
-    framework::BlockDesc* block_desc,
-    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* cast_map,
-    framework::proto::VarType::Type to_type,
-    int block_idx,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map) {
-  auto* real_node =
-      GetRealNode(graphes, block_idx, in_node, vars_in_multi_block_map);
-  if (!NodeVarHasDtype(real_node)) return;
-  auto graph = graphes[block_idx];
-  bool is_main_block = block_idx == 0;
-  auto* in_var = real_node->Var();
-  auto in_var_type = in_var->GetDataType();
-  bool is_in_multi_block = vars_in_multi_block_map->count(in_var->Name());
-
-  if (!is_main_block && is_in_multi_block) {
-    in_var_type = vars_in_multi_block_map->at(in_var->Name()).first;
-  }
-  if (support_precision) {
-    if (in_var->Persistable() &&
-        in_var_type == framework::proto::VarType::FP32) {
-      if (WeightsShouldNotConvert(in_node)) return;
-      in_var->SetDataType(to_type);
-      in_var_type = to_type;
-    } else if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
-               in_var_type != to_type) {
-      AddCastOp(graph,
-                in_node,
-                op_node,
-                in_var_type,
-                to_type,
-                suffix,
-                block_desc,
-                cast_map);
-    }
-  } else {
-    if (!in_var->Persistable() && IsFloatVarType(in_var_type) &&
-        in_var_type != to_type) {
-      AddCastOp(graph,
-                in_node,
-                op_node,
-                in_var_type,
-                to_type,
-                suffix,
-                block_desc,
-                cast_map);
-    }
-  }
-  VLOG(3) << " in_node name " << in_var->Name() << " data_type " << in_var_type;
-}
-
-void ConvertAllFp64ToFp32(framework::ir::Graph* graph) {
-  auto op_nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* op_node : op_nodes) {
-    if (!op_node->IsOp()) continue;
-    auto op_type = op_node->Op()->Type();
-    if (op_type == "feed" || op_type == "fetch") continue;
-
-    if (op_type == "fill_constant") {
-      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-          static_cast<int>(framework::proto::VarType::FP64))
-        op_node->Op()->SetAttr(
-            "dtype", static_cast<int>(framework::proto::VarType::FP32));
-    } else if (op_type == "assign_value") {
-      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-          static_cast<int>(framework::proto::VarType::FP64))
-        op_node->Op()->SetAttr(
-            "dtype", static_cast<int>(framework::proto::VarType::FP32));
-    } else if (op_type == "eye") {
-      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-          static_cast<int>(framework::proto::VarType::FP64))
-        op_node->Op()->SetAttr(
-            "dtype", static_cast<int>(framework::proto::VarType::FP32));
-    } else if (op_type == "fill_any_like") {
-      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("dtype")) ==
-          static_cast<int>(framework::proto::VarType::FP64))
-        op_node->Op()->SetAttr(
-            "dtype", static_cast<int>(framework::proto::VarType::FP32));
-    } else if (op_type == "cast") {
-      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("in_dtype")) ==
-          static_cast<int>(framework::proto::VarType::FP64))
-        op_node->Op()->SetAttr(
-            "in_dtype", static_cast<int>(framework::proto::VarType::FP32));
-      if (PADDLE_GET_CONST(int, op_node->Op()->GetAttr("out_dtype")) ==
-          static_cast<int>(framework::proto::VarType::FP64))
-        op_node->Op()->SetAttr(
-            "out_dtype", static_cast<int>(framework::proto::VarType::FP32));
-    }
-
-    auto inputs = op_node->inputs;
-    for (auto* in_node : inputs) {
-      if (in_node->IsCtrlVar()) continue;
-      auto* in_var = in_node->Var();
-      if (!in_var->Persistable() &&
-          in_var->GetDataType() == framework::proto::VarType::FP64) {
-        in_var->SetDataType(framework::proto::VarType::FP32);
-      }
-    }
-  }
-}
-
-// Handle special ops which contains dtype attribute. e.g., fill_constant,
-// assign_value.
-void HandleSpecialOps(framework::OpDesc* op_desc) {
-  if (op_desc->Type() == "fill_constant") {
-    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
-        static_cast<int>(framework::proto::VarType::FP32))
-      op_desc->SetAttr("dtype",
-                       static_cast<int>(framework::proto::VarType::FP16));
-  } else if (op_desc->Type() == "assign_value") {
-    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
-        static_cast<int>(framework::proto::VarType::FP32))
-      op_desc->SetAttr("dtype",
-                       static_cast<int>(framework::proto::VarType::FP16));
-  } else if (op_desc->Type() == "eye") {
-    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
-        static_cast<int>(framework::proto::VarType::FP32))
-      op_desc->SetAttr("dtype",
-                       static_cast<int>(framework::proto::VarType::FP16));
-  } else if (op_desc->Type() == "fill_any_like") {
-    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
-        static_cast<int>(framework::proto::VarType::FP32))
-      op_desc->SetAttr("dtype",
-                       static_cast<int>(framework::proto::VarType::FP16));
-  } else if (op_desc->Type() == "fill_constant_batch_size_like") {
-    if (PADDLE_GET_CONST(int, op_desc->GetAttr("dtype")) ==
-        static_cast<int>(framework::proto::VarType::FP32))
-      op_desc->SetAttr("dtype",
-                       static_cast<int>(framework::proto::VarType::FP16));
-  }
-}
-
-// We modify op's input output precision, and we need to fix cast op in_dtype
-// and out_dtype attribute.
-void FixCastAttr(framework::ir::Graph* graph) {
-  auto op_nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* op_node : op_nodes) {
-    if (!op_node->IsOp()) continue;
-    auto op_type = op_node->Op()->Type();
-    if (op_type != "cast") continue;
-
-    auto input = op_node->inputs[0];
-    auto output = op_node->outputs[0];
-    op_node->Op()->SetAttr("in_dtype",
-                           static_cast<int>(input->Var()->GetDataType()));
-    op_node->Op()->SetAttr("out_dtype",
-                           static_cast<int>(output->Var()->GetDataType()));
-  }
-}
-
-void FindVarsInMultiBlock(
-    framework::ProgramDesc* program_desc,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map,
-    std::vector<std::set<std::string>>* vars_appear_multi_in_one_block) {
-  std::vector<std::set<std::string>> block_var_names_set(program_desc->Size());
-  for (size_t i = 0; i < program_desc->Size(); ++i) {
-    for (auto op : program_desc->Block(i).AllOps()) {
-      auto in_names = op->InputArgumentNames();
-      block_var_names_set[i].insert(in_names.begin(), in_names.end());
-      auto out_names = op->OutputArgumentNames();
-      if (op->HasAttr("sub_block") == false) {
-        for (auto& n : out_names) {
-          if (block_var_names_set[i].count(n)) {
-            (*vars_appear_multi_in_one_block)[i].insert(n);
-          }
-        }
-      }
-      block_var_names_set[i].insert(out_names.begin(), out_names.end());
-    }
-  }
-
-  for (size_t i = 0; i < program_desc->Size() - 1; ++i) {
-    for (size_t j = i + 1; j < program_desc->Size(); ++j) {
-      std::set<std::string> vars_in_multi_block;
-      std::set_intersection(
-          block_var_names_set[i].begin(),
-          block_var_names_set[i].end(),
-          block_var_names_set[j].begin(),
-          block_var_names_set[j].end(),
-          std::inserter(vars_in_multi_block, vars_in_multi_block.begin()));
-
-      for (auto name : vars_in_multi_block) {
-        vars_in_multi_block_map->emplace(
-            name, std::make_pair(framework::proto::VarType::FP32, i));
-      }
-    }
-  }
-}
-
-bool OpInOutHasTensorArray(
-    std::vector<framework::ir::Graph*> graphes,
-    int block_idx,
-    framework::ir::Node* op_node,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map) {
-  CHECK_EQ(op_node->IsOp(), true);
-  for (auto in : op_node->inputs) {
-    auto* real_node =
-        GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
-    if (!NodeVarHasDtype(real_node)) continue;
-    if (real_node->Var()->GetType() ==
-        framework::proto::VarType::LOD_TENSOR_ARRAY)
-      return true;
-  }
-
-  for (auto out : op_node->outputs) {
-    auto* real_node =
-        GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
-    if (!NodeVarHasDtype(real_node)) continue;
-
-    if (real_node->Var()->GetType() ==
-        framework::proto::VarType::LOD_TENSOR_ARRAY)
-      return true;
-  }
-  return false;
-}
-
-void ConvertTensorDtype(
-    framework::ProgramDesc* program_desc,
-    std::vector<framework::ir::Graph*> graphes,
-    const std::unordered_set<std::string>& blacklist,
     bool keep_io_types,
-    phi::Backend backend,
-    phi::DataType tensor_dtype,
-    int block_idx,
-    std::unordered_map<std::string,
-                       std::pair<framework::proto::VarType::Type, int>>*
-        vars_in_multi_block_map,
-    const std::vector<std::set<std::string>>& vars_appear_multi_in_one_block) {
-  auto graph = graphes[block_idx];
-  framework::proto::VarType::Type to_type;
-  if (tensor_dtype == phi::DataType::FLOAT16) {
-    to_type = framework::proto::VarType::FP16;
-  } else if (tensor_dtype == phi::DataType::BFLOAT16) {
-    to_type = framework::proto::VarType::BF16;
-  } else {
+    const std::unordered_set<std::string>& black_list)
+    : model_file_(model_file),
+      params_file_(params_file),
+      mixed_model_file_(mixed_model_file),
+      mixed_params_file_(mixed_params_file),
+      mixed_precision_(mixed_precision),
+      backend_(backend),
+      keep_io_types_(keep_io_types),
+      black_list_(black_list) {
+  if (mixed_precision_ != phi::DataType::FLOAT16 &&
+      mixed_precision_ != phi::DataType::BFLOAT16) {
     PADDLE_THROW(paddle::platform::errors::InvalidArgument(
         "mixed_precision currently not supported dtype %d, we now only "
         "support fp16 and bf16.",
-        static_cast<int>(tensor_dtype)));
+        static_cast<int>(mixed_precision_)));
   }
+  if (backend_ != phi::Backend::GPU) {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "mixed_precision currently not supported place %d, we now only "
+        "support gpu.",
+        static_cast<int>(backend_)));
+  }
+}
 
-  auto* block_desc =
-      framework::ir::TopologySortOperations(*graph)[0]->Op()->Block();
-
-  int num_low_precision = 0;
-  int suffix = 0;
-  std::vector<framework::ir::Node*> output_nodes;
-  std::unordered_map<framework::ir::Node*, framework::ir::Node*> cast_map;
-  auto op_nodes = framework::ir::TopologySortOperations(*graph);
-  for (auto* op_node : op_nodes) {
-    if (!op_node->IsOp()) continue;
-    auto op_type = op_node->Op()->Type();
-    VLOG(3) << "-------------------- op_type " << op_type << ", phi_type "
-            << phi::TransToPhiKernelName(op_type);
-    // 1. set input dtype.
-    if (op_type == "feed") {
-      auto feed_var = op_node->outputs[0]->Var();
-      if (!keep_io_types &&
-          feed_var->GetDataType() == framework::proto::VarType::FP32) {
-        feed_var->SetDataType(to_type);
-      }
-    } else if (op_type == "fetch") {
-      auto* fetch_var = op_node->inputs[0];
-      output_nodes.push_back(fetch_var);
-      continue;
-    } else if (op_type == "cast") {
-      continue;
-    }
-
-    else if (op_node->Op()->HasAttr("sub_block")) {  // NOLINT
-      // sub_block op's output dtype should be same as input dtype, if have the
-      // same name.
-      std::unordered_map<std::string, framework::ir::Node*> in_name_to_node;
-      for (auto* in : op_node->inputs) {
-        auto* real_node =
-            GetRealNode(graphes, block_idx, in, vars_in_multi_block_map);
-        if (NodeVarHasDtype(real_node)) {
-          in_name_to_node[in->Name()] = in;
-        }
-      }
-
-      for (auto out : op_node->outputs) {
-        auto* real_node =
-            GetRealNode(graphes, block_idx, out, vars_in_multi_block_map);
-        if (NodeVarHasDtype(real_node)) {
-          if (in_name_to_node.count(out->Name()))
-            real_node->Var()->SetDataType(
-                in_name_to_node[out->Name()]->Var()->GetDataType());
-        }
-      }
+void ConvertToMixedPrecisionPass::LoadModel() {
+  framework::Executor exe{platform::CPUPlace{}};
 
-      continue;
-    }
+  auto program_desc = inference::Load(&exe, &scope_, model_file_, params_file_);
+  main_graph_ = std::unique_ptr<framework::ir::Graph>(
+      new framework::ir::Graph(*program_desc));
+  main_graph_->SetNotOwned(framework::ir::kParamScopeAttr, &scope_);
+}
 
-    // A strange case found in multi block.
-    else if (op_type == "assign" &&  // NOLINT
-             op_node->inputs[0]->Name() == op_node->outputs[0]->Name()) {
-      VLOG(2) << " in out are same, continue";
-      continue;
-    }
+void ConvertToMixedPrecisionPass::Run() {
+  LoadModel();
 
-    // Handle tensor array.
-    else if (OpInOutHasTensorArray(  // NOLINT
-                 graphes,
-                 block_idx,
-                 op_node,
-                 vars_in_multi_block_map)) {
-      VLOG(2) << "  in or out has tensor array, continue";
-      continue;
-    }
+  framework::ir::AutoMixedPrecisionPass pass;
+  pass.Set("mixed_precision_mode", new int{static_cast<int>(mixed_precision_)});
+  pass.Set("mixed_black_list",
+           new std::unordered_set<std::string>{black_list_});
+  pass.Set("enable_gpu_mixed", new bool{true});
+  pass.Set("keep_io_types", new bool{keep_io_types_});
 
-    // 2. if op support fp16/bf16 and not in blacklist.
-    //      - cast weight to fp16/bf16.
-    //      - add cast op if the input dtype is not fp16/bf16.
-    //      - set output dtype.
-    //
-    // If a var(op's out var) appears multiple times in a block, we should not
-    // convert to fp16.
-    else if (blacklist.count(op_type) == 0 &&  // NOLINT
-             !VarIsMultiOpsOut(graphes,
-                               block_idx,
-                               op_node,
-                               vars_in_multi_block_map,
-                               vars_appear_multi_in_one_block)) {
-      bool support_precision =
-          OpSupportPrecision(op_type, backend, tensor_dtype, blacklist);
-      VLOG(2) << " support low precision " << support_precision;
+  pass.Apply(main_graph_.get());
 
-      if (support_precision) {
-        HandleSpecialOps(op_node->Op());
-        ++num_low_precision;
-        auto inputs = op_node->inputs;
-        // Process inputs.
-        for (auto* in_node : inputs) {
-          ProcessInputNode(true,
-                           graphes,
-                           in_node,
-                           op_node,
-                           &suffix,
-                           block_desc,
-                           &cast_map,
-                           to_type,
-                           block_idx,
-                           vars_in_multi_block_map);
-        }
-        // Process outputs.
-        for (auto* out_node : op_node->outputs) {
-          ProcessOutputNode(
-              graphes, block_idx, out_node, to_type, vars_in_multi_block_map);
-        }
-      } else {
-        auto inputs = op_node->inputs;
-        for (auto* in_node : inputs) {
-          ProcessInputNode(false,
-                           graphes,
-                           in_node,
-                           op_node,
-                           &suffix,
-                           block_desc,
-                           &cast_map,
-                           framework::proto::VarType::FP32,
-                           block_idx,
-                           vars_in_multi_block_map);
-        }
-      }
-    }
+  SaveMixedModel();
+}
 
-    // 3. check op not support fp16/bf16 or in blacklist.
-    //      - add cast op if the input dtype is not fp32.
-    else {  // NOLINT
-      auto ins = op_node->inputs;
-      for (auto* in_node : ins) {
-        if (in_node->IsCtrlVar()) continue;
-        auto* in_var = in_node->Var();
-        if (in_var->GetDataType() == to_type) {
-          AddCastOp(graph,
-                    in_node,
-                    op_node,
-                    to_type,
-                    framework::proto::VarType::FP32,
-                    &suffix,
-                    block_desc,
-                    &cast_map);
-        }
-      }
-    }
-  }
+void ConvertToMixedPrecisionPass::SaveMixedModel() {
+  framework::ProgramDesc mixed_program_desc;
+  framework::ir::GraphToProgram(*main_graph_, &mixed_program_desc);
 
-  // 4. if output_op's dtype is not compatible to output dtype, then just
-  // insert cast.
-  for (auto* node : output_nodes) {
-    if (node->IsCtrlVar()) continue;
-    auto var = node->Var();
-    if (keep_io_types && var->GetDataType() == to_type) {
-      // fp16/bf16 -> fp32.
-      AddCastOp(graph,
-                node,
-                node->outputs[0],
-                to_type,
-                framework::proto::VarType::FP32,
-                &suffix,
-                block_desc,
-                &cast_map);
-    } else if (!keep_io_types &&
-               var->GetDataType() == framework::proto::VarType::FP32) {
-      // fp32 -> fp16/bf16
-      AddCastOp(graph,
-                node,
-                node->outputs[0],
-                framework::proto::VarType::FP32,
-                to_type,
-                &suffix,
-                block_desc,
-                &cast_map);
-    }
-  }
+  auto parameters = scope_.LocalVarNames();
+  std::sort(parameters.begin(), parameters.end());
 
-  for (auto node : graph->Nodes()) {
-    auto* real_node =
-        GetRealNode(graphes, block_idx, node, vars_in_multi_block_map);
-    if (!NodeVarHasDtype(real_node)) continue;
+  auto SerializeParams = [&]() -> std::string {
+    std::ostringstream os;
+    phi::CPUContext ctx;
+    for (const auto& param : parameters) {
+      PADDLE_ENFORCE_NOT_NULL(
+          scope_.FindVar(param),
+          platform::errors::NotFound(
+              "Block should already have a '%s' variable", param));
+      auto* tensor = scope_.FindVar(param)->GetMutable<phi::DenseTensor>();
+      framework::SerializeToStream(os, *tensor, ctx);
+    }
+    return os.str();
+  };
 
-    if (vars_in_multi_block_map->count(real_node->Name()) &&
-        vars_in_multi_block_map->at(real_node->Name()).second == block_idx) {
-      vars_in_multi_block_map->at(real_node->Name()).first =
-          real_node->Var()->GetDataType();
-    }
-  }
+  auto StrToBinary = [](const std::string& path, const std::string& str) {
+    std::ofstream file(path.c_str(), std::ios::binary);
+    file.write(str.c_str(), str.size());
+    file.close();
+  };
 
-  if (num_low_precision)
-    LOG(INFO) << "---  detected " << num_low_precision
-              << " low precision ops in " << block_idx << " subgraph";
+  StrToBinary(mixed_model_file_,
+              mixed_program_desc.Proto()->SerializeAsString());
+  StrToBinary(mixed_params_file_, SerializeParams());
 }
-}  // namespace
 
 bool OpSupportPrecision(const std::string& op_type,
                         phi::Backend backend,
                         phi::DataType precision,
-                        const std::unordered_set<std::string>& blacklist) {
-  auto phi_op_type = phi::TransToPhiKernelName(op_type);
-  bool support_precision = false;
-  if (blacklist.count(op_type) == 0) {
-    if (backend == phi::Backend::GPU)
-      support_precision = GpuKernelSupportPrecision(op_type, precision);
-    else
-      support_precision =
-          PhiKernelSupportPrecision(phi_op_type, backend, precision);
-  }
-  return support_precision;
+                        const std::unordered_set<std::string>& black_list) {
+  return framework::ir::OpSupportPrecision(
+      op_type, backend, precision, black_list);
 }
 
-void AddCastOp(
+void InsertCastOp(
     framework::ir::Graph* graph,
-    framework::ir::Node* node,
-    framework::ir::Node* next_op,
+    framework::ir::Node* var_node,
+    framework::ir::Node* op_node,
     framework::proto::VarType::Type from_type,
     framework::proto::VarType::Type to_type,
-    int* suffix,
     framework::BlockDesc* block_desc,
-    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map) {
-  auto update_cast_desc = [&](framework::OpDesc& desc,
-                              const std::string& x_name,
-                              const std::string& out_name,
-                              const int in_dtype,
-                              const int out_dtype) {
-    desc.SetType("cast");
-    desc.SetInput("X", {x_name});
-    desc.SetOutput("Out", {out_name});
-    desc.SetAttr("in_dtype", in_dtype);
-    desc.SetAttr("out_dtype", out_dtype);
-    desc.SetAttr("use_mkldnn", false);
-    desc.SetAttr("with_quant_attr", false);
-    desc.Flush();
-  };
-
-  if (map->count(node) == 0) {
-    // insert cast op before node.
-    std::string cast_input_name = node->Var()->Name();
-    std::string cast_output_name =
-        node->Var()->Name() + "_cast.tmp_" + std::to_string((*suffix)++);
-    CHECK_NOTNULL(block_desc);
-    framework::OpDesc cast_op_desc(block_desc);
-    update_cast_desc(cast_op_desc,
-                     cast_input_name,
-                     cast_output_name,
-                     static_cast<int>(from_type),
-                     static_cast<int>(to_type));
-    auto* cast_op_node = graph->CreateOpNode(&cast_op_desc);
-    auto* cast_output_vardesc = block_desc->Var(cast_output_name);
-    cast_output_vardesc->SetPersistable(false);
-    cast_output_vardesc->SetDataType(to_type);
-    cast_output_vardesc->SetShape(node->Var()->GetShape());
-    auto* cast_output_node = graph->CreateVarNode(cast_output_vardesc);
-    IR_NODE_LINK_TO(cast_op_node, cast_output_node);
-    (*map)[node] = cast_output_node;
-  }
-  next_op->Op()->RenameInput(node->Name(), map->at(node)->Name());
-  IR_NODE_LINK_TO(node, map->at(node)->inputs[0]);
-  IR_NODE_LINK_TO(map->at(node), next_op);
-}
-
-void ConvertToMixedPrecision(const std::string& model_file,
-                             const std::string& params_file,
-                             const std::string& mixed_model_file,
-                             const std::string& mixed_params_file,
-                             phi::DataType mixed_precision,
-                             phi::Backend backend,
-                             bool keep_io_types,
-                             std::unordered_set<std::string> black_list) {
-  paddle::CPUPlace place;
-  framework::Executor executor(place);
-  framework::Scope scope;
-  auto program_desc =
-      inference::Load(&executor, &scope, model_file, params_file);
-  auto main_graph = std::unique_ptr<framework::ir::Graph>(
-      new framework::ir::Graph(*program_desc));
-
-  std::unordered_map<std::string,
-                     std::pair<framework::proto::VarType::Type, int>>
-      vars_in_multi_block_map;
-  std::vector<std::set<std::string>> vars_appear_multi_in_one_block(
-      program_desc->Size());
-  FindVarsInMultiBlock(program_desc.get(),
-                       &vars_in_multi_block_map,
-                       &vars_appear_multi_in_one_block);
-
-  std::vector<framework::ir::Graph*> graphes;
-  for (size_t i = 0; i < main_graph->SubGraphsSize(); ++i) {
-    auto graph = main_graph->GetSubGraph(i);
-    graphes.push_back(graph);
-    VLOG(2) << " --------  handle subgraph " << i << ", has "
-            << graph->Nodes().size() << " nodes --------";
-
-    ConvertAllFp64ToFp32(graph);
-    ConvertTensorDtype(program_desc.get(),
-                       graphes,
-                       black_list,
-                       keep_io_types,
-                       backend,
-                       mixed_precision,
-                       i,
-                       &vars_in_multi_block_map,
-                       vars_appear_multi_in_one_block);
-    FixCastAttr(graph);
-  }
-
-  framework::ProgramDesc mixed_program_desc;
-  framework::ir::GraphToProgram(*main_graph, &mixed_program_desc);
-
-  SaveMixedModel(main_graph.get(),
-                 &scope,
-                 &mixed_program_desc,
-                 mixed_model_file,
-                 mixed_params_file,
-                 mixed_precision,
-                 vars_in_multi_block_map);
+    int* suffix,
+    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* visited) {
+  framework::ir::DoInsertCastOp(graph,
+                                var_node,
+                                op_node,
+                                from_type,
+                                to_type,
+                                block_desc,
+                                suffix,
+                                visited);
+}
+
+void ConvertToMixedPrecision(
+    const std::string& model_file,
+    const std::string& params_file,
+    const std::string& mixed_model_file,
+    const std::string& mixed_params_file,
+    phi::DataType mixed_precision,
+    phi::Backend backend,
+    bool keep_io_types,
+    const std::unordered_set<std::string>& black_list) {
+  ConvertToMixedPrecisionPass pass(model_file,
+                                   params_file,
+                                   mixed_model_file,
+                                   mixed_params_file,
+                                   mixed_precision,
+                                   backend,
+                                   keep_io_types,
+                                   black_list);
+  pass.Run();
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
index 2a19453b02a011..3a1e5fbb30a21d 100644
--- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
+++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.h
@@ -15,14 +15,12 @@
 #pragma once
 
 #include <string>
-#include <unordered_map>
 #include <unordered_set>
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_helper.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
 #include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/phi/common/backend.h"
 #include "paddle/phi/common/data_type.h"
 
@@ -30,20 +28,52 @@ namespace paddle {
 namespace inference {
 namespace analysis {
 
-bool OpSupportPrecision(const std::string& phi_op_type,
+class ConvertToMixedPrecisionPass {
+ public:
+  explicit ConvertToMixedPrecisionPass(
+      const std::string& model_file,
+      const std::string& params_file,
+      const std::string& mixed_model_file,
+      const std::string& mixed_params_file,
+      phi::DataType mixed_precision,
+      phi::Backend backend,
+      bool keep_io_types,
+      const std::unordered_set<std::string>& black_list);
+
+  void Run();
+
+ private:
+  void LoadModel();
+  void SaveMixedModel();
+
+ private:
+  std::string model_file_;
+  std::string params_file_;
+  std::string mixed_model_file_;
+  std::string mixed_params_file_;
+  phi::DataType mixed_precision_;
+  phi::Backend backend_;
+  bool keep_io_types_;
+  std::unordered_set<std::string> black_list_;
+
+  framework::Scope scope_;
+  std::unique_ptr<framework::ir::Graph> main_graph_{nullptr};
+};
+
+bool OpSupportPrecision(const std::string& op_type,
                         phi::Backend backend,
                         phi::DataType precision,
-                        const std::unordered_set<std::string>& blacklist);
+                        const std::unordered_set<std::string>& black_list);
 
-void AddCastOp(
+void InsertCastOp(
     framework::ir::Graph* graph,
-    framework::ir::Node* node,
-    framework::ir::Node* next_op,
+    framework::ir::Node* var_node,
+    framework::ir::Node* op_node,
     framework::proto::VarType::Type from_type,
     framework::proto::VarType::Type to_type,
-    int* suffix,
     framework::BlockDesc* block_desc,
-    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* map);
+    int* suffix,
+    std::unordered_map<framework::ir::Node*, framework::ir::Node*>* visited);
 
 void ConvertToMixedPrecision(const std::string& model_file,
                              const std::string& params_file,
@@ -51,8 +81,8 @@ void ConvertToMixedPrecision(const std::string& model_file,
                              const std::string& mixed_params_file,
                              phi::DataType mixed_precision,
                              phi::Backend backend,
-                             bool keep_io_types = true,
-                             std::unordered_set<std::string> black_list = {});
+                             bool keep_io_types,
+                             const std::unordered_set<std::string>& black_list);
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
index ed45ec3301d1d2..126d16933fd820 100644
--- a/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/inference_op_replace_pass.cc
@@ -40,7 +40,7 @@ void InferenceOpReplacePass::RunImpl(Argument* argument) {
 }
 
 std::string InferenceOpReplacePass::repr() const {
-  return "inference-op-replace-pass";
+  return "inference_op_replace_pass";
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
index 53398a69536b97..12b18ac53e3687 100644
--- a/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_analysis_pass.cc
@@ -105,7 +105,7 @@ void IrAnalysisPass::CollectFusionStatis(Argument* argument) {
           framework::ir::kFuseStatisAttr));
 }
 
-std::string IrAnalysisPass::repr() const { return "ir-analysis-pass"; }
+std::string IrAnalysisPass::repr() const { return "ir_analysis_pass"; }
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
index 5070328394b170..df0ffc534b71c9 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc
@@ -55,7 +55,8 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
         argument->model_params_path(),
         argument->scope_ptr(),
         place,
-        argument->model_from_memory_valid() && argument->model_from_memory());
+        argument->model_from_memory_valid() && argument->model_from_memory(),
+        argument->skip_load_params());
     argument->SetMainProgram(program.release());
   } else {
     PADDLE_THROW(platform::errors::PreconditionNotMet(
@@ -63,7 +64,8 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
         "set."));
   }
 
-  auto graph = std::unique_ptr<Graph>(new Graph(argument->main_program()));
+  auto graph = std::unique_ptr<framework::ir::Graph>(
+      new framework::ir::Graph(argument->main_program()));
   argument->SetMainGraph(graph.release());
   auto *scope_ptr = argument->scope_ptr();
   PADDLE_ENFORCE_NOT_NULL(scope_ptr,
@@ -92,6 +94,10 @@ void IrGraphBuildPass::RunImpl(Argument *argument) {
           &argument->ipu_available_memory_proportion());
       argument->main_graph().SetNotOwned("enable_half_partial",
                                          &argument->ipu_enable_half_partial());
+      argument->main_graph().SetNotOwned("custom_ops_info",
+                                         &argument->ipu_custom_ops_info());
+      argument->main_graph().SetNotOwned("custom_patterns",
+                                         &argument->ipu_custom_patterns());
     }
   }
 #endif
@@ -110,16 +116,17 @@ std::unique_ptr<framework::ProgramDesc> IrGraphBuildPass::LoadModel(
     const std::string &params_path,
     framework::Scope *scope,
     const platform::Place &place,
-    bool model_from_memory) {
+    bool model_from_memory,
+    bool skip_load_params) {
   framework::Executor exe(place);
   if (!model_from_memory) {
-    return Load(&exe, scope, program_path, params_path);
+    return Load(&exe, scope, program_path, params_path, !skip_load_params);
   } else {
     return LoadFromMemory(&exe, scope, program_path, params_path);
   }
 }
 
-std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; }
+std::string IrGraphBuildPass::repr() const { return "ir_graph_build_pass"; }
 
 }  // namespace analysis
 }  // namespace inference
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
index 32902ef0667303..69047b73ea02a6 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h
@@ -43,7 +43,8 @@ class IrGraphBuildPass : public AnalysisPass {
       const std::string &params_path,
       framework::Scope *scope,
       const platform::Place &place,
-      bool model_from_memory);
+      bool model_from_memory,
+      bool skip_load_params);
 
   std::string model_binary_str_;
 };
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
deleted file mode 100644
index 6c18c625637166..00000000000000
--- a/paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
-
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-#include "paddle/fluid/framework/ir/node.h"
-
-namespace paddle {
-namespace inference {
-namespace analysis {
-
-void IrInferCleanGraphPass::RunImpl(Argument* argument) {
-  auto& graph = argument->main_graph();
-  auto is_valid_node = [](framework::ir::Node* x) {
-    return x && IsControlDepVar(*x) && x->IsVar() && !x->Var();
-  };
-
-  std::unordered_set<const framework::ir::Node*> invalid_nodes;
-  int valid_op = 0;
-  for (auto* node : graph.Nodes()) {
-    PADDLE_ENFORCE_NOT_NULL(node,
-                            platform::errors::PreconditionNotMet(
-                                "The node should not be nullptr."));
-    if (is_valid_node(node)) {
-      invalid_nodes.insert(node);
-    } else if (node->IsOp()) {
-      ++valid_op;
-    }
-  }
-
-  GraphSafeRemoveNodes(&graph, invalid_nodes);
-}
-
-}  // namespace analysis
-}  // namespace inference
-}  // namespace paddle
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
index 999fb4ad8d7642..3d86f7bf399a99 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.cc
@@ -31,7 +31,7 @@ void IrGraphToProgramPass::RunImpl(Argument *argument) {
               new int(argument->memory_optim_sort_kind()));
   }
 
-  std::unique_ptr<Graph> graph(argument->main_graph_ptr());
+  std::unique_ptr<framework::ir::Graph> graph(argument->main_graph_ptr());
 
   // Direct using ProgramDesc desc(argument->main_program()) may cause
   // incomplete copies of information.
diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
index 5b20667d62ab60..8e90eb0e20d57d 100644
--- a/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
+++ b/paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h
@@ -28,7 +28,7 @@ class IrGraphToProgramPass : public AnalysisPass {
  public:
   void RunImpl(Argument *argument) override;
 
-  std::string repr() const override { return "ir-graph-to-param-pass"; }
+  std::string repr() const override { return "ir_graph_to_param_pass"; }
 };
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 168b99f3d7649a..8961cbb5b6e473 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -21,12 +21,9 @@
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/ir/graph_helper.h"
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/inference/api/helper.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/place.h"
 #include "paddle/phi/common/data_type.h"
 
 namespace paddle {
@@ -117,28 +114,6 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
     reserve_cpu_weights = true;
   }
 
-  int64_t params_total_bytes{0};
-  for (auto *node : paddle::framework::ir::TopologySortOperations(graph)) {
-    if (!node->IsOp()) continue;
-    if (node->Op()->Type() == "feed" || node->Op()->Type() == "fetch") continue;
-    for (auto *var_node : node->inputs) {
-      if (!var_node->Var()->Persistable()) continue;
-      auto var_name = var_node->Var()->Name();
-      auto *var = scope->FindLocalVar(var_name);
-      if (var->IsType<framework::LoDTensor>() ||
-          var->IsType<framework::Tensor>()) {
-        auto *t = var->GetMutable<framework::LoDTensor>();
-        params_total_bytes += t->numel() * experimental::SizeOf(t->dtype());
-      }
-    }
-  }
-
-  {
-    // Alloc memory in pool to store all parameters.
-    framework::Tensor ts;
-    ts.mutable_data(place, params_total_bytes);
-  }
-
   std::unordered_set<std::string> visited;
   for (auto *node : paddle::framework::ir::TopologySortOperations(graph)) {
     if (!node->IsOp()) continue;
@@ -165,38 +140,12 @@ void IrParamsSyncAmongDevicesPass::CopyParamsToGpu(Argument *argument) {
         auto var_data_type = var_node->Var()->GetDataType();
         VLOG(5) << "var_name is " << var_name << ", data type is "
                 << var_data_type;
-        if (var_data_type == paddle::framework::proto::VarType::FP16) {
-          framework::Tensor half_tensor;
-          half_tensor.set_type(paddle::experimental::DataType::FLOAT16);
-          half_tensor.Resize(t->dims());
-          auto *half_data =
-              half_tensor.mutable_data<float16>(platform::CPUPlace());
-          for (int i = 0; i < t->numel(); i++) {
-            auto *data = t->mutable_data<float16>(platform::CPUPlace());
-            half_data[i] = static_cast<float16>(data[i]);
-          }
-          t->clear();
-          paddle::framework::TensorCopySync(half_tensor, place, t);
-        } else if (var_data_type == paddle::framework::proto::VarType::BF16) {
-          framework::Tensor bf16_tensor;
-          bf16_tensor.set_type(paddle::experimental::DataType::BFLOAT16);
-          bf16_tensor.Resize(t->dims());
-          auto *bf16_data = bf16_tensor.mutable_data<platform::bfloat16>(
-              platform::CPUPlace());
-          for (int i = 0; i < t->numel(); i++) {
-            auto *data = t->mutable_data<bfloat16>(platform::CPUPlace());
-            bf16_data[i] = static_cast<platform::bfloat16>(data[i]);
-          }
-          t->clear();
-          paddle::framework::TensorCopySync(bf16_tensor, place, t);
-        } else {
-          platform::CPUPlace cpu_place;
-          framework::LoDTensor temp_tensor;
-          temp_tensor.Resize(t->dims());
-          paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
-          t->clear();
-          paddle::framework::TensorCopySync(temp_tensor, place, t);
-        }
+        platform::CPUPlace cpu_place;
+        framework::LoDTensor temp_tensor;
+        temp_tensor.Resize(t->dims());
+        paddle::framework::TensorCopySync(*t, cpu_place, &temp_tensor);
+        t->clear();
+        paddle::framework::TensorCopySync(temp_tensor, place, t);
       }
     }
   }
@@ -220,7 +169,7 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
 }
 
 std::string IrParamsSyncAmongDevicesPass::repr() const {
-  return "ir-params-sync-among-devices-pass";
+  return "ir_params_sync_among_devices_pass";
 }
 
 }  // namespace analysis
diff --git a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
index bfe9e1e4b26626..63aaa7d97967a4 100644
--- a/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/memory_optimize_pass.cc
@@ -76,6 +76,7 @@ void MemoryOptimizePass::CollectLifeCycle(
     } else {
       // Normal operators.
       for (const Node* node : requires) {
+        if (!node->Var()) continue;
         if (node->Var()->Persistable()) continue;
         std::string var = node->Name();
         if (!lifecycles->count(var)) {
@@ -133,7 +134,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
   // between performance and underlying principle.
   std::unordered_set<std::string> black_list;
   for (auto* node : graph->Nodes()) {
-    if (node->IsVar() &&
+    if (node->IsVar() && node->Var() &&
         node->Var()->GetType() ==
             framework::proto::VarType::Type::VarType_Type_LOD_TENSOR) {
       if (!valid_var(node)) {
@@ -144,7 +145,7 @@ void MemoryOptimizePass::CollectVarMemorySize(
 
   // Collect tensors from graph.
   for (auto* node : graph->Nodes()) {
-    if (node->IsVar() &&
+    if (node->IsVar() && node->Var() &&
         node->Var()->GetType() ==
             framework::proto::VarType::Type::VarType_Type_LOD_TENSOR &&
         !black_list.count(node->Var()->Name())) {
@@ -294,7 +295,7 @@ void UpdateOpDescsByReuse(
   }
 }
 
-std::string MemoryOptimizePass::repr() const { return "memory optimize pass"; }
+std::string MemoryOptimizePass::repr() const { return "memory_optimize_pass"; }
 
 void MemoryOptimizePass::RunImpl(Argument* argument) {
   // Memory optimization.
diff --git a/paddle/fluid/inference/analysis/passes/passes.cc b/paddle/fluid/inference/analysis/passes/passes.cc
index 19aab1a948dd2d..cd65757d08f3fb 100644
--- a/paddle/fluid/inference/analysis/passes/passes.cc
+++ b/paddle/fluid/inference/analysis/passes/passes.cc
@@ -18,7 +18,6 @@
 #include "paddle/fluid/inference/analysis/passes/inference_op_replace_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h"
-#include "paddle/fluid/inference/analysis/passes/ir_graph_clean_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_graph_to_program_pass.h"
 #include "paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.h"
 #include "paddle/fluid/inference/analysis/passes/memory_optimize_pass.h"
@@ -34,8 +33,6 @@ PassRegistry::PassRegistry() {
                   std::unique_ptr<AnalysisPass>(new IrAnalysisPass));
   passes_.emplace("ir_graph_build_pass",
                   std::unique_ptr<AnalysisPass>(new IrGraphBuildPass));
-  passes_.emplace("ir_graph_clean_pass",
-                  std::unique_ptr<AnalysisPass>(new IrInferCleanGraphPass));
   passes_.emplace("memory_optimize_pass",
                   std::unique_ptr<AnalysisPass>(new MemoryOptimizePass));
   passes_.emplace(
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index f3fbf1c344d65e..87c622cf509056 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -22,6 +22,7 @@
 #include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/utils/string/split.h"
 
 #ifdef PADDLE_WITH_TENSORRT
 #include "paddle/fluid/inference/tensorrt/helper.h"
@@ -84,15 +85,29 @@ void AnalysisConfig::SetModel(const std::string &prog_file_path,
 
   Update();
 }
+
 void AnalysisConfig::EnableUseGpu(uint64_t memory_pool_init_size_mb,
-                                  int device_id) {
+                                  int device_id,
+                                  Precision precision_mode) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   use_gpu_ = true;
   memory_pool_init_size_mb_ = memory_pool_init_size_mb;
   FLAGS_initial_gpu_memory_in_mb = memory_pool_init_size_mb_;
   gpu_device_id_ = device_id;
+  mixed_precision_mode_ = precision_mode;
+  if (precision_mode == Precision::kFloat32) {
+    // default
+  } else if (precision_mode == Precision::kHalf ||
+             precision_mode == Precision::kBf16) {
+    enable_gpu_mixed_ = true;
+  } else {
+    LOG(ERROR)
+        << "The Paddle-GPU inference currently only supports "
+           "float32/float16/bfloat16 precision. Please check the parameters "
+           "you specified in EnableUseGpu or enable_use_gpu function.";
+  }
 #else
-  LOG(ERROR) << "Please compile with gpu to EnableGpu()";
+  LOG(ERROR) << "Please use PaddlePaddle with GPU version.";
   use_gpu_ = false;
 #endif
 
@@ -208,6 +223,120 @@ void AnalysisConfig::SetIpuConfig(bool ipu_enable_fp16,
   Update();
 }
 
+void AnalysisConfig::SetIpuCustomInfo(
+    const std::vector<std::vector<std::string>> &ipu_custom_ops_info,
+    const std::map<std::string, bool> &ipu_custom_patterns) {
+  ipu_custom_ops_info_ = ipu_custom_ops_info;
+  for (auto iter = ipu_custom_patterns.begin();
+       iter != ipu_custom_patterns.end();
+       iter++) {
+    if (iter->second == true) {
+      ipu_custom_patterns_.push_back(
+          std::vector<std::string>{iter->first, "True"});
+    } else if (iter->second == false) {
+      ipu_custom_patterns_.push_back(
+          std::vector<std::string>{iter->first, "False"});
+    }
+  }
+
+  Update();
+}
+
+void AnalysisConfig::LoadIpuConfig(const std::string &config_path) {
+  std::ifstream fin(config_path, std::ios::in);
+  PADDLE_ENFORCE_EQ(
+      static_cast<bool>(fin.is_open()),
+      true,
+      platform::errors::NotFound(
+          "Cannot open file %s, please confirm whether the file is normal.",
+          config_path));
+  std::string line;
+  while (std::getline(fin, line)) {
+    // remove all space
+    line.erase(std::remove(line.begin(), line.end(), ' '), line.end());
+
+    std::string key;
+    std::string value;
+    std::istringstream stream(line);
+    // Split string to key and value based on the first `,`
+    std::getline(stream, key, ',');
+    std::getline(stream, value);
+
+    auto string2bool = [](std::string s) {
+      std::transform(s.begin(), s.end(), s.begin(), [](unsigned char c) {
+        return ::tolower(c);
+      });
+      return s == "true" || s == "1";
+    };
+
+    // ipu_custom_ops_info:
+    // [[paddle_op_name, popart_op_name, domain, version], [paddle_op_name,
+    // popart_op_name, domain, version]...]
+    // ipu_custom_patterns:
+    // [[paddle_op_name, enable_pattern], [paddle_op_name, enable_pattern]...]
+    auto string2vector = [](std::string s) {
+      std::vector<std::vector<std::string>> custom_info;
+      s.erase(0, 1);
+      s.pop_back();
+
+      std::string one;
+      std::istringstream s_stream(s);
+      while (std::getline(s_stream, one, ']')) {
+        if (!one.empty()) {
+          // remove `[`
+          one.erase(0, 1);
+          custom_info.push_back(paddle::string::Split(one, ','));
+        }
+      }
+      return custom_info;
+    };
+
+    if (ipu_config_mapper_.find(key) == ipu_config_mapper_.end()) {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "invalid key %s in IPU config: ", key));
+    }
+    switch (ipu_config_mapper_.at(key)) {
+      case ipu_config_code::ipu_device_num:
+        ipu_device_num_ = std::stoi(value);
+        break;
+      case ipu_config_code::ipu_micro_batch_size:
+        ipu_micro_batch_size_ = std::stoi(value);
+        break;
+      case ipu_config_code::ipu_enable_pipelining:
+        ipu_enable_pipelining_ = string2bool(value);
+        break;
+      case ipu_config_code::ipu_batches_per_step:
+        ipu_batches_per_step_ = std::stoi(value);
+        break;
+      case ipu_config_code::ipu_enable_fp16:
+        ipu_enable_fp16_ = string2bool(value);
+        break;
+      case ipu_config_code::ipu_replica_num:
+        ipu_replica_num_ = std::stoi(value);
+        break;
+      case ipu_config_code::ipu_available_memory_proportion:
+        ipu_available_memory_proportion_ = std::stof(value);
+        break;
+      case ipu_config_code::ipu_enable_half_partial:
+        ipu_enable_half_partial_ = string2bool(value);
+        break;
+      case ipu_config_code::ipu_custom_ops_info:
+        ipu_custom_ops_info_ = string2vector(value);
+        break;
+      case ipu_config_code::ipu_custom_patterns:
+        ipu_custom_patterns_ = string2vector(value);
+        break;
+
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "invalid key %s in IPU config", key));
+        break;
+    }
+  }
+
+  Update();
+}
+
 void AnalysisConfig::EnableONNXRuntime() {
 #ifdef PADDLE_WITH_ONNXRUNTIME
   use_onnxruntime_ = true;
@@ -257,8 +386,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(gpu_device_id_);
   CP_MEMBER(memory_pool_init_size_mb_);
 
-  // Mixed related.
+  // Mixed precision related.
   CP_MEMBER(mixed_black_list_);
+  CP_MEMBER(enable_gpu_mixed_);
+  CP_MEMBER(mixed_precision_mode_);
 
   CP_MEMBER(enable_memory_optim_);
   // TensorRT related.
@@ -358,6 +489,8 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(ipu_replica_num_);
   CP_MEMBER(ipu_available_memory_proportion_);
   CP_MEMBER(ipu_enable_half_partial_);
+  CP_MEMBER(ipu_custom_ops_info_);
+  CP_MEMBER(ipu_custom_patterns_);
 
   // fleet exe related
   CP_MEMBER(dist_config_);
@@ -367,6 +500,10 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(custom_device_type_);
   CP_MEMBER(custom_device_id_);
 
+  // JITLayer relate
+  CP_MEMBER(apply_optim_);
+  CP_MEMBER(skip_load_params_);
+
   if (use_gpu_) {
     PADDLE_ENFORCE_EQ(use_xpu_,
                       false,
@@ -619,13 +756,7 @@ void AnalysisConfig::Update() {
       ((use_custom_device() ^ pass_builder_->use_custom_device()))) {
     if (use_gpu()) {
       pass_builder_.reset(new GpuPassStrategy);
-
-      if (use_tensorrt_) {
-        // Append after the Affine_channel_conv_fuse pass.
-        pass_builder()->InsertPass(3, "tensorrt_subgraph_pass");
-      }
     } else if (use_ipu()) {
-      VLOG(1) << "IpuPassStrategy has been used for new.";
       pass_builder_.reset(new IpuPassStrategy);
     } else if (use_xpu()) {
       PADDLE_ENFORCE_EQ(
@@ -825,9 +956,6 @@ void AnalysisConfig::Update() {
         "but did not have the option -DWITH_CUSTOM_DEVICE compiled."));
 #endif
   }
-  if (ir_debug_) {
-    pass_builder()->TurnOnDebug();
-  }
 }
 
 std::string AnalysisConfig::SerializeInfoCache() {
@@ -839,6 +967,7 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << calibration_file_path_;
 
   ss << use_gpu_;
+  ss << enable_gpu_mixed_;
   ss << use_external_stream_;
   ss << exec_stream_;
   ss << use_fc_padding_;
@@ -910,7 +1039,12 @@ std::string AnalysisConfig::SerializeInfoCache() {
   ss << ipu_replica_num_;
   ss << ipu_available_memory_proportion_;
   ss << ipu_enable_half_partial_;
-
+  for (auto custom_op : ipu_custom_ops_info_)
+    for (auto attr : custom_op) ss << attr;
+  ss << ";";
+  for (auto pattern : ipu_custom_patterns_)
+    for (auto attr : pattern) ss << attr;
+  ss << ";";
   for (auto &op : mixed_black_list_) ss << op.c_str();
   return ss.str();
 }
@@ -1041,6 +1175,7 @@ std::string AnalysisConfig::Summary() {
   os.InsertRow({"use_gpu", use_gpu_ ? "true" : "false"});
   if (use_gpu_) {
     os.InsertRow({"gpu_device_id", std::to_string(gpu_device_id_)});
+    os.InsertRow({"enable_gpu_mixed", std::to_string(enable_gpu_mixed_)});
     os.InsertRow({"memory_pool_init_size",
                   std::to_string(memory_pool_init_size_mb_) + "MB"});
     os.InsertRow(
@@ -1234,7 +1369,7 @@ bool AnalysisConfig::trt_allow_build_at_runtime() {
   return trt_allow_build_at_runtime_;
 }
 
-void AnalysisConfig::Exp_SetBlackListOpsForMixedModel(
+void AnalysisConfig::Exp_DisableMixedPrecisionOps(
     const std::unordered_set<std::string> &black_list) {
   mixed_black_list_ = black_list;
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
old mode 100755
new mode 100644
index fbc2830aff6148..7bd14ca05ecdde
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -129,17 +129,19 @@ phi::DataType ConvertPrecision(AnalysisConfig::Precision precision) {
   }
 }
 
-phi::Backend ConvertBackend(AnalysisConfig::Backend backend) {
+phi::Backend ConvertBackend(paddle_infer::PlaceType backend) {
   switch (backend) {
-    case AnalysisConfig::Backend::kGPU:
+    case paddle_infer::PlaceType::kGPU:
       // NOTE: phi also support phi::Backend::GPUDNN.
       return phi::Backend::GPU;
-    case AnalysisConfig::Backend::kNPU:
+    case paddle_infer::PlaceType::kNPU:
       return phi::Backend::NPU;
-    case AnalysisConfig::Backend::kXPU:
+    case paddle_infer::PlaceType::kXPU:
       return phi::Backend::XPU;
-    case AnalysisConfig::Backend::kCPU:
+    case paddle_infer::PlaceType::kCPU:
       return phi::Backend::CPU;
+    case paddle_infer::PlaceType::kIPU:
+      return phi::Backend::IPU;
     default:
       PADDLE_THROW(paddle::platform::errors::InvalidArgument(
           "Paddle Inference not support backend, we now only support GPU, XPU, "
@@ -166,20 +168,28 @@ bool PaddleTensorToLoDTensor(const PaddleTensor &pt,
     LOG(ERROR) << "unsupported feed type " << pt.dtype;
     return false;
   }
-
-  PADDLE_ENFORCE_NOT_NULL(
-      input_ptr,
-      paddle::platform::errors::Fatal(
-          "Cannot convert to LoDTensor because LoDTensor creation failed."));
-  PADDLE_ENFORCE_NOT_NULL(
-      pt.data.data(),
-      paddle::platform::errors::InvalidArgument(
-          "The data contained in the input PaddleTensor is illegal."));
+  // NOTE(Aurelius84): Some kernels support zero shape input
+  // without memory holder, we should skip enforce logic.
+  bool has_zero_dim = (phi::product(ddim) == 0);
+  VLOG(3) << "Found zero dim: " << has_zero_dim
+          << " from input with ddim: " << ddim;
+  if (!has_zero_dim) {
+    PADDLE_ENFORCE_NOT_NULL(
+        input_ptr,
+        paddle::platform::errors::Fatal(
+            "Cannot convert to LoDTensor because LoDTensor creation failed."));
+    PADDLE_ENFORCE_NOT_NULL(
+        pt.data.data(),
+        paddle::platform::errors::InvalidArgument(
+            "The data contained in the input PaddleTensor is illegal."));
+  }
 
   if (platform::is_cpu_place(place)) {
     // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
-    std::memcpy(
-        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
+    if (input_ptr != nullptr) {
+      std::memcpy(
+          static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
+    }
   } else if (platform::is_ipu_place(place)) {
 #ifdef PADDLE_WITH_IPU
     std::memcpy(
@@ -527,6 +537,11 @@ bool AnalysisPredictor::PrepareProgram(
     // If the program is passed from external, no need to optimize it, this
     // logic is used in the clone scenario.
     inference_program_ = program;
+    if (config_.apply_optim_) {
+      VLOG(3)
+          << "apply_optim is enabled, will call OptimizeInferenceProgram().";
+      OptimizeInferenceProgram();
+    }
   }
 
   executor_->CreateVariables(*inference_program_, 0, false, sub_scope_);
@@ -1050,7 +1065,7 @@ void AnalysisPredictor::PrepareArgument() {
   argument_.SetUseGPU(config_.use_gpu());
   argument_.SetUseFcPadding(config_.use_fc_padding());
   argument_.SetGPUDeviceId(config_.gpu_device_id());
-  argument_.SetEnableAnalysisOptim(config_.enable_ir_optim_);
+  argument_.SetEnableIrOptim(config_.enable_ir_optim_);
   argument_.SetEnableMemoryOptim(config_.enable_memory_optim());
   argument_.SetModelFromMemory(config_.model_from_memory_);
   // Analyze inference_program
@@ -1063,11 +1078,12 @@ void AnalysisPredictor::PrepareArgument() {
                       false,
                       platform::errors::PreconditionNotMet(
                           "Either model_dir or prog_file should be set."));
-    std::string dir = inference::analysis::GetDirRoot(config_.prog_file());
 
     argument_.SetModelProgramPath(config_.prog_file());
     argument_.SetModelParamsPath(config_.params_file());
   }
+  // For JITLayer
+  argument_.SetSkipLoadParams(config_.skip_load_params_);
 
   argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
   argument_.SetTensorRtUseOSS(config_.trt_use_varseqlen_);
@@ -1160,6 +1176,8 @@ void AnalysisPredictor::PrepareArgument() {
   argument_.SetIpuAvailableMemoryProportion(
       config_.ipu_available_memory_proportion_);
   argument_.SetIpuEnableHalfPartial(config_.ipu_enable_half_partial_);
+  argument_.SetIpuCustomOpsInfo(config_.ipu_custom_ops_info_);
+  argument_.SetIpuCustomPatterns(config_.ipu_custom_patterns_);
 #endif
 
   argument_.SetUseNpu(config_.use_npu_);
@@ -1192,58 +1210,72 @@ void AnalysisPredictor::PrepareArgument() {
   }
 #endif
 
-  auto passes = config_.pass_builder()->AllPasses();
+  auto *pass_builder = config_.pass_builder();
   if (model_precision_ != phi::DataType::FLOAT32) {
     LOG(INFO) << "Model is mixed precision type with " << model_precision_
               << ", we will use a new PassStrategy. Note that only the GPU "
                  "backend is supported for now.";
-    passes.clear();
+    pass_builder->ClearPasses();
+    const auto &deleted_passes = pass_builder->GetAllDeletedPasses();
     if (config_.tensorrt_engine_enabled()) {
       for (const auto &pass : kTrtLowerPrecisionPasses) {
-        passes.push_back(pass);
+        if (deleted_passes.count(pass)) continue;
+        pass_builder->AppendPass(pass);
       }
     } else if (config_.use_gpu()) {
       for (const auto &pass : kGpuLowerPrecisionPasses) {
-        passes.push_back(pass);
+        if (deleted_passes.count(pass)) continue;
+        pass_builder->AppendPass(pass);
       }
     }
+  }
 
-    const auto &deleted_passes = config_.pass_builder()->GetAllDeletedPasses();
-    for (const auto &it : deleted_passes) {
-      auto iterator = std::find(passes.begin(), passes.end(), it);
-      if (iterator != passes.end()) {
-        passes.erase(iterator);
-      }
+  if (!config_.ir_optim()) {
+    argument_.SetEnableIrOptim(false);
+    if (config_.enable_gpu_mixed_) {
+      argument_.SetEnableIrOptim(true);
+      pass_builder->ClearPasses();
+      pass_builder->AppendPass("auto_mixed_precision_pass");
+      LOG(INFO)
+          << "This model run in Paddle-GPU mixed precision mode with no ir "
+             "optimization.";
+    } else {
+      LOG(INFO) << "ir_optim is turned off, no IR pass will be executed.";
     }
-
+  } else {
     if (config_.ir_debug_) {
-      auto it = std::begin(passes);
-      while (it != std::end(passes)) {
-        if (*it != "graph_viz_pass") {
-          it = passes.insert(it + 1, "graph_viz_pass");
-        } else {
-          ++it;
-        }
-      }
+      pass_builder->TurnOnDebug();
+    }
+    if (config_.enable_gpu_mixed_) {
+      LOG(INFO) << "This model run in Paddle-GPU mixed precision mode.";
     }
-  }
-  if (!config_.ir_optim()) {
-    passes.clear();
-    LOG(INFO) << "ir_optim is turned off, no IR pass will be executed";
   }
   argument_.SetDisableLogs(config_.glog_info_disabled());
-  argument_.SetIrAnalysisPasses(passes);
-  argument_.SetAnalysisPasses(config_.pass_builder()->AnalysisPasses());
+  argument_.SetIrAnalysisPasses(pass_builder->AllPasses());
+  argument_.SetAnalysisPasses(pass_builder->AnalysisPasses());
   argument_.SetScopeNotOwned(scope_.get());
 
   // mixed precison.
   argument_.SetModelPrecision(static_cast<int>(model_precision_));
   argument_.SetMixedBlackList(config_.mixed_black_list_);
+  argument_.SetEnableGPUMixed(config_.enable_gpu_mixed_);
+  argument_.SetMixedPrecisionMode(static_cast<int>(
+      paddle::ConvertPrecision(config_.mixed_precision_mode_)));
 }
 
 // NOTE All the members in AnalysisConfig should be copied to Argument.
 void AnalysisPredictor::OptimizeInferenceProgram() {
   PrepareArgument();
+
+#ifdef PADDLE_WITH_TENSORRT
+  if (config_.tensorrt_engine_enabled()) {
+    inference::tensorrt::TensorRTEngine::predictor_id_per_thread =
+        predictor_id_;
+    VLOG(3) << "thread_local var predictor_id in TensorRTEngine is set to: "
+            << inference::tensorrt::TensorRTEngine::predictor_id_per_thread;
+  }
+#endif
+
   Analyzer().Run(&argument_);
 
   PADDLE_ENFORCE_EQ(
@@ -1735,10 +1767,39 @@ void AnalysisPredictor::CollectShapeRangeInfo() {
     if (!var->IsType<framework::LoDTensor>()) {
       continue;
     }
-    framework::DDim dim = var->Get<framework::LoDTensor>().dims();
+    auto tensor = var->Get<framework::LoDTensor>();
+    framework::DDim dim = tensor.dims();
     std::vector<int32_t> shape(dim.size());
     for (size_t i = 0; i < shape.size(); ++i) shape[i] = dim[i];
     shape_info_[name].emplace_back(shape);
+
+    // We need collect value range for shape tensor for Paddle-TRT's use.
+    // To be noticed, this method to identify all shape tensors is based on
+    // assumption that all shape tensors in the model have numbers <= 7.
+    // This is a simple method to identify all shape tensors with some
+    // mistakes, but it doesn't matter.
+    auto is_shape_tensor = tensor.numel() <= 7 && tensor.numel() >= 1;
+    if (tensor.dtype() == paddle::experimental::DataType::INT32 &&
+        is_shape_tensor) {
+      std::vector<int> int32_host(tensor.numel());
+      if (tensor.place() == platform::CPUPlace()) {
+        paddle::memory::Copy(platform::CPUPlace(),
+                             int32_host.data(),
+                             platform::CPUPlace(),
+                             tensor.data<int>(),
+                             tensor.numel() * sizeof(int));
+      } else if (tensor.place() == platform::CUDAPlace()) {
+#if defined(PADDLE_WITH_CUDA)
+        paddle::memory::Copy(platform::CPUPlace(),
+                             int32_host.data(),
+                             platform::CUDAPlace(),
+                             tensor.data<int>(),
+                             tensor.numel() * sizeof(int),
+                             nullptr);
+#endif
+      }
+      shape_tensor_value_[name].emplace_back(int32_host);
+    }
   }
 }
 
@@ -1746,43 +1807,61 @@ void AnalysisPredictor::StatisticShapeRangeInfo() {
   std::map<std::string, std::vector<int32_t>> min_shapes;
   std::map<std::string, std::vector<int32_t>> max_shapes;
   std::map<std::string, std::vector<int32_t>> opt_shapes;
-  for (auto it : shape_info_) {
-    auto name = it.first;
-    auto shapes = it.second;
-
-    std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
-    std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
-    std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
-
-    auto ShapeMaxFreq = [](const std::map<int32_t, int32_t> &m) -> int32_t {
-      std::vector<std::pair<int32_t, int32_t>> counter;
-      for (auto &it : m) counter.push_back(it);
-      std::sort(
-          counter.begin(),
-          counter.end(),
-          [](std::pair<int32_t, int32_t> &a, std::pair<int32_t, int32_t> &b) {
-            return a.second > b.second;
-          });
-      return counter[0].first;
-    };
-
-    for (size_t d = 0; d < shapes[0].size(); ++d) {
-      std::map<int32_t, int32_t> counter;
-      for (size_t i = 0; i < shapes.size(); ++i) {
-        counter[shapes[i][d]] += 1;
-        if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
-        if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
-      }
-      opt_shape[d] = ShapeMaxFreq(counter);
-    }
+  std::map<std::string, std::vector<int32_t>> min_values;
+  std::map<std::string, std::vector<int32_t>> max_values;
+  std::map<std::string, std::vector<int32_t>> opt_values;
+
+  auto extract_min_max_opt =
+      [](std::map<std::string, std::vector<int32_t>> &min_data,
+         decltype(min_data) max_data,
+         decltype(min_data) opt_data,
+         decltype(shape_info_) shape_data) {
+        for (auto it : shape_data) {
+          auto name = it.first;
+          auto shapes = it.second;
+
+          std::vector<int32_t> min_shape(shapes[0].begin(), shapes[0].end());
+          std::vector<int32_t> max_shape(shapes[0].begin(), shapes[0].end());
+          std::vector<int32_t> opt_shape(shapes[0].begin(), shapes[0].end());
+
+          auto ShapeMaxFreq =
+              [](const std::map<int32_t, int32_t> &m) -> int32_t {
+            std::vector<std::pair<int32_t, int32_t>> counter;
+            for (auto &it : m) counter.push_back(it);
+            std::sort(counter.begin(),
+                      counter.end(),
+                      [](std::pair<int32_t, int32_t> &a,
+                         std::pair<int32_t, int32_t> &b) {
+                        return a.second > b.second;
+                      });
+            return counter[0].first;
+          };
+
+          for (size_t d = 0; d < shapes[0].size(); ++d) {
+            std::map<int32_t, int32_t> counter;
+            for (size_t i = 0; i < shapes.size(); ++i) {
+              counter[shapes[i][d]] += 1;
+              if (shapes[i][d] < min_shape[d]) min_shape[d] = shapes[i][d];
+              if (shapes[i][d] > max_shape[d]) max_shape[d] = shapes[i][d];
+            }
+            opt_shape[d] = ShapeMaxFreq(counter);
+          }
 
-    min_shapes[name] = min_shape;
-    max_shapes[name] = max_shape;
-    opt_shapes[name] = opt_shape;
-  }
+          min_data[name] = min_shape;
+          max_data[name] = max_shape;
+          opt_data[name] = opt_shape;
+        }
+      };
+  extract_min_max_opt(min_shapes, max_shapes, opt_shapes, shape_info_);
+  extract_min_max_opt(min_values, max_values, opt_values, shape_tensor_value_);
 
-  inference::SerializeShapeRangeInfo(
-      config_.shape_range_info_path(), min_shapes, max_shapes, opt_shapes);
+  inference::SerializeShapeRangeInfo(config_.shape_range_info_path(),
+                                     min_shapes,
+                                     max_shapes,
+                                     opt_shapes,
+                                     min_values,
+                                     max_values,
+                                     opt_values);
 }
 
 bool AnalysisPredictor::LoadProgramDesc() {
@@ -2032,7 +2111,9 @@ std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone(void *stream) {
   }
   x->predictor_stream_ = stream;
   x->Init(scope_, inference_program_);
+#ifdef PADDLE_WITH_TENSORRT
   x->executor_->ResetTrtOps(++AnalysisPredictor::clone_num_);
+#endif
   return std::unique_ptr<PaddlePredictor>(x);
 }
 
@@ -2094,6 +2175,8 @@ USE_TRT_CONVERTER(elementwise_add_weight);
 USE_TRT_CONVERTER(elementwise_sub_weight);
 USE_TRT_CONVERTER(elementwise_mul_weight);
 USE_TRT_CONVERTER(elementwise_div_weight);
+USE_TRT_CONVERTER(elementwise_min_weight);
+USE_TRT_CONVERTER(elementwise_max_weight);
 USE_TRT_CONVERTER(elementwise_pow_weight);
 USE_TRT_CONVERTER(elementwise_add_tensor);
 USE_TRT_CONVERTER(elementwise_sub_tensor);
@@ -2107,6 +2190,8 @@ USE_TRT_CONVERTER(transpose2);
 USE_TRT_CONVERTER(flatten);
 USE_TRT_CONVERTER(flatten_contiguous_range);
 USE_TRT_CONVERTER(matmul);
+USE_TRT_CONVERTER(matmul_v2);
+USE_TRT_CONVERTER(bmm);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
 USE_TRT_CONVERTER(exp);
@@ -2185,6 +2270,8 @@ USE_TRT_CONVERTER(shape)
 USE_TRT_CONVERTER(fill_constant)
 USE_TRT_CONVERTER(fused_token_prune)
 USE_TRT_CONVERTER(layernorm_shift_partition)
+USE_TRT_CONVERTER(generic_plugin_creater)
+USE_TRT_CONVERTER(custom_plugin_creater)
 #if PADDLE_WITH_CUSPARSELT && IS_TRT_VERSION_GE(8000)
 USE_TRT_CONVERTER(sparse_fc)
 USE_TRT_CONVERTER(sparse_multihead_matmul)
@@ -2304,7 +2391,7 @@ void ConvertToMixedPrecision(const std::string &model_file,
                              const std::string &mixed_model_file,
                              const std::string &mixed_params_file,
                              PrecisionType mixed_precision,
-                             BackendType backend,
+                             paddle_infer::PlaceType backend,
                              bool keep_io_types,
                              std::unordered_set<std::string> black_list) {
   auto phi_backend = paddle::ConvertBackend(backend);
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 235714257558aa..e5e0185b059b3a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -515,6 +515,7 @@ class AnalysisPredictor : public PaddlePredictor {
   bool status_is_cloned_{false};
 
   std::map<std::string, std::vector<std::vector<int32_t>>> shape_info_;
+  std::map<std::string, std::vector<std::vector<int32_t>>> shape_tensor_value_;
   static int clone_num_;
 
   bool private_context_{false};
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
old mode 100644
new mode 100755
index cef7402e6c061c..293236b111630d
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -142,7 +142,7 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateScalesForOpOutputs(
           scales_[var_name] = scales_[input_var_name];
         }
         compute_scale = false;
-      } else if (op->Type() == "slice" || op->Type() == "shape") {
+      } else if (op->Type() == "slice") {
         auto input_var_name = op->Input("Input")[0];
         PADDLE_ENFORCE_NE(scales_.find(input_var_name),
                           scales_.end(),
@@ -604,10 +604,8 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
   if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
   auto passes = builder->AllPasses();
   predictor_.argument_.SetIrAnalysisPasses(passes);
-  predictor_.argument_.SetAnalysisPasses({"ir_graph_clean_pass",
-                                          "ir_analysis_pass",
-                                          "memory_optimize_pass",
-                                          "ir_graph_to_program_pass"});
+  predictor_.argument_.SetAnalysisPasses(
+      {"ir_analysis_pass", "memory_optimize_pass", "ir_graph_to_program_pass"});
   predictor_.argument_.SetQuantVarScales(scales_);
 }
 
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7bf5dc2cfe35db..7dfc8d1df41ded 100755
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -170,13 +170,6 @@ struct PD_INFER_DECL AnalysisConfig {
     kBf16,         ///< bf16
   };
 
-  enum class Backend {
-    kCPU = 0,
-    kGPU,
-    kXPU,
-    kNPU,
-  };
-
   ///
   /// \brief Set the no-combined model dir path.
   ///
@@ -254,8 +247,12 @@ struct PD_INFER_DECL AnalysisConfig {
   ///
   /// \param memory_pool_init_size_mb initial size of the GPU memory pool in MB.
   /// \param device_id device_id the GPU card to use (default is 0).
+  /// \param precision the precision used in Paddle-GPU inference.
   ///
-  void EnableUseGpu(uint64_t memory_pool_init_size_mb, int device_id = 0);
+  void EnableUseGpu(uint64_t memory_pool_init_size_mb,
+                    int device_id = 0,
+                    Precision precision_mode = Precision::kFloat32);
+
   ///
   /// \brief Turn off GPU.
   ///
@@ -289,6 +286,22 @@ struct PD_INFER_DECL AnalysisConfig {
                  const std::string& precision = "int16",
                  bool adaptive_seqlen = false);
 
+  ///
+  /// \brief configs of IPU
+  ///
+  enum class ipu_config_code {
+    ipu_device_num,
+    ipu_micro_batch_size,
+    ipu_enable_pipelining,
+    ipu_batches_per_step,
+    ipu_enable_fp16,
+    ipu_replica_num,
+    ipu_available_memory_proportion,
+    ipu_enable_half_partial,
+    ipu_custom_ops_info,
+    ipu_custom_patterns
+  };
+
   ///
   /// \brief Turn on IPU.
   ///
@@ -318,6 +331,25 @@ struct PD_INFER_DECL AnalysisConfig {
                     float ipu_available_memory_proportion = 1.0,
                     bool ipu_enable_half_partial = false);
 
+  ///
+  /// \brief Set IPU custom ops and patterns.
+  ///
+  /// \param custom_ops_info the mapper of paddle custom ops and popart ops.
+  /// e.g. {{paddle_op_name, popart_op_name, op_domain, op_version}}.
+  /// \param custom_patterns the names of popart patterns. e.g. {{pattern_name,
+  /// enable_pattern}}}
+  ///
+  void SetIpuCustomInfo(
+      const std::vector<std::vector<std::string>>& ipu_custom_ops_info = {},
+      const std::map<std::string, bool>& ipu_custom_patterns = {});
+
+  ///
+  /// \brief Load IPU config from configuration file.
+  ///
+  /// \param config_path configure file path for ipu.
+  ///
+  void LoadIpuConfig(const std::string& config_path);
+
   ///
   /// \brief Set XPU device id.
   ///
@@ -939,9 +971,13 @@ struct PD_INFER_DECL AnalysisConfig {
   /// interface is in the experimental stage and may change in the future. Note
   /// that the blacklist must be the same as the model conversion blacklist.
   ///
-  void Exp_SetBlackListOpsForMixedModel(
+  void Exp_DisableMixedPrecisionOps(
       const std::unordered_set<std::string>& black_list);
 
+  void SetApplyOptim(bool value) { apply_optim_ = value; }
+
+  void SetSkipLoadParams(bool value) { skip_load_params_ = value; }
+
  protected:
   // Update the config.
   void Update();
@@ -955,13 +991,15 @@ struct PD_INFER_DECL AnalysisConfig {
   mutable std::string params_file_;
   mutable std::string calibration_file_path_;
 
-  // Mixed precision.
+  // Mixed precision related.
+  Precision mixed_precision_mode_{Precision::kFloat32};
   std::unordered_set<std::string> mixed_black_list_;
 
   // GPU related.
   bool use_gpu_{false};
   int gpu_device_id_{0};
   uint64_t memory_pool_init_size_mb_{100};  // initial size is 100MB.
+  bool enable_gpu_mixed_{false};
   bool thread_local_stream_{false};
 
   bool use_cudnn_{false};
@@ -1118,6 +1156,22 @@ struct PD_INFER_DECL AnalysisConfig {
   float ipu_available_memory_proportion_{1.0};
   bool ipu_enable_half_partial_{false};
 
+  std::vector<std::vector<std::string>> ipu_custom_ops_info_;
+  std::vector<std::vector<std::string>> ipu_custom_patterns_;
+
+  const std::unordered_map<std::string, ipu_config_code> ipu_config_mapper_ = {
+      {"ipu_device_num", ipu_config_code::ipu_device_num},
+      {"ipu_micro_batch_size", ipu_config_code::ipu_micro_batch_size},
+      {"ipu_enable_pipelining", ipu_config_code::ipu_enable_pipelining},
+      {"ipu_batches_per_step", ipu_config_code::ipu_batches_per_step},
+      {"ipu_enable_fp16", ipu_config_code::ipu_enable_fp16},
+      {"ipu_replica_num", ipu_config_code::ipu_replica_num},
+      {"ipu_available_memory_proportion",
+       ipu_config_code::ipu_available_memory_proportion},
+      {"ipu_enable_half_partial", ipu_config_code::ipu_enable_half_partial},
+      {"ipu_custom_ops_info", ipu_config_code::ipu_custom_ops_info},
+      {"ipu_custom_patterns", ipu_config_code::ipu_custom_patterns}};
+
   // If the config is already used on a predictor, it becomes invalid.
   // Any config can only be used with one predictor.
   // Variables held by config can take up a lot of memory in some cases.
@@ -1128,6 +1182,13 @@ struct PD_INFER_DECL AnalysisConfig {
 
   // fleet exe related
   DistConfig dist_config_{};
+
+  // jit engine related
+  // NOTE(Aureliue84): In case of Predictor in JITLayer, program is from outer
+  // which means Predictor should apply optimization by calling
+  // PrepareProgram(). So we add this flag to control the process.
+  bool apply_optim_{false};
+  bool skip_load_params_{false};
 };
 
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index ae844f138b0f68..055cf3a13fbaf7 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -47,7 +47,6 @@ namespace paddle_infer {
 using PrecisionType = paddle::AnalysisConfig::Precision;
 using Config = paddle::AnalysisConfig;
 using DistConfig = paddle::DistConfig;
-using BackendType = paddle::AnalysisConfig::Backend;
 
 ///
 /// \class Predictor
@@ -198,7 +197,7 @@ PD_INFER_DECL void ConvertToMixedPrecision(
     const std::string& mixed_model_file,
     const std::string& mixed_params_file,
     PrecisionType mixed_precision,
-    BackendType backend,
+    PlaceType backend,
     bool keep_io_types = true,
     std::unordered_set<std::string> black_list = {});
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
old mode 100644
new mode 100755
index 2b5cb6dd050a6e..c5b2cd6e201a58
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -84,8 +84,7 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
-  "identity_scale_op_clean_pass",              //
-      "adaptive_pool2d_convert_global_pass",   //
+  "adaptive_pool2d_convert_global_pass",       //
       "shuffle_channel_detect_pass",           //
       "quant_conv2d_dequant_fuse_pass",        //
       "delete_fill_constant_op_pass",          //
@@ -93,6 +92,7 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "delete_quant_dequant_filter_op_pass",   //
       "delete_weight_dequant_linear_op_pass",  //
       "delete_quant_dequant_linear_op_pass",   //
+      "identity_scale_op_clean_pass",          //
       "add_support_int8_pass",                 //
       // "fc_fuse_pass",                        //
       "simplify_with_basic_ops_pass",                 //
@@ -110,7 +110,6 @@ const std::vector<std::string> kTRTSubgraphPasses({
       "conv_bn_fuse_pass",                           //
       "unsqueeze2_eltwise_fuse_pass",                //
       "trt_squeeze2_matmul_fuse_pass",               //
-      "trt_reshape2_matmul_fuse_pass",               //
       "trt_flatten2_matmul_fuse_pass",               //
       "trt_map_matmul_v2_to_mul_pass",               //
       "trt_map_matmul_v2_to_matmul_pass",            //
@@ -164,10 +163,17 @@ const std::vector<std::string> kGpuLowerPrecisionPasses{
     "conv_elementwise_add2_act_fuse_pass",
     "conv_elementwise_add_fuse_pass",
     "multihead_matmul_fuse_pass_v2",
+    "fused_multi_transformer_encoder_pass",
+    "fused_multi_transformer_decoder_pass",
+    "fused_multi_transformer_encoder_fuse_qkv_pass",
+    "fused_multi_transformer_decoder_fuse_qkv_pass",
+    "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass",
+    "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass",
     "gpu_cpu_map_matmul_v2_to_mul_pass",
     "gpu_cpu_map_matmul_v2_to_matmul_pass",
+    "gpu_cpu_map_matmul_to_mul_pass",
     "fc_fuse_pass",
-    "fc_elementwise_layernorm_fuse_pass",
+    // "fc_elementwise_layernorm_fuse_pass",
     "embedding_eltwise_layernorm_fuse_pass",
     "runtime_context_cache_pass",
 };
@@ -188,22 +194,29 @@ const std::vector<std::string> kTrtLowerPrecisionPasses{
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     //   "identity_scale_op_clean_pass",             //
-    "is_test_pass",                               //
-        "simplify_with_basic_ops_pass",           //
-        "conv_bn_fuse_pass",                      //
-        "conv_eltwiseadd_bn_fuse_pass",           //
-        "embedding_eltwise_layernorm_fuse_pass",  //
-        "multihead_matmul_fuse_pass_v2",          //
-        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
-        "gpu_cpu_reshape2_matmul_fuse_pass",      //
-        "gpu_cpu_flatten2_matmul_fuse_pass",      //
-        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
-        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
-        "matmul_scale_fuse_pass",                 //
-        "multihead_matmul_fuse_pass_v3",          //
-        "gpu_cpu_map_matmul_to_mul_pass",         //
-        "fc_fuse_pass",                           //
-        "fc_elementwise_layernorm_fuse_pass",     //
+    "is_test_pass",                                                     //
+        "simplify_with_basic_ops_pass",                                 //
+        "conv_bn_fuse_pass",                                            //
+        "conv_eltwiseadd_bn_fuse_pass",                                 //
+        "embedding_eltwise_layernorm_fuse_pass",                        //
+        "multihead_matmul_fuse_pass_v2",                                //
+        "fused_multi_transformer_encoder_pass",                         //
+        "fused_multi_transformer_decoder_pass",                         //
+        "fused_multi_transformer_encoder_fuse_qkv_pass",                //
+        "fused_multi_transformer_decoder_fuse_qkv_pass",                //
+        "multi_devices_fused_multi_transformer_encoder_fuse_qkv_pass",  //
+        "multi_devices_fused_multi_transformer_decoder_fuse_qkv_pass",  //
+        "fuse_multi_transformer_layer_pass",                            //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",                            //
+        "gpu_cpu_reshape2_matmul_fuse_pass",                            //
+        "gpu_cpu_flatten2_matmul_fuse_pass",                            //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",                            //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",                         //
+        "matmul_scale_fuse_pass",                                       //
+        "multihead_matmul_fuse_pass_v3",                                //
+        "gpu_cpu_map_matmul_to_mul_pass",                               //
+        "fc_fuse_pass",                                                 //
+        "fc_elementwise_layernorm_fuse_pass",                           //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
@@ -215,9 +228,10 @@ GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
         "conv_elementwise_add_fuse_pass",      //
 #endif                                         //
         "transpose_flatten_concat_fuse_pass",  //
-        "constant_folding_pass",
+        "constant_folding_pass",               //
         // following pass should be located in the last, since it will
         // work on all fused ops.
+        "auto_mixed_precision_pass",  //
         "runtime_context_cache_pass"
   });
 
@@ -296,6 +310,7 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
+             "squeeze2_transpose2_onednn_fuse_pass",
              "depthwise_conv_mkldnn_pass",    //
              "conv_bn_fuse_pass",             // Execute BN passes again to
              "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
@@ -321,6 +336,9 @@ void CpuPassStrategy::EnableMKLDNN() {
              "softplus_activation_mkldnn_fuse_pass",  //
              "shuffle_channel_mkldnn_detect_pass",    //
              "elt_act_mkldnn_fuse_pass",              //
+             "operator_scale_onednn_fuse_pass",       //
+             "operator_unsqueeze2_onednn_fuse_pass",  //
+             "operator_reshape2_onednn_fuse_pass",    //
              // TODO(intel): Please fix the bug on windows.
              // https://github.com/PaddlePaddle/Paddle/issues/29710
              // "mkldnn_inplace_pass",  // This pass should be activated after
@@ -371,6 +389,8 @@ void CpuPassStrategy::EnableMkldnnInt8() {
     passes_.push_back("quant_dequant_mkldnn_pass");
     passes_.push_back("mkldnn_placement_pass");
     passes_.push_back("simplify_with_basic_ops_pass");
+    passes_.push_back("constant_folding_pass");
+    passes_.push_back("squeeze2_transpose2_onednn_fuse_pass");
     passes_.push_back("layer_norm_fuse_pass");
     passes_.push_back("attention_lstm_fuse_pass");
     passes_.push_back("seqconv_eltadd_relu_fuse_pass");
@@ -413,6 +433,9 @@ void CpuPassStrategy::EnableMkldnnInt8() {
     passes_.push_back("scale_matmul_fuse_pass");
     passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass");
     passes_.push_back("matmul_elementwise_add_mkldnn_fuse_pass");
+    passes_.push_back("operator_scale_onednn_fuse_pass");
+    passes_.push_back("operator_unsqueeze2_onednn_fuse_pass");
+    passes_.push_back("operator_reshape2_onednn_fuse_pass");
     passes_.push_back("cpu_quantize_placement_pass");
     passes_.push_back("cpu_quantize_pass");
     passes_.push_back("cpu_quantize_squash_pass");
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h
index cd97382785395f..c8083e87dd8f0d 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.h
+++ b/paddle/fluid/inference/api/paddle_pass_builder.h
@@ -115,7 +115,6 @@ class PD_INFER_DECL PaddlePassBuilder {
   /// \cond Protected
   std::vector<std::string> analysis_passes_{
       {"ir_graph_build_pass",
-       "ir_graph_clean_pass",
        "ir_analysis_pass",
        "ir_params_sync_among_devices_pass",
        "adjust_cudnn_workspace_size_pass",
diff --git a/paddle/fluid/inference/api/resource_manager.h b/paddle/fluid/inference/api/resource_manager.h
index 1b375efaf3b5f9..6655324d305b6b 100644
--- a/paddle/fluid/inference/api/resource_manager.h
+++ b/paddle/fluid/inference/api/resource_manager.h
@@ -23,6 +23,7 @@
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/backends/cpu/forwards.h"
 #include "paddle/phi/common/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/device/gpu/gpu_types.h"
diff --git a/paddle/fluid/inference/capi_exp/CMakeLists.txt b/paddle/fluid/inference/capi_exp/CMakeLists.txt
index 089a766b91cfe4..26d76c280bd6ba 100644
--- a/paddle/fluid/inference/capi_exp/CMakeLists.txt
+++ b/paddle/fluid/inference/capi_exp/CMakeLists.txt
@@ -31,6 +31,18 @@ cc_library(
   DEPS paddle_inference)
 set_target_properties(paddle_inference_c_shared PROPERTIES OUTPUT_NAME
                                                            paddle_inference_c)
+
+if(APPLE)
+  target_link_libraries(
+    paddle_inference_c_shared
+    xxhash
+    utf8proc
+    cryptopp
+    protobuf
+    gflags
+    cblas)
+endif()
+
 if(WIN32)
   target_link_libraries(paddle_inference_c_shared shlwapi.lib)
 endif()
diff --git a/paddle/fluid/inference/goapi/config.go b/paddle/fluid/inference/goapi/config.go
index 0aca2a1075fd3f..d156252985eb25 100644
--- a/paddle/fluid/inference/goapi/config.go
+++ b/paddle/fluid/inference/goapi/config.go
@@ -332,9 +332,9 @@ func (config *Config) IrOptim() bool {
 /// \param useCalibMode Use TRT int8 calibration(post training
 /// quantization).
 ///
-func (config *Config) EnableTensorRtEngine(workspaceSize int32, maxBatchSize int32, minSubgraphSize int32,
+func (config *Config) EnableTensorRtEngine(workspaceSize int64, maxBatchSize int32, minSubgraphSize int32,
 	precision Precision, useStatic bool, useCalibMode bool) {
-	C.PD_ConfigEnableTensorRtEngine(config.c, C.int32_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode))
+	C.PD_ConfigEnableTensorRtEngine(config.c, C.int64_t(workspaceSize), C.int32_t(maxBatchSize), C.int32_t(minSubgraphSize), C.int32_t(precision), cvtGoBoolToPD(useStatic), cvtGoBoolToPD(useCalibMode))
 }
 
 ///
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
index cad5903540b907..253df63763329a 100644
--- a/paddle/fluid/inference/io.cc
+++ b/paddle/fluid/inference/io.cc
@@ -160,11 +160,11 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
   return main_program;
 }
 
-std::unique_ptr<framework::ProgramDesc> Load(
-    framework::Executor* executor,
-    framework::Scope* scope,
-    const std::string& prog_filename,
-    const std::string& param_filename) {
+std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
+                                             framework::Scope* scope,
+                                             const std::string& prog_filename,
+                                             const std::string& param_filename,
+                                             bool load_params) {
   std::string program_desc_str;
   ReadBinaryFile(prog_filename, &program_desc_str);
 
@@ -175,13 +175,14 @@ std::unique_ptr<framework::ProgramDesc> Load(
       true,
       platform::errors::Unavailable("Model version %ld is not supported.",
                                     main_program->Version()));
-
-  LoadPersistables(executor,
-                   scope,
-                   *main_program,
-                   "",
-                   param_filename,
-                   false /* model_from_memory */);
+  if (load_params) {
+    LoadPersistables(executor,
+                     scope,
+                     *main_program,
+                     "",
+                     param_filename,
+                     false /* model_from_memory */);
+  }
   return main_program;
 }
 
diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h
index 31ed29e425dd96..36e21f8f36e137 100644
--- a/paddle/fluid/inference/io.h
+++ b/paddle/fluid/inference/io.h
@@ -42,7 +42,8 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
 std::unique_ptr<framework::ProgramDesc> Load(framework::Executor* executor,
                                              framework::Scope* scope,
                                              const std::string& prog_filename,
-                                             const std::string& param_filename);
+                                             const std::string& param_filename,
+                                             bool load_params = true);
 
 std::unique_ptr<framework::ProgramDesc> LoadFromMemory(
     framework::Executor* executor,
diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
index 7239b506d33f6f..d4a4c8c06af756 100644
--- a/paddle/fluid/inference/tensorrt/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -12,10 +12,18 @@ else()
     SRCS engine.cc trt_int8_calibrator.cc
     DEPS ${GLOB_OPERATOR_DEPS} framework_proto device_context)
 endif()
+nv_library(
+  tensorrt_dynamic_shape_infermeta_factory
+  SRCS dynamic_shape_infermeta.cc
+  DEPS framework_proto)
+nv_library(
+  tensorrt_plugin_arg_mapping_context
+  SRCS plugin_arg_mapping_context.cc
+  DEPS framework_proto)
 nv_library(
   tensorrt_op_teller
   SRCS op_teller.cc
-  DEPS framework_proto device_context)
+  DEPS framework_proto device_context tensorrt_dynamic_shape_infermeta_factory)
 nv_test(
   test_tensorrt
   SRCS test_tensorrt.cc
@@ -24,6 +32,10 @@ nv_test(
   test_tensorrt_engine
   SRCS test_engine.cc test_dynamic_engine.cc
   DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
+nv_test(
+  test_arg_mapping_context
+  SRCS test_arg_mapping_context.cc
+  DEPS framework_proto tensorrt_plugin_arg_mapping_context)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index ce95363b72d0b3..a40f2bd58d582c 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -3,6 +3,8 @@ list(
   APPEND
   CONVERT_FILES
   matmul_op.cc
+  matmul_v2_op.cc
+  bmm_op.cc
   conv2d_op.cc
   fc_op.cc
   pool2d_op.cc
@@ -76,7 +78,8 @@ list(
   shape_op.cc
   fill_constant_op.cc
   fused_token_prune_op.cc
-  layernorm_shift_partition_op.cc)
+  layernorm_shift_partition_op.cc
+  generic_and_custom_plugin_creater.cc)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND CONVERT_FILES sparse_fc_op.cc sparse_multihead_matmul_op.cc)
@@ -85,7 +88,12 @@ endif()
 nv_library(
   tensorrt_converter
   SRCS ${CONVERT_FILES}
-  DEPS tensorrt_engine tensorrt_plugin operator scope framework_proto
+  DEPS tensorrt_engine
+       tensorrt_plugin
+       operator
+       scope
+       framework_proto
+       tensorrt_op_teller
        op_registry)
 
 nv_test(
@@ -94,6 +102,11 @@ nv_test(
   DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
        tensorrt_converter)
 
+nv_test(
+  test_custom_plugin_creater
+  SRCS test_custom_plugin_creater.cc
+  DEPS paddle_framework tensorrt_converter op_meta_info custom_operator)
+
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will
   # be build only in CI, so suppose the generator in Windows is Ninja.
diff --git a/paddle/fluid/inference/tensorrt/convert/bmm_op.cc b/paddle/fluid/inference/tensorrt/convert/bmm_op.cc
new file mode 100644
index 00000000000000..4f4751d8ca977d
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/bmm_op.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class BMMOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::ILayer* layer = nullptr;
+
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    auto output_name = op_desc.Output("Out")[0];
+
+    layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                 MatrixMultiply,
+                                 *input1,
+                                 nvinfer1::MatrixOperation::kNONE,
+                                 *input2,
+                                 nvinfer1::MatrixOperation::kNONE);
+
+    RreplenishLayerAndOutput(layer, "bmm", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(bmm, BMMOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/cast_op.cc b/paddle/fluid/inference/tensorrt/convert/cast_op.cc
index ab62c43d851eb8..b2b06744d984ab 100644
--- a/paddle/fluid/inference/tensorrt/convert/cast_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/cast_op.cc
@@ -43,13 +43,13 @@ class CastOpConverter : public OpConverter {
 
     switch (out_dtype) {
       case 2:  // INT32 = 2
-        layer->getOutput(0)->setType(nvinfer1::DataType::kINT32);
+        layer->setOutputType(0, nvinfer1::DataType::kINT32);
         break;
       case 4:  // FP16 = 4
-        layer->getOutput(0)->setType(nvinfer1::DataType::kHALF);
+        layer->setOutputType(0, nvinfer1::DataType::kHALF);
         break;
       case 5:  // FP32 = 5
-        layer->getOutput(0)->setType(nvinfer1::DataType::kFLOAT);
+        layer->setOutputType(0, nvinfer1::DataType::kFLOAT);
         break;
       default:
         LOG(ERROR) << "Unable to convert a fluid data type(" << out_dtype
diff --git a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
index e873ad4f624fc0..7f2c400bfec2ac 100644
--- a/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/elementwise_op.cc
@@ -216,6 +216,10 @@ REGISTER_TRT_OP_CONVERTER(elementwise_sub_weight,
                           ElementwiseTensorSubOpConverter);
 REGISTER_TRT_OP_CONVERTER(elementwise_div_weight,
                           ElementwiseTensorDivOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_max_weight,
+                          ElementwiseTensorMaxOpConverter);
+REGISTER_TRT_OP_CONVERTER(elementwise_min_weight,
+                          ElementwiseTensorMinOpConverter);
 REGISTER_TRT_OP_CONVERTER(elementwise_pow_weight,
                           ElementwiseTensorPowOpConverter);
 
diff --git a/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
new file mode 100644
index 00000000000000..b5d9a50f06d7df
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/generic_and_custom_plugin_creater.cc
@@ -0,0 +1,233 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/generic_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+/*
+ * Stack converter from fluid to tensorRT.
+ */
+class CustomPluginCreater : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    VLOG(3) << "convert " << op_desc.Type() << " op to custom pluign layer";
+
+    std::string plugin_name;
+
+    if (engine_->with_dynamic_shape()) {
+      plugin_name = op_desc.Type() + "_paddle_trt_dynamic_plugin";
+    } else {
+      plugin_name = op_desc.Type() + "_paddle_trt_plugin";
+    }
+
+    nvinfer1::ILayer *layer = nullptr;
+    std::vector<nvinfer1::ITensor *> inputs;
+
+    auto &op_meta_info_map = OpMetaInfoMap::Instance();
+    const auto &meta_info_map = op_meta_info_map.GetMap();
+    auto &op_info = meta_info_map.at(op_desc.Type()).front();
+
+    // set inputs
+    auto &op_input_names = framework::OpMetaInfoHelper::GetInputs(op_info);
+    for (auto &param_name : op_input_names) {
+      for (auto &arg_name : op_desc.Input(param_name)) {
+        inputs.push_back(engine_->GetITensor(arg_name));
+      }
+    }
+    auto creator =
+        GetPluginRegistry()->getPluginCreator(plugin_name.c_str(), "1");
+    CHECK(creator);
+
+    // set attrs
+    std::vector<nvinfer1::PluginField> plugindatas;
+    auto &op_attrs_names = framework::OpMetaInfoHelper::GetAttrs(op_info);
+    auto &attrs = op_desc.GetAttrMap();
+
+    std::list<int> int_attrs;
+    std::list<float> float_attrs;
+    std::list<double> bool_attrs;
+    std::list<std::string> string_attrs;
+    std::list<std::vector<int>> ints_attrs;
+    std::list<std::vector<float>> floats_attrs;
+
+    for (auto &attr_name : op_attrs_names) {
+      nvinfer1::PluginField plugindata;
+      plugindata.name = attr_name.c_str();
+      if (op_desc.GetAttrType(attr_name) == framework::proto::AttrType::INT) {
+        int_attrs.push_back(PADDLE_GET_CONST(int, attrs.at(attr_name)));
+        plugindata.data = &int_attrs.back();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = 1;
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::FLOAT) {
+        float_attrs.push_back(PADDLE_GET_CONST(float, attrs.at(attr_name)));
+        plugindata.data = &float_attrs.back();
+        plugindata.type = nvinfer1::PluginFieldType::kFLOAT32;
+        plugindata.length = 1;
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::BOOLEAN) {
+        int_attrs.push_back(PADDLE_GET_CONST(bool, attrs.at(attr_name)));
+        plugindata.data = &int_attrs.back();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = 1;
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::STRING) {
+        string_attrs.push_back(
+            PADDLE_GET_CONST(std::string, attrs.at(attr_name)));
+        plugindata.data = string_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kCHAR;
+        plugindata.length =
+            string_attrs.back().size() + 1;  // string ends with ‘\0’
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::INTS) {
+        ints_attrs.push_back(
+            PADDLE_GET_CONST(std::vector<int>, attrs.at(attr_name)));
+        plugindata.data = ints_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = ints_attrs.back().size();
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::FLOATS) {
+        floats_attrs.push_back(
+            PADDLE_GET_CONST(std::vector<float>, attrs.at(attr_name)));
+        plugindata.data = floats_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kFLOAT32;
+        plugindata.length = floats_attrs.back().size();
+      } else if (op_desc.GetAttrType(attr_name) ==
+                 framework::proto::AttrType::BOOLEANS) {
+        auto bools_attr =
+            PADDLE_GET_CONST(std::vector<bool>, attrs.at(attr_name));
+        std::vector<int> convert_to_ints_attr;
+        for (bool i : bools_attr) convert_to_ints_attr.push_back(i);
+        ints_attrs.push_back(convert_to_ints_attr);
+        plugindata.data = ints_attrs.back().data();
+        plugindata.type = nvinfer1::PluginFieldType::kINT32;
+        plugindata.length = ints_attrs.back().size();
+      } else {
+        CHECK(false) << "UNKNOWN PluginFieldType.";
+      }
+      plugindatas.push_back(plugindata);
+    }
+
+    nvinfer1::PluginFieldCollection plugin_fc{(int32_t)plugindatas.size(),
+                                              plugindatas.data()};
+
+    auto *plugin = creator->createPlugin(op_desc.Type().c_str(), &plugin_fc);
+    CHECK(plugin);
+
+    if (engine_->with_dynamic_shape()) {
+      layer =
+          engine_->AddDynamicPlugin(inputs.data(),
+                                    inputs.size(),
+                                    (plugin::DynamicPluginTensorRT *)plugin);
+    } else {
+      layer = engine_->AddPlugin(
+          inputs.data(), inputs.size(), (plugin::PluginTensorRT *)plugin);
+    }
+
+    CHECK(layer);
+
+    // set outputs
+    auto &op_output_names = framework::OpMetaInfoHelper::GetOutputs(op_info);
+    std::vector<std::string> output_names;
+    for (auto &param_name : op_output_names) {
+      for (auto &arg_name : op_desc.Output(param_name))
+        output_names.push_back(arg_name);
+    }
+
+    RreplenishLayerAndOutput(layer, op_desc.Type(), output_names, test_mode);
+  }
+};
+
+class GenericPluginCreater : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc &op,
+                  const framework::Scope &scope,
+                  bool test_mode) override {
+    framework::OpDesc op_desc(op, nullptr);
+    CHECK(block_);
+    const framework::BlockDesc block_desc(
+        nullptr, const_cast<framework::proto::BlockDesc *>(block_));
+
+    nvinfer1::ILayer *layer = nullptr;
+    std::vector<nvinfer1::ITensor *> inputs;
+
+    phi::KernelSignature phi_kernel_signature;
+    if (phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_desc.Type())) {
+      const phi::ArgumentMappingFn *argument_mapping_func =
+          phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_desc.Type());
+      PluginArgumentMappingContext argument_mapping_context(&op_desc);
+      phi_kernel_signature = (*argument_mapping_func)(argument_mapping_context);
+    } else {
+      phi_kernel_signature =
+          phi::DefaultKernelSignatureMap::Instance().Get(op_desc.Type());
+    }
+
+    plugin::GenericPlugin::InputOutPutVarInfo in_out_info;
+
+    for (auto &param_name : phi_kernel_signature.input_names) {
+      for (auto &arg_name : op_desc.Input(param_name)) {
+        inputs.push_back(engine_->GetITensor(arg_name));
+        auto *var = block_desc.FindVar(arg_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            var,
+            platform::errors::NotFound(
+                "There is no variable called %s in block.", arg_name.c_str()));
+        PADDLE_ENFORCE_EQ(
+            var->GetType(),
+            FluidDT::VarType_Type_LOD_TENSOR,
+            platform::errors::InvalidArgument("TensorRT engine only takes "
+                                              "LoDTensor as input"));
+        in_out_info.inputs_data_type.push_back(var->GetDataType());
+      }
+    }
+
+    std::vector<std::string> output_names;
+    for (auto &param_name : phi_kernel_signature.output_names) {
+      for (auto &arg_name : op_desc.Output(param_name)) {
+        output_names.push_back(arg_name);
+        auto *var = block_desc.FindVar(arg_name);
+        PADDLE_ENFORCE_NOT_NULL(
+            var,
+            platform::errors::NotFound(
+                "There is no variable called %s in block.", arg_name.c_str()));
+        PADDLE_ENFORCE_EQ(
+            var->GetType(),
+            FluidDT::VarType_Type_LOD_TENSOR,
+            platform::errors::InvalidArgument("TensorRT engine only takes "
+                                              "LoDTensor as input"));
+        in_out_info.outputs_data_type.push_back(var->GetDataType());
+      }
+    }
+    plugin::GenericPlugin *plugin = new plugin::GenericPlugin(op, in_out_info);
+    layer = engine_->AddDynamicPlugin(inputs.data(), inputs.size(), plugin);
+
+    RreplenishLayerAndOutput(layer, op_desc.Type(), output_names, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(custom_plugin_creater, CustomPluginCreater);
+REGISTER_TRT_OP_CONVERTER(generic_plugin_creater, GenericPluginCreater);
diff --git a/paddle/fluid/inference/tensorrt/convert/layernorm_shift_partition_op.cc b/paddle/fluid/inference/tensorrt/convert/layernorm_shift_partition_op.cc
index 15f2663ce59bdc..147c9a97316671 100644
--- a/paddle/fluid/inference/tensorrt/convert/layernorm_shift_partition_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/layernorm_shift_partition_op.cc
@@ -40,11 +40,9 @@ class LayerNormShiftPartitionOpConverter : public OpConverter {
                           : 1e-5f;
     const int window_size =
         PADDLE_GET_CONST(int, op_desc.GetAttr("window_size"));
+    const int shift_size = PADDLE_GET_CONST(int, op_desc.GetAttr("shift_size"));
     const int input_resolution =
         PADDLE_GET_CONST(int, op_desc.GetAttr("input_resolution"));
-    // int shift_size = window_size / 2;
-    // shift_size = (input_resolution <= window_size) ? 0 : shift_size;
-    int shift_size = 0;
 
     PADDLE_ENFORCE_NOT_NULL(
         Bias_v,
diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
new file mode 100644
index 00000000000000..e87b2844373bc1
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_v2_op.cc
@@ -0,0 +1,136 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/plugin/matmul_op_int8_plugin.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * MatMulV2Op, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class MatMulV2OpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope,
+                  bool test_mode) override {
+    VLOG(3) << "convert a matmul_v2 op to tensorrt IMatrixMultiplyLayer layer ";
+    framework::OpDesc op_desc(op, nullptr);
+    nvinfer1::IMatrixMultiplyLayer* layer = nullptr;
+
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+
+    nvinfer1::Dims dims_x = input1->getDimensions();
+    nvinfer1::Dims dims_y = input2->getDimensions();
+
+    bool transpose_X = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_x"));
+    bool transpose_Y = PADDLE_GET_CONST(bool, op_desc.GetAttr("trans_y"));
+
+    auto output_name = op_desc.Output("Out")[0];
+
+    nvinfer1::MatrixOperation matrix_operation_X =
+        transpose_X ? nvinfer1::MatrixOperation::kTRANSPOSE
+                    : nvinfer1::MatrixOperation::kNONE;
+    nvinfer1::MatrixOperation matrix_operation_Y =
+        transpose_Y ? nvinfer1::MatrixOperation::kTRANSPOSE
+                    : nvinfer1::MatrixOperation::kNONE;
+
+    int one_num = 0;
+    bool all_matrix = dims_x.nbDims >= 2 && dims_y.nbDims >= 2;
+    nvinfer1::ITensor* new_shape_tensor = nullptr;
+    if (dims_x.nbDims < dims_y.nbDims && all_matrix) {
+      one_num = dims_y.nbDims - dims_x.nbDims;
+      new_shape_tensor = Shape(input1);
+      std::vector<int32_t> one_vec(one_num, 1);
+      auto* one_tensor = Add1DConstantLayer(one_vec);
+      new_shape_tensor =
+          Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
+
+      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input1);
+      reshape_layer->setInput(1, *new_shape_tensor);
+
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   MatrixMultiply,
+                                   *reshape_layer->getOutput(0),
+                                   matrix_operation_X,
+                                   *input2,
+                                   matrix_operation_Y);
+
+    } else if (dims_x.nbDims > dims_y.nbDims && all_matrix) {
+      one_num = dims_x.nbDims - dims_y.nbDims;
+      new_shape_tensor = Shape(input2);
+      std::vector<int32_t> one_vec(one_num, 1);
+      auto* one_tensor = Add1DConstantLayer(one_vec);
+      new_shape_tensor =
+          Concat(std::vector<nvinfer1::ITensor*>{one_tensor, new_shape_tensor});
+      auto* reshape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *input2);
+      reshape_layer->setInput(1, *new_shape_tensor);
+
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   MatrixMultiply,
+                                   *input1,
+                                   matrix_operation_X,
+                                   *reshape_layer->getOutput(0),
+                                   matrix_operation_Y);
+
+    } else {
+      layer = TRT_ENGINE_ADD_LAYER(engine_,
+                                   MatrixMultiply,
+                                   *input1,
+                                   matrix_operation_X,
+                                   *input2,
+                                   matrix_operation_Y);
+    }
+    if (dims_x.nbDims == 1)
+      layer->setOperation(0, nvinfer1::MatrixOperation::kVECTOR);
+    if (dims_y.nbDims == 1)
+      layer->setOperation(1, nvinfer1::MatrixOperation::kVECTOR);
+    nvinfer1::ILayer* final_layer = static_cast<nvinfer1::ILayer*>(layer);
+    // When vec * vec, trt produces a scalar, so to be consistent with paddle,
+    // we need add a reshape.
+    if (dims_x.nbDims == 1 && dims_y.nbDims == 1) {
+      auto reshape_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+      nvinfer1::Dims reshape_dim;
+      reshape_dim.nbDims = 1;
+      reshape_dim.d[0] = 1;
+      reshape_layer->setReshapeDimensions(reshape_dim);
+      final_layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
+    }
+    VLOG(3) << "Convert a matmul_v2_op to TensorRT ";
+
+    RreplenishLayerAndOutput(
+        final_layer, "matmul_v2_op", {output_name}, test_mode);
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(matmul_v2, MatMulV2OpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
index 53e8ffb1c0ffba..d1720f270e7beb 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms3_op.cc
@@ -54,18 +54,34 @@ class MultiClassNMS3OpConverter : public OpConverter {
         PADDLE_GET_CONST(float, op_desc.GetAttr("nms_threshold"));
     int keep_top_k = PADDLE_GET_CONST(int, op_desc.GetAttr("keep_top_k"));
     bool normalized = PADDLE_GET_CONST(bool, op_desc.GetAttr("normalized"));
-    int num_classes = scores_tensor->getDimensions().d[0];
+    int class_index = engine_->with_dynamic_shape() ? 1 : 0;
+    int num_classes = scores_tensor->getDimensions().d[class_index];
 
     auto bboxes_dims = bboxes_tensor->getDimensions();
-    nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
-    auto* bboxes_expand_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
-    bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
-
-    nvinfer1::Permutation permutation{1, 0};
-    auto* scores_transpose_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
-    scores_transpose_layer->setFirstTranspose(permutation);
+    nvinfer1::IShuffleLayer* bboxes_expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* scores_transpose_layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      nvinfer1::Dims4 bboxes_expand_dims(
+          bboxes_dims.d[0], bboxes_dims.d[1], 1, bboxes_dims.d[2]);
+      bboxes_expand_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+      bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+      nvinfer1::Permutation permutation{0, 2, 1};
+      scores_transpose_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+      scores_transpose_layer->setFirstTranspose(permutation);
+    } else {
+      nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
+      bboxes_expand_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+      bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+      nvinfer1::Permutation permutation{1, 0};
+      scores_transpose_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+      scores_transpose_layer->setFirstTranspose(permutation);
+    }
 
     std::vector<nvinfer1::ITensor*> batch_nms_inputs;
     batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0));
@@ -101,27 +117,41 @@ class MultiClassNMS3OpConverter : public OpConverter {
                    fields.size() * sizeof(nvinfer1::PluginField)));
     plugin_collections->nbFields = static_cast<int>(fields.size());
     plugin_collections->fields = fields.data();
-
-    auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1");
+    std::string nms_plugin_name = "BatchedNMS_TRT";
+    if (engine_->with_dynamic_shape()) {
+      nms_plugin_name = "BatchedNMSDynamic_TRT";
+    }
+    auto creator =
+        GetPluginRegistry()->getPluginCreator(nms_plugin_name.c_str(), "1");
     auto batch_nms_plugin =
-        creator->createPlugin("BatchNMSPlugin", plugin_collections);
+        creator->createPlugin(nms_plugin_name.c_str(), plugin_collections);
     free(plugin_collections);
 
     auto batch_nms_layer = engine_->network()->addPluginV2(
         batch_nms_inputs.data(), batch_nms_inputs.size(), *batch_nms_plugin);
+    // static shape: [keep_topk, 4], [keep_topk], [keep_topk]
+    // dynamic shape: [bs, keep_topk, 4], [bs, keep_topk], [bs, keep_topk]
     auto nmsed_boxes = batch_nms_layer->getOutput(1);
     auto nmsed_scores = batch_nms_layer->getOutput(2);
     auto nmsed_classes = batch_nms_layer->getOutput(3);
 
     auto nmsed_scores_transpose_layer =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores);
-    nmsed_scores_transpose_layer->setReshapeDimensions(
-        nvinfer1::Dims2(keep_top_k, 1));
     auto nmsed_classes_reshape_layer =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes);
-    nmsed_classes_reshape_layer->setReshapeDimensions(
-        nvinfer1::Dims2(keep_top_k, 1));
-
+    if (engine_->with_dynamic_shape()) {
+      nmsed_scores_transpose_layer->setReshapeDimensions(
+          nvinfer1::Dims3(bboxes_dims.d[0], keep_top_k, 1));
+
+      nmsed_classes_reshape_layer->setReshapeDimensions(
+          nvinfer1::Dims3(bboxes_dims.d[0], keep_top_k, 1));
+    } else {
+      nmsed_scores_transpose_layer->setReshapeDimensions(
+          nvinfer1::Dims2(keep_top_k, 1));
+
+      nmsed_classes_reshape_layer->setReshapeDimensions(
+          nvinfer1::Dims2(keep_top_k, 1));
+    }
     std::vector<nvinfer1::ITensor*> concat_inputs;
     concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0));
     concat_inputs.push_back(nmsed_scores_transpose_layer->getOutput(0));
@@ -129,7 +159,8 @@ class MultiClassNMS3OpConverter : public OpConverter {
 
     auto nms_concat_layer = TRT_ENGINE_ADD_LAYER(
         engine_, Concatenation, concat_inputs.data(), concat_inputs.size());
-    nms_concat_layer->setAxis(1);
+    int axis_index = engine_->with_dynamic_shape() ? 1 : 0;
+    nms_concat_layer->setAxis(axis_index + 1);
 
     // add fake index as output to be consistent with the outputs of
     // multiclass_nms3
diff --git a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
index 1266b1b621d56d..bfc12eb3a6ff46 100644
--- a/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multiclass_nms_op.cc
@@ -52,18 +52,34 @@ class MultiClassNMSOpConverter : public OpConverter {
         PADDLE_GET_CONST(float, op_desc.GetAttr("nms_threshold"));
     int keep_top_k = PADDLE_GET_CONST(int, op_desc.GetAttr("keep_top_k"));
     bool normalized = PADDLE_GET_CONST(bool, op_desc.GetAttr("normalized"));
-    int num_classes = scores_tensor->getDimensions().d[0];
+    int class_index = engine_->with_dynamic_shape() ? 1 : 0;
+    int num_classes = scores_tensor->getDimensions().d[class_index];
 
     auto bboxes_dims = bboxes_tensor->getDimensions();
-    nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
-    auto* bboxes_expand_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
-    bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
-
-    nvinfer1::Permutation permutation{1, 0};
-    auto* scores_transpose_layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
-    scores_transpose_layer->setFirstTranspose(permutation);
+    nvinfer1::IShuffleLayer* bboxes_expand_layer = nullptr;
+    nvinfer1::IShuffleLayer* scores_transpose_layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      nvinfer1::Dims4 bboxes_expand_dims(
+          bboxes_dims.d[0], bboxes_dims.d[1], 1, bboxes_dims.d[2]);
+      bboxes_expand_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+      bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+      nvinfer1::Permutation permutation{0, 2, 1};
+      scores_transpose_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+      scores_transpose_layer->setFirstTranspose(permutation);
+    } else {
+      nvinfer1::Dims3 bboxes_expand_dims(bboxes_dims.d[0], 1, bboxes_dims.d[1]);
+      bboxes_expand_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *bboxes_tensor);
+      bboxes_expand_layer->setReshapeDimensions(bboxes_expand_dims);
+
+      nvinfer1::Permutation permutation{1, 0};
+      scores_transpose_layer =
+          TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *scores_tensor);
+      scores_transpose_layer->setFirstTranspose(permutation);
+    }
 
     std::vector<nvinfer1::ITensor*> batch_nms_inputs;
     batch_nms_inputs.push_back(bboxes_expand_layer->getOutput(0));
@@ -100,9 +116,14 @@ class MultiClassNMSOpConverter : public OpConverter {
     plugin_collections->nbFields = static_cast<int>(fields.size());
     plugin_collections->fields = fields.data();
 
-    auto creator = GetPluginRegistry()->getPluginCreator("BatchedNMS_TRT", "1");
+    std::string nms_plugin_name = "BatchedNMS_TRT";
+    if (engine_->with_dynamic_shape()) {
+      nms_plugin_name = "BatchedNMSDynamic_TRT";
+    }
+    auto creator =
+        GetPluginRegistry()->getPluginCreator(nms_plugin_name.c_str(), "1");
     auto batch_nms_plugin =
-        creator->createPlugin("BatchNMSPlugin", plugin_collections);
+        creator->createPlugin(nms_plugin_name.c_str(), plugin_collections);
     free(plugin_collections);
 
     auto batch_nms_layer = engine_->network()->addPluginV2(
@@ -113,12 +134,21 @@ class MultiClassNMSOpConverter : public OpConverter {
 
     auto nmsed_scores_transpose_layer =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_scores);
-    nmsed_scores_transpose_layer->setReshapeDimensions(
-        nvinfer1::Dims2(keep_top_k, 1));
     auto nmsed_classes_reshape_layer =
         TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *nmsed_classes);
-    nmsed_classes_reshape_layer->setReshapeDimensions(
-        nvinfer1::Dims2(keep_top_k, 1));
+    if (engine_->with_dynamic_shape()) {
+      nmsed_scores_transpose_layer->setReshapeDimensions(
+          nvinfer1::Dims3(bboxes_dims.d[0], keep_top_k, 1));
+
+      nmsed_classes_reshape_layer->setReshapeDimensions(
+          nvinfer1::Dims3(bboxes_dims.d[0], keep_top_k, 1));
+    } else {
+      nmsed_scores_transpose_layer->setReshapeDimensions(
+          nvinfer1::Dims2(keep_top_k, 1));
+
+      nmsed_classes_reshape_layer->setReshapeDimensions(
+          nvinfer1::Dims2(keep_top_k, 1));
+    }
 
     std::vector<nvinfer1::ITensor*> concat_inputs;
     concat_inputs.push_back(nmsed_classes_reshape_layer->getOutput(0));
@@ -127,7 +157,8 @@ class MultiClassNMSOpConverter : public OpConverter {
 
     auto nms_concat_layer = TRT_ENGINE_ADD_LAYER(
         engine_, Concatenation, concat_inputs.data(), concat_inputs.size());
-    nms_concat_layer->setAxis(1);
+    int axis_index = engine_->with_dynamic_shape() ? 1 : 0;
+    nms_concat_layer->setAxis(axis_index + 1);
 
     RreplenishLayerAndOutput(
         nms_concat_layer, "multiclass_nms", {output_name}, test_mode);
diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h
index cdd6345c484413..e253b83a739989 100644
--- a/paddle/fluid/inference/tensorrt/convert/op_converter.h
+++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/helper.h"
 #include "paddle/fluid/inference/tensorrt/engine.h"
 #include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/op_teller.h"
 #include "paddle/fluid/inference/utils/singleton.h"
 
 namespace paddle {
@@ -49,111 +50,136 @@ class OpConverter {
                  const std::unordered_set<std::string>& parameters,
                  const framework::Scope& scope,
                  TensorRTEngine* engine,
-                 bool test_mode = false) {
+                 bool test_mode = false,
+                 const framework::proto::BlockDesc* block = nullptr) {
     framework::OpDesc op_desc(op, nullptr);
 
     OpConverter* it{nullptr};
 
-    if (op_desc.Type() == "mul") {
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
-                        1UL,
-                        platform::errors::InvalidArgument(
-                            "The input op mul's Input(\"Y\")."
-                            "size() should equal to 1, but reveceid "
-                            "Input(\"Y\").size() = %u.",
-                            op_desc.Input("Y").size()));
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        it = Registry<OpConverter>::Global().Lookup("fc");
-      }
-    }
-    if (op_desc.Type().find("elementwise") != std::string::npos) {
-      static std::unordered_set<std::string> add_tensor_op_set{
-          "add", "mul", "sub", "div", "max", "min", "pow"};
-      static std::unordered_set<std::string> add_weight_op_set{
-          "add", "mul", "sub", "div", "pow"};
-      PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
-                        1UL,
-                        platform::errors::InvalidArgument(
-                            "The input op's Input(\"Y\")."
-                            "size() should equal to 1, but reveceid "
-                            "Input(\"Y\").size() = %u.",
-                            op_desc.Input("Y").size()));
-      int op_type_len = op_desc.Type().size();
-      std::string op_type = op_desc.Type().substr(op_type_len - 3, op_type_len);
-      std::string Y = op_desc.Input("Y")[0];
-      if (parameters.count(Y)) {
-        PADDLE_ENFORCE_GT(
-            add_weight_op_set.count(op_type),
-            0,
-            platform::errors::Unimplemented("Unsupported elementwise type %s",
-                                            op_type.c_str()));
-        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-                                                    "_weight");
-        PADDLE_ENFORCE_NOT_NULL(
-            it,
-            platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                            op_desc.Type()));
-      } else {
-        PADDLE_ENFORCE_GT(
-            add_tensor_op_set.count(op_type),
-            0,
-            platform::errors::Unimplemented("Unsupported elementwise type %s",
-                                            op_type.c_str()));
-        it = Registry<OpConverter>::Global().Lookup("elementwise_" + op_type +
-                                                    "_tensor");
-      }
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
+    auto op_converter_type_map = OpTeller::Global().GetOpConverterTypeMap();
+    switch (op_converter_type_map.at(op_desc.Type())) {
+      case OpConverterType::Default:
+        if (op_desc.Type() == "mul") {
+          PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
+                            1UL,
+                            platform::errors::InvalidArgument(
+                                "The input op mul's Input(\"Y\")."
+                                "size() should equal to 1, but reveceid "
+                                "Input(\"Y\").size() = %u.",
+                                op_desc.Input("Y").size()));
+          std::string Y = op_desc.Input("Y")[0];
+          if (parameters.count(Y)) {
+            it = Registry<OpConverter>::Global().Lookup("fc");
+          }
+        }
+        if (op_desc.Type().find("elementwise") != std::string::npos) {
+          static std::unordered_set<std::string> add_tensor_op_set{
+              "add", "mul", "sub", "div", "max", "min", "pow"};
+          static std::unordered_set<std::string> add_weight_op_set{
+              "add", "mul", "sub", "div", "max", "min", "pow"};
+          PADDLE_ENFORCE_EQ(op_desc.Input("Y").size(),
+                            1UL,
+                            platform::errors::InvalidArgument(
+                                "The input op's Input(\"Y\")."
+                                "size() should equal to 1, but reveceid "
+                                "Input(\"Y\").size() = %u.",
+                                op_desc.Input("Y").size()));
+          int op_type_len = op_desc.Type().size();
+          std::string op_type =
+              op_desc.Type().substr(op_type_len - 3, op_type_len);
+          std::string Y = op_desc.Input("Y")[0];
+          if (parameters.count(Y)) {
+            PADDLE_ENFORCE_GT(
+                add_weight_op_set.count(op_type),
+                0,
+                platform::errors::Unimplemented(
+                    "Unsupported elementwise type %s", op_type.c_str()));
+            it = Registry<OpConverter>::Global().Lookup("elementwise_" +
+                                                        op_type + "_weight");
+            PADDLE_ENFORCE_NOT_NULL(
+                it,
+                platform::errors::Unimplemented(
+                    "no OpConverter for optype [%s]", op_desc.Type()));
+          } else {
+            PADDLE_ENFORCE_GT(
+                add_tensor_op_set.count(op_type),
+                0,
+                platform::errors::Unimplemented(
+                    "Unsupported elementwise type %s", op_type.c_str()));
+            it = Registry<OpConverter>::Global().Lookup("elementwise_" +
+                                                        op_type + "_tensor");
+          }
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
 
-    if (op_desc.Type() == "depthwise_conv2d") {
-      it = Registry<OpConverter>::Global().Lookup("conv2d");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (op_desc.Type() == "depthwise_conv2d_transpose") {
-      it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (op_desc.Type() == "transpose2") {
-      it = Registry<OpConverter>::Global().Lookup("transpose");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (op_desc.Type() == "flatten2") {
-      it = Registry<OpConverter>::Global().Lookup("flatten");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    // reshape2 == reshape
-    if (op_desc.Type() == "reshape2") {
-      it = Registry<OpConverter>::Global().Lookup("reshape");
-      PADDLE_ENFORCE_NOT_NULL(
-          it,
-          platform::errors::Unimplemented("no OpConverter for optype [%s]",
-                                          op_desc.Type()));
-    }
-    if (!it) {
-      it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
+        if (op_desc.Type() == "depthwise_conv2d") {
+          it = Registry<OpConverter>::Global().Lookup("conv2d");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (op_desc.Type() == "depthwise_conv2d_transpose") {
+          it = Registry<OpConverter>::Global().Lookup("conv2d_transpose");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (op_desc.Type() == "transpose2") {
+          it = Registry<OpConverter>::Global().Lookup("transpose");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (op_desc.Type() == "flatten2") {
+          it = Registry<OpConverter>::Global().Lookup("flatten");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        // reshape2 == reshape
+        if (op_desc.Type() == "reshape2") {
+          it = Registry<OpConverter>::Global().Lookup("reshape");
+          PADDLE_ENFORCE_NOT_NULL(
+              it,
+              platform::errors::Unimplemented("no OpConverter for optype [%s]",
+                                              op_desc.Type()));
+        }
+        if (!it) {
+          it = Registry<OpConverter>::Global().Lookup(op_desc.Type());
+        }
+        break;
+
+      case OpConverterType::GenericPluginCreater:
+        LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
+                  << ", now use generic_plugin_creater!";
+        it = Registry<OpConverter>::Global().Lookup("generic_plugin_creater");
+        break;
+
+      case OpConverterType::CustomPluginCreater:
+        LOG(INFO) << "There is no OpConverter for type " << op_desc.Type()
+                  << ", now use custom_plugin_creater!";
+        it = Registry<OpConverter>::Global().Lookup("custom_plugin_creater");
+        break;
+
+      default:
+        CHECK(false) << "no OpConverter for optype " << op_desc.Type();
     }
+
     PADDLE_ENFORCE_NOT_NULL(
         it,
         platform::errors::Unimplemented("no OpConverter for optype [%s]",
                                         op_desc.Type()));
 
     it->SetEngine(engine);
+    engine->SetScope(scope);
+    it->SetBlockDesc(block);
     (*it)(op, scope, test_mode);
 
     size_t output_num = op_desc.OutputNames().size();
@@ -230,34 +256,9 @@ class OpConverter {
                     const framework::Scope& scope,
                     TensorRTEngine* engine) {
     std::unique_lock<std::mutex> lk(mut_);
-    for (int i = 0; i < block.ops_size(); i++) {
-      SetEngine(engine);
-      const auto& op = block.ops(i);
-      framework::OpDesc op_desc(op, nullptr);
-      framework::Variable* X_v = nullptr;
-      std::string X_name;
-      // inputs : string -> std::vector<string>
-      auto inputs = op_desc.Inputs();
-      if (inputs.count("X")) {
-        X_name = op_desc.Input("X")[0];
-      } else if (inputs.count("Input")) {
-        X_name = op_desc.Input("Input")[0];
-      } else if (inputs.count("Y")) {
-        X_name = op_desc.Input("Y")[0];
-      }
-      X_v = scope.FindVar(X_name);
-      // If this weight is shared between ops, it needn't to be convtered to
-      // itensor once again
-      if (engine->GetITensorMap()->count(X_name)) {
-        continue;
-      }
-      if (X_v) {
-        ConvertWeight2ITensor(scope, X_name);
-      }
-    }
     for (int i = 0; i < block.ops_size(); i++) {
       const auto& op = block.ops(i);
-      ConvertOp(op, parameters, scope, engine);
+      ConvertOp(op, parameters, scope, engine, false, &block);
     }
     for (int i = 0; i < engine->network()->getNbLayers(); i++) {
       auto layer = engine->network()->getLayer(i);
@@ -571,35 +572,6 @@ class OpConverter {
     return Add1DConstantLayer(input_data, weight_name, scalar);
   }
 
-  // For cases when input is not middle-tensor , but persistable tensor
-  // you should call this.
-  nvinfer1::ITensor* ConvertWeight2ITensor(const framework::Scope& scope,
-                                           const std::string& name) {
-    auto* var_v = scope.FindVar(name);
-    auto* var_t = var_v->GetMutable<framework::LoDTensor>();
-    auto weight = engine_->GetTrtWeight(name, *var_t);
-
-    // Now we have create weights, then we need create a itensor
-    auto var_dims = var_t->dims();
-    nvinfer1::Dims trt_in_shape;
-    trt_in_shape.nbDims = var_t->dims().size();
-    for (int64_t i = 0; i < trt_in_shape.nbDims; i++) {
-      trt_in_shape.d[i] = var_dims[i];
-    }
-    // In fact , this is not always right, because we can't determine if the 0th
-    // dimension is batch. Just for run chenqu's model
-    if (!engine_->with_dynamic_shape()) {
-      trt_in_shape.nbDims--;
-      for (int i = 0; i < trt_in_shape.nbDims; i++) {
-        trt_in_shape.d[i] = trt_in_shape.d[i + 1];
-      }
-    }
-    nvinfer1::ILayer* layer =
-        TRT_ENGINE_ADD_LAYER(engine_, Constant, trt_in_shape, weight.get());
-    engine_->SetITensor(name, layer->getOutput(0));
-    return layer->getOutput(0);
-  }
-
   void RreplenishLayerAndOutput(
       nvinfer1::ILayer* layer,
       const std::string& layer_type,
@@ -620,10 +592,16 @@ class OpConverter {
   }
   void SetEngine(TensorRTEngine* engine) { engine_ = engine; }
 
+  void SetBlockDesc(const framework::proto::BlockDesc* block) {
+    block_ = block;
+  }
+
   virtual ~OpConverter() {}
 
   // TensorRT engine
   TensorRTEngine* engine_{nullptr};
+  // BlockDesc
+  const framework::proto::BlockDesc* block_{nullptr};
 
  protected:
   bool test_mode_;
diff --git a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
index cb67957c79cbf4..deecb913891b16 100644
--- a/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/strided_slice_op.cc
@@ -14,33 +14,23 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 
-namespace paddle {
-namespace framework {
-class Scope;
-namespace proto {
-class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
-
 namespace paddle {
 namespace inference {
 namespace tensorrt {
 
-/*
- * Stack converter from fluid to tensorRT.
- */
 class StridedSliceOpConverter : public OpConverter {
  public:
   void operator()(const framework::proto::OpDesc& op,
                   const framework::Scope& scope,
                   bool test_mode) override {
-    VLOG(4) << "convert fluid StridedSlice op to tensorrt Slice layer";
-
+    VLOG(4) << "convert strided_slice op to tensorrt layer";
     framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
     auto* input = engine_->GetITensor(op_desc.Input("Input")[0]);
-    nvinfer1::Dims input_dims = input->getDimensions();
     auto output_name = op_desc.Output("Out")[0];
+
+    // phi only allow axes[i] >= 0 && <rank, so we need not deal with minus
+    // axes[i]
     std::vector<int> axes =
         PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("axes"));
     std::vector<int> starts =
@@ -49,119 +39,148 @@ class StridedSliceOpConverter : public OpConverter {
         PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("ends"));
     std::vector<int> strides =
         PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
-    int axes_size = axes.size();
-    nvinfer1::Dims start;
-    nvinfer1::Dims stride;
-    nvinfer1::Dims size;
-    start.nbDims = input_dims.nbDims;
-    stride.nbDims = input_dims.nbDims;
-    size.nbDims = input_dims.nbDims;
-    for (int i = 0; i < input_dims.nbDims; i++) {
-      start.d[i] = 0;
-      stride.d[i] = 1;
-      size.d[i] = input_dims.d[i];
-    }
+    std::vector<int> decrease_axises =
+        PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("decrease_axis"));
 
+    auto input_dims = input->getDimensions();
     if (!engine_->with_dynamic_shape()) {
-      for (int i = 0; i < axes_size; i++) {
-        start.d[axes[i] - 1] = starts[i];
+      // notice that input shape is [CHW] without batch axis when input has
+      // static shape
+      for (size_t i = input_dims.nbDims; i > 0; i--) {
+        input_dims.d[i] = input_dims.d[i - 1];
       }
-      for (int i = 0; i < axes_size; i++) {
-        stride.d[axes[i] - 1] = strides[i];
-      }
-      for (int i = 0; i < axes_size; ++i) {
-        int dim = size.d[axes[i] - 1];
-        if (dim > 0) {
-          int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
-          int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
-          int stride = std::abs(strides[i]);
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim);
-          size.d[axes[i] - 1] = (std::abs(end - start) + stride - 1) / stride;
+      input_dims.d[0] = 1;  // fake batchsize, not useful here
+      for (size_t i = 0; i < axes.size(); i++) {
+        if (starts[i] < 0) {
+          starts[i] = std::max(starts[i] + input_dims.d[axes[i]], 0);
+        }
+        if (ends[i] < 0) {
+          ends[i] = std::max(ends[i] + input_dims.d[axes[i]], 0);
         }
+        ends[i] = std::min(ends[i], input_dims.d[axes[i]]);
+        PADDLE_ENFORCE_GT(
+            ends[i],
+            starts[i],
+            platform::errors::InvalidArgument(
+                "Attr(ends) should be greater than attr(starts) in "
+                "slice op. But received ends = %d, starts = %d.",
+                ends[i],
+                starts[i]));
       }
-      auto* layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
-      RreplenishLayerAndOutput(
-          layer, "strided_slice", {output_name}, test_mode);
-    } else {
-      for (int i = 0; i < axes_size; i++) {
-        start.d[axes[i]] = starts[i];
+    }
+
+    nvinfer1::ILayer* layer = nullptr;
+    if (engine_->with_dynamic_shape()) {
+      auto nchw_input_dims = input->getDimensions();
+      nvinfer1::Dims trt_start_dims;
+      trt_start_dims.nbDims = nchw_input_dims.nbDims;
+      memset(trt_start_dims.d, 0, sizeof(int32_t) * nchw_input_dims.nbDims);
+      nvinfer1::Dims trt_size_dims = trt_start_dims;
+      nvinfer1::Dims trt_end_dims = trt_start_dims;
+      nvinfer1::Dims trt_step_dims = trt_start_dims;
+      for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1;
+      // input : [N,C,H,W]
+      bool has_neg_indices = false;
+      for (size_t i = 0; i < axes.size(); i++) {
+        int trt_axis = axes[i];
+        trt_start_dims.d[trt_axis] = starts[i];
+        trt_end_dims.d[trt_axis] = ends[i];
+        trt_step_dims.d[axes[i]] = strides[i];
+        if (starts[i] < 0 || ends[i] < 0) has_neg_indices = true;
       }
-      for (int i = 0; i < axes_size; i++) {
-        stride.d[axes[i]] = strides[i];
+      auto* shape_tensor = Shape(input);
+      auto* start_tensor = Add1DConstantLayer(trt_start_dims);
+      if (has_neg_indices) {
+        start_tensor = FixNegIndices(shape_tensor, start_tensor);
       }
-      for (int i = 0; i < axes_size; ++i) {
-        int dim = size.d[axes[i]];
-        if (dim > 0) {
-          int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
-          int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
-          int stride = std::abs(strides[i]);
-          start = std::max(start, 0);
-          end = std::max(end, 0);
-          end = std::min(end, dim);
-          size.d[axes[i]] = (std::abs(end - start) + stride - 1) / stride;
-        }
+
+      std::vector<nvinfer1::ITensor*> end_vec_tensor;
+      for (int i = 0; i < trt_end_dims.nbDims; i++) {
+        end_vec_tensor.push_back(GetEleTensorOfShape(shape_tensor, i));
       }
 
-      auto create_weights = [&](const std::vector<int>& data,
-                                const std::string& type) -> int* {
-        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
-        int data_size = data.size();
-        tmp_tensor->Resize({data_size});
-        auto* tmp_data = tmp_tensor->mutable_data<int>(platform::CPUPlace());
-        for (int i = 0; i < data_size; i++) {
-          tmp_data[i] = data[i];
+      for (size_t i = 0; i < axes.size(); i++) {
+        int trt_axis = axes[i];
+        if (ends[i] >= 0) {
+          end_vec_tensor[trt_axis] = Add1DConstantLayer(ends[i]);
+        } else {
+          end_vec_tensor[trt_axis] =
+              Sum(end_vec_tensor[trt_axis], Add1DConstantLayer(ends[i]));
         }
-
-        engine_->SetWeights(output_name + "_add_slice_op_" + type,
-                            std::move(tmp_tensor));
-        return tmp_data;
-      };
-
-      std::vector<int> const_weight(input_dims.nbDims, 0);
-      for (int i = 0; i < axes_size; i++) {
-        int dim = input_dims.d[axes[i]];
-        int start = starts[i] < 0 ? (starts[i] + dim) : starts[i];
-        int end = ends[i] < 0 ? (ends[i] + dim) : ends[i];
-        int stride = std::abs(strides[i]);
-        start = std::max(start, 0);
-        end = std::max(end, 0);
-        end = std::min(end, dim);
-        const_weight[axes[i]] =
-            dim - ((std::abs(end - start) + stride - 1) / stride);
       }
 
-      int* weight_data = create_weights(const_weight, "size");
-
-      TensorRTEngine::Weight weight{nvinfer1::DataType::kINT32,
-                                    static_cast<void*>(weight_data),
-                                    static_cast<size_t>(input_dims.nbDims)};
-
-      int input_dim_size = input_dims.nbDims;
-      nvinfer1::Dims input_shape;
-      input_shape.nbDims = 1;
-      input_shape.d[0] = input_dim_size;
-
-      auto const_layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Constant, input_shape, weight.get());
-
-      auto shape_layer = TRT_ENGINE_ADD_LAYER(engine_, Shape, *input);
-      // slice layer
-      auto* layer =
-          TRT_ENGINE_ADD_LAYER(engine_, Slice, *input, start, size, stride);
-      // elementwise layer for get size tensor
-      auto size_layer =
-          TRT_ENGINE_ADD_LAYER(engine_,
-                               ElementWise,
-                               *shape_layer->getOutput(0),
-                               *const_layer->getOutput(0),
-                               nvinfer1::ElementWiseOperation::kSUB);
-      layer->setInput(2, *size_layer->getOutput(0));
-      RreplenishLayerAndOutput(
-          layer, "strided_slice", {output_name}, test_mode);
+      auto* size_tensor =
+          Sub(start_tensor, Min(Concat(end_vec_tensor), shape_tensor));
+      auto zero_t =
+          Add1DConstantLayer(std::vector<int>(nchw_input_dims.nbDims, 0));
+      auto step_tensor = Add1DConstantLayer(trt_step_dims);
+      size_tensor = Sub(zero_t, FloorDiv(size_tensor, step_tensor));
+
+      layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims);
+      layer->setInput(1, *start_tensor);
+      layer->setInput(2, *size_tensor);
+      layer->setInput(3, *step_tensor);
+
+      if (decrease_axises.size() > 0) {
+        std::vector<int32_t> gather_indices;
+        for (int i = 0; i < trt_size_dims.nbDims; i++) {
+          if (decrease_axises.end() !=
+              std::find(decrease_axises.begin(), decrease_axises.end(), i))
+            continue;
+          gather_indices.push_back(i);
+        }
+        if (gather_indices.empty())
+          gather_indices.push_back(decrease_axises[0]);
+        auto real_size_tensor = Gather(size_tensor, gather_indices);
+        layer = TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+        layer->setInput(1, *real_size_tensor);
+      }
+    } else {
+      auto chw_input_dims = input->getDimensions();
+      nvinfer1::Dims trt_start_dims;
+      trt_start_dims.nbDims = chw_input_dims.nbDims;
+      memset(trt_start_dims.d, 0, sizeof(int32_t) * chw_input_dims.nbDims);
+      nvinfer1::Dims trt_size_dims = chw_input_dims;
+      nvinfer1::Dims trt_step_dims;
+      trt_step_dims.nbDims = chw_input_dims.nbDims;
+      for (int i = 0; i < trt_step_dims.nbDims; i++) trt_step_dims.d[i] = 1;
+
+      // input : [C,H,W]
+      for (size_t i = 0; i < axes.size(); i++) {
+        int trt_axis = axes[i] - 1;
+        trt_start_dims.d[trt_axis] = starts[i];
+        trt_size_dims.d[trt_axis] =
+            (ends[i] - starts[i] + strides[i] - 1) / strides[i];
+        trt_step_dims.d[trt_axis] = strides[i];
+      }
+      layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Slice, *input, trt_start_dims, trt_size_dims, trt_step_dims);
+      nvinfer1::Dims real_trt_size_dims;
+      real_trt_size_dims.nbDims = 0;
+
+      if (decrease_axises.size() > 0) {
+        for (size_t i = 0; i < decrease_axises.size(); i++) {
+          decrease_axises[i]--;
+        }
+        for (int i = 0; i < trt_size_dims.nbDims; i++) {
+          if (decrease_axises.end() !=
+              std::find(decrease_axises.begin(), decrease_axises.end(), i))
+            continue;
+          real_trt_size_dims.d[real_trt_size_dims.nbDims] = trt_size_dims.d[i];
+          real_trt_size_dims.nbDims++;
+        }
+        if (real_trt_size_dims.nbDims == 0) {
+          real_trt_size_dims.nbDims = 1;
+          real_trt_size_dims.d[0] = 1;
+        }
+        auto reshape_layer =
+            TRT_ENGINE_ADD_LAYER(engine_, Shuffle, *layer->getOutput(0));
+        reshape_layer->setReshapeDimensions(real_trt_size_dims);
+        layer = static_cast<nvinfer1::ILayer*>(reshape_layer);
+      }
     }
+    RreplenishLayerAndOutput(layer, "strided_slice", {output_name}, test_mode);
   }
 };
 
diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h b/paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h
new file mode 100644
index 00000000000000..adb41528bae004
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h
@@ -0,0 +1,356 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <cassert>
+#include <iostream>
+#include <string>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+class custom_op_plugin : public nvinfer1::IPluginV2 {
+ public:
+  explicit custom_op_plugin(float float_attr) { float_attr_ = float_attr; }
+
+  custom_op_plugin(const void* buffer, size_t length) {
+    DeserializeValue(&buffer, &length, &float_attr_);
+  }
+
+  size_t getSerializationSize() const noexcept override {
+    return SerializedSize(float_attr_);
+  }
+
+  void serialize(void* buffer) const noexcept override {
+    SerializeValue(&buffer, float_attr_);
+  }
+
+  nvinfer1::IPluginV2* clone() const noexcept override {
+    return new custom_op_plugin(float_attr_);
+  }
+
+  ~custom_op_plugin() override = default;
+
+  const char* getPluginType() const noexcept override {
+    return "custom_op_paddle_trt_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  int getNbOutputs() const noexcept override { return 1; }
+
+  nvinfer1::Dims getOutputDimensions(int index,
+                                     const nvinfer1::Dims* inputs,
+                                     int nbInputDims) noexcept override {
+    return inputs[0];
+  }
+
+  bool supportsFormat(nvinfer1::DataType type,
+                      nvinfer1::PluginFormat format) const noexcept override {
+    return true;
+  }
+
+  void configureWithFormat(nvinfer1::Dims const* inputDims,
+                           int32_t nbInputs,
+                           nvinfer1::Dims const* outputDims,
+                           int32_t nbOutputs,
+                           nvinfer1::DataType type,
+                           nvinfer1::PluginFormat format,
+                           int32_t maxBatchSize) noexcept override {}
+
+  int initialize() noexcept override { return 0; }
+
+  void terminate() noexcept override {}
+
+  size_t getWorkspaceSize(int maxBatchSize) const noexcept override {
+    return 0;
+  }
+
+#if IS_TRT_VERSION_LT(8000)
+  int enqueue(int batch_size,
+              const void* const* inputs,
+              void** outputs,
+#else
+  int enqueue(int batch_size,
+              const void* const* inputs,
+              void* const* outputs,
+#endif
+              void* workspace,
+              cudaStream_t stream) noexcept override {
+    return 0;
+  }
+
+  void destroy() noexcept override { delete this; }
+
+  void setPluginNamespace(const char* libNamespace) noexcept override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return namespace_.c_str();
+  }
+
+ private:
+  float float_attr_;
+  std::string namespace_;
+};
+
+class custom_op_plugin_creator : public nvinfer1::IPluginCreator {
+ public:
+  custom_op_plugin_creator() {}
+
+  ~custom_op_plugin_creator() override = default;
+
+  const char* getPluginName() const noexcept override {
+    return "custom_op_paddle_trt_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  void setPluginNamespace(const char* pluginNamespace) noexcept override {
+    plugin_namespace_ = pluginNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return plugin_namespace_.c_str();
+  }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) noexcept override {
+    CHECK_EQ(fc->nbFields, 7);
+    // float_attr
+    auto attr_field = (fc->fields)[0];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kFLOAT32);
+    CHECK_EQ(attr_field.length, 1);
+    float float_value = (reinterpret_cast<const float*>(attr_field.data))[0];
+    CHECK_EQ(float_value, 1.0);
+
+    // int_attr
+    attr_field = (fc->fields)[1];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 1);
+    int int_value = (reinterpret_cast<const int*>(attr_field.data))[0];
+    CHECK_EQ(int_value, 1);
+
+    // bool_attr
+    attr_field = (fc->fields)[2];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 1);
+    int bool_value = (reinterpret_cast<const int*>(attr_field.data))[0];
+    CHECK_EQ(bool_value, 1);
+
+    // string_attr
+    attr_field = (fc->fields)[3];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kCHAR);
+    std::string expect_string_attr = "test_string_attr";
+    CHECK_EQ((size_t)attr_field.length, expect_string_attr.size() + 1);
+    const char* receive_string_attr =
+        reinterpret_cast<const char*>(attr_field.data);
+    CHECK(expect_string_attr == std::string(receive_string_attr));
+
+    // ints_attr
+    attr_field = (fc->fields)[4];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 3);
+    const int* ints_value = reinterpret_cast<const int*>(attr_field.data);
+    CHECK_EQ(ints_value[0], 1);
+    CHECK_EQ(ints_value[1], 2);
+    CHECK_EQ(ints_value[2], 3);
+
+    // floats_attr
+    attr_field = (fc->fields)[5];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kFLOAT32);
+    CHECK_EQ(attr_field.length, 3);
+    const float* floats_value = reinterpret_cast<const float*>(attr_field.data);
+    CHECK_EQ(floats_value[0], 1.0);
+    CHECK_EQ(floats_value[1], 2.0);
+    CHECK_EQ(floats_value[2], 3.0);
+
+    // bools_attr
+    attr_field = (fc->fields)[6];
+    CHECK(attr_field.type == nvinfer1::PluginFieldType::kINT32);
+    CHECK_EQ(attr_field.length, 3);
+    ints_value = reinterpret_cast<const int*>(attr_field.data);
+    CHECK_EQ(ints_value[0], true);
+    CHECK_EQ(ints_value[1], false);
+    CHECK_EQ(ints_value[2], true);
+
+    return new custom_op_plugin(float_value);
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name,
+      const void* serialData,
+      size_t serialLength) noexcept override {
+    return new custom_op_plugin(serialData, serialLength);
+  }
+
+ private:
+  std::string plugin_namespace_;
+};
+
+class custom_op_dynamic_plugin : public nvinfer1::IPluginV2DynamicExt {
+ public:
+  explicit custom_op_dynamic_plugin(float float_attr)
+      : float_attr_(float_attr) {}
+
+  custom_op_dynamic_plugin(const void* buffer, size_t length) {
+    DeserializeValue(&buffer, &length, &float_attr_);
+  }
+
+  ~custom_op_dynamic_plugin() override = default;
+
+  const char* getPluginType() const noexcept override {
+    return "custom_op_paddle_trt_dynamic_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  int getNbOutputs() const noexcept override { return 1; }
+
+  int initialize() noexcept override { return 0; }
+
+  void terminate() noexcept override {}
+
+  size_t getSerializationSize() const noexcept override {
+    return SerializedSize(float_attr_);
+  }
+
+  void serialize(void* buffer) const noexcept override {
+    SerializeValue(&buffer, float_attr_);
+  }
+
+  void destroy() noexcept override { delete this; }
+
+  void setPluginNamespace(const char* libNamespace) noexcept override {
+    namespace_ = libNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return namespace_.c_str();
+  }
+
+  /*IPluginV2Ext method*/
+  nvinfer1::DataType getOutputDataType(
+      int32_t index,
+      nvinfer1::DataType const* inputTypes,
+      int32_t nbInputs) const noexcept override {
+    return inputTypes[index];
+  }
+
+  /*IPluginV2DynamicExt method*/
+  nvinfer1::IPluginV2DynamicExt* clone() const noexcept override {
+    return new custom_op_dynamic_plugin(float_attr_);
+  };
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int32_t outputIndex,
+      const nvinfer1::DimsExprs* inputs,
+      int32_t nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) noexcept override {
+    return inputs[0];
+  }
+
+  bool supportsFormatCombination(int32_t pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int32_t nbInputs,
+                                 int32_t nbOutputs) noexcept override {
+    return true;
+  }
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int32_t nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int32_t nbOutputs) noexcept override {}
+
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int32_t nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int32_t nbOutputs) const noexcept override {
+    return 0;
+  }
+
+  int32_t enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+                  const nvinfer1::PluginTensorDesc* outputDesc,
+                  const void* const* inputs,
+                  void* const* outputs,
+                  void* workspace,
+                  cudaStream_t stream) noexcept override {
+    return 0;
+  }
+
+ private:
+  float float_attr_ = 0;
+  std::string namespace_;
+};
+
+class custom_op_dynamic_plugin_creator : public nvinfer1::IPluginCreator {
+ public:
+  custom_op_dynamic_plugin_creator() {}
+
+  ~custom_op_dynamic_plugin_creator() override = default;
+
+  const char* getPluginName() const noexcept override {
+    return "custom_op_paddle_trt_dynamic_plugin";
+  }
+
+  const char* getPluginVersion() const noexcept override { return "1"; }
+
+  void setPluginNamespace(char const* pluginNamespace) noexcept override {
+    plugin_namespace_ = pluginNamespace;
+  }
+
+  const char* getPluginNamespace() const noexcept override {
+    return plugin_namespace_.c_str();
+  }
+
+  const nvinfer1::PluginFieldCollection* getFieldNames() noexcept override {
+    return nullptr;
+  }
+
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name,
+      const nvinfer1::PluginFieldCollection* fc) noexcept override {
+    return new custom_op_dynamic_plugin(1.0);
+  }
+
+  nvinfer1::IPluginV2* deserializePlugin(
+      const char* name,
+      const void* serialData,
+      size_t serialLength) noexcept override {
+    return new custom_op_dynamic_plugin(serialData, serialLength);
+  }
+
+ private:
+  std::string plugin_namespace_;
+};
+
+REGISTER_TRT_PLUGIN_V2(custom_op_plugin_creator);
+REGISTER_TRT_PLUGIN_V2(custom_op_dynamic_plugin_creator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
new file mode 100644
index 00000000000000..2a3ead9c8e6843
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/test_custom_plugin_creater.cc
@@ -0,0 +1,209 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>  // NOLINT
+
+#include "paddle/extension.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+#include "paddle/fluid/inference/tensorrt/convert/test_custom_op_plugin.h"
+
+PD_BUILD_OP(custom_op)
+    .Inputs({"Input"})
+    .Outputs({"Output"})
+    .Attrs({
+        "float_attr",
+        "int_attr",
+        "bool_attr",
+        "string_attr",
+        "ints_attr",
+        "floats_attr",
+        "bools_attr",
+    });
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(CustomPluginCreater, StaticShapePlugin) {
+  framework::ProgramDesc prog;
+  auto *block = prog.MutableBlock(0);
+  auto *op = block->AppendOp();
+  framework::proto::OpDesc *op_desc = op->Proto();
+
+  op_desc->set_type("custom_op");
+  auto *input_var = op_desc->add_inputs();
+  input_var->set_parameter("Input");
+  *input_var->add_arguments() = "X";
+
+  auto *output_var = op_desc->add_outputs();
+  output_var->set_parameter("Output");
+  *output_var->add_arguments() = "Out";
+
+  auto *attr = op_desc->add_attrs();
+  attr->set_name("float_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(1.0);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("int_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bool_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("string_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s("test_string_attr");
+
+  attr = op_desc->add_attrs();
+  attr->set_name("ints_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INTS);
+  attr->add_ints(1);
+  attr->add_ints(2);
+  attr->add_ints(3);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("floats_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOATS);
+  attr->add_floats(1.0);
+  attr->add_floats(2.0);
+  attr->add_floats(3.0);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bools_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEANS);
+  attr->add_bools(true);
+  attr->add_bools(false);
+  attr->add_bools(true);
+
+  // init trt engine
+  std::unique_ptr<TensorRTEngine> engine_;
+  engine_.reset(new TensorRTEngine(5, 1 << 15));
+  engine_->InitNetwork();
+
+  engine_->DeclareInput(
+      "X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims3(2, 5, 5));
+
+  framework::Scope scope;
+
+  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
+
+  auto &custom_plugin_tell = OpTeller::Global().GetCustomPluginTeller();
+
+  framework::OpDesc custom_op(*op_desc, nullptr);
+  CHECK_EQ((*custom_plugin_tell)(custom_op, false, false), true);
+
+  OpTeller::Global().SetOpConverterType("custom_op",
+                                        OpConverterType::CustomPluginCreater);
+
+  OpConverter converter;
+  converter.ConvertBlock(
+      *block->Proto(), {}, scope, engine_.get() /*TensorRTEngine*/);
+}
+
+TEST(CustomPluginCreater, DynamicShapePlugin) {
+  framework::ProgramDesc prog;
+  auto *block = prog.MutableBlock(0);
+  auto *op = block->AppendOp();
+  framework::proto::OpDesc *op_desc = op->Proto();
+
+  op_desc->set_type("custom_op");
+  auto *input_var = op_desc->add_inputs();
+  input_var->set_parameter("Input");
+  *input_var->add_arguments() = "X";
+
+  auto *output_var = op_desc->add_outputs();
+  output_var->set_parameter("Output");
+  *output_var->add_arguments() = "Out";
+
+  auto *attr = op_desc->add_attrs();
+  attr->set_name("float_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("int_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bool_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("string_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("ints_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INTS);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("floats_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOATS);
+
+  attr = op_desc->add_attrs();
+  attr->set_name("bools_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEANS);
+
+  // init trt engine
+  std::unique_ptr<TensorRTEngine> engine_;
+
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"x", {1, 2, 5, 5}}};
+
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"x", {1, 2, 5, 5}}};
+
+  std::map<std::string, std::vector<int>> optim_input_shape = {
+      {"x", {1, 2, 5, 5}}};
+
+  engine_.reset(new TensorRTEngine(5,
+                                   1 << 15,
+                                   AnalysisConfig::Precision::kFloat32,
+                                   nullptr,
+                                   0,
+                                   min_input_shape,
+                                   max_input_shape,
+                                   optim_input_shape));
+  engine_->InitNetwork();
+
+  LOG(INFO) << "with_dynamic_shape " << engine_->with_dynamic_shape();
+  engine_->DeclareInput(
+      "X", nvinfer1::DataType::kFLOAT, nvinfer1::Dims4(-1, 2, 5, 5));
+
+  framework::Scope scope;
+
+  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
+
+  auto &custom_plugin_tell = OpTeller::Global().GetCustomPluginTeller();
+
+  framework::OpDesc custom_op(*op_desc, nullptr);
+  CHECK_EQ((*custom_plugin_tell)(custom_op, false, true), true);
+
+  OpTeller::Global().SetOpConverterType("custom_op",
+                                        OpConverterType::CustomPluginCreater);
+
+  OpConverter converter;
+  converter.ConvertBlock(
+      *block->Proto(), {}, scope, engine_.get() /*TensorRTEngine*/);
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+USE_TRT_CONVERTER(custom_plugin_creater)
diff --git a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
index 5e748aad2375c2..795f62a3e1e6a3 100644
--- a/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
+++ b/paddle/fluid/inference/tensorrt/convert/test_op_converter.cc
@@ -57,6 +57,7 @@ TEST(OpConverter, ConvertBlock) {
   x_tensor->Resize(phi::make_ddim(dim_vec));
   x_tensor->mutable_data<float>(platform::CUDAPlace(0));
 
+  OpTeller::Global().SetOpConverterType("conv2d", OpConverterType::Default);
   OpConverter converter;
   converter.ConvertBlock(
       *block->Proto(), {"conv2d-Y"}, scope, engine_.get() /*TensorRTEngine*/);
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
new file mode 100644
index 00000000000000..4c5944e79451cf
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta.cc
@@ -0,0 +1,391 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/kernels/funcs/unfold_functor.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+nvinfer1::DimsExprs GatherNdInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dims = inputs[0];
+  const int x_dims_size = inputs[0].nbDims;
+  const nvinfer1::DimsExprs index_dims = inputs[1];
+  const int index_dims_size = inputs[1].nbDims;
+
+  std::vector<const nvinfer1::IDimensionExpr*> result_dims;
+  // The result dims is
+  //   Index.shape[:-1] + X.shape[Index.shape[-1]:]
+  for (int i = 0; i < index_dims_size - 1; ++i) {
+    result_dims.emplace_back(index_dims.d[i]);
+  }
+
+  if (index_dims.d[index_dims_size - 1]->isConstant()) {
+    for (int i = index_dims.d[index_dims_size - 1]->getConstantValue();
+         i < x_dims_size;
+         ++i) {
+      result_dims.emplace_back(x_dims.d[i]);
+    }
+  }
+
+  nvinfer1::DimsExprs output;
+  output.nbDims = result_dims.size();
+  for (int i = 0; i < output.nbDims; i++) {
+    output.d[i] = result_dims[i];
+  }
+  return output;
+}
+
+nvinfer1::DimsExprs YoloBoxInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs,
+      2,
+      phi::errors::InvalidArgument("inputs of yolo_box should be equal to 2, "
+                                   "But received (%s)",
+                                   nb_inputs));
+
+  const nvinfer1::DimsExprs dim_x = inputs[0];
+
+  auto anchors = PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("anchors"));
+  int anchor_num = anchors.size() / 2;
+
+  // box_num = dim_x[2] * dim_x[3] * anchor_num;
+  const nvinfer1::IDimensionExpr* box_num = expr_builder.operation(
+      nvinfer1::DimensionOperation::kPROD,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kPROD, *dim_x.d[2], *dim_x.d[3]),
+      *expr_builder.constant(anchor_num));
+
+  nvinfer1::DimsExprs output;
+  output.nbDims = 3;
+  if (output_index == 0) {
+    output.d[0] = dim_x.d[0];
+    output.d[1] = box_num;
+    output.d[2] = expr_builder.constant(4);
+  } else {
+    auto class_num = PADDLE_GET_CONST(int, op_desc.GetAttr("class_num"));
+    output.d[0] = dim_x.d[0];
+    output.d[1] = box_num;
+    output.d[2] = expr_builder.constant(class_num);
+  }
+  return output;
+}
+
+nvinfer1::DimsExprs InstanceNormInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  nvinfer1::DimsExprs x_dims = inputs[0];
+  return x_dims;
+}
+
+inline const nvinfer1::IDimensionExpr* CalcOutputSize(
+    const nvinfer1::IDimensionExpr* input_size,
+    const nvinfer1::IDimensionExpr* filter_size,
+    const nvinfer1::IDimensionExpr* dilation,
+    const nvinfer1::IDimensionExpr* padding1,
+    const nvinfer1::IDimensionExpr* padding2,
+    const nvinfer1::IDimensionExpr* stride,
+    nvinfer1::IExprBuilder& expr_builder  // NOLINT
+) {
+  // dkernel = dilation * (filter_size - 1) + 1;
+  const nvinfer1::IDimensionExpr* dkernel = expr_builder.operation(
+      nvinfer1::DimensionOperation::kSUM,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kPROD,
+          *dilation,
+          *expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
+                                  *filter_size,
+                                  *expr_builder.constant(1))),
+      *expr_builder.constant(1));
+
+  // output_size = (input_size + padding1 + padding2 - dkernel) / stride + 1;
+  const nvinfer1::IDimensionExpr* tmp = expr_builder.operation(
+      nvinfer1::DimensionOperation::kSUB,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kSUM,
+          *expr_builder.operation(
+              nvinfer1::DimensionOperation::kSUM, *input_size, *padding1),
+          *padding2),
+      *dkernel);
+
+  const nvinfer1::IDimensionExpr* output_size = expr_builder.operation(
+      nvinfer1::DimensionOperation::kSUM,
+      *expr_builder.operation(
+          nvinfer1::DimensionOperation::kFLOOR_DIV, *tmp, *stride),
+      *expr_builder.constant(1));
+  return output_size;
+}
+
+nvinfer1::DimsExprs UnflodInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(
+      nb_inputs,
+      1,
+      phi::errors::InvalidArgument("inputs of unfold should be equal to 1, "
+                                   "But received (%s)",
+                                   nb_inputs));
+
+  const nvinfer1::DimsExprs in_dims = inputs[0];
+  std::vector<const nvinfer1::IDimensionExpr*> out_dims;
+  out_dims.push_back(in_dims.d[0]);
+
+  auto kernel_sizes =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("kernel_sizes"));
+  auto dilations =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("dilations"));
+  auto paddings =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+  auto strides = PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
+
+  // output_channels = in_dims[1] * kernel_sizes[0] * kernel_sizes[1];
+  const nvinfer1::IDimensionExpr* output_channels = expr_builder.operation(
+      nvinfer1::DimensionOperation::kPROD,
+      *in_dims.d[1],
+      *expr_builder.operation(nvinfer1::DimensionOperation::kPROD,
+                              *expr_builder.constant(kernel_sizes[0]),
+                              *expr_builder.constant(kernel_sizes[1])));
+  out_dims.push_back(output_channels);
+
+  const nvinfer1::IDimensionExpr* output_height =
+      CalcOutputSize(in_dims.d[2],
+                     expr_builder.constant(kernel_sizes[0]),
+                     expr_builder.constant(dilations[0]),
+                     expr_builder.constant(paddings[0]),
+                     expr_builder.constant(paddings[2]),
+                     expr_builder.constant(strides[0]),
+                     expr_builder);
+  const nvinfer1::IDimensionExpr* output_width =
+      CalcOutputSize(in_dims.d[3],
+                     expr_builder.constant(kernel_sizes[1]),
+                     expr_builder.constant(dilations[1]),
+                     expr_builder.constant(paddings[1]),
+                     expr_builder.constant(paddings[3]),
+                     expr_builder.constant(strides[1]),
+                     expr_builder);
+
+  const nvinfer1::IDimensionExpr* output_col_length = expr_builder.operation(
+      nvinfer1::DimensionOperation::kPROD, *output_height, *output_width);
+
+  out_dims.push_back(output_col_length);
+  nvinfer1::DimsExprs output;
+  output.nbDims = out_dims.size();
+  for (size_t i = 0; i < out_dims.size(); i++) output.d[i] = out_dims[i];
+  return output;
+}
+
+nvinfer1::DimsExprs ScatterNdAddInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(nb_inputs,
+                    3,
+                    phi::errors::InvalidArgument(
+                        "inputs of scatter_nd_add should be equal to 3, "
+                        "But received (%s)",
+                        nb_inputs));
+  const nvinfer1::DimsExprs ref_dims = inputs[0];
+  return ref_dims;
+}
+
+nvinfer1::DimsExprs UnchangedInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  PADDLE_ENFORCE_EQ(nb_inputs,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "inputs of UnchangedInferMeta should be equal to 1, "
+                        "But received (%s)",
+                        nb_inputs));
+  return inputs[0];
+}
+
+nvinfer1::DimsExprs Pad3dInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dim = inputs[0];
+
+  nvinfer1::DimsExprs out_dims;
+  out_dims.nbDims = x_dim.nbDims;
+
+  out_dims.d[0] = x_dim.d[0];
+
+  auto paddings =
+      PADDLE_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+  auto data_format =
+      PADDLE_GET_CONST(std::string, op_desc.GetAttr("data_format"));
+
+  if (data_format == "NCDHW") {
+    out_dims.d[1] = x_dim.d[1];
+  } else {
+    out_dims.d[4] = x_dim.d[4];
+  }
+
+  if (data_format == "NCDHW") {
+    // depth
+    out_dims.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[2],
+                                *expr_builder.constant(paddings[4])),
+        *expr_builder.constant(paddings[5]));
+    // height
+    out_dims.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[3],
+                                *expr_builder.constant(paddings[2])),
+        *expr_builder.constant(paddings[3]));
+    // width
+    out_dims.d[4] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[4],
+                                *expr_builder.constant(paddings[0])),
+        *expr_builder.constant(paddings[1]));
+  } else {  // NDHWC
+    // depth
+    out_dims.d[1] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[1],
+                                *expr_builder.constant(paddings[4])),
+        *expr_builder.constant(paddings[5]));
+    // height
+    out_dims.d[2] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[2],
+                                *expr_builder.constant(paddings[2])),
+        *expr_builder.constant(paddings[3]));
+    // width
+    out_dims.d[3] = expr_builder.operation(
+        nvinfer1::DimensionOperation::kSUM,
+        *expr_builder.operation(nvinfer1::DimensionOperation::kSUM,
+                                *x_dim.d[3],
+                                *expr_builder.constant(paddings[0])),
+        *expr_builder.constant(paddings[1]));
+  }
+  return out_dims;
+}
+
+nvinfer1::DimsExprs PNormInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dim = inputs[0];
+  std::vector<const nvinfer1::IDimensionExpr*> reduce_dims;
+  std::vector<const nvinfer1::IDimensionExpr*> keep_dims;
+
+  bool asvector = PADDLE_GET_CONST(bool, op_desc.GetAttr("asvector"));
+  bool keepdim = PADDLE_GET_CONST(bool, op_desc.GetAttr("keepdim"));
+  int axis = PADDLE_GET_CONST(int, op_desc.GetAttr("axis"));
+
+  if (asvector) {
+    reduce_dims.emplace_back(expr_builder.constant(1));
+    keep_dims.emplace_back(expr_builder.constant(1));
+    if (keepdim) {
+      for (int i = 1; i < x_dim.nbDims; ++i) {
+        keep_dims.emplace_back(expr_builder.constant(1));
+      }
+    }
+  } else {
+    if (axis < 0) axis = x_dim.nbDims + axis;
+    for (int i = 0; i < x_dim.nbDims; ++i) {
+      if (i != axis) reduce_dims.emplace_back(x_dim.d[i]);
+    }
+    if (reduce_dims.size() == 0) {
+      reduce_dims.emplace_back(expr_builder.constant(1));
+    }
+  }
+  keep_dims[axis] = expr_builder.constant(1);
+
+  nvinfer1::DimsExprs output;
+  if (keepdim) {
+    output.nbDims = keep_dims.size();
+    for (int i = 0; i < output.nbDims; i++) output.d[i] = keep_dims[i];
+  } else {
+    output.nbDims = reduce_dims.size();
+    for (int i = 0; i < output.nbDims; i++) output.d[i] = reduce_dims[i];
+  }
+  return output;
+}
+
+nvinfer1::DimsExprs GridSamplerInferMeta(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+    const framework::OpDesc& op_desc) {
+  const nvinfer1::DimsExprs x_dims = inputs[0];
+  const nvinfer1::DimsExprs grid_dims = inputs[1];
+
+  nvinfer1::DimsExprs output;
+  if (grid_dims.nbDims == 4) {
+    output.nbDims = 4;
+    output.d[0] = x_dims.d[0];
+    output.d[1] = x_dims.d[1];
+    output.d[2] = grid_dims.d[1];
+    output.d[3] = grid_dims.d[2];
+  } else {
+    output.nbDims = 4;
+    output.d[0] = x_dims.d[0];
+    output.d[1] = x_dims.d[1];
+    output.d[2] = grid_dims.d[1];
+    output.d[3] = grid_dims.d[2];
+    output.d[4] = grid_dims.d[3];
+  }
+  return output;
+}
+
+PD_REGISTER_DYNAMIC_INFER_META_FN(gather_nd, GatherNdInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(yolo_box, YoloBoxInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(instance_norm, InstanceNormInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(unfold, UnflodInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(scatter_nd_add, ScatterNdAddInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(inverse, UnchangedInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(pad3d, Pad3dInferMeta);
+PD_REGISTER_DYNAMIC_INFER_META_FN(grid_sampler, GridSamplerInferMeta);
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
new file mode 100644
index 00000000000000..0196d81754fdd9
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h
@@ -0,0 +1,99 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <string>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/macros.h"
+#include "paddle/utils/flat_hash_map.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+using DynamicMetaFn =
+    nvinfer1::DimsExprs (*)(int output_index,
+                            const nvinfer1::DimsExprs* inputs,
+                            int nb_inputs,
+                            nvinfer1::IExprBuilder& expr_builder,  // NOLINT
+                            const framework::OpDesc& op_desc);
+
+class DynamicMetaFnFactory {
+ public:
+  static DynamicMetaFnFactory& Instance() {
+    static DynamicMetaFnFactory g_meta_fn_map;
+    return g_meta_fn_map;
+  }
+
+  bool Contains(const std::string& op_name) const {
+    return meta_fn_map_.count(op_name) > 0;
+  }
+
+  void Insert(std::string op_name, DynamicMetaFn infer_meta_fn) {
+    PADDLE_ENFORCE_NE(
+        Contains(op_name),
+        true,
+        phi::errors::AlreadyExists(
+            "`%s` op's DynamicInferMetaFn has been registered.", op_name));
+    meta_fn_map_.insert({std::move(op_name), std::move(infer_meta_fn)});
+  }
+
+  const DynamicMetaFn& Get(const std::string& op_name) const {
+    auto it = meta_fn_map_.find(op_name);
+    PADDLE_ENFORCE_NE(
+        it,
+        meta_fn_map_.end(),
+        phi::errors::NotFound(
+            "`%s` op's DynamicInferMetaFn has been registered.", op_name));
+    return it->second;
+  }
+
+ private:
+  DynamicMetaFnFactory() = default;
+
+  paddle::flat_hash_map<std::string, DynamicMetaFn> meta_fn_map_;
+
+  DISABLE_COPY_AND_ASSIGN(DynamicMetaFnFactory);
+};
+
+struct DynamicMetaFnRegistrar {
+  DynamicMetaFnRegistrar(const char* op_name, DynamicMetaFn infer_meta_fn) {
+    DynamicMetaFnFactory::Instance().Insert(op_name, std::move(infer_meta_fn));
+  }
+
+  static void Touch() {}
+};
+
+#define PD_REGISTER_DYNAMIC_INFER_META_FN(op_name, dynamic_infer_meta_fn)   \
+  static paddle::inference::tensorrt::DynamicMetaFnRegistrar                \
+      registrar_dynamic_infer_meta_fn_for_##op_name(#op_name,               \
+                                                    dynamic_infer_meta_fn); \
+  int TouchDynamicMetaFnRegistrar_##op_name() {                             \
+    registrar_dynamic_infer_meta_fn_for_##op_name.Touch();                  \
+    return 0;                                                               \
+  }
+
+#define USE_TRT_DYNAMIC_INFER_META_FN(op_name)           \
+  extern int TouchDynamicMetaFnRegistrar_##op_name();    \
+  static int use_op_dynamic_infer_meta##op_name UNUSED = \
+      TouchDynamicMetaFnRegistrar_##op_name();
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
new file mode 100644
index 00000000000000..c0ddaf5d983ef8
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+USE_TRT_DYNAMIC_INFER_META_FN(gather_nd);
+USE_TRT_DYNAMIC_INFER_META_FN(yolo_box);
+USE_TRT_DYNAMIC_INFER_META_FN(instance_norm);
+USE_TRT_DYNAMIC_INFER_META_FN(unfold);
+USE_TRT_DYNAMIC_INFER_META_FN(scatter_nd_add);
+USE_TRT_DYNAMIC_INFER_META_FN(pad3d);
+USE_TRT_DYNAMIC_INFER_META_FN(inverse);
+USE_TRT_DYNAMIC_INFER_META_FN(grid_sampler);
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/engine.cc b/paddle/fluid/inference/tensorrt/engine.cc
index e70a49c685e70a..315b3f84f7c491 100644
--- a/paddle/fluid/inference/tensorrt/engine.cc
+++ b/paddle/fluid/inference/tensorrt/engine.cc
@@ -231,6 +231,35 @@ void TensorRTEngine::FreezeNetwork() {
             nvinfer1::OptProfileSelector::kOPT,
             Vec2TRT_Dims(optim_input_shape_[input.first], input.first, true));
       }
+
+      for (int input_id = 0; input_id < network()->getNbInputs(); input_id++) {
+        auto input_name = network()->getInput(input_id)->getName();
+        if (!itensor_map_.count(input_name)) continue;
+        if (!GetITensor(input_name)->isShapeTensor()) continue;
+        PADDLE_ENFORCE_EQ(min_shape_tensor_.count(input_name) &&
+                              max_shape_tensor_.count(input_name) &&
+                              optim_shape_tensor_.count(input_name),
+                          true,
+                          platform::errors::InvalidArgument(
+                              "Fail to find min/max/optim shape value for TRT "
+                              "network's shape tensor input named %s.",
+                              input_name));
+        auto min_vec = min_shape_tensor_.at(input_name);
+        optim_profiles_[i]->setShapeValues(input_name,
+                                           nvinfer1::OptProfileSelector::kMIN,
+                                           min_vec.data(),
+                                           min_vec.size());
+        optim_profiles_[i]->setShapeValues(input_name,
+                                           nvinfer1::OptProfileSelector::kMAX,
+                                           max_shape_tensor_[input_name].data(),
+                                           min_vec.size());
+        optim_profiles_[i]->setShapeValues(
+            input_name,
+            nvinfer1::OptProfileSelector::kOPT,
+            optim_shape_tensor_[input_name].data(),
+            min_vec.size());
+      }
+
       infer_builder_config_->addOptimizationProfile(optim_profiles_[i]);
     }
     if (WithFp16() && disable_trt_plugin_fp16()) {
@@ -369,11 +398,47 @@ void TensorRTEngine::SetITensor(const std::string &name,
 }
 
 nvinfer1::ITensor *TensorRTEngine::GetITensor(const std::string &name) {
-  PADDLE_ENFORCE_EQ(itensor_map_.count(name),
-                    true,
-                    platform::errors::NotFound(
-                        "Tensor named %s is not found in TRT engine", name));
-  return itensor_map_[name];
+  if (itensor_map_.count(name)) {
+    return itensor_map_[name];
+  } else {
+    ConvertWeight2ITensor(name);
+    return itensor_map_[name];
+  }
+}
+
+// For cases when input is not middle-tensor , but persistable tensor
+// you should call this.
+nvinfer1::ITensor *TensorRTEngine::ConvertWeight2ITensor(
+    const std::string &name) {
+  auto *var_v = scope_->FindVar(name);
+  PADDLE_ENFORCE_NOT_NULL(
+      var_v,
+      platform::errors::NotFound("You are converting a persistable weight to a "
+                                 "tensor, but there is no "
+                                 "persistable variable called %s in scope.",
+                                 name));
+  auto *var_t = var_v->GetMutable<framework::LoDTensor>();
+  auto weight = this->GetTrtWeight(name, *var_t);
+
+  // Now we have create weights, then we need create a itensor
+  auto var_dims = var_t->dims();
+  nvinfer1::Dims trt_in_shape;
+  trt_in_shape.nbDims = var_t->dims().size();
+  for (int64_t i = 0; i < trt_in_shape.nbDims; i++) {
+    trt_in_shape.d[i] = var_dims[i];
+  }
+  // In fact , this is not always right, because we can't determine if the 0th
+  // dimension is batch. Just for run chenqu's model
+  if (!this->with_dynamic_shape()) {
+    trt_in_shape.nbDims--;
+    for (int i = 0; i < trt_in_shape.nbDims; i++) {
+      trt_in_shape.d[i] = trt_in_shape.d[i + 1];
+    }
+  }
+  nvinfer1::ILayer *layer =
+      TRT_ENGINE_ADD_LAYER(this, Constant, trt_in_shape, weight.get());
+  this->SetITensor(name, layer->getOutput(0));
+  return layer->getOutput(0);
 }
 
 std::unordered_map<std::string, nvinfer1::ITensor *>
@@ -610,9 +675,8 @@ void TensorRTEngine::GetEngineInfo() {
   LOG(INFO) << "====== engine info ======";
   std::unique_ptr<nvinfer1::IEngineInspector> infer_inspector(
       infer_engine_->createEngineInspector());
-  auto infer_context = infer_ptr<nvinfer1::IExecutionContext>(
-      infer_engine_->createExecutionContextWithoutDeviceMemory());
-  infer_inspector->setExecutionContext(infer_context.get());
+  auto infer_context = context();
+  infer_inspector->setExecutionContext(infer_context);
   LOG(INFO) << infer_inspector->getEngineInformation(
       nvinfer1::LayerInformationFormat::kONELINE);
   LOG(INFO) << "====== engine info end ======";
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 861a2aa8dfbd7d..17d8bb35b29d01 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -24,9 +24,9 @@ limitations under the License. */
 #include <unordered_set>
 #include <utility>
 #include <vector>
-
 #include "NvInferRuntimeCommon.h"
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
@@ -214,6 +214,9 @@ class TensorRTEngine {
       const ShapeMapType min_input_shape = {},
       const ShapeMapType max_input_shape = {},
       const ShapeMapType optim_input_shape = {},
+      const ShapeMapType min_shape_tensor = {},
+      const ShapeMapType max_shape_tensor = {},
+      const ShapeMapType optim_shape_tensor = {},
       bool disable_trt_plugin_fp16 = false,
       phi::DataType model_precision = phi::DataType::FLOAT32,
       nvinfer1::ILogger& logger = NaiveLogger::Global())
@@ -225,6 +228,9 @@ class TensorRTEngine {
         min_input_shape_(min_input_shape),
         max_input_shape_(max_input_shape),
         optim_input_shape_(optim_input_shape),
+        min_shape_tensor_(min_shape_tensor),
+        max_shape_tensor_(max_shape_tensor),
+        optim_shape_tensor_(optim_shape_tensor),
         disable_trt_plugin_fp16_(disable_trt_plugin_fp16),
         model_precision_(model_precision),
         logger_(logger) {
@@ -283,19 +289,11 @@ class TensorRTEngine {
   void SetITensor(const std::string& name, nvinfer1::ITensor* tensor);
   // Get an ITensor called name.
   nvinfer1::ITensor* GetITensor(const std::string& name);
+  nvinfer1::ITensor* ConvertWeight2ITensor(const std::string& name);
   std::unordered_map<std::string, nvinfer1::ITensor*>* GetITensorMap();
 
   nvinfer1::ICudaEngine* engine() { return infer_engine_.get(); }
   nvinfer1::IExecutionContext* context() {
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
     std::unique_lock<std::mutex> lock(mutex_);
     if (infer_context_.find(predictor_id_per_thread) == infer_context_.end()) {
       PADDLE_ENFORCE_NOT_NULL(
@@ -322,15 +320,6 @@ class TensorRTEngine {
 
   int GetProfileIndex() {
     if (max_profile_num_ > 1) {
-#ifndef PADDLE_WITH_TESTING
-      PADDLE_ENFORCE_GT(
-          predictor_id_per_thread,
-          -1,
-          platform::errors::InvalidArgument(
-              "thread local var predictor_id_per_thread must be "
-              "initialized to >= 0, but now predictor_id_per_thread = %d",
-              predictor_id_per_thread));
-#endif
       std::unique_lock<std::mutex> lock(mutex_);
       return profile_index_[predictor_id_per_thread];
     } else {
@@ -349,15 +338,6 @@ class TensorRTEngine {
         infer_engine_,
         platform::errors::InvalidArgument(
             "You should build engine first and then set the context."));
-#ifndef PADDLE_WITH_TESTING
-    PADDLE_ENFORCE_GT(
-        predictor_id_per_thread,
-        -1,
-        platform::errors::InvalidArgument(
-            "thread local var predictor_id_per_thread must be "
-            "initialized to >= 0, but now predictor_id_per_thread = %d",
-            predictor_id_per_thread));
-#endif
     std::unique_lock<std::mutex> lock(mutex_);
     infer_context_[predictor_id_per_thread].reset(nullptr);
     infer_context_.erase(predictor_id_per_thread);
@@ -529,6 +509,9 @@ class TensorRTEngine {
   ShapeMapType min_input_shape() { return min_input_shape_; }
   ShapeMapType max_input_shape() { return max_input_shape_; }
   ShapeMapType optim_input_shape() { return optim_input_shape_; }
+  ShapeMapType min_shape_tensor() { return min_shape_tensor_; }
+  ShapeMapType max_shape_tensor() { return max_shape_tensor_; }
+  ShapeMapType optim_shape_tensor() { return optim_shape_tensor_; }
 
   bool AdjustDynamicShapeRange(const ShapeMapType& runtime_input_shape,
                                std::vector<std::string>* changed) {
@@ -691,12 +674,15 @@ class TensorRTEngine {
   void GetEngineInfo();
 
   void SetUseInspector(bool use_inspector) { use_inspector_ = use_inspector; }
+  void SetScope(const framework::Scope& scope) { scope_ = &scope; }
 
  private:
   // Each ICudaEngine object is bound to a specific GPU when it is instantiated,
   // ensure that the thread is associated with the correct device by calling
   // freshDeviceId().
   void freshDeviceId();
+  // Used for convert weight into Itensor
+  const framework::Scope* scope_;
 
   // the max batch size
   int max_batch_;
@@ -717,6 +703,9 @@ class TensorRTEngine {
   ShapeMapType min_input_shape_;
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
+  ShapeMapType min_shape_tensor_;
+  ShapeMapType max_shape_tensor_;
+  ShapeMapType optim_shape_tensor_;
   bool disable_trt_plugin_fp16_{false};
   phi::DataType model_precision_{phi::DataType::FLOAT32};
   bool use_varseqlen_{false};
@@ -808,6 +797,9 @@ class TRTEngineManager {
       const std::map<std::string, std::vector<int>> min_input_shape = {},
       const std::map<std::string, std::vector<int>> max_input_shape = {},
       const std::map<std::string, std::vector<int>> optim_input_shape = {},
+      const std::map<std::string, std::vector<int>> min_shape_tensor = {},
+      const std::map<std::string, std::vector<int>> max_shape_tensor = {},
+      const std::map<std::string, std::vector<int>> optim_shape_tensor = {},
       bool disable_trt_plugin_fp16 = false,
       phi::DataType model_precision = phi::DataType::FLOAT32,
       nvinfer1::ILogger& logger = NaiveLogger::Global()) {
@@ -819,6 +811,9 @@ class TRTEngineManager {
                                  min_input_shape,
                                  max_input_shape,
                                  optim_input_shape,
+                                 min_shape_tensor,
+                                 max_shape_tensor,
+                                 optim_shape_tensor,
                                  disable_trt_plugin_fp16,
                                  model_precision,
                                  logger);
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 6286010a03b3cb..ecc089e134a3ff 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -18,6 +18,11 @@
 
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/data_layout.h"
+#include "paddle/fluid/framework/op_meta_info_helper.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_factory.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_factory.h"
 
 namespace paddle {
 namespace framework {
@@ -33,11 +38,17 @@ namespace tensorrt {
 struct SimpleOpTypeSetTeller : public Teller {
   SimpleOpTypeSetTeller() {
 #if IS_TRT_VERSION_GE(7130)
+    // use TensorRT plugin
     teller_set.insert("group_norm");
+    teller_set.insert("multiclass_nms3");
+    teller_set.insert("multiclass_nms");
+    int8_teller_set.insert("multiclass_nms3");
+    int8_teller_set.insert("multiclass_nms");
 #endif
 #if IS_TRT_VERSION_GE(7000)
     teller_set.insert("tile");
     teller_set.insert("flatten_contiguous_range");
+    int8_teller_set.insert("flatten_contiguous_range");
     teller_set.insert("rnn");
     int8_teller_set.insert("rnn");
     teller_set.insert("fill_constant_batch_size_like");
@@ -57,253 +68,16 @@ struct SimpleOpTypeSetTeller : public Teller {
 #endif
   }
 
-  bool operator()(const std::string& op_type,
-                  const framework::OpDesc& desc,
-                  bool use_no_calib_int8) override {
-    if (use_no_calib_int8) {
-      return int8_teller_set.count(op_type);
-    } else {
-      return teller_set.count(op_type);
-    }
-  }
-
- private:
-  // use this set for no calib int8.
-  std::unordered_set<std::string> int8_teller_set{
-      "mul",
-      "matmul",
-      "conv2d",
-      "conv2d_fusion",
-      "pool2d",
-      "relu",
-      "elu",
-      "selu",
-      "softsign",
-      "softplus",
-      "stanh",
-      "thresholded_relu",
-      "exp",
-      "log",
-      "sqrt",
-      "abs",
-      "sin",
-      "cos",
-      "tan",
-      "sinh",
-      "cosh",
-      "asin",
-      "acos",
-      "atan",
-      "asinh",
-      "atanh",
-      "ceil",
-      "floor",
-      "erf",
-      "softmax",
-      "sigmoid",
-      "hard_swish",
-      "depthwise_conv2d",
-      "batch_norm",
-      "concat",
-      "tanh",
-      "pad",
-      "elementwise_add",
-      "elementwise_sub",
-      "elementwise_mul",
-      "elementwise_div",
-      "elementwise_pow",
-      "equal",
-      "dropout",
-      "prelu",
-      "conv2d_transpose",
-      "depthwise_conv2d_transpose",
-      "leaky_relu",
-      "fc",
-      "shuffle_channel",
-      "swish",
-      "silu",
-      "split",
-      "instance_norm",
-      "gelu",
-      "layer_norm",
-      "scale",
-      "stack",
-      "transpose2",
-      "transpose",
-      "top_k",
-      "top_k_v2",
-      "flatten2",
-      "flatten",
-      "gather",
-      "gather_nd",
-      "yolo_box",
-      "yolo_box_head",
-      "arg_max",
-      "roi_align",
-      "affine_channel",
-      "nearest_interp",
-      "anchor_generator",
-      "reduce_sum",
-      "reduce_mean",
-      "conv3d",
-      "conv3d_transpose",
-      "mish",
-      "nearest_interp_v2",
-      "bilinear_interp_v2",
-      "pool3d",
-      "deformable_conv",
-      "relu6",
-      "hard_sigmoid",
-      "clip",
-      "fused_embedding_eltwise_layernorm",
-      "multihead_matmul",
-      "skip_layernorm",
-      "slice",
-      "strided_slice",
-      "fused_preln_embedding_eltwise_layernorm",
-      "preln_residual_bias",
-      "c_allreduce_sum",
-      "c_allreduce_min",
-      "c_allreduce_max",
-      "c_allreduce_prod",
-      "roll",
-      "cast",
-      "preln_skip_layernorm",
-      "transformer_input_convert",
-      "recover_padding",
-      "remove_padding",
-      "fill_constant",
-      "sum",
-      "shape",
-      "squeeze2",
-      "unsqueeze2",
-      "layernorm_shift_partition"};
-  std::unordered_set<std::string> teller_set{
-      "mul",
-      "matmul",
-      "conv2d",
-      "conv2d_fusion",
-      "pool2d",
-      "relu",
-      "elu",
-      "selu",
-      "softsign",
-      "softplus",
-      "stanh",
-      "thresholded_relu",
-      "exp",
-      "log",
-      "sqrt",
-      "abs",
-      "sin",
-      "cos",
-      "tan",
-      "sinh",
-      "cosh",
-      "asin",
-      "acos",
-      "atan",
-      "asinh",
-      "atanh",
-      "ceil",
-      "floor",
-      "erf",
-      "softmax",
-      "sigmoid",
-      "hard_swish",
-      "depthwise_conv2d",
-      "batch_norm",
-      "concat",
-      "tanh",
-      "pad",
-      "elementwise_add",
-      "elementwise_sub",
-      "elementwise_mul",
-      "elementwise_div",
-      "elementwise_pow",
-      "equal",
-      "dropout",
-      "prelu",
-      "conv2d_transpose",
-      "depthwise_conv2d_transpose",
-      "leaky_relu",
-      "fc",
-      "shuffle_channel",
-      "swish",
-      "silu",
-      "split",
-      "instance_norm",
-      "gelu",
-      "layer_norm",
-      "scale",
-      "stack",
-      "transpose2",
-      "transpose",
-      "top_k",
-      "top_k_v2",
-      "flatten2",
-      "flatten",
-      "gather",
-      "gather_nd",
-      "yolo_box",
-      "yolo_box_head",
-      "arg_max",
-      "roi_align",
-      "affine_channel",
-      "nearest_interp",
-      "anchor_generator",
-      "reduce_sum",
-      "reduce_mean",
-      "conv3d",
-      "conv3d_transpose",
-      "mish",
-      "bilinear_interp_v2",
-      "nearest_interp_v2",
-      "pool3d",
-      "deformable_conv",
-      "relu6",
-      "hard_sigmoid",
-      "clip",
-      "fused_embedding_eltwise_layernorm",
-      "multihead_matmul",
-      "skip_layernorm",
-      "slice",
-      "strided_slice",
-      "fused_preln_embedding_eltwise_layernorm",
-      "preln_skip_layernorm",
-      "preln_residual_bias",
-      "c_allreduce_sum",
-      "c_allreduce_min",
-      "c_allreduce_max",
-      "c_allreduce_prod",
-      "roll",
-      "cast",
-      "multiclass_nms3",
-      "transformer_input_convert",
-      "recover_padding",
-      "remove_padding",
-      "fill_constant",
-      "sum",
-      "shape",
-      "squeeze2",
-      "unsqueeze2",
-      "fused_token_prune",
-      "layernorm_shift_partition"};
-};
-
-bool OpTeller::Tell(const framework::ir::Node* node,
-                    bool use_no_calib_int8,
-                    bool with_dynamic_shape) {
-  const std::string op_type = node->Op()->Type();
-  const framework::OpDesc desc = *node->Op();
-  // do not support the op which is labeled the `skip_quant`
-  if ((desc.HasAttr("namescope") &&
-       PADDLE_GET_CONST(std::string, desc.GetAttr("op_namescope")) ==
-           "/skip_quant_2/") ||
-      desc.HasAttr("skip_quant"))
-    return false;
-
-  for (auto& teller : tellers_) {
+  bool operator()(const framework::OpDesc& desc,
+                  bool use_no_calib_int8 = false,
+                  bool with_dynamic_shape = false) override {
+    const std::string op_type = desc.Type();
+    // do not support the op which is labeled the `skip_quant`
+    if ((desc.HasAttr("namescope") &&
+         PADDLE_GET_CONST(std::string, desc.GetAttr("op_namescope")) ==
+             "/skip_quant_2/") ||
+        desc.HasAttr("skip_quant"))
+      return false;
     std::unordered_set<std::string> act_op_list = {
         "relu",     "relu6", "sigmoid",
         "elu",      "selu",  "softsign",
@@ -361,7 +135,30 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       }
     }
 
+    if (op_type == "dropout") {
+      /*
+       * Some OpDescs Attribute support both constant value and dynamic
+       * runtime value (which is a Variable(s) type). But TensorRT maybe
+       * only support constant value Attribute, so we shall distinguish
+       * this case in time and return False in OpTeller.Tell().
+       * If Attribute is Variable(s), HasAttr() will return False
+       */
+      if (!desc.HasAttr("dropout_prob", /*with_attr_var=*/false)) {
+        VLOG(3)
+            << "Skip to convert into TRT while found Attribute('dropout_prob') "
+               "is Variable type in dropout.";
+        return false;
+      }
+    }
+
     if (op_type == "pool2d") {
+      // If Attribute is Variable(s), HasAttr() will return False
+      if (!desc.HasAttr("ksize", /*with_attr_var=*/false)) {
+        VLOG(3) << "Skip to convert into TRT while found Attribute('ksize') is "
+                   "Variable type in pool2d.";
+        return false;
+      }
+
       std::vector<int> paddings =
           PADDLE_GET_CONST(std::vector<int>, desc.GetAttr("paddings"));
       if (paddings.size() > 2) {
@@ -485,6 +282,15 @@ bool OpTeller::Tell(const framework::ir::Node* node,
         }
       }
 #endif
+      auto* block = desc.Block();
+      if (block) {
+        auto* filter_var_desc = block->FindVar(desc.Input("Filter")[0]);
+        if (!filter_var_desc->Persistable()) {
+          VLOG(3) << "Trt not support filter is  a intermediate tensor in "
+                     "conv2d op.";
+          return false;
+        }
+      }
     }
 
     if (op_type == "deformable_conv") {
@@ -532,10 +338,30 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       }
     }
 
-    if (op_type == "matmul") {
-      auto* block = desc.Block();
-      if (block == nullptr) {
-        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+    if (op_type == "bmm") {
+      if (!with_dynamic_shape) {
+        return false;
+      }
+    }
+
+    if (op_type == "matmul_v2") {
+      if (!with_dynamic_shape) {
+        return false;
+      }
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
+                   "Developers need to check whether block_desc is passed in "
+                   "the pass.";
+        return false;
+      }
+      return true;
+    }
+
+    if (op_type == "matmul") {
+      auto* block = desc.Block();
+      if (block == nullptr) {
+        VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
                    "Developers need to check whether block_desc is passed in "
                    "the pass.";
         return false;
@@ -726,6 +552,16 @@ bool OpTeller::Tell(const framework::ir::Node* node,
                      "the pass.";
           return false;
         }
+
+        auto index_var_name = desc.Input("Index")[0];
+        auto* index_var_desc = block->FindVar(index_var_name);
+
+        // The index input must be int32 datatype.
+        if (index_var_desc->GetDataType() !=
+            paddle::framework::proto::VarType_Type::VarType_Type_INT32) {
+          VLOG(3) << "gather op Index input data type must be int32";
+          return false;
+        }
 #if !IS_TRT_VERSION_GE(7000)
         auto* x_var_desc = block->FindVar(desc.Input("X")[0]);
         const auto x_shape = x_var_desc->GetShape();
@@ -794,11 +630,21 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "arg_max") {
+      if (!desc.HasAttr("axis", /*with_attr_var=*/false)) {
+        VLOG(3) << "Skip to convert into TRT while found Attribute('axis') is "
+                   "Variable type in arg_max.";
+        return false;
+      }
+
       int axis = desc.HasAttr("axis")
                      ? PADDLE_GET_CONST(int64_t, desc.GetAttr("axis"))
                      : -1;
-      bool flatten = PADDLE_GET_CONST(bool, desc.GetAttr("flatten"));
-      int dtype = PADDLE_GET_CONST(int, desc.GetAttr("dtype"));
+      bool flatten = desc.HasAttr("flatten")
+                         ? PADDLE_GET_CONST(bool, desc.GetAttr("flatten"))
+                         : false;
+      int dtype = desc.HasAttr("dtype")
+                      ? PADDLE_GET_CONST(int, desc.GetAttr("dtype"))
+                      : 3;
       if (axis == 0 || flatten || dtype != 2) return false;
     }
 
@@ -824,7 +670,6 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "multiclass_nms" || op_type == "multiclass_nms3") {
-      if (with_dynamic_shape) return false;
       auto* block = desc.Block();
       if (block == nullptr) {
         VLOG(3) << "The block desc is nullptr, we can't continue to analyze. "
@@ -1058,6 +903,13 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "squeeze2") {
+      // If Attribute is Variable(s), HasAttr() will return False
+      if (!desc.HasAttr("axes", /*with_attr_var=*/false)) {
+        VLOG(3) << "Skip to convert into TRT while found Attribute('axes') is "
+                   "Variable type in squeeze2.";
+        return false;
+      }
+
       std::vector<int> axes;
       if (desc.HasAttr("axes")) {
         axes = PADDLE_GET_CONST(std::vector<int>, desc.GetAttr("axes"));
@@ -1387,7 +1239,8 @@ bool OpTeller::Tell(const framework::ir::Node* node,
 
     if (op_type == "elementwise_add" || op_type == "elementwise_mul" ||
         op_type == "elementwise_sub" || op_type == "elementwise_div" ||
-        op_type == "elementwise_pow") {
+        op_type == "elementwise_pow" || op_type == "elementwise_min" ||
+        op_type == "elementwise_max") {
       if (desc.Input("X").size() != 1) {
         VLOG(3) << "The input op's Input(\"X\").size() "
                    "should equal to 1, but received Input(\"X\").size() = "
@@ -1859,8 +1712,10 @@ bool OpTeller::Tell(const framework::ir::Node* node,
           return false;
         }
       } else {
-#if !IS_TRT_VERSION_GE(8000)
-        VLOG(3) << "The version of TRT must be greater than 8000";
+#if (IS_TRT_VERSION_GE(8000) && IS_TRT_VERSION_LT(8100)) || \
+    (IS_TRT_VERSION_LT(7200))
+        VLOG(3) << "There are some bugs in v8.0.* and the versions lower than "
+                   "v7.2 are not supported";
         return false;
 #endif
       }
@@ -1924,13 +1779,13 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "reshape" || op_type == "reshape2") {
-      if (with_dynamic_shape) {
-        return true;
-      }
       if (!desc.HasAttr("shape")) {
         return false;
       }
-      // Paddle-TRT does not support the input tensors: Shape and ShapeTensor
+      if (with_dynamic_shape) {
+        return true;
+      }
+      // Static shape does not support the input tensors: Shape and ShapeTensor
       auto reshape_inputs = desc.Inputs();
       if (reshape_inputs.find("Shape") != reshape_inputs.end()) {
         if (desc.Input("Shape").size() >= 1) {
@@ -1999,6 +1854,13 @@ bool OpTeller::Tell(const framework::ir::Node* node,
     }
 
     if (op_type == "reduce_sum" || op_type == "reduce_mean") {
+      if (!desc.HasAttr("dim", /*with_attr_var=*/false)) {
+        VLOG(3) << "Skip to convert into TRT while found Attribute('dim') is "
+                   "Variable type in "
+                << desc.Type();
+        return false;
+      }
+
       if (!(desc.HasAttr("keep_dim") && desc.HasAttr("dim") &&
             desc.HasAttr("reduce_all"))) {
         VLOG(3) << "the " << op_type
@@ -2256,12 +2118,348 @@ bool OpTeller::Tell(const framework::ir::Node* node,
       }
     }
 
-    if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
+    if (use_no_calib_int8) {
+      return int8_teller_set.count(op_type);
+    } else {
+      return teller_set.count(op_type);
+    }
+  }
+
+ private:
+  // use this set for no calib int8.
+  std::unordered_set<std::string> int8_teller_set{
+      "mul",
+      "matmul",
+      "matmul_v2",
+      "bmm",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "elu",
+      "selu",
+      "softsign",
+      "softplus",
+      "stanh",
+      "thresholded_relu",
+      "exp",
+      "log",
+      "sqrt",
+      "abs",
+      "sin",
+      "cos",
+      "tan",
+      "sinh",
+      "cosh",
+      "asin",
+      "acos",
+      "atan",
+      "asinh",
+      "atanh",
+      "ceil",
+      "floor",
+      "erf",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_sub",
+      "elementwise_mul",
+      "elementwise_div",
+      "elementwise_pow",
+      "elementwise_min",
+      "elementwise_max",
+      "equal",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "silu",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "top_k",
+      "top_k_v2",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "yolo_box_head",
+      "arg_max",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "nearest_interp_v2",
+      "bilinear_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "strided_slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_residual_bias",
+      "c_allreduce_sum",
+      "c_allreduce_min",
+      "c_allreduce_max",
+      "c_allreduce_prod",
+      "roll",
+      "cast",
+      "preln_skip_layernorm",
+      "transformer_input_convert",
+      "recover_padding",
+      "remove_padding",
+      "fill_constant",
+      "sum",
+      "shape",
+      "squeeze2",
+      "unsqueeze2",
+      "layernorm_shift_partition"};
+  std::unordered_set<std::string> teller_set{
+      "mul",
+      "matmul",
+      "matmul_v2",
+      "bmm",
+      "conv2d",
+      "conv2d_fusion",
+      "pool2d",
+      "relu",
+      "elu",
+      "selu",
+      "softsign",
+      "softplus",
+      "stanh",
+      "thresholded_relu",
+      "exp",
+      "log",
+      "sqrt",
+      "abs",
+      "sin",
+      "cos",
+      "tan",
+      "sinh",
+      "cosh",
+      "asin",
+      "acos",
+      "atan",
+      "asinh",
+      "atanh",
+      "ceil",
+      "floor",
+      "erf",
+      "softmax",
+      "sigmoid",
+      "hard_swish",
+      "depthwise_conv2d",
+      "batch_norm",
+      "concat",
+      "tanh",
+      "pad",
+      "elementwise_add",
+      "elementwise_sub",
+      "elementwise_mul",
+      "elementwise_div",
+      "elementwise_pow",
+      "elementwise_min",
+      "elementwise_max",
+      "equal",
+      "dropout",
+      "prelu",
+      "conv2d_transpose",
+      "depthwise_conv2d_transpose",
+      "leaky_relu",
+      "fc",
+      "shuffle_channel",
+      "swish",
+      "silu",
+      "split",
+      "instance_norm",
+      "gelu",
+      "layer_norm",
+      "scale",
+      "stack",
+      "transpose2",
+      "transpose",
+      "top_k",
+      "top_k_v2",
+      "flatten2",
+      "flatten",
+      "gather",
+      "gather_nd",
+      "yolo_box",
+      "yolo_box_head",
+      "arg_max",
+      "roi_align",
+      "affine_channel",
+      "nearest_interp",
+      "anchor_generator",
+      "reduce_sum",
+      "reduce_mean",
+      "conv3d",
+      "conv3d_transpose",
+      "mish",
+      "bilinear_interp_v2",
+      "nearest_interp_v2",
+      "pool3d",
+      "deformable_conv",
+      "relu6",
+      "hard_sigmoid",
+      "clip",
+      "fused_embedding_eltwise_layernorm",
+      "multihead_matmul",
+      "skip_layernorm",
+      "slice",
+      "strided_slice",
+      "fused_preln_embedding_eltwise_layernorm",
+      "preln_skip_layernorm",
+      "preln_residual_bias",
+      "c_allreduce_sum",
+      "c_allreduce_min",
+      "c_allreduce_max",
+      "c_allreduce_prod",
+      "roll",
+      "cast",
+      "transformer_input_convert",
+      "recover_padding",
+      "remove_padding",
+      "fill_constant",
+      "sum",
+      "shape",
+      "squeeze2",
+      "unsqueeze2",
+      "fused_token_prune",
+      "layernorm_shift_partition"};
+};
+
+struct GenericPluginTeller : public Teller {
+ public:
+  GenericPluginTeller() {}
+  bool operator()(const framework::OpDesc& desc,
+                  bool use_no_calib_int8 = false,
+                  bool with_dynamic_shape = false) override {
+    const std::string op_type = desc.Type();
+    // only consider dynamic_shape mode
+    if (!with_dynamic_shape) {
+      return false;
+    }
+    if (op_type == "yolo_box") {
+      if (!desc.HasAttr("iou_aware") && !desc.HasAttr("iou_aware_factor"))
+        return false;
+    }
+    if (op_type == "pad3d") {
+      auto pad3d_inputs = desc.Inputs();
+      if (pad3d_inputs.find("Paddings") != pad3d_inputs.end()) {
+        if (desc.Input("Paddings").size() >= 1) {
+          return false;
+        }
+      }
+    }
+    if (use_no_calib_int8) {
+      return false;
+    } else {
+      framework::InitDefaultKernelSignatureMap();
+      bool res = phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_type) ||
+                 phi::DefaultKernelSignatureMap::Instance().Has(op_type);
+      if (!res) {
+        VLOG(3) << op_type << " has no KernelSignature";
+        return false;
+      }
+      res = phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type);
+      if (!res) {
+        VLOG(3) << op_type << " has no CompatiblePhiKernel in phi.";
+        return false;
+      }
+      auto& dynamic_infermeta_factory =
+          tensorrt::DynamicMetaFnFactory::Instance();
+      res = dynamic_infermeta_factory.Contains(op_type);
+      if (!res) {
+        VLOG(3) << op_type << " has no DynamicMetaFn.";
+        return false;
+      }
+      return true;
+    }
+  }
+};
+
+struct CustomPluginTeller : public Teller {
+ public:
+  CustomPluginTeller() {}
+  bool operator()(const framework::OpDesc& desc,
+                  bool use_no_calib_int8 = false,
+                  bool with_dynamic_shape = false) override {
+    const std::string op_type = desc.Type();
+    std::string expect_plugin_name;
+
+    if (with_dynamic_shape) {
+      expect_plugin_name = op_type + "_paddle_trt_dynamic_plugin";
+    } else {
+      expect_plugin_name = op_type + "_paddle_trt_plugin";
+    }
+
+    int num = 0;
+    auto creators = GetPluginRegistry()->getPluginCreatorList(&num);
+
+    for (int i = 0; i < num; i++) {
+      if (std::string(creators[i]->getPluginName()) == expect_plugin_name)
+        return true;
+    }
+    return false;
   }
+};
 
+bool OpTeller::Tell(const framework::ir::Node* node,
+                    bool use_no_calib_int8,
+                    bool with_dynamic_shape) {
+  const std::string op_type = node->Op()->Type();
+  const framework::OpDesc desc = *node->Op();
+  auto& default_teller = GetDefaultTeller();
+  if ((*default_teller)(desc, use_no_calib_int8, with_dynamic_shape)) {
+    SetOpConverterType(op_type, OpConverterType::Default);
+    return true;
+  }
+  auto& generic_plugin_teller = GetGenericPluginTeller();
+  if ((*generic_plugin_teller)(desc, use_no_calib_int8, with_dynamic_shape)) {
+    SetOpConverterType(op_type, OpConverterType::GenericPluginCreater);
+    return true;
+  }
+  auto& custom_plugin_teller = GetCustomPluginTeller();
+  if ((*custom_plugin_teller)(desc, use_no_calib_int8, with_dynamic_shape)) {
+    SetOpConverterType(op_type, OpConverterType::CustomPluginCreater);
+    return true;
+  }
   return false;
 }
-OpTeller::OpTeller() { tellers_.emplace_back(new SimpleOpTypeSetTeller); }
+
+OpTeller::OpTeller() {
+  tellers_.emplace_back(new tensorrt::SimpleOpTypeSetTeller);
+  tellers_.emplace_back(new tensorrt::GenericPluginTeller);
+  tellers_.emplace_back(new tensorrt::CustomPluginTeller);
+}
 }  // namespace tensorrt
 }  // namespace inference
 }  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/op_teller.h b/paddle/fluid/inference/tensorrt/op_teller.h
index 1a6ce092a18b43..2fa3dc361217ed 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.h
+++ b/paddle/fluid/inference/tensorrt/op_teller.h
@@ -38,9 +38,9 @@ namespace tensorrt {
  * issues such as op_desc.
  */
 struct Teller {
-  virtual bool operator()(const std::string& op_type,
-                          const framework::OpDesc& desc,
-                          bool use_no_calib_int8) = 0;
+  virtual bool operator()(const framework::OpDesc& desc,
+                          bool use_no_calib_int8 = false,
+                          bool with_dynamic_shape = false) = 0;
 
   virtual ~Teller() = default;
 };
@@ -55,9 +55,15 @@ struct Teller {
  *};
  */
 
+enum class OpConverterType {
+  Default = 0,
+  GenericPluginCreater,
+  CustomPluginCreater
+};
 /*
  * class OpTeller helps to tell whether a fluid
- * operator can be transformed to a TensorRT layer.
+ * operator can be transformed to a TensorRT layer
+ * and use which kind of OpConverter
  */
 class OpTeller {
  public:
@@ -70,11 +76,26 @@ class OpTeller {
             bool use_no_calib_int8 = false,
             bool with_dynamic_shape = false);
 
+  std::unique_ptr<Teller>& GetDefaultTeller() { return tellers_.at(0); }
+
+  std::unique_ptr<Teller>& GetGenericPluginTeller() { return tellers_.at(1); }
+
+  std::unique_ptr<Teller>& GetCustomPluginTeller() { return tellers_.at(2); }
+
+  void SetOpConverterType(std::string name, OpConverterType type) {
+    op_converter_type_map_[name] = type;
+  }
+
+  const std::map<std::string, OpConverterType>& GetOpConverterTypeMap() const {
+    return op_converter_type_map_;
+  }
+
  private:
   OpTeller();
 
  private:
   std::vector<std::unique_ptr<Teller>> tellers_;
+  std::map<std::string, OpConverterType> op_converter_type_map_;
 };
 
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index f602714f21150b..9fe02cd731d828 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -32,7 +32,8 @@ list(
   c_allreduce_op_plugin.cu
   preln_residual_bias_plugin.cu
   fused_token_prune_op_plugin.cu
-  layernorm_shift_partition_op.cu)
+  layernorm_shift_partition_op.cu
+  generic_plugin.cu)
 
 if(CUSPARSELT_FOUND AND ${TENSORRT_MAJOR_VERSION} GREATER_EQUAL 8)
   list(APPEND TRT_FILES spmm_plugin.cu)
@@ -41,7 +42,13 @@ endif()
 nv_library(
   tensorrt_plugin
   SRCS ${TRT_FILES}
-  DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
+  DEPS enforce
+       tensorrt_engine
+       prelu
+       tensor
+       bert_encoder_functor
+       tensorrt_dynamic_shape_infermeta_factory
+       tensorrt_plugin_arg_mapping_context)
 
 nv_test(
   test_split_plugin
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
new file mode 100644
index 00000000000000..f335c63fa36614
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.cu
@@ -0,0 +1,501 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin/generic_plugin.h"
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/op_kernel_type.h"
+#include "paddle/fluid/framework/phi_utils.h"
+#include "paddle/fluid/inference/tensorrt/dynamic_shape_infermeta_registry.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/phi/core/kernel_context.h"
+#include "paddle/phi/core/kernel_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+void BuildPhiKernelContextAttr(const framework::OpDesc& op_desc,
+                               phi::KernelContext* kernel_context,
+                               const phi::KernelSignature& signature,
+                               const phi::Kernel& phi_kernel) {
+  const phi::KernelArgsDef& args_def = phi_kernel.args_def();
+  const auto& attr_names = signature.attr_names;
+  const auto& attr_defs = args_def.attribute_defs();
+
+  PADDLE_ENFORCE_EQ(
+      attr_names.size(),
+      attr_defs.size(),
+      platform::errors::InvalidArgument(
+          "The attr_names.size() should be equal to attr_defs.size()."));
+
+  framework::AttrReader attr_reader(op_desc.GetAttrMap());
+
+  for (size_t k = 0; k < attr_names.size(); ++k) {
+    auto attr_name = attr_names[k];
+    auto* attr_ptr = attr_reader.GetAttr(attr_name);
+    if (attr_ptr) {
+      switch (attr_defs[k].type_index) {
+        case phi::AttributeType::SCALAR: {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::FLOAT:
+              return kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(float, attr)));
+              break;
+            case framework::proto::AttrType::INT:
+              return kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(int, attr)));
+              break;
+            case framework::proto::AttrType::STRING:
+              return kernel_context->EmplaceBackAttr(
+                  phi::Scalar(PADDLE_GET_CONST(std::string, attr)));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to Scalar when "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        } break;
+
+        case phi::AttributeType::INT_ARRAY: {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS:
+              kernel_context->EmplaceBackAttr(std::move(
+                  phi::IntArray(PADDLE_GET_CONST(std::vector<int32_t>, attr))));
+              break;
+            case framework::proto::AttrType::LONGS:
+              kernel_context->EmplaceBackAttr(std::move(
+                  phi::IntArray(PADDLE_GET_CONST(std::vector<int64_t>, attr))));
+              break;
+            case framework::proto::AttrType::INT:
+              kernel_context->EmplaceBackAttr(
+                  phi::IntArray({PADDLE_GET_CONST(int, attr)}));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to IntArray when "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        } break;
+
+        case phi::AttributeType::SCALARS: {
+          auto& attr = *attr_ptr;
+          switch (AttrTypeID(attr)) {
+            case framework::proto::AttrType::INTS: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<int32_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::LONGS: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<int64_t>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOATS: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<float>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            case framework::proto::AttrType::FLOAT64S: {
+              const auto& vec = PADDLE_GET_CONST(std::vector<double>, attr);
+              std::vector<phi::Scalar> scalar_list;
+              scalar_list.reserve(vec.size());
+              for (const auto& val : vec) {
+                scalar_list.emplace_back(val);
+              }
+              kernel_context->EmplaceBackAttr(std::move(scalar_list));
+            } break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` to vector<Scalar> when "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        } break;
+
+        default: {
+          auto& attr = *attr_ptr;
+          switch (attr_defs[k].type_index) {
+            case phi::AttributeType::FLOAT32:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(float, attr));
+              break;
+            case phi::AttributeType::INT32:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(int, attr));
+              break;
+            case phi::AttributeType::BOOL:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(bool, attr));
+              break;
+            case phi::AttributeType::INT64:
+              kernel_context->EmplaceBackAttr(PADDLE_GET_CONST(int64_t, attr));
+              break;
+            case phi::AttributeType::INT32S:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<int>, attr));
+              break;
+            case phi::AttributeType::DATA_TYPE: {
+              auto data_type = paddle::framework::TransToPhiDataType(
+                  static_cast<framework::proto::VarType::Type>(
+                      PADDLE_GET_CONST(int, attr)));
+              kernel_context->EmplaceBackAttr(data_type);
+            } break;
+            case phi::AttributeType::STRING:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::string, attr));
+              break;
+            case phi::AttributeType::INT64S:
+              switch (AttrTypeID(attr)) {
+                case framework::proto::AttrType::LONGS:
+                  kernel_context->EmplaceBackAttr(
+                      PADDLE_GET_CONST(std::vector<int64_t>, attr));
+                  break;
+                case framework::proto::AttrType::INTS: {
+                  const auto& vector_int_attr =
+                      PADDLE_GET_CONST(std::vector<int>, attr);
+                  const std::vector<int64_t> vector_int64_attr(
+                      vector_int_attr.begin(), vector_int_attr.end());
+                  kernel_context->EmplaceBackAttr(vector_int64_attr);
+                } break;
+                default:
+                  PADDLE_THROW(platform::errors::Unimplemented(
+                      "Unsupported cast op attribute `%s` to vector<int64_t> "
+                      "when ProtoAttr2PhiAttr.",
+                      attr_name));
+              }
+              break;
+            case phi::AttributeType::FLOAT32S:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<float>, attr));
+              break;
+            case phi::AttributeType::STRINGS:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<std::string>, attr));
+              break;
+            case phi::AttributeType::BOOLS:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<bool>, attr));
+              break;
+            case phi::AttributeType::FLOAT64S:
+              kernel_context->EmplaceBackAttr(
+                  PADDLE_GET_CONST(std::vector<double>, attr));
+              break;
+            default:
+              PADDLE_THROW(platform::errors::Unimplemented(
+                  "Unsupported cast op attribute `%s` when construct "
+                  "ProtoAttr2PhiAttr.",
+                  attr_name));
+          }
+        }
+      }
+    }
+  }
+  CHECK_EQ(attr_names.size(), kernel_context->AttrsSize());
+}
+
+GenericPlugin::GenericPlugin(
+    const paddle::framework::proto::OpDesc& proto_op_desc,
+    const InputOutPutVarInfo& in_out_info) {
+  proto_op_desc_ = proto_op_desc;
+  op_desc_ = std::move(framework::OpDesc(proto_op_desc_, nullptr));
+  proto_op_desc_.SerializeToString(&op_meta_data_);
+  inputs_data_type_ = in_out_info.inputs_data_type;
+  outputs_data_type_ = in_out_info.outputs_data_type;
+}
+
+GenericPlugin::GenericPlugin(
+    const paddle::framework::proto::OpDesc& proto_op_desc,
+    const std::vector<int>& inputs_data_type,
+    const std::vector<int>& outputs_data_type) {
+  proto_op_desc_ = proto_op_desc;
+  op_desc_ = std::move(framework::OpDesc(proto_op_desc_, nullptr));
+  proto_op_desc_.SerializeToString(&op_meta_data_);
+  inputs_data_type_ = inputs_data_type;
+  outputs_data_type_ = outputs_data_type;
+}
+
+GenericPlugin::GenericPlugin(void const* serial_data, size_t serial_length) {
+  DeserializeValue(&serial_data, &serial_length, &inputs_data_type_);
+  DeserializeValue(&serial_data, &serial_length, &outputs_data_type_);
+  std::string op_meta_data((char*)(serial_data), serial_length);  // NOLINT
+  op_meta_data_ = std::move(op_meta_data);
+  proto_op_desc_.ParseFromString(op_meta_data_);
+  op_desc_ = std::move(framework::OpDesc(proto_op_desc_, nullptr));
+}
+
+int GenericPlugin::getNbOutputs() const TRT_NOEXCEPT {
+  int res = 0;
+  for (auto& i : op_desc_.Outputs()) {
+    if (!i.second.empty()) res += i.second.size();
+  }
+  return res;
+}
+
+int GenericPlugin::getNbInputs() const TRT_NOEXCEPT {
+  int res = 0;
+  for (auto& i : op_desc_.Inputs()) {
+    if (!i.second.empty()) res += i.second.size();
+  }
+  return res;
+}
+
+nvinfer1::IPluginV2DynamicExt* GenericPlugin::clone() const TRT_NOEXCEPT {
+  nvinfer1::IPluginV2DynamicExt* plugin =
+      new GenericPlugin(proto_op_desc_, inputs_data_type_, outputs_data_type_);
+  plugin->initialize();
+  return plugin;
+}
+
+void GenericPlugin::serialize(void* buffer) const TRT_NOEXCEPT {
+  // inputs_data_type_
+  SerializeValue(&buffer, inputs_data_type_);
+  // outputs_data_type_
+  SerializeValue(&buffer, outputs_data_type_);
+  // serialize op_meta_data_
+  std::memcpy(buffer, op_meta_data_.c_str(), op_meta_data_.size());
+  reinterpret_cast<char*&>(buffer) += op_meta_data_.size();
+}
+
+bool GenericPlugin::supportsFormatCombination(
+    int pos,
+    const nvinfer1::PluginTensorDesc* in_out,
+    int nb_inputs,
+    int nb_outputs) TRT_NOEXCEPT {
+  if (op_desc_.Type() == "gather_nd" || op_desc_.Type() == "yolo_box") {
+    if (pos == 0)
+      return (in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    if (pos == 1)
+      return (in_out[pos].type == nvinfer1::DataType::kINT32) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    if (pos == 2)
+      return (in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else if (op_desc_.Type() == "scatter_nd_add") {
+    if (pos == 0)
+      return (in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    if (pos == 1)
+      return (in_out[pos].type == nvinfer1::DataType::kINT32) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    if (pos == 2)
+      return (in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+    if (pos == 3)
+      return (in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+             (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  } else {
+    return (in_out[pos].type == nvinfer1::DataType::kFLOAT) &&
+           (in_out[pos].format == nvinfer1::TensorFormat::kLINEAR);
+  }
+}
+
+nvinfer1::DataType GenericPlugin::getOutputDataType(
+    int index,
+    const nvinfer1::DataType* input_types,
+    int nb_inputs) const TRT_NOEXCEPT {
+  return input_types[0];
+}
+
+int GenericPlugin::initialize() TRT_NOEXCEPT {
+  std::string op_type = op_desc_.Type();
+
+  phi::KernelSignature phi_kernel_signature;
+  if (phi::OpUtilsMap::Instance().HasArgumentMappingFn(op_type)) {
+    const phi::ArgumentMappingFn* argument_mapping_func =
+        phi::OpUtilsMap::Instance().GetArgumentMappingFn(op_type);
+    PluginArgumentMappingContext argument_mapping_context(&op_desc_);
+    phi_kernel_signature = (*argument_mapping_func)(argument_mapping_context);
+  } else {
+    phi_kernel_signature =
+        phi::DefaultKernelSignatureMap::Instance().Get(op_type);
+  }
+
+  phi::KernelKey phi_kernel_key(
+      phi::Backend::GPU, phi::DataLayout::ANY, phi::DataType::FLOAT32);
+
+  PADDLE_ENFORCE_EQ(
+      phi::KernelFactory::Instance().HasCompatiblePhiKernel(op_type),
+      true,
+      platform::errors::Fatal("%s has no compatible phi kernel!",
+                              op_type.c_str()));
+
+  const phi::Kernel& phi_kernel = phi::KernelFactory::Instance().SelectKernel(
+      phi_kernel_signature.name, phi_kernel_key);
+  phi_kernel_ = &phi_kernel;
+
+  PADDLE_ENFORCE_EQ(phi_kernel_->IsValid(),
+                    true,
+                    platform::errors::Fatal("%s phi kernel is invalid!.",
+                                            phi_kernel_signature.name));
+
+  paddle::platform::DeviceContextPool& pool =
+      paddle::platform::DeviceContextPool::Instance();
+  platform::CUDAPlace place(platform::GetCurrentDeviceId());
+  auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(place));
+
+  if (!phi_kernel_context_) {
+    phi_kernel_context_ = new phi::KernelContext(dev_ctx);
+    BuildPhiKernelContextAttr(
+        op_desc_, phi_kernel_context_, phi_kernel_signature, phi_kernel);
+  }
+  if (!dense_tensor_inputs_)
+    dense_tensor_inputs_ = new std::vector<phi::DenseTensor>(getNbInputs());
+  if (!dense_tensor_outputs_)
+    dense_tensor_outputs_ = new std::vector<phi::DenseTensor>(getNbOutputs());
+
+  return 0;
+}
+
+nvinfer1::DimsExprs GenericPlugin::getOutputDimensions(
+    int output_index,
+    const nvinfer1::DimsExprs* inputs,
+    int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) TRT_NOEXCEPT {
+  CHECK(output_index < getNbOutputs());
+  auto& dynamic_infermeta_factory = tensorrt::DynamicMetaFnFactory::Instance();
+  PADDLE_ENFORCE_EQ(dynamic_infermeta_factory.Contains(op_desc_.Type()),
+                    true,
+                    platform::errors::InvalidArgument(
+                        "The %s op has no dynamic plugin infershape function!",
+                        op_desc_.Type().c_str()));
+
+  auto* infershape_func = dynamic_infermeta_factory.Get(op_desc_.Type());
+  return infershape_func(
+      output_index, inputs, nb_inputs, expr_builder, op_desc_);
+}
+
+void GenericPlugin::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in,
+    int nb_inputs,
+    const nvinfer1::DynamicPluginTensorDesc* out,
+    int nb_outputs) TRT_NOEXCEPT {
+  CHECK(phi_kernel_context_);
+  CHECK(phi_kernel_);
+  CHECK(nb_inputs == getNbInputs());
+  CHECK(nb_outputs == getNbOutputs());
+}
+
+// Shutdown the layer. This is called when the engine is destroyed
+void GenericPlugin::terminate() TRT_NOEXCEPT {
+  delete phi_kernel_context_;
+  delete dense_tensor_inputs_;
+  delete dense_tensor_outputs_;
+}
+
+int GenericPlugin::enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+                           const nvinfer1::PluginTensorDesc* output_desc,
+                           const void* const* inputs,
+                           void* const* outputs,
+                           void* workspace,
+                           cudaStream_t stream) TRT_NOEXCEPT {
+  platform::CUDAPlace place(platform::GetCurrentDeviceId());
+
+  // [TODO]now generic plugin do not support FP16 and INT8 precision
+  auto protoType2PhiType = [](int proto_type) -> std::pair<phi::DataType, int> {
+    if (proto_type ==
+        static_cast<int>(framework::proto::VarType_Type::VarType_Type_FP32))
+      return {phi::DataType::FLOAT32, sizeof(float)};
+    else if (proto_type ==
+                 static_cast<int>(
+                     framework::proto::VarType_Type::VarType_Type_INT64) ||
+             proto_type ==
+                 static_cast<int>(
+                     framework::proto::VarType_Type::VarType_Type_INT32))
+      return {phi::DataType::INT32, sizeof(int32_t)};
+    else if (proto_type ==
+             static_cast<int>(
+                 framework::proto::VarType_Type::VarType_Type_BOOL))
+      return {phi::DataType::BOOL, sizeof(bool)};
+    else
+      CHECK(false) << "precision is not supported";
+  };
+
+  // input
+  phi_kernel_context_->ClearInputOutput();
+
+  for (int i = 0; i < getNbInputs(); i++) {
+    auto const& input_dims = input_desc[i].dims;
+
+    std::vector<int> input_shape;
+    for (int j = 0; j < input_dims.nbDims; j++)
+      input_shape.push_back(input_dims.d[j]);
+
+    int input_numel = 1;
+    for (int k = 0; k < input_shape.size(); k++) input_numel *= input_shape[k];
+
+    auto data_type_and_size = protoType2PhiType(inputs_data_type_[i]);
+    phi::DenseTensorMeta input_meta(data_type_and_size.first,
+                                    phi::make_ddim(input_shape));
+    std::shared_ptr<phi::Allocation> input_alloc(
+        new phi::Allocation((void*)(inputs[i]),  // NOLINT
+                            input_numel * data_type_and_size.second,
+                            place));
+    (*dense_tensor_inputs_)[i] =
+        std::move(phi::DenseTensor(input_alloc, input_meta));
+    phi_kernel_context_->EmplaceBackInput(&((*dense_tensor_inputs_)[i]));
+  }
+
+  // output
+  for (int i = 0; i < getNbOutputs(); i++) {
+    auto const& output_dims = output_desc[i].dims;
+
+    std::vector<int> output_shape;
+    for (int j = 0; j < output_dims.nbDims; j++)
+      output_shape.push_back(output_dims.d[j]);
+
+    int output_numel = 1;
+    for (int k = 0; k < output_shape.size(); k++)
+      output_numel *= output_shape[k];
+
+    auto data_type_and_size = protoType2PhiType(inputs_data_type_[i]);
+    phi::DenseTensorMeta output_meta(data_type_and_size.first,
+                                     phi::make_ddim(output_shape));
+    std::shared_ptr<phi::Allocation> output_alloc(
+        new phi::Allocation(reinterpret_cast<void*>(outputs[i]),
+                            output_numel * data_type_and_size.second,
+                            place));
+    phi::DenseTensor output_densetonsor(output_alloc, output_meta);
+    (*dense_tensor_outputs_)[i] =
+        std::move(phi::DenseTensor(output_alloc, output_meta));
+    phi_kernel_context_->EmplaceBackOutput(&((*dense_tensor_outputs_)[i]));
+  }
+
+  CHECK_EQ(phi_kernel_context_->InputsSize(), getNbInputs());
+  CHECK_EQ(phi_kernel_context_->OutputsSize(), getNbOutputs());
+
+  (*phi_kernel_)(phi_kernel_context_);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h
new file mode 100644
index 00000000000000..5705078ffa4412
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/generic_plugin.h
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <NvInfer.h>
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/scope.h"
+#include "paddle/fluid/framework/type_defs.h"
+#include "paddle/fluid/inference/tensorrt/engine.h"
+#include "paddle/fluid/inference/tensorrt/helper.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+#include "paddle/fluid/memory/allocation/cuda_allocator.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+void BuildPhiKernelContextAttr(const framework::OpDesc& op_desc,
+                               phi::KernelContext* kernel_context,
+                               const phi::KernelSignature& signature,
+                               const phi::Kernel& phi_kernel);
+
+class GenericPlugin : public DynamicPluginTensorRT {
+ public:
+  struct InputOutPutVarInfo {
+    std::vector<int> inputs_data_type;
+    std::vector<int> outputs_data_type;
+  };
+
+ public:
+  GenericPlugin() {}
+
+  GenericPlugin(const paddle::framework::proto::OpDesc& proto_op_desc,
+                const InputOutPutVarInfo& in_out_info);
+
+  GenericPlugin(const paddle::framework::proto::OpDesc& proto_op_desc,
+                const std::vector<int>& inputs_data_type,
+                const std::vector<int>& outputs_data_type);
+
+  // It was used for tensorrt deserialization.
+  // It should not be called by users.
+  GenericPlugin(void const* serialData, size_t serialLength);
+
+  // IPluginV2 method
+  const char* getPluginType() const TRT_NOEXCEPT override {
+    return "generic_plugin";
+  }
+
+  int getNbOutputs() const TRT_NOEXCEPT override;
+
+  int getNbInputs() const TRT_NOEXCEPT;
+
+  // Initialize the layer for execution.
+  int initialize() TRT_NOEXCEPT override;
+
+  // Shutdown the layer. This is called when the engine is destroyed
+  void terminate() TRT_NOEXCEPT override;
+
+  void destroy() TRT_NOEXCEPT{};
+
+  size_t getSerializationSize() const TRT_NOEXCEPT {
+    return op_meta_data_.size() + SerializedSize(inputs_data_type_) +
+           SerializedSize(outputs_data_type_);
+  }
+
+  void serialize(void* buffer) const TRT_NOEXCEPT;
+
+  // The Func in IPluginV2
+  nvinfer1::IPluginV2DynamicExt* clone() const TRT_NOEXCEPT;
+
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
+      TRT_NOEXCEPT;
+
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* in_out,
+                                 int nb_inputs,
+                                 int nb_outputs) TRT_NOEXCEPT;
+
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nb_inputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nb_outputs) TRT_NOEXCEPT;
+
+  int enqueue(const nvinfer1::PluginTensorDesc* input_desc,
+              const nvinfer1::PluginTensorDesc* output_desc,
+              const void* const* inputs,
+              void* const* outputs,
+              void* workspace,
+              cudaStream_t stream) TRT_NOEXCEPT;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* input_types,
+                                       int nb_inputs) const TRT_NOEXCEPT;
+
+ private:
+  std::string op_meta_data_;
+  framework::proto::OpDesc proto_op_desc_;
+  framework::OpDesc op_desc_;
+
+ private:
+  const phi::Kernel* phi_kernel_{nullptr};
+
+  phi::KernelContext* phi_kernel_context_{nullptr};
+  std::vector<phi::DenseTensor>* dense_tensor_inputs_{nullptr};
+  std::vector<phi::DenseTensor>* dense_tensor_outputs_{nullptr};
+
+ private:
+  InputOutPutVarInfo in_out_info_;
+  std::vector<int> inputs_data_type_;
+  std::vector<int> outputs_data_type_;
+};
+
+class GenericPluginCreator : public TensorRTPluginCreator {
+ public:
+  const char* getPluginName() const TRT_NOEXCEPT override {
+    return "generic_plugin";
+  }
+
+  const char* getPluginVersion() const TRT_NOEXCEPT override { return "1"; }
+
+  nvinfer1::IPluginV2DynamicExt* deserializePlugin(const char* name,
+                                                   const void* serial_data,
+                                                   size_t serial_length)
+      TRT_NOEXCEPT override {
+    return new GenericPlugin(serial_data, serial_length);
+  }
+};
+REGISTER_TRT_PLUGIN_V2(GenericPluginCreator);
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu b/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
index 0605b392163a6b..ca59d4e9daeee3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/layernorm_shift_partition_op.cu
@@ -1,4 +1,5 @@
 // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+// Copyright (c) 2019-2022, NVIDIA CORPORATION.  All rights reserved.
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
@@ -91,8 +92,12 @@ __global__ void layernorm_shift_partition(T *out,
   float mean = 0.0f;
   float variance = 0.0f;
 
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   float local_out =
       (tid < n) ? static_cast<float>(__ldg(input + bid * n + tid)) : 0.0f;
+#else
+  float local_out = (tid < n) ? static_cast<float>(input[bid * n + tid]) : 0.0f;
+#endif
 
   mean = blockReduceSum<float>(local_out);
   if (threadIdx.x == 0) {
@@ -108,14 +113,20 @@ __global__ void layernorm_shift_partition(T *out,
   __syncthreads();
 
   if (tid < n) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
     out[output_bid * n + tid] =
         (T)(((local_out - s_mean) * rsqrtf(s_variance)) *
                 static_cast<float>(__ldg(&gamma[tid])) +
             static_cast<float>(__ldg(&beta[tid])));
+#else
+    out[output_bid * n + tid] =
+        (T)(((local_out - s_mean) * rsqrtf(s_variance)) *
+                static_cast<float>(gamma[tid]) +
+            static_cast<float>(beta[tid]));
+#endif
   }
 }
 
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
 template <>
 __global__ void layernorm_shift_partition(half2 *out_ptr,
                                           const half2 *input_ptr,
@@ -128,6 +139,7 @@ __global__ void layernorm_shift_partition(half2 *out_ptr,
                                           int shift_size,
                                           int window_size,
                                           const float eps) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   const int batch_offset = blockIdx.z * gridDim.y * gridDim.x;
   const int bid = batch_offset + blockIdx.y * gridDim.x + blockIdx.x;
   const int shifted_H_idx =
@@ -184,8 +196,8 @@ __global__ void layernorm_shift_partition(half2 *out_ptr,
         (local_out_fp2.y - s_mean) * s_variance * gamma_val.y + beta_val.y;
     out_ptr[output_bid * n + tid] = __float22half2_rn(local_out_fp2);
   }
-}
 #endif
+}
 
 #define kITE 4
 template <typename T>
@@ -232,7 +244,11 @@ __global__ void layernorm_shift_partition_v2(T *out,
   for (int i = 0; i < kITE; i++) {
     int col_id = i * blockDim.x + tid;
     if (col_id < n) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
       local_out[i] = static_cast<float>(__ldg(input + offset + col_id));
+#else
+      local_out[i] = static_cast<float>(input[offset + col_id]);
+#endif
       sum += local_out[i];
     }
   }
@@ -264,15 +280,20 @@ __global__ void layernorm_shift_partition_v2(T *out,
   for (int i = 0; i < kITE; i++) {
     int col_id = i * blockDim.x + tid;
     if (col_id < n) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
       out[output_offset + col_id] =
           (T)(local_out[i] * s_variance *
                   static_cast<float>(__ldg(&gamma[col_id])) +
               static_cast<float>(__ldg(&beta[col_id])));
+#else
+      out[output_offset + col_id] =
+          (T)(local_out[i] * s_variance * static_cast<float>(gamma[col_id]) +
+              static_cast<float>(beta[col_id]));
+#endif
     }
   }
 }
 
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
 template <>
 __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
                                              const half2 *__restrict input_ptr,
@@ -285,6 +306,7 @@ __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
                                              int shift_size,
                                              int window_size,
                                              const float eps) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
   // constexpr int ite = 4;
   const int tid = threadIdx.x;
   const int batch_offset = blockIdx.z * gridDim.y * gridDim.x;
@@ -358,8 +380,8 @@ __global__ void layernorm_shift_partition_v2(half2 *out_ptr,
           __ldg(&beta_ptr[col_id]);
     }
   }
-}
 #endif
+}
 
 template <typename T>
 void invokeLayernormShiftPartition(T *out,
diff --git a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
index df404ae3e10e24..433ff37aac7bb8 100644
--- a/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/mish_op_plugin.h
@@ -125,10 +125,11 @@ class MishPluginDynamic : public DynamicPluginTensorRT {
   size_t getSerializationSize() const TRT_NOEXCEPT override;
   void serialize(void* buffer) const TRT_NOEXCEPT override;
 
-  nvinfer1::DimsExprs getOutputDimensions(int output_index,
-                                          const nvinfer1::DimsExprs* inputs,
-                                          int nb_inputs,
-                                          nvinfer1::IExprBuilder& expr_builder)
+  nvinfer1::DimsExprs getOutputDimensions(
+      int output_index,
+      const nvinfer1::DimsExprs* inputs,
+      int nb_inputs,
+      nvinfer1::IExprBuilder& expr_builder)  // NOLINT
       TRT_NOEXCEPT override;
 
   bool supportsFormatCombination(int pos,
diff --git a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
index 21eb89d135efa6..52959d8f90b6d1 100644
--- a/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/pool_op_plugin.cu
@@ -321,16 +321,16 @@ int PoolPluginDynamic::enqueue(const nvinfer1::PluginTensorDesc *input_desc,
     paddings[1] = 0;
     output_shape[2] = 1;
     output_shape[3] = 1;
+    if (adaptive_) {
+      output_shape[2] = h;
+      output_shape[3] = w;
+    }
   } else {
     auto data_dim = CalcOutputSize(
         {h, w}, ceil_mode_, adaptive_, ksize_, strides_, paddings_);
     output_shape[2] = data_dim[0];
     output_shape[3] = data_dim[1];
   }
-  if (adaptive_) {
-    output_shape[2] = h;
-    output_shape[3] = w;
-  }
 
   if (pool_type_ == "max") {
     phi::funcs::MaxPool<float> pool_process;
diff --git a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
index 3963b48a26c6c7..c6be871709452a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/recover_padding_plugin.cu
@@ -118,7 +118,28 @@ int RecoverPaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
   const int32_t* input1 =
       static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
   float* output = static_cast<float*>(outputs[0]);
-  const int32_t num_threads = 256;
+  int32_t num_threads;
+  if (input0_desc.dims.d[1] % 512 == 0) {
+    num_threads = 512;
+  } else if (input0_desc.dims.d[1] % 256 == 0) {
+    num_threads = 256;
+  } else if (input0_desc.dims.d[1] % 128 == 0) {
+    num_threads = 128;
+  } else if (input0_desc.dims.d[1] % 64 == 0) {
+    num_threads = 64;
+  } else if (input0_desc.dims.d[1] % 32 == 0) {
+    num_threads = 32;
+  } else if (input0_desc.dims.d[1] % 16 == 0) {
+    num_threads = 16;
+  } else if (input0_desc.dims.d[1] % 8 == 0) {
+    num_threads = 8;
+  } else if (input0_desc.dims.d[1] % 4 == 0) {
+    num_threads = 4;
+  } else if (input0_desc.dims.d[1] % 2 == 0) {
+    num_threads = 2;
+  } else {
+    num_threads = 1;
+  }
   const dim3 num_blocks(
       input1_desc.dims.d[0] - 1,
       input2_desc.dims.d[1],
diff --git a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
index 418ecb015784fe..9f1a1d6d2c109a 100644
--- a/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/remove_padding_plugin.cu
@@ -110,10 +110,29 @@ int RemovePaddingPlugin::enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
   const int32_t* input1 =
       static_cast<const int32_t*>(inputs[1]);  // pos_id_tensor
   float* output = static_cast<float*>(outputs[0]);
-
   const auto input0_desc = inputDesc[0];
-
-  const int32_t num_threads = 256;
+  int32_t num_threads;
+  if (input0_desc.dims.d[2] % 512 == 0) {
+    num_threads = 512;
+  } else if (input0_desc.dims.d[2] % 256 == 0) {
+    num_threads = 256;
+  } else if (input0_desc.dims.d[2] % 128 == 0) {
+    num_threads = 128;
+  } else if (input0_desc.dims.d[2] % 64 == 0) {
+    num_threads = 64;
+  } else if (input0_desc.dims.d[2] % 32 == 0) {
+    num_threads = 32;
+  } else if (input0_desc.dims.d[2] % 16 == 0) {
+    num_threads = 16;
+  } else if (input0_desc.dims.d[2] % 8 == 0) {
+    num_threads = 8;
+  } else if (input0_desc.dims.d[2] % 4 == 0) {
+    num_threads = 4;
+  } else if (input0_desc.dims.d[2] % 2 == 0) {
+    num_threads = 2;
+  } else {
+    num_threads = 1;
+  }
   const dim3 num_blocks(
       input0_desc.dims.d[0],
       input0_desc.dims.d[1],
diff --git a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
index 8c105230d27d4f..f08a8a75ba4067 100644
--- a/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
+++ b/paddle/fluid/inference/tensorrt/plugin/trt_plugin.h
@@ -38,6 +38,13 @@ namespace inference {
 namespace tensorrt {
 namespace plugin {
 
+#if defined(_WIN32)
+#define UNUSED
+#define __builtin_expect(EXP, C) (EXP)
+#else
+#define UNUSED __attribute__((unused))
+#endif
+
 class PluginTensorRT;
 
 typedef std::function<PluginTensorRT*(const void*, size_t)>
@@ -372,6 +379,26 @@ class TensorRTPluginCreator : public nvinfer1::IPluginCreator {
   std::vector<nvinfer1::PluginField> plugin_attributes_;
 };
 
+class TrtPluginRegistry {
+ public:
+  static TrtPluginRegistry* Global() {
+    static TrtPluginRegistry registry;
+    return &registry;
+  }
+  bool Regist(const std::string& name, const std::function<void()>& func) {
+    map.emplace(name, func);
+    return true;
+  }
+  void RegistToTrt() {
+    for (auto& it : map) {
+      it.second();
+    }
+  }
+
+ private:
+  std::unordered_map<std::string, std::function<void()>> map;
+};
+
 template <typename T>
 class TrtPluginRegistrarV2 {
  public:
@@ -386,9 +413,14 @@ class TrtPluginRegistrarV2 {
   T creator;
 };
 
-#define REGISTER_TRT_PLUGIN_V2(name)                                     \
-  static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
-      plugin_registrar_##name {}
+#define REGISTER_TRT_PLUGIN_V2(name) REGISTER_TRT_PLUGIN_V2_HELPER(name)
+
+#define REGISTER_TRT_PLUGIN_V2_HELPER(name)                                    \
+  UNUSED static bool REGISTER_TRT_PLUGIN_V2_HELPER##name =                     \
+      TrtPluginRegistry::Global()->Regist(#name, []() -> void {                \
+        static paddle::inference::tensorrt::plugin::TrtPluginRegistrarV2<name> \
+            plugin_registrar_##name{};                                         \
+      });
 
 }  // namespace plugin
 }  // namespace tensorrt
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
new file mode 100644
index 00000000000000..a76e5310ddf9f0
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.cc
@@ -0,0 +1,136 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+bool PluginArgumentMappingContext::HasInput(const std::string& name) const {
+  auto inputs = op_desc_ptr_->Inputs();
+  for (auto& i : inputs) {
+    if (i.first == name && !i.second.empty()) return true;
+  }
+  return false;
+}
+
+bool PluginArgumentMappingContext::HasOutput(const std::string& name) const {
+  auto outputs = op_desc_ptr_->Outputs();
+  for (auto& i : outputs) {
+    if (i.first == name && !i.second.empty()) return true;
+  }
+  return false;
+}
+
+bool PluginArgumentMappingContext::HasAttr(const std::string& name) const {
+  return op_desc_ptr_->HasAttr(name);
+}
+
+paddle::any PluginArgumentMappingContext::Attr(
+    const std::string& attr_name) const {
+  auto attr_type = op_desc_ptr_->GetAttrType(attr_name);
+  switch (attr_type) {
+    case framework::proto::AttrType::INT: {
+      return PADDLE_GET_CONST(int, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::FLOAT: {
+      return PADDLE_GET_CONST(float, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::STRING: {
+      return PADDLE_GET_CONST(std::string, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::INTS: {
+      return PADDLE_GET_CONST(std::vector<int>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::FLOATS: {
+      return PADDLE_GET_CONST(std::vector<float>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::STRINGS: {
+      return PADDLE_GET_CONST(std::vector<std::string>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::BOOLEAN: {
+      return PADDLE_GET_CONST(bool, op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    case framework::proto::AttrType::BOOLEANS: {
+      return PADDLE_GET_CONST(std::vector<bool>,
+                              op_desc_ptr_->GetAttr(attr_name));
+      break;
+    };
+    default: {
+      LOG(ERROR) << "Can't conver op's attribute [" << attr_name
+                 << "] to paddle any.";
+    }
+  }
+  return paddle::any();
+}
+
+size_t PluginArgumentMappingContext::InputSize(const std::string& name) const {
+  return op_desc_ptr_->Inputs().at(name).size();
+}
+size_t PluginArgumentMappingContext::OutputSize(const std::string& name) const {
+  return op_desc_ptr_->Outputs().at(name).size();
+}
+bool PluginArgumentMappingContext::IsDenseTensorInput(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsDenseTensorInputs(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsSelectedRowsInput(
+    const std::string& name) const {
+  return false;
+}
+
+bool PluginArgumentMappingContext::IsSparseCooTensorInput(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsSparseCsrTensorInput(
+    const std::string& name) const {
+  return false;
+}
+
+bool PluginArgumentMappingContext::IsSelectedRowsInputs(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsDenseTensorVectorInput(
+    const std::string& name) const {
+  return false;
+}
+
+bool PluginArgumentMappingContext::IsDenseTensorOutput(
+    const std::string& name) const {
+  return false;
+}
+bool PluginArgumentMappingContext::IsSelectedRowsOutput(
+    const std::string& name) const {
+  return false;
+}
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
new file mode 100644
index 00000000000000..b84c9fc915db01
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/phi/core/compat/arg_map_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+class PluginArgumentMappingContext : public ::phi::ArgumentMappingContext {
+ public:
+  explicit PluginArgumentMappingContext(framework::OpDesc* op_desc_ptr)
+      : op_desc_ptr_(op_desc_ptr) {}
+
+  bool HasInput(const std::string& name) const override;
+
+  bool HasOutput(const std::string& name) const override;
+
+  bool HasAttr(const std::string& name) const override;
+
+  paddle::any Attr(const std::string& attr_name) const override;
+
+  size_t InputSize(const std::string& name) const override;
+
+  size_t OutputSize(const std::string& name) const override;
+
+  bool IsDenseTensorInput(const std::string& name) const override;
+
+  bool IsDenseTensorInputs(const std::string& name) const override;
+
+  bool IsSelectedRowsInput(const std::string& name) const override;
+
+  bool IsSparseCooTensorInput(const std::string& name) const override;
+
+  bool IsSparseCsrTensorInput(const std::string& name) const override;
+
+  bool IsSelectedRowsInputs(const std::string& name) const override;
+
+  bool IsDenseTensorVectorInput(const std::string& name) const override;
+
+  bool IsDenseTensorOutput(const std::string& name) const override;
+
+  bool IsSelectedRowsOutput(const std::string& name) const override;
+
+  bool IsForInferShape() const override { return false; }
+
+ private:
+  framework::OpDesc* op_desc_ptr_;
+};
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
new file mode 100644
index 00000000000000..75716a91f574f7
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/test_arg_mapping_context.cc
@@ -0,0 +1,132 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/inference/tensorrt/plugin_arg_mapping_context.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+TEST(ArgMappingContexTest, BasicFunction) {
+  paddle::framework::proto::OpDesc op;
+  op.set_type("imaged_op");
+  auto *input_var = op.add_inputs();
+  input_var->set_parameter("X");
+  *input_var->add_arguments() = "input";
+
+  auto *output_var = op.add_outputs();
+  output_var->set_parameter("Out");
+  *output_var->add_arguments() = "output";
+
+  auto *attr = op.add_attrs();
+  attr->set_name("int_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INT);
+  attr->set_i(1);
+
+  attr = op.add_attrs();
+  attr->set_name("float_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOAT);
+  attr->set_f(1.0);
+
+  attr = op.add_attrs();
+  attr->set_name("string_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRING);
+  attr->set_s("1");
+
+  attr = op.add_attrs();
+  attr->set_name("bool_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+  attr->set_b(true);
+
+  attr = op.add_attrs();
+  attr->set_name("ints_attr");
+  attr->set_type(paddle::framework::proto::AttrType::INTS);
+  attr->add_ints(1);
+  attr->add_ints(2);
+
+  attr = op.add_attrs();
+  attr->set_name("floats_attr");
+  attr->set_type(paddle::framework::proto::AttrType::FLOATS);
+  attr->add_floats(1.0);
+  attr->add_floats(2.0);
+
+  attr = op.add_attrs();
+  attr->set_name("strings_attr");
+  attr->set_type(paddle::framework::proto::AttrType::STRINGS);
+  attr->add_strings("1");
+  attr->add_strings("2");
+
+  attr = op.add_attrs();
+  attr->set_name("bools_attr");
+  attr->set_type(paddle::framework::proto::AttrType::BOOLEANS);
+  attr->add_bools(true);
+  attr->add_bools(true);
+
+  framework::OpDesc op_desc(op, nullptr);
+  PluginArgumentMappingContext context(&op_desc);
+
+  EXPECT_EQ(context.HasInput("X"), true);
+  EXPECT_EQ(context.HasOutput("Out"), true);
+  EXPECT_EQ(context.HasAttr("int_attr"), true);
+
+  int int_attr = any_cast<int>(context.Attr("int_attr"));
+  EXPECT_EQ(int_attr, 1);
+
+  float flaot_attr = any_cast<float>(context.Attr("float_attr"));
+  EXPECT_EQ(flaot_attr, 1);
+
+  std::string string_attr = any_cast<std::string>(context.Attr("string_attr"));
+  EXPECT_EQ(string_attr, "1");
+
+  bool bool_attr = any_cast<bool>(context.Attr("bool_attr"));
+  EXPECT_EQ(bool_attr, true);
+
+  std::vector<int> ints_attr =
+      any_cast<std::vector<int>>(context.Attr("ints_attr"));
+  EXPECT_EQ(ints_attr[0], 1);
+  EXPECT_EQ(ints_attr[1], 2);
+
+  std::vector<float> floats_attr =
+      any_cast<std::vector<float>>(context.Attr("floats_attr"));
+  EXPECT_EQ(floats_attr[0], 1.0);
+  EXPECT_EQ(floats_attr[1], 2.0);
+
+  std::vector<std::string> strings_attr =
+      any_cast<std::vector<std::string>>(context.Attr("strings_attr"));
+  EXPECT_EQ(strings_attr[0], "1");
+  EXPECT_EQ(strings_attr[1], "2");
+
+  std::vector<bool> bools_attr =
+      any_cast<std::vector<bool>>(context.Attr("bools_attr"));
+  EXPECT_EQ(bools_attr[0], true);
+  EXPECT_EQ(bools_attr[1], true);
+
+  EXPECT_EQ(context.InputSize("X"), true);
+  EXPECT_EQ(context.OutputSize("Out"), true);
+  EXPECT_EQ(context.IsDenseTensorInput("X"), false);
+  EXPECT_EQ(context.IsDenseTensorInputs("X"), false);
+  EXPECT_EQ(context.IsSelectedRowsInput("X"), false);
+  EXPECT_EQ(context.IsDenseTensorVectorInput("X"), false);
+
+  EXPECT_EQ(context.IsDenseTensorOutput("Out"), false);
+  EXPECT_EQ(context.IsSelectedRowsOutput("Out"), false);
+  EXPECT_EQ(context.IsForInferShape(), false);
+}
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
index 6ac23e32856bec..fc93844e93a793 100644
--- a/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
+++ b/paddle/fluid/inference/tensorrt/test_dynamic_engine.cc
@@ -31,6 +31,137 @@ namespace paddle {
 namespace inference {
 namespace tensorrt {
 
+class TensorRTDynamicShapeValueEngineTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    ctx_ = new phi::GPUContext(platform::CUDAPlace(0));
+    ctx_->SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
+                           .GetAllocator(platform::CUDAPlace(0), ctx_->stream())
+                           .get());
+    ctx_->SetHostAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CPUPlace())
+            .get());
+    ctx_->SetZeroAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetZeroAllocator(platform::CUDAPlace(0))
+            .get());
+    ctx_->SetPinnedAllocator(
+        paddle::memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(paddle::platform::CUDAPinnedPlace())
+            .get());
+    ctx_->PartialInitWithAllocator();
+
+    std::map<std::string, std::vector<int>> min_input_shape = {
+        {"input", {1, 32}}};
+    std::map<std::string, std::vector<int>> max_input_shape = {
+        {"input", {18, 32}}};
+    std::map<std::string, std::vector<int>> optim_input_shape = {
+        {"input", {18, 32}}};
+    std::map<std::string, std::vector<int>> min_input_value = {
+        {"shape", {1, 8, 4}}};
+    std::map<std::string, std::vector<int>> max_input_value = {
+        {"shape", {18, 8, 4}}};
+    std::map<std::string, std::vector<int>> optim_input_value = {
+        {"shape", {18, 8, 4}}};
+    engine_ = new TensorRTEngine(16,
+                                 1 << 10,
+                                 AnalysisConfig::Precision::kFloat32,
+                                 nullptr,
+                                 0,
+                                 min_input_shape,
+                                 max_input_shape,
+                                 optim_input_shape,
+                                 min_input_value,
+                                 max_input_value,
+                                 optim_input_value,
+                                 false,
+                                 phi::DataType::FLOAT32,
+                                 NaiveLogger::Global());
+    engine_->InitNetwork();
+  }
+
+  void TearDown() override {
+    if (engine_) {
+      delete engine_;
+      engine_ = nullptr;
+    }
+  }
+
+  void PrepareInputOutput(const std::vector<float> &input,
+                          std::vector<int> output_shape) {
+    paddle::framework::TensorFromVector(input, *ctx_, &input_);
+    output_.Resize(phi::make_ddim(output_shape));
+  }
+  void PrepareShapeInput(const std::vector<int> &input) {
+    paddle::framework::TensorFromVector(input, *ctx_, &shape_);
+  }
+  void GetOutput(std::vector<float> *output) {
+    paddle::framework::TensorToVector(output_, *ctx_, output);
+  }
+
+ protected:
+  framework::LoDTensor input_;
+  framework::LoDTensor shape_;
+  framework::LoDTensor output_;
+  TensorRTEngine *engine_;
+  phi::GPUContext *ctx_;
+};
+
+TEST_F(TensorRTDynamicShapeValueEngineTest, test_trt_dynamic_shape_value) {
+  std::vector<void *> buffers(3);
+  std::cout << "with_dynamic_shape: " << engine_->with_dynamic_shape()
+            << std::endl;
+  auto *x = engine_->DeclareInput(
+      "input", nvinfer1::DataType::kFLOAT, nvinfer1::Dims2{-1, 32});
+  nvinfer1::Dims shape_dim;
+  shape_dim.nbDims = 1;
+  shape_dim.d[0] = 3;
+  auto *shape =
+      engine_->DeclareInput("shape", nvinfer1::DataType::kINT32, shape_dim);
+  auto layer = engine_->network()->addShuffle(*x);
+  layer->setInput(1, *shape);
+  PADDLE_ENFORCE_NOT_NULL(
+      layer,
+      platform::errors::InvalidArgument("TRT shuffle layer building failed."));
+  engine_->DeclareOutput(layer, 0, "y");
+  engine_->FreezeNetwork();
+  ASSERT_EQ(engine_->engine()->getNbBindings(), 3);
+
+  std::vector<float> x_v(8 * 32);
+  for (int i = 0; i < 8 * 32; i++) {
+    x_v[i] = i % (8 * 32);
+  }
+
+  std::vector<int> shape_v = {8, 8, 4};
+  PrepareInputOutput(x_v, {8, 8, 4});
+  PrepareShapeInput(shape_v);
+  engine_->context()->setBindingDimensions(0, nvinfer1::Dims2{8, 32});
+  engine_->context()->setBindingDimensions(1, shape_dim);
+  engine_->context()->setInputShapeBinding(1, shape_v.data());
+
+  auto *x_gpu_data = input_.mutable_data<float>(ctx_->GetPlace());
+  auto *shape_gpu_data = shape_.mutable_data<int>(ctx_->GetPlace());
+  auto *y_gpu_data = output_.mutable_data<float>(ctx_->GetPlace());
+
+  buffers[0] = reinterpret_cast<void *>(x_gpu_data);
+  buffers[1] = reinterpret_cast<void *>(shape_gpu_data);
+  buffers[2] = reinterpret_cast<void *>(y_gpu_data);
+
+  engine_->Execute(-1, &buffers, ctx_->stream());
+  cudaStreamSynchronize(ctx_->stream());
+  std::vector<float> y_cpu;
+  GetOutput(&y_cpu);
+  ASSERT_EQ(y_cpu[0], 0);
+  ASSERT_EQ(y_cpu[1], 1);
+  auto dims = engine_->context()->getBindingDimensions(2);
+  ASSERT_EQ(dims.nbDims, 3);
+  ASSERT_EQ(dims.d[0], 8);
+  ASSERT_EQ(dims.d[1], 8);
+  ASSERT_EQ(dims.d[2], 4);
+  return;
+}
+
 class TensorRTDynamicEngineTest : public ::testing::Test {
  protected:
   void SetUp() override {
@@ -67,6 +198,9 @@ class TensorRTDynamicEngineTest : public ::testing::Test {
                                  min_input_shape,
                                  max_input_shape,
                                  optim_input_shape,
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
                                  false,
                                  phi::DataType::FLOAT32,
                                  NaiveLogger::Global());
@@ -241,6 +375,9 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
                                  min_input_shape,
                                  max_input_shape,
                                  optim_input_shape,
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
+                                 std::map<std::string, std::vector<int>>(),
                                  false,
                                  phi::DataType::FLOAT32,
                                  NaiveLogger::Global());
@@ -284,6 +421,7 @@ class TensorRTDynamicTestFusedTokenPrune : public ::testing::Test {
 
 TEST_F(TensorRTDynamicTestFusedTokenPrune, test_fused_token_prune) {
 #if IS_TRT_VERSION_GE(8000)
+  tensorrt::plugin::TrtPluginRegistry::Global()->RegistToTrt();
   auto *attn = engine_->DeclareInput(
       "attn", nvinfer1::DataType::kHALF, nvinfer1::Dims4{-1, 1, 4, 4});
   auto *x = engine_->DeclareInput(
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 77831167ddd5df..f8650ef366e156 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -416,6 +416,9 @@ download_result(${ERNIE_INSTALL_DIR} "Ernie_result.txt.tar.gz"
 if(WITH_GPU)
   inference_analysis_api_test(test_analyzer_ernie ${ERNIE_INSTALL_DIR}
                               analyzer_ernie_tester.cc)
+  inference_analysis_api_test(gpu_ernie_half_test ${ERNIE_INSTALL_DIR}
+                              gpu_ernie_half_test.cc)
+  set_tests_properties(gpu_ernie_half_test PROPERTIES TIMEOUT 60)
 endif()
 inference_analysis_api_int8_test(test_analyzer_ernie_int8 ${ERNIE_INSTALL_DIR}
                                  analyzer_ernie_int8_tester.cc)
diff --git a/paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc b/paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
new file mode 100644
index 00000000000000..6b83e89a4447d3
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/gpu_ernie_half_test.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/api/paddle_inference_api.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
+
+namespace paddle {
+namespace inference {
+
+using paddle::PaddleTensor;
+
+template <typename T>
+void GetValueFromStream(std::stringstream *ss, T *t) {
+  (*ss) >> (*t);
+}
+
+template <>
+void GetValueFromStream<std::string>(std::stringstream *ss, std::string *t) {
+  *t = ss->str();
+}
+
+// Split string to vector
+template <typename T>
+void Split(const std::string &line, char sep, std::vector<T> *v) {
+  std::stringstream ss;
+  T t;
+  for (auto c : line) {
+    if (c != sep) {
+      ss << c;
+    } else {
+      GetValueFromStream<T>(&ss, &t);
+      v->push_back(std::move(t));
+      ss.str({});
+      ss.clear();
+    }
+  }
+
+  if (!ss.str().empty()) {
+    GetValueFromStream<T>(&ss, &t);
+    v->push_back(std::move(t));
+    ss.str({});
+    ss.clear();
+  }
+}
+
+// Parse tensor from string
+template <typename T>
+bool ParseTensor(const std::string &field, paddle::PaddleTensor *tensor) {
+  std::vector<std::string> data;
+  Split(field, ':', &data);
+  if (data.size() < 2) return false;
+
+  std::string shape_str = data[0];
+
+  std::vector<int> shape;
+  Split(shape_str, ' ', &shape);
+
+  std::string mat_str = data[1];
+
+  std::vector<T> mat;
+  Split(mat_str, ' ', &mat);
+
+  tensor->shape = shape;
+  auto size =
+      std::accumulate(shape.begin(), shape.end(), 1, std::multiplies<int>()) *
+      sizeof(T);
+  tensor->data.Resize(size);
+  std::copy(mat.begin(), mat.end(), static_cast<T *>(tensor->data.data()));
+  tensor->dtype = GetPaddleDType<T>();
+
+  return true;
+}
+
+// Parse input tensors from string
+bool ParseLine(const std::string &line,
+               std::vector<paddle::PaddleTensor> *tensors) {
+  std::vector<std::string> fields;
+  Split(line, ';', &fields);
+
+  tensors->clear();
+  tensors->reserve(4);
+
+  int i = 0;
+  auto input_name = FLAGS_ernie_large ? "eval_placeholder_" : "placeholder_";
+  for (; i < 3; i++) {
+    paddle::PaddleTensor temp;
+    ParseTensor<int64_t>(fields[i], &temp);
+    temp.name = input_name + std::to_string(i);
+    tensors->push_back(temp);
+  }
+
+  // input_mask
+  paddle::PaddleTensor input_mask;
+  ParseTensor<float>(fields[i], &input_mask);
+  input_mask.name = input_name + std::to_string(i);
+  tensors->push_back(input_mask);
+
+  return true;
+}
+
+bool LoadInputData(std::vector<std::vector<paddle::PaddleTensor>> *inputs,
+                   int batch_size = 1) {
+  if (FLAGS_infer_data.empty()) {
+    LOG(ERROR) << "please set input data path";
+    return false;
+  }
+
+  std::ifstream fin(FLAGS_infer_data);
+  std::string line;
+  int sample = 0;
+
+  // The unit-test dataset only have 10 samples, each sample have 5 feeds.
+  while (std::getline(fin, line)) {
+    std::vector<paddle::PaddleTensor> feed_data;
+    ParseLine(line, &feed_data);
+    inputs->push_back(std::move(feed_data));
+    sample++;
+    if (!FLAGS_test_all_data && sample == batch_size) break;
+  }
+  LOG(INFO) << "number of samples: " << sample;
+  return true;
+}
+
+// Compare results
+TEST(Ernie_gpu_fp16_no_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kHalf);
+  config.SwitchIrOptim(false);
+
+  auto predictor = CreatePaddlePredictor(config);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 8e-3);
+    }
+  }
+}
+
+// Compare results
+TEST(Ernie_gpu_fp16_with_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kHalf);
+  config.SwitchIrOptim(true);
+  // There is a problem with the model itself, which has nothing to do with
+  // constant_folding_pass.
+  config.pass_builder()->DeletePass("constant_folding_pass");
+
+  auto predictor = CreatePaddlePredictor(config);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 2e-2);
+    }
+  }
+}
+
+// Compare results
+TEST(Ernie_gpu_bf16_no_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kBf16);
+  config.SwitchIrOptim(false);
+
+  auto predictor = CreatePaddlePredictor(config);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 1e-2);
+    }
+  }
+}
+
+// Compare results
+TEST(Ernie_gpu_bf16_with_ir, compare_results) {
+  AnalysisConfig config;
+  config.SetModel(FLAGS_infer_model);
+  config.EnableUseGpu(512, 0, paddle_infer::PrecisionType::kBf16);
+  config.SwitchIrOptim(true);
+  // There is a problem with the model itself, which has nothing to do with
+  // constant_folding_pass.
+  config.pass_builder()->DeletePass("constant_folding_pass");
+
+  auto predictor = CreatePaddlePredictor(config);
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  LoadInputData(&input_slots_all);
+
+  std::ifstream fin(FLAGS_refer_result);
+  std::string line;
+  std::vector<float> ref;
+
+  while (std::getline(fin, line)) {
+    Split(line, ' ', &ref);
+  }
+
+  std::vector<PaddleTensor> outputs;
+  for (size_t i = 0; i < input_slots_all.size(); i++) {
+    outputs.clear();
+    predictor->Run(input_slots_all[i], &outputs);
+
+    auto output = outputs.front();
+    size_t outputs_size = 1;
+    for (auto dim : output.shape) {
+      outputs_size *= dim;
+    }
+    float *result = reinterpret_cast<float *>(output.data.data());
+    for (size_t j = 0; j < outputs_size; ++j) {
+      EXPECT_NEAR(ref[i * outputs_size + j], result[j], 5e-3);
+    }
+  }
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
index 8cff649b97092a..9029cefc9a424f 100644
--- a/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
+++ b/paddle/fluid/inference/tests/api/paddle_infer_api_test.cc
@@ -1,4 +1,4 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,15 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cuda_runtime.h>
 #include <glog/logging.h>
 #include <gtest/gtest.h>
 
-#include <cstring>
-#include <numeric>
-
 #include "gflags/gflags.h"
-#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+#include "paddle/fluid/inference/tests/api/tester_helper.h"
 
 namespace paddle_infer {
 
diff --git a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc
index fa01d7540228a2..a075192a58054b 100644
--- a/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc
+++ b/paddle/fluid/inference/tests/infer_ut/test_ppyolo_mbv3.cc
@@ -73,7 +73,7 @@ TEST(tensorrt_tester_ppyolo_mbv3, multi_thread4_trt_fp32_bz2) {
                   FLAGS_modeldir + "/model.pdiparams");
   config.EnableUseGpu(100, 0);
   config.EnableTensorRtEngine(
-      1 << 20, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false);
+      1 << 25, 2, 3, paddle_infer::PrecisionType::kFloat32, false, false);
   LOG(INFO) << config.Summary();
   // get groudtruth by disbale ir
   paddle_infer::services::PredictorPool pred_pool_no_ir(config_no_ir, 1);
diff --git a/paddle/fluid/inference/utils/io_utils.cc b/paddle/fluid/inference/utils/io_utils.cc
index e45e3a1035fe5d..ef342a4eefce3d 100644
--- a/paddle/fluid/inference/utils/io_utils.cc
+++ b/paddle/fluid/inference/utils/io_utils.cc
@@ -182,7 +182,10 @@ void SerializeShapeRangeInfo(
     const std::string &path,
     const std::map<std::string, std::vector<int32_t>> &min_shape,
     const std::map<std::string, std::vector<int32_t>> &max_shape,
-    const std::map<std::string, std::vector<int32_t>> &opt_shape) {
+    const std::map<std::string, std::vector<int32_t>> &opt_shape,
+    const std::map<std::string, std::vector<int32_t>> &min_value,
+    const std::map<std::string, std::vector<int32_t>> &max_value,
+    const std::map<std::string, std::vector<int32_t>> &opt_value) {
   paddle::inference::proto::ShapeRangeInfos shape_range_infos;
   for (auto it : min_shape) {
     auto *s = shape_range_infos.add_shape_range_info();
@@ -192,10 +195,18 @@ void SerializeShapeRangeInfo(
       s->add_max_shape(max_shape.at(it.first)[i]);
       s->add_opt_shape(opt_shape.at(it.first)[i]);
     }
+    // If it.first is a shape tensor, we should collect values from it.
+    if (min_value.count(it.first)) {
+      for (size_t i = 0; i < min_value.at(it.first).size(); ++i) {
+        s->add_min_value(min_value.at(it.first)[i]);
+        s->add_max_value(max_value.at(it.first)[i]);
+        s->add_opt_value(opt_value.at(it.first)[i]);
+      }
+    }
   }
-
   inference::SerializeShapeRangeInfo(path, shape_range_infos);
 }
+
 void DeserializeShapeRangeInfo(
     const std::string &path, paddle::inference::proto::ShapeRangeInfos *info) {
   int fd = open(path.c_str(), O_RDONLY);
@@ -213,7 +224,10 @@ void DeserializeShapeRangeInfo(
     const std::string &path,
     std::map<std::string, std::vector<int32_t>> *min_shape,
     std::map<std::string, std::vector<int32_t>> *max_shape,
-    std::map<std::string, std::vector<int32_t>> *opt_shape) {
+    std::map<std::string, std::vector<int32_t>> *opt_shape,
+    std::map<std::string, std::vector<int32_t>> *min_value,
+    std::map<std::string, std::vector<int32_t>> *max_value,
+    std::map<std::string, std::vector<int32_t>> *opt_value) {
   paddle::inference::proto::ShapeRangeInfos shape_range_infos;
   DeserializeShapeRangeInfo(path, &shape_range_infos);
   for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
@@ -236,6 +250,26 @@ void DeserializeShapeRangeInfo(
       opt_shape->insert(std::make_pair(name, tmp));
     }
   }
+  for (int i = 0; i < shape_range_infos.shape_range_info_size(); ++i) {
+    auto info = shape_range_infos.shape_range_info(i);
+    auto name = info.name();
+    if (min_value->count(name) || max_value->count(name) ||
+        opt_value->count(name)) {
+      continue;
+    } else {
+      std::vector<int32_t> tmp(info.min_value_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.min_value(k);
+      min_value->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.max_value_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.max_value(k);
+      max_value->insert(std::make_pair(name, tmp));
+
+      tmp.resize(info.opt_value_size());
+      for (size_t k = 0; k < tmp.size(); ++k) tmp[k] = info.opt_value(k);
+      opt_value->insert(std::make_pair(name, tmp));
+    }
+  }
 }
 
 void UpdateShapeRangeInfo(
@@ -264,6 +298,7 @@ void UpdateShapeRangeInfo(
       }
     }
   }
+
   inference::SerializeShapeRangeInfo(path, shape_range_infos);
 }
 
diff --git a/paddle/fluid/inference/utils/io_utils.h b/paddle/fluid/inference/utils/io_utils.h
index 682bbdef05edcc..64d6b3be4d94cb 100644
--- a/paddle/fluid/inference/utils/io_utils.h
+++ b/paddle/fluid/inference/utils/io_utils.h
@@ -42,23 +42,22 @@ void SerializePDTensorsToFile(const std::string& path,
                               const std::vector<PaddleTensor>& tensors);
 void DeserializePDTensorsToFile(const std::string& path,
                                 std::vector<PaddleTensor>* tensors);
-
-void SerializeShapeRangeInfo(
-    const std::string& path,
-    const paddle::inference::proto::ShapeRangeInfos& info);
 void SerializeShapeRangeInfo(
     const std::string& path,
     const std::map<std::string, std::vector<int32_t>>& min_shape,
     const std::map<std::string, std::vector<int32_t>>& max_shape,
-    const std::map<std::string, std::vector<int32_t>>& opt_shape);
-void DeserializeShapeRangeInfo(const std::string& path,
-                               paddle::inference::proto::ShapeRangeInfos* info);
+    const std::map<std::string, std::vector<int32_t>>& opt_shape,
+    const std::map<std::string, std::vector<int32_t>>& min_value,
+    const std::map<std::string, std::vector<int32_t>>& max_value,
+    const std::map<std::string, std::vector<int32_t>>& opt_value);
 void DeserializeShapeRangeInfo(
     const std::string& path,
     std::map<std::string, std::vector<int32_t>>* min_shape,
     std::map<std::string, std::vector<int32_t>>* max_shape,
-    std::map<std::string, std::vector<int32_t>>* opt_shape);
-
+    std::map<std::string, std::vector<int32_t>>* opt_shape,
+    std::map<std::string, std::vector<int32_t>>* min_value,
+    std::map<std::string, std::vector<int32_t>>* max_value,
+    std::map<std::string, std::vector<int32_t>>* opt_value);
 void UpdateShapeRangeInfo(
     const std::string& path,
     const std::map<std::string, std::vector<int32_t>>& min_shape,
diff --git a/paddle/fluid/inference/utils/io_utils_tester.cc b/paddle/fluid/inference/utils/io_utils_tester.cc
index 7707140fb9762e..812c22aa67d2ee 100644
--- a/paddle/fluid/inference/utils/io_utils_tester.cc
+++ b/paddle/fluid/inference/utils/io_utils_tester.cc
@@ -100,28 +100,48 @@ TEST(infer_io_utils, tensors) {
 TEST(shape_info_io, read_and_write) {
   const std::string path = "test_shape_info_io";
   std::map<std::string, std::vector<int32_t>> min_shape, max_shape, opt_shape;
+  std::map<std::string, std::vector<int32_t>> min_value, max_value, opt_value;
   min_shape.insert(
       std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
   max_shape.insert(
       std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
   opt_shape.insert(
       std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  min_value.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 112, 112}));
+  max_value.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
+  opt_value.insert(
+      std::make_pair("test1", std::vector<int32_t>{1, 3, 224, 224}));
   paddle::inference::SerializeShapeRangeInfo(
-      path, min_shape, max_shape, opt_shape);
+      path, min_shape, max_shape, opt_shape, min_value, max_value, opt_value);
   min_shape.clear();
   max_shape.clear();
   opt_shape.clear();
+  min_value.clear();
+  max_value.clear();
+  opt_value.clear();
   opt_shape.insert(
       std::make_pair("test2", std::vector<int32_t>{1, 3, 224, 224}));
-  paddle::inference::DeserializeShapeRangeInfo(
-      path, &min_shape, &max_shape, &opt_shape);
+  paddle::inference::DeserializeShapeRangeInfo(path,
+                                               &min_shape,
+                                               &max_shape,
+                                               &opt_shape,
+                                               &min_value,
+                                               &max_value,
+                                               &opt_value);
 
   min_shape.insert(std::make_pair("test1", std::vector<int32_t>{1, 3, 56, 56}));
   std::vector<std::string> names{"test1"};
   paddle::inference::UpdateShapeRangeInfo(
       path, min_shape, max_shape, opt_shape, names);
 
-  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo(
-                   "no_exists_file", &min_shape, &max_shape, &opt_shape);
+  ASSERT_THROW(paddle::inference::DeserializeShapeRangeInfo("no_exists_file",
+                                                            &min_shape,
+                                                            &max_shape,
+                                                            &opt_shape,
+                                                            &min_value,
+                                                            &max_value,
+                                                            &opt_value);
                , paddle::platform::EnforceNotMet);
 }
diff --git a/paddle/fluid/inference/utils/shape_range_info.proto b/paddle/fluid/inference/utils/shape_range_info.proto
index fcb2d635b52261..53f018cb593489 100644
--- a/paddle/fluid/inference/utils/shape_range_info.proto
+++ b/paddle/fluid/inference/utils/shape_range_info.proto
@@ -23,6 +23,9 @@ message ShapeRangeInfos {
     repeated int32 min_shape = 2;
     repeated int32 max_shape = 3;
     repeated int32 opt_shape = 4;
+    repeated int32 min_value = 5;
+    repeated int32 max_value = 6;
+    repeated int32 opt_value = 7;
   }
 
   repeated ShapeRangeInfo shape_range_info = 1;
diff --git a/paddle/fluid/jit/CMakeLists.txt b/paddle/fluid/jit/CMakeLists.txt
index 565bd670b98bf1..c1a41fe595d954 100644
--- a/paddle/fluid/jit/CMakeLists.txt
+++ b/paddle/fluid/jit/CMakeLists.txt
@@ -34,7 +34,8 @@ cc_library(
 cc_library(
   jit_function
   SRCS function.cc
-  DEPS jit_function_utils jit_executor_engine jit_pe_engine)
+  DEPS jit_function_utils jit_executor_engine jit_pe_engine
+       jit_predictor_engine)
 
 cc_library(
   jit_layer
@@ -46,6 +47,7 @@ cc_library(
        jit_function_schema
        jit_executor_engine
        jit_pe_engine
+       jit_predictor_engine
        jit_function)
 
 if(WITH_TESTING AND NOT WIN32)
diff --git a/paddle/fluid/jit/engine/CMakeLists.txt b/paddle/fluid/jit/engine/CMakeLists.txt
index 92a1f9582c931f..949a89a595334d 100644
--- a/paddle/fluid/jit/engine/CMakeLists.txt
+++ b/paddle/fluid/jit/engine/CMakeLists.txt
@@ -7,3 +7,8 @@ cc_library(
   jit_pe_engine
   SRCS pe_engine.cc
   DEPS parallel_executor)
+
+cc_library(
+  jit_predictor_engine
+  SRCS predictor_engine.cc
+  DEPS paddle_inference_api analysis_predictor)
diff --git a/paddle/fluid/jit/engine/executor_engine.cc b/paddle/fluid/jit/engine/executor_engine.cc
index 58d80426e5fbae..1cde715b8f0301 100644
--- a/paddle/fluid/jit/engine/executor_engine.cc
+++ b/paddle/fluid/jit/engine/executor_engine.cc
@@ -44,14 +44,17 @@ std::vector<Tensor> ExecutorEngine::operator()(
 std::vector<DenseTensor> ExecutorEngine::operator()(
     const std::vector<DenseTensor> &inputs) {
   utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
+  const auto out_names = info_->OutputArgNames();
   inner_exe_.Run(info_->ProgramDesc(),
                  &scope_,
                  /*blockID=*/0,
                  false,
                  true,
-                 info_->OutputArgNames());
+                 out_names);
   std::vector<DenseTensor> outputs;
-  utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
+  utils::FetchOuts(out_names, scope_, &outputs);
+  // Erase output vars to avoid data rewriting.
+  scope_.EraseVars(out_names);
   return outputs;
 }
 
diff --git a/paddle/fluid/jit/engine/pe_engine.cc b/paddle/fluid/jit/engine/pe_engine.cc
index 2d35a8792ef704..576687c0efaf1a 100644
--- a/paddle/fluid/jit/engine/pe_engine.cc
+++ b/paddle/fluid/jit/engine/pe_engine.cc
@@ -85,7 +85,6 @@ void PEEngine::CreateGraphAndPE() {
   graph_ = std::make_shared<Graph>(program_desc, start_op_index, end_op_index);
   inner_pe_ = std::make_shared<ParallelExecutor>(
       place_, &scope_, execution_strategy, build_strategy, graph_.get());
-  inner_pe_->PrepareVariables(&scope_);
   inner_pe_->SkipMemoryReuse(/*scope_idx=*/0, info_->InputArgNames());
 }
 
@@ -97,18 +96,15 @@ std::vector<Tensor> PEEngine::operator()(const std::vector<Tensor> &inputs) {
 std::vector<DenseTensor> PEEngine::operator()(
     const std::vector<DenseTensor> &inputs) {
   utils::ShareIntoScope(info_->InputArgNames(), inputs, &scope_);
-
-  // update op_handle scope_map in pe->executor_->Graph
-  std::unordered_map<framework::Scope *, framework::Scope *> scope_map = {
-      {inner_pe_->GetLocalScopes().front(), &scope_}};
-  inner_pe_->ResetOpHandleScopeMapOfGraphs(scope_map);
+  const auto out_names = info_->OutputArgNames();
   // need to recreate tmp variables in new scope
   inner_pe_->PrepareVariables(&scope_);
-
-  inner_pe_->RunWithoutFetch(info_->OutputArgNames());
+  inner_pe_->RunWithoutFetch(out_names);
 
   std::vector<DenseTensor> outputs;
-  utils::FetchOuts(info_->OutputArgNames(), scope_, &outputs);
+  utils::FetchOuts(out_names, scope_, &outputs);
+  // Erase output vars to avoid data rewriting.
+  scope_.EraseVars(out_names);
   scope_.DropKids();
   return outputs;
 }
diff --git a/paddle/fluid/jit/engine/predictor_engine.cc b/paddle/fluid/jit/engine/predictor_engine.cc
new file mode 100644
index 00000000000000..6a44c192c16f72
--- /dev/null
+++ b/paddle/fluid/jit/engine/predictor_engine.cc
@@ -0,0 +1,192 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/jit/engine/predictor_engine.h"
+
+#include "paddle/fluid/inference/api/analysis_predictor.h"
+#include "paddle/fluid/inference/api/paddle_api.h"
+#include "paddle/fluid/jit/function_utils.h"
+#include "paddle/fluid/platform/device_context.h"
+
+namespace paddle {
+namespace jit {
+
+static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t);
+static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
+                                      DenseTensor *t,
+                                      const platform::Place &place);
+
+PredictorEngine::PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
+                                 const VariableMap &params_dict,
+                                 const phi::Place &place)
+    : info_(info), scope_(new framework::Scope()), place_(place) {
+  utils::ShareParamsIntoScope(info_->ParamNames(), params_dict, scope_.get());
+  VLOG(6) << framework::GenScopeTreeDebugInfo(scope_.get());
+
+  // TODO(Aurelius84): Expose AnalysisConfig to user.
+  AnalysisConfig config;
+  config.SetProgFile(info->ProgramFilePath());
+  if (platform::is_gpu_place(place_)) {
+    config.EnableUseGpu(100, place_.GetDeviceId());
+  } else if (platform::is_cpu_place(place_)) {
+    config.DisableGpu();
+    config.EnableMKLDNN();
+    config.EnableMkldnnInt8();
+    config.SetMkldnnCacheCapacity(0);
+  }
+  config.SetSkipLoadParams(true);
+  config.SetApplyOptim(true);
+  config.SwitchIrOptim(true);
+
+  predictor_.reset(new AnalysisPredictor(config));
+
+  predictor_->Init(
+      scope_, std::make_shared<framework::ProgramDesc>(info_->ProgramDesc()));
+}
+
+std::vector<Tensor> PredictorEngine::operator()(
+    const std::vector<Tensor> &inputs) {
+  auto dense_tensors = utils::ToDenseTensors(inputs);
+  return utils::ToTensors(this->operator()(dense_tensors));
+}
+
+std::vector<DenseTensor> PredictorEngine::operator()(
+    const std::vector<DenseTensor> &inputs) {
+  std::vector<PaddleTensor> pt_inputs;
+  std::vector<PaddleTensor> pt_outputs;
+  for (auto &t : inputs) {
+    auto non_const_t = const_cast<DenseTensor *>(&t);
+    pt_inputs.emplace_back(DenseTensorToPaddleTensor(non_const_t));
+  }
+
+  predictor_->Run(pt_inputs, &pt_outputs);
+
+  std::vector<DenseTensor> outputs;
+  for (auto &pt : pt_outputs) {
+    DenseTensor t;
+    PaddleTensorToDenseTensor(pt, &t, place_);
+    outputs.emplace_back(t);
+  }
+
+  return outputs;
+}
+
+static PaddleTensor DenseTensorToPaddleTensor(DenseTensor *t) {
+  PaddleTensor pt;
+  switch (framework::TransToProtoVarType(t->dtype())) {
+    case framework::proto::VarType::INT32: {
+      pt.data.Reset(t->data(), t->numel() * sizeof(int32_t));
+      pt.dtype = PaddleDType::INT32;
+    } break;
+    case framework::proto::VarType::INT64: {
+      pt.data.Reset(t->data(), t->numel() * sizeof(int64_t));
+      pt.dtype = PaddleDType::INT64;
+    } break;
+    case framework::proto::VarType::FP32: {
+      pt.data.Reset(t->data(), t->numel() * sizeof(float));
+      pt.dtype = PaddleDType::FLOAT32;
+    } break;
+    default:
+      PADDLE_THROW(
+          platform::errors::Unimplemented("Unsupported tensor date type. Now "
+                                          "only supports INT64, FP32, INT32."));
+  }
+  pt.shape = phi::vectorize<int>(t->dims());
+  return pt;
+}
+
+static bool PaddleTensorToDenseTensor(const PaddleTensor &pt,
+                                      DenseTensor *t,
+                                      const platform::Place &place) {
+  framework::DDim ddim = phi::make_ddim(pt.shape);
+  void *input_ptr;
+  switch (pt.dtype) {
+    case PaddleDType::INT64:
+      input_ptr = t->mutable_data<int64_t>(ddim, place);
+      break;
+    case PaddleDType::FLOAT32:
+      input_ptr = t->mutable_data<float>(ddim, place);
+      break;
+    case PaddleDType::INT32:
+      input_ptr = t->mutable_data<int32_t>(ddim, place);
+      break;
+    case PaddleDType::FLOAT16:
+      input_ptr = t->mutable_data<float16>(ddim, place);
+      break;
+    default:
+      LOG(ERROR) << "unsupported feed type " << pt.dtype;
+      return false;
+  }
+
+  PADDLE_ENFORCE_NOT_NULL(
+      input_ptr,
+      paddle::platform::errors::Fatal(
+          "Cannot convert to LoDTensor because LoDTensor creation failed."));
+  PADDLE_ENFORCE_NOT_NULL(
+      pt.data.data(),
+      paddle::platform::errors::InvalidArgument(
+          "The data contained in the input PaddleTensor is illegal."));
+
+  if (platform::is_cpu_place(place)) {
+    // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
+    std::memcpy(
+        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
+  } else if (platform::is_ipu_place(place)) {
+#ifdef PADDLE_WITH_IPU
+    std::memcpy(
+        static_cast<void *>(input_ptr), pt.data.data(), pt.data.length());
+#else
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Not compile with WITH_IPU, should not reach here."));
+#endif
+  } else if (platform::is_gpu_place(place)) {
+    PADDLE_ENFORCE_EQ(platform::is_xpu_place(place),
+                      false,
+                      platform::errors::InvalidArgument(
+                          "Only one choice can be made between CPU and XPU."));
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto *dev_ctx = static_cast<const phi::GPUContext *>(pool.Get(place));
+    auto dst_gpu_place = place;
+    memory::Copy(dst_gpu_place,
+                 static_cast<void *>(input_ptr),
+                 platform::CPUPlace(),
+                 pt.data.data(),
+                 pt.data.length(),
+                 dev_ctx->stream());
+#else
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Not compile with CUDA, should not reach here."));
+#endif
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    auto dst_xpu_place = place;
+    memory::Copy(dst_xpu_place,
+                 static_cast<void *>(input_ptr),
+                 platform::CPUPlace(),
+                 pt.data.data(),
+                 pt.data.length());
+#else
+    PADDLE_THROW(paddle::platform::errors::Fatal(
+        "Not compile with XPU, should not reach here."));
+#endif
+  } else {
+    PADDLE_THROW(paddle::platform::errors::InvalidArgument(
+        "The analysis predictor supports CPU, GPU and XPU now."));
+  }
+  return true;
+}
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/engine/predictor_engine.h b/paddle/fluid/jit/engine/predictor_engine.h
new file mode 100644
index 00000000000000..026b012cbfb02f
--- /dev/null
+++ b/paddle/fluid/jit/engine/predictor_engine.h
@@ -0,0 +1,50 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/fluid/jit/engine/base_engine.h"
+#include "paddle/fluid/jit/function_schema.h"
+#include "paddle/fluid/jit/function_utils.h"
+
+namespace paddle {
+class AnalysisPredictor;
+
+namespace framework {
+class Scope;
+}
+
+namespace jit {
+
+class PredictorEngine : public BaseEngine {
+ public:
+  PredictorEngine(const std::shared_ptr<FunctionInfo> &info,
+                  const VariableMap &params_dict,
+                  const phi::Place &place);
+
+  ~PredictorEngine() noexcept {}
+
+  std::vector<Tensor> operator()(const std::vector<Tensor> &inputs);
+
+  std::vector<DenseTensor> operator()(const std::vector<DenseTensor> &inputs);
+
+ private:
+  std::shared_ptr<FunctionInfo> info_;
+  std::shared_ptr<framework::Scope> scope_;
+  phi::Place place_;
+  std::shared_ptr<AnalysisPredictor> predictor_;
+};
+
+}  // namespace jit
+}  // namespace paddle
diff --git a/paddle/fluid/jit/function_schema.cc b/paddle/fluid/jit/function_schema.cc
index 8150d3b2e7589c..0d2014153e1d76 100644
--- a/paddle/fluid/jit/function_schema.cc
+++ b/paddle/fluid/jit/function_schema.cc
@@ -82,6 +82,14 @@ const std::vector<std::string> FunctionInfo::OutputArgNames() const {
   return schema_.OutputArgNames();
 }
 
+const std::string& FunctionInfo::ProgramFilePath() const {
+  return prog_file_path_;
+}
+
+void FunctionInfo::SetProgramFilePath(const std::string& path) {
+  prog_file_path_ = path;
+}
+
 void FunctionInfo::RemoveDescFeedFetch() {
   utils::RemoveFeedFetch(program_desc_.get());
 }
diff --git a/paddle/fluid/jit/function_schema.h b/paddle/fluid/jit/function_schema.h
index 9f593dd7eee241..31d82b15311376 100644
--- a/paddle/fluid/jit/function_schema.h
+++ b/paddle/fluid/jit/function_schema.h
@@ -19,7 +19,6 @@
 #include <vector>
 
 namespace paddle {
-
 namespace framework {
 class ProgramDesc;
 }  // namespace framework
@@ -72,6 +71,10 @@ class FunctionInfo {
 
   const std::vector<std::string> OutputArgNames() const;
 
+  const std::string& ProgramFilePath() const;
+
+  void SetProgramFilePath(const std::string& path);
+
   void RemoveDescFeedFetch();
 
  private:
@@ -79,6 +82,7 @@ class FunctionInfo {
   std::vector<std::string> param_names_;
   std::shared_ptr<framework::ProgramDesc> program_desc_;
   FunctionSchema schema_;
+  std::string prog_file_path_;
 };
 
 }  // namespace jit
diff --git a/paddle/fluid/jit/serializer.cc b/paddle/fluid/jit/serializer.cc
index 65a39bc7f9a564..ca3a8823fca713 100644
--- a/paddle/fluid/jit/serializer.cc
+++ b/paddle/fluid/jit/serializer.cc
@@ -22,6 +22,7 @@
 
 #include "paddle/fluid/jit/engine/executor_engine.h"
 #include "paddle/fluid/jit/engine/pe_engine.h"
+#include "paddle/fluid/jit/engine/predictor_engine.h"
 #include "paddle/fluid/jit/layer.h"
 #include "paddle/fluid/jit/property.h"
 #include "paddle/fluid/jit/serializer_utils.h"
@@ -53,6 +54,7 @@ Layer Deserializer::operator()(const std::string& path,
     param_names_set.insert(persist_var_names.begin(), persist_var_names.end());
     info_map[func_name] = std::make_shared<FunctionInfo>(
         func_name, persist_var_names, program_desc);
+    info_map[func_name]->SetProgramFilePath(it.second);
   }
 
   VariableMap params_dict;
@@ -69,16 +71,19 @@ Layer Deserializer::operator()(const std::string& path,
   for (auto it = info_map.begin(); it != info_map.end(); ++it) {
     const std::string& func_name = it->first;
     auto& info = it->second;
+    VLOG(3) << "Add function type: " << FLAGS_jit_engine_type
+            << " Function name: " << func_name;
     if (FLAGS_jit_engine_type == "Executor") {
-      VLOG(3) << "Add function type: ExecutorEngine. Function name: "
-              << func_name;
       layer.SetEngine(
           func_name,
           utils::MakeEngine<ExecutorEngine>(info, params_dict, place));
     } else if (FLAGS_jit_engine_type == "PE") {
-      VLOG(3) << "Add function type: PEEngine. Function name: " << func_name;
       layer.SetEngine(func_name,
                       utils::MakeEngine<PEEngine>(info, params_dict, place));
+    } else if (FLAGS_jit_engine_type == "Predictor") {
+      layer.SetEngine(
+          info->FunctionName(),
+          utils::MakeEngine<PredictorEngine>(info, params_dict, place));
     } else {
       PD_THROW("Invalid JitLayer engine type.");
     }
diff --git a/paddle/fluid/memory/allocation/system_allocator.cc b/paddle/fluid/memory/allocation/system_allocator.cc
index fcfece978cb7fc..15cd2f6d1f3719 100644
--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -111,15 +111,14 @@ void CPUAllocator::Free(void* p, size_t size, size_t index) {
     munlock(p, size);
 #endif
   }
+  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
+  platform::RecordMemEvent(
+      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 #ifdef _WIN32
   _aligned_free(p);
 #else
   free(p);
 #endif
-
-  HOST_MEMORY_STAT_UPDATE(Reserved, 0, -size);
-  platform::RecordMemEvent(
-      p, CPUPlace(), size, platform::TracerMemEventType::ReservedFree);
 }
 
 bool CPUAllocator::UseGpu() const { return false; }
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 39faf87406d589..ac1d89ede50214 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -8,6 +8,7 @@ add_definitions(-D_USE_MATH_DEFINES)
 unset(GLOB_OP_LIB CACHE)
 unset(OP_LIBRARY CACHE)
 set(pybind_file ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h.tmp CACHE INTERNAL "pybind.h file")
+set(pybind_file_prune ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h.prune CACHE INTERNAL "pybind.h file")
 set(pybind_file_final ${PADDLE_BINARY_DIR}/paddle/fluid/pybind/pybind.h)
 file(WRITE ${pybind_file} "// Generated by the paddle/fluid/operators/CMakeLists.txt.  DO NOT EDIT!\n\n")
 
@@ -101,7 +102,7 @@ else()
     cc_library(gather_scatter_kernel SRCS gather_scatter_kernel.cc gather_scatter_kernel.cu DEPS tensor)
 endif()
 
-set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta)
+set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_api_utils gather_scatter_kernel backward_infermeta sparse_backward_infermeta)
 
 register_operators(EXCLUDES py_layer_op py_func_op warpctc_op dgc_op load_combine_op lstm_op run_program_op eye_op quantize_linear_op
         recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
@@ -254,3 +255,11 @@ cc_test(copy_cross_scope_test SRCS copy_cross_scope_test.cc DEPS op_registry cop
 endif()
 
 copy_if_different(${pybind_file} ${pybind_file_final})
+
+if (WITH_CUSTOM_DEVICE)
+cc_library(custom_device_common_op_registry SRCS custom_device_common_op_registry.cc DEPS operator)
+endif()
+
+if(NOT "${OP_LIST}" STREQUAL "")
+prune_pybind_h()
+endif()
diff --git a/paddle/fluid/operators/abs_op.cc b/paddle/fluid/operators/abs_op.cc
index 465637f3ed63e7..9a2a75a642ab74 100644
--- a/paddle/fluid/operators/abs_op.cc
+++ b/paddle/fluid/operators/abs_op.cc
@@ -154,7 +154,7 @@ class AbsDoubleGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
     return framework::OpKernelType(
         framework::TransToProtoVarType(tensor.dtype()),
         tensor.place(),
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 62b805cf422d95..f3ff19b78c06e2 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -38,29 +38,20 @@ static constexpr bool CanInplaceAct() {
          GradFunctor::FwdDeps() == ActBwdOpFwdDeps::kNoDeps;
 }
 
-#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)                    \
-  class OP_NAME##OpMaker                                                     \
-      : public ::paddle::framework::OpProtoAndCheckerMaker {                 \
-   public:                                                                   \
-    void Make() override {                                                   \
-      AddInput("X",                                                          \
-               "Input of " #OP_NAME                                          \
-               " operator, an N-D Tensor, with data type float32, "          \
-               "float64 or float16.");                                       \
-      AddOutput("Out",                                                       \
-                "Output of " #OP_NAME                                        \
-                " operator, a Tensor with shape same as input.");            \
-      AddAttr<bool>("use_mkldnn",                                            \
-                    "(bool, default false) Only used in mkldnn kernel")      \
-          .SetDefault(false)                                                 \
-          .AsExtra();                                                        \
-      AddAttr<bool>("use_cudnn",                                             \
-                    "(bool, default false) Only used in cudnn kernel, need " \
-                    "install cudnn")                                         \
-          .SetDefault(false)                                                 \
-          .AsExtra();                                                        \
-      AddComment(OP_COMMENT);                                                \
-    }                                                                        \
+#define REGISTER_ACTIVATION_OP_MAKER(OP_NAME, OP_COMMENT)           \
+  class OP_NAME##OpMaker                                            \
+      : public ::paddle::framework::OpProtoAndCheckerMaker {        \
+   public:                                                          \
+    void Make() override {                                          \
+      AddInput("X",                                                 \
+               "Input of " #OP_NAME                                 \
+               " operator, an N-D Tensor, with data type float32, " \
+               "float64 or float16.");                              \
+      AddOutput("Out",                                              \
+                "Output of " #OP_NAME                               \
+                " operator, a Tensor with shape same as input.");   \
+      AddComment(OP_COMMENT);                                       \
+    }                                                               \
   }
 
 template <ActBwdOpFwdDeps kDepValue, typename T>
@@ -107,8 +98,7 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx,
 //   }
 // #endif
 #ifdef PADDLE_WITH_MKLDNN
-  auto it = oper.Attrs().find("use_mkldnn");
-  if (library == framework::LibraryType::kPlain && it != oper.Attrs().end() &&
+  if (library == framework::LibraryType::kPlain &&
       oper.CanMKLDNNBeUsed(ctx, data_type)) {
     library = framework::LibraryType::kMKLDNN;
     layout = framework::DataLayout::kMKLDNN;
@@ -135,7 +125,7 @@ class ActivationOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     // When activation is first oneDNN op (there was some non oneDNN op
     // previously)
@@ -182,9 +172,9 @@ class ActivationOpGrad : public framework::OperatorWithKernel {
 };
 
 UNUSED constexpr char SigmoidDoc[] = R"DOC(
-Sigmoid Activation Operator
+Sigmoid Activation
 
-$$out = \\frac{1}{1 + e^{-x}}$$
+$$out = \frac{1}{1 + e^{-x}}$$
 
 )DOC";
 
@@ -458,10 +448,6 @@ class LeakyReluOpMaker : public framework::OpProtoAndCheckerMaker {
         "A LoDTensor or Tensor with the same type and size as that of x.");
     AddAttr<float>("alpha", "Slope of the activation function at x < 0.")
         .SetDefault(0.02f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 LeakyRelu Activation Operator.
 
@@ -483,35 +469,6 @@ class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("beta", "The value of beta for Softplus.").SetDefault(1.0f);
     AddAttr<float>("threshold", "The value of threshold for Softplus.")
         .SetDefault(20.0f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<bool>(
-        "use_cudnn",
-        "(bool, default false) Only used in cudnn kernel, need install cudnn.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>(
-        "fuse_activation_type",
-        "Fused activation type used in softplus OneDNN kernel.")
-        .SetDefault("")
-        .AsExtra();
-    AddAttr<float>(
-        "fuse_activation_alpha",
-        "Fused activation alpha parameter type used in softplus OneDNN kernel.")
-        .SetDefault(0.0f)
-        .AsExtra();
-    AddAttr<float>(
-        "fuse_activation_beta",
-        "Fused activation beta parameter type used in softplus OneDNN kernel.")
-        .SetDefault(0.0f)
-        .AsExtra();
-    AddAttr<float>(
-        "fuse_activation_scale",
-        "Fused activation scale parameter type used in softplus OneDNN kernel.")
-        .SetDefault(1.0f)
-        .AsExtra();
     AddComment(R"DOC(
 :strong:`Softplus Activation Operator`
 
@@ -613,10 +570,6 @@ class ELUOpMaker : public framework::OpProtoAndCheckerMaker {
               "The output is a multi-dimensional Tensor which has same "
               "dimension and data type as the ``x``.");
     AddAttr<float>("alpha", "The alpha value of ELU").SetDefault(1.0f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 ELU Activation Operator.
 
@@ -712,10 +665,6 @@ class Relu6OpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<float>("threshold",
                    "The threshold value of Relu6. Default is 6.0. ")
         .SetDefault(6.0f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 Relu6 Activation Operator.
 
@@ -817,10 +766,6 @@ class SwishOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "Input of Swish operator");
     AddOutput("Out", "Output of Swish operator");
     AddAttr<float>("beta", "Constant beta of swish operator").SetDefault(1.0f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 Swish Activation Operator.
 
@@ -841,10 +786,6 @@ class MishOpMaker : public framework::OpProtoAndCheckerMaker {
         "of softplus will be used if absolute value of input is greater than "
         ":attr:`threshold`")
         .SetDefault(20.f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 Mish Activation Operator.
 
@@ -871,10 +812,6 @@ class HardSwishOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(6.0f);
     AddAttr<float>("offset", "The offset parameter of HardSwish operator")
         .SetDefault(3.0f);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 HardSwish Activation Operator.
 
diff --git a/paddle/fluid/operators/bmm_op_xpu.cc b/paddle/fluid/operators/bmm_op_xpu.cc
deleted file mode 100644
index f6e1d0227c8fca..00000000000000
--- a/paddle/fluid/operators/bmm_op_xpu.cc
+++ /dev/null
@@ -1,226 +0,0 @@
-//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#ifdef PADDLE_WITH_XPU
-
-#include <string>
-#include <vector>
-
-#include "paddle/fluid/operators/matmul_v2_op.h"
-#include "paddle/fluid/operators/xpu_api_wrapper.h"
-#include "paddle/fluid/platform/device/device_wrapper.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename T, typename FCT>
-static void MatMulXPUFunction(const Tensor* x,
-                              const Tensor* y,
-                              Tensor* out,
-                              bool trans_x,
-                              bool trans_y,
-                              const paddle::framework::ExecutionContext& ctx) {
-  using XPUType = typename XPUTypeTrait<T>::Type;
-  const auto& x_dims = x->dims();
-  const auto& y_dims = y->dims();
-  auto& dev_ctx =
-      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-
-  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
-      RowMatrixFromVector(x_dims), 0, trans_x);
-  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
-      ColumnMatrixFromVector(y_dims), 0, trans_y);
-
-  T* data_c = out->data<T>();
-  int m = mat_dim_a.height_;
-  int n = mat_dim_b.width_;
-  int k = mat_dim_a.width_;
-  int batch_size = mat_dim_a.batch_size_;
-  // batch matmul
-  int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
-      dev_ctx.x_context(),                             // Context* ctx,
-      batch_size,                                      // int batch_size,
-      mat_dim_a.trans_,                                // bool x_trans,
-      mat_dim_b.trans_,                                // bool w_trans,
-      m,                                               // int m,
-      n,                                               // int n,
-      k,                                               // int k,
-      1.0,                                             // float alpha,
-      reinterpret_cast<const XPUType*>(x->data<T>()),  // const TX* x,
-      mat_dim_a.stride_,                               // int stride_a,
-      reinterpret_cast<const XPUType*>(y->data<T>()),  // const TW* w,
-      mat_dim_b.stride_,                               // int stride_b,
-      0.0,                                             // float beta,
-      reinterpret_cast<XPUType*>(data_c),              // TY* y,
-      m * n,                                           // int stride_c,
-      nullptr,                                         // const float* x_maxptr,
-      nullptr);                                        // const float* w_maxptr
-
-  PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_batched");
-}
-
-template <typename T>
-class BmmXPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto* x = ctx.Input<Tensor>("X");
-    auto* y = ctx.Input<Tensor>("Y");
-    auto* out = ctx.Output<Tensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-
-    if (x->numel() == 0 || y->numel() == 0) {
-      return;
-    }
-    bool trans_x = false;
-    bool trans_y = false;
-
-    auto x_dims = x->dims();
-    auto y_dims = y->dims();
-
-    PADDLE_ENFORCE_EQ(x_dims.size(),
-                      3,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of BmmOp must be 3-dimensional in BmmOp, "
-                          "but received X's shape: [%s].",
-                          x_dims));
-    PADDLE_ENFORCE_EQ(y_dims.size(),
-                      3,
-                      platform::errors::InvalidArgument(
-                          "Input(Y) of BmmOp must be 3-dimensional in BmmOp, "
-                          "but received Y's shape: [%s].",
-                          y_dims));
-    PADDLE_ENFORCE_EQ(
-        x_dims[0],
-        y_dims[0],
-        platform::errors::InvalidArgument(
-            "Input(X) and Input(Y) must have the same batch size in BmmOp, "
-            "but received X's batch size: [%s],"
-            "Y's batch size [%s]",
-            x_dims[0],
-            y_dims[0]));
-    PADDLE_ENFORCE_EQ(
-        x_dims[2],
-        y_dims[1],
-        platform::errors::InvalidArgument(
-            "Input(X)'s width must be equal with Input(Y)'s height in BmmOp,"
-            "but receive X's width: [%s],"
-            "Y's height: [%s].",
-            x_dims[2],
-            y_dims[1]));
-
-    if (std::is_same<paddle::platform::float16, T>::value) {
-      MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
-    } else {
-      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-        MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, ctx);
-      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-        MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, ctx);
-      } else {
-        MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, ctx);
-      }
-    }
-  }
-};
-
-template <typename T>
-class BmmXPUGradKernel : public framework::OpKernel<T> {
- public:
-  void MatMul(const framework::ExecutionContext& ctx,
-              const framework::Tensor& a,
-              bool trans_a,
-              const framework::Tensor& b,
-              bool trans_b,
-              framework::Tensor* out) const {
-    out->mutable_data<T>(ctx.GetPlace());
-    if (std::is_same<paddle::platform::float16, T>::value) {
-      MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
-    } else {
-      if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
-        MatMulXPUFunction<T, int32_t>(&a, &b, out, trans_a, trans_b, ctx);
-      } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
-        MatMulXPUFunction<T, float>(&a, &b, out, trans_a, trans_b, ctx);
-      } else {
-        MatMulXPUFunction<T, int16_t>(&a, &b, out, trans_a, trans_b, ctx);
-      }
-    }
-  }
-
-  void CalcInputGrad(const framework::ExecutionContext& context,
-                     const framework::Tensor& a,
-                     bool trans_a,
-                     const framework::Tensor& b,
-                     bool trans_b,
-                     framework::Tensor* out) const {
-    if (out == nullptr) return;
-    MatMul(context, a, trans_a, b, trans_b, out);
-  }
-
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto x = *context.Input<framework::Tensor>("X");
-    auto y = *context.Input<framework::Tensor>("Y");
-    auto dout =
-        *context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    ReshapeXYOutIntoMatrixSequence(&x, &y, &dout, false, false);
-
-    framework::DDim dx_dims;
-    if (dx) {
-      dx_dims = dx->dims();
-      if (dx_dims != x.dims()) {
-        dx->Resize(x.dims());
-      }
-    }
-
-    framework::DDim dy_dims;
-    if (dy) {
-      dy_dims = dy->dims();
-      if (dy_dims != y.dims()) {
-        dy->Resize(y.dims());
-      }
-    }
-
-    CalcInputGrad(context, dout, false, y, true, dx);
-    CalcInputGrad(context, x, true, dout, false, dy);
-
-    // CalcInputGrad(context, dout, false, false, y, true, false, dx);
-    // CalcInputGrad(context, x, true, true, dout, false, true, dy);
-
-    if (dx) {
-      if (dx_dims != x.dims()) {
-        dx->Resize(dx_dims);
-      }
-    }
-
-    if (dy) {
-      if (dy_dims != y.dims()) {
-        dy->Resize(dy_dims);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_XPU_KERNEL(bmm,
-                       ops::BmmXPUKernel<float>,
-                       ops::BmmXPUKernel<plat::float16>);
-REGISTER_OP_XPU_KERNEL(bmm_grad,
-                       ops::BmmXPUGradKernel<float>,
-                       ops::BmmXPUGradKernel<plat::float16>);
-
-#endif
diff --git a/paddle/fluid/operators/cinn/cinn_launch_context.cc b/paddle/fluid/operators/cinn/cinn_launch_context.cc
index 3c983994925144..a796050c7ec2ac 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_context.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_context.cc
@@ -270,8 +270,9 @@ void CinnLaunchContext::AssignExternalVariable(const std::string& var_name) {
       [this, var_name](void* ctx, cinn_buffer_t* buffer) {
         auto* tensor = cached_scope_->GetVar(var_name)->GetMutable<LoDTensor>();
         tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
-        buffer->memory = reinterpret_cast<uint8_t*>(
-            tensor->mutable_data<float>(*cached_place_));
+        buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
+            *cached_place_,
+            framework::paddle2cinn::TransToPaddleDataType(buffer->type)));
         return 0;
       });
 
@@ -295,8 +296,9 @@ void CinnLaunchContext::AssignInternalVariable(const std::string& var_name) {
         auto* tensor =
             cached_temp_scope_->Var(var_name)->GetMutable<LoDTensor>();
         tensor->Resize(framework::DDim(buffer->dims, buffer->dimensions));
-        buffer->memory = reinterpret_cast<uint8_t*>(
-            tensor->mutable_data<float>(*cached_place_));
+        buffer->memory = reinterpret_cast<uint8_t*>(tensor->mutable_data(
+            *cached_place_,
+            framework::paddle2cinn::TransToPaddleDataType(buffer->type)));
         return 0;
       });
 
@@ -437,7 +439,8 @@ ParallelExecutor* CinnLaunchContext::InitializePE(const platform::Place& place,
     auto* buffer = GetCinnBufferOfVar(var_name);
     auto dim = framework::DDim(buffer->dims, buffer->dimensions);
     var->GetMutable<LoDTensor>()->Resize(dim);
-    var->GetMutable<LoDTensor>()->mutable_data<float>(place);
+    var->GetMutable<LoDTensor>()->mutable_data(
+        place, framework::paddle2cinn::TransToPaddleDataType(buffer->type));
   }
   return parallel_executor_.get();
 }
diff --git a/paddle/fluid/operators/collective/CMakeLists.txt b/paddle/fluid/operators/collective/CMakeLists.txt
index c94b0c93eb34aa..e29b3f6639f1ed 100644
--- a/paddle/fluid/operators/collective/CMakeLists.txt
+++ b/paddle/fluid/operators/collective/CMakeLists.txt
@@ -17,6 +17,10 @@ foreach(src ${OPS})
                                                 ${COLLECTIVE_COMPILE_FLAGS})
 endforeach()
 
+if(WITH_GLOO)
+  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
+endif()
+
 register_operators(
   EXCLUDES
   c_gen_bkcl_id_op
@@ -35,10 +39,6 @@ if(WITH_NCCL OR WITH_RCCL)
   op_library(gen_nccl_id_op DEPS ${COLLECTIVE_DEPS})
 endif()
 
-if(WITH_GLOO)
-  set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} gloo_wrapper)
-endif()
-
 if(WITH_XPU_BKCL)
   set(COLLECTIVE_DEPS ${COLLECTIVE_DEPS} collective_helper)
   op_library(c_gen_bkcl_id_op DEPS ${COLLECTIVE_DEPS})
diff --git a/paddle/fluid/operators/collective/barrier_op_mlu.cc b/paddle/fluid/operators/collective/barrier_op_mlu.cc
new file mode 100644
index 00000000000000..d463e66fe62581
--- /dev/null
+++ b/paddle/fluid/operators/collective/barrier_op_mlu.cc
@@ -0,0 +1,63 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/barrier_op.h"
+#if defined(PADDLE_WITH_CNCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class BarrierOpMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+#if defined(PADDLE_WITH_CNCL)
+    auto in = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
+
+    auto place = ctx.GetPlace();
+    cnclDataType_t dtype =
+        platform::ToCNCLDataType(framework::TransToProtoVarType(in->dtype()));
+    int64_t numel = in->numel();
+    const void* sendbuff = in->data();
+    void* recvbuff = out->mutable_data<T>(place);
+
+    int rid = ctx.Attr<int>("ring_id");
+    auto cncl_comm = platform::CNCLCommContext::Instance().Get(rid, place);
+    auto* comm = cncl_comm->comm();
+    auto comm_stream = cncl_comm->stream();
+    auto& dev_ctx =
+        ctx.template device_context<paddle::platform::MLUDeviceContext>();
+    cnclReduceOp_t cncl_red_type = cnclSum;
+    dev_ctx.Wait();
+    PADDLE_ENFORCE_MLU_SUCCESS(cnclAllReduce(
+        sendbuff, recvbuff, numel, dtype, cncl_red_type, comm, comm_stream));
+    PADDLE_ENFORCE_MLU_SUCCESS(cnrtQueueSync(comm_stream));
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should compile with CNCL."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(barrier, ops::BarrierOpMLUKernel<int>);
diff --git a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
index fc3ad8a006ec53..347349ac7a49b5 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_mlu.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
 
 #if defined(PADDLE_WITH_CNCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -27,15 +28,14 @@ template <typename T>
 class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
+    auto place = ctx.GetPlace();
+    auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
 #if defined(PADDLE_WITH_CNCL)
-    auto x = ctx.Input<framework::Tensor>("X");
-    auto out = ctx.Output<framework::Tensor>("Out");
-    cnclDataType_t dtype =
-        platform::ToCNCLDataType(framework::TransToProtoVarType(x->dtype()));
+    auto x = ctx.Input<phi::DenseTensor>("X");
+    auto out = ctx.Output<phi::DenseTensor>("Out");
 
     int nranks = ctx.Attr<int>("nranks");
     int rid = ctx.Attr<int>("ring_id");
-    auto place = ctx.GetPlace();
     auto comm = platform::CNCLCommContext::Instance().Get(rid, place);
     PADDLE_ENFORCE_EQ(
         nranks,
@@ -48,19 +48,56 @@ class CAllGatherOpMLUKernel : public framework::OpKernel<T> {
     out->mutable_data<T>(out_dims, place);
 
     uint32_t send_numel = x->numel();
-    void* send_buff = reinterpret_cast<void*>(const_cast<T*>(x->data<T>()));
-    void* recv_buff = reinterpret_cast<void*>(out->data<T>());
+    void* send_buff;
+    void* recv_buff;
+    phi::DenseTensor in_tensor, out_tensor;
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast from int64 to int32 since cncl do not support int64
+      in_tensor.mutable_data<int32_t>(x->dims(), place);
+      out_tensor.mutable_data<int32_t>(out->dims(), place);
+      MLUCnnlTensorDesc x_int64_desc(*x);
+      MLUCnnlTensorDesc x_int32_desc(in_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT64, VT::INT32);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    x_int64_desc.get(),
+                    GetBasePtr(x),
+                    x_int32_desc.get(),
+                    GetBasePtr(&in_tensor));
+      send_buff = reinterpret_cast<void*>(in_tensor.data<int32_t>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<int32_t>());
+    } else {
+      in_tensor.ShareDataWith(*x);
+      out_tensor.ShareDataWith(*out);
+      send_buff = reinterpret_cast<void*>(in_tensor.data<T>());
+      recv_buff = reinterpret_cast<void*>(out_tensor.data<T>());
+    }
 
     mluStream stream = nullptr;
     if (ctx.Attr<bool>("use_calc_stream")) {
-      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
       stream = static_cast<platform::MLUDeviceContext*>(dev_ctx)->stream();
     } else {
       stream = comm->stream();
     }
+    cnclDataType_t dtype = platform::ToCNCLDataType(
+        framework::TransToProtoVarType(in_tensor.dtype()));
 
     PADDLE_ENFORCE_MLU_SUCCESS(cnclAllGather(
         send_buff, recv_buff, send_numel, dtype, comm->comm(), stream));
+    if (framework::TransToProtoVarType(x->dtype()) ==
+        framework::proto::VarType::INT64) {
+      // cast back from int64 out_tensor to out
+      MLUCnnlTensorDesc out_int64_desc(*out);
+      MLUCnnlTensorDesc out_int32_desc(out_tensor);
+      cnnlCastDataType_t cast_type = GetCastDataType(VT::INT32, VT::INT64);
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    out_int32_desc.get(),
+                    GetBasePtr(&out_tensor),
+                    out_int64_desc.get(),
+                    GetBasePtr(out));
+    }
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
         "PaddlePaddle should compile with MLU."));
@@ -80,4 +117,5 @@ REGISTER_OP_MLU_KERNEL(c_allgather,
                        ops::CAllGatherOpMLUKernel<int>,
                        ops::CAllGatherOpMLUKernel<int8_t>,
                        ops::CAllGatherOpMLUKernel<int16_t>,
+                       ops::CAllGatherOpMLUKernel<int64_t>,
                        ops::CAllGatherOpMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
index ef7e298aaf6a3c..f1640d2f4a3f53 100644
--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -265,7 +265,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
     auto map = distributed::ProcessGroupMapFromGid::getInstance();
     distributed::ProcessGroup* pg = map->get(rid);
     distributed::AllreduceOptions opts;
-    opts.reduce_op = distributed::ReduceOp::SUM;
+    opts.reduce_op = distributed::ReduceOp::MAX;
 
     // allocate memory on device.
     softmax->mutable_data<T>(place);
@@ -348,6 +348,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
 
     in_out.clear();
     in_out.push_back(predicted_logits);
+    opts.reduce_op = distributed::ReduceOp::SUM;
     pg->AllReduce(in_out, in_out, opts)->Synchronize();
 
     // step 4, obtain exp(logit)
@@ -364,6 +365,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
 
     in_out.clear();
     in_out.push_back(sum_exp_logits);
+    opts.reduce_op = distributed::ReduceOp::SUM;
     pg->AllReduce(in_out, in_out, opts)->Synchronize();
 
     auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index ec18a172e1f8bd..faf5bd8a0d76a8 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -217,6 +217,15 @@ class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
     }
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclRecv(
         out->data<T>(), numel, dtype, peer, comm->comm(), stream));
+    // VLOG(0) << "nccl bcast";
+    // PADDLE_ENFORCE_GPU_SUCCESS(
+    //     platform::dynload::ncclBcast(reinterpret_cast<void
+    //     *>(out->data<T>()),
+    //                                  numel,
+    //                                  dtype,
+    //                                  peer,
+    //                                  comm->comm(),
+    //                                  stream));
     VLOG(3) << "rank " << comm->rank() << " recv " << phi::product(out->dims())
             << " from " << peer;
 #else
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index 37b18703031de3..4a7a921c3b9b28 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -203,6 +203,14 @@ class SendOpV2CUDAKernel : public framework::OpKernel<T> {
         platform::ToNCCLDataType(framework::TransToProtoVarType(x->dtype()));
     PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclSend(
         x->data<T>(), numel, dtype, peer, comm->comm(), stream));
+    // VLOG(0) << "nccl bcast";
+    // PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclBcast(
+    //     reinterpret_cast<void*>(const_cast<T*>(x->data<T>())),
+    //     numel,
+    //     dtype,
+    //     comm->rank(),
+    //     comm->comm(),
+    //     stream));
     VLOG(3) << "rank " << comm->rank() << " send " << phi::product(x->dims())
             << " to " << peer;
 #else
diff --git a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
index 2ddcc7eb72cd4b..cb52baa7e64991 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_infer_op.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
+#ifdef PADDLE_WITH_MKLDNN
+#include "paddle/fluid/platform/mkldnn_helper.h"
+#endif
+
+DECLARE_bool(use_mkldnn);
 namespace paddle {
 namespace framework {
 class OpDesc;
@@ -73,14 +78,33 @@ class ConditionalBlockInferOp : public ConditionalOp {
       scopes->front() = &scope.NewScope();
       auto &cur_scope = *scopes->front();
 
-      framework::Executor exec(dev_place);
       auto *block = Attr<framework::BlockDesc *>("sub_block");
       VLOG(3) << "Conditional block.idx = " << block->ID()
               << ", scope = " << &cur_scope;
-      exec.Run(*block->Program(), &cur_scope, block->ID(), false);
+
+      if (!exec_ || !platform::is_same_place(exec_->GetPlace(), dev_place)) {
+        auto &pdesc = *block->Program();
+        exec_.reset(new framework::Executor(dev_place));
+#ifdef PADDLE_WITH_MKLDNN
+        if (FLAGS_use_mkldnn) exec_->EnableMKLDNN(pdesc);
+#endif
+        ctx_ = exec_->Prepare(
+            pdesc, block->ID(), std::vector<std::string>(), false);
+#ifdef PADDLE_WITH_MKLDNN
+        if (FLAGS_use_mkldnn) {
+          platform::AttachPointerHashToMKLDNNKey(exec_.get(), dev_place);
+          platform::RegisterModelLayout(ctx_->ops_, dev_place);
+        }
+#endif
+      }
+      exec_->RunPreparedContext(ctx_.get(), &cur_scope, false, true, false);
       scope.DeleteScope(scopes->front());
     }
   }
+
+ private:
+  mutable std::shared_ptr<framework::Executor> exec_{nullptr};
+  mutable std::unique_ptr<framework::ExecutorPrepareContext> ctx_{nullptr};
 };
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index bdc07efbc0f8c4..3a4ee516d08f99 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -312,6 +312,37 @@ class ConditionalBlockGradOp : public ConditionalOp {
   }
 };
 
+template <class T>
+struct FilterNoGradInput {};
+
+template <>
+struct FilterNoGradInput<framework::OpDesc> {
+  static void filter(const framework::BlockDesc *desc,
+                     std::vector<std::string> *vec) {
+    auto f = [desc](const std::string &name) -> std::string {
+      if (name == framework::kEmptyVarName) {
+        // don't drop empty var name, you can use Input(name, true) to drop it.
+        return framework::kEmptyVarName;
+      }
+      auto var_desc =
+          desc->FindVarRecursive(framework::GradOriginalVarName(name));
+      std::set<framework::proto::VarType::Type> not_support_backward_dtype = {
+          framework::proto::VarType::BOOL,
+          framework::proto::VarType::INT8,
+          framework::proto::VarType::UINT8,
+          framework::proto::VarType::INT16,
+          framework::proto::VarType::INT32,
+          framework::proto::VarType::INT64,
+      };
+      if (!var_desc ||
+          not_support_backward_dtype.count(var_desc->GetDataType()))
+        return framework::kEmptyVarName;
+      return name;
+    };
+    std::transform(vec->begin(), vec->end(), vec->begin(), f);
+  }
+};
+
 class ConditionalBlockGradInferShape : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *context) const override {
@@ -369,8 +400,11 @@ class ConditionalBlockGradMaker : public framework::SingleGradOpMaker<T> {
                       this->OutputGrad(ConditionalOp::kOutputs));
     grad_op->SetInput(ConditionalOp::kScope,
                       this->Output(ConditionalOp::kScope));
+
+    auto fwd_inputs = this->InputGrad(ConditionalOp::kInputs, false);
+    FilterNoGradInput<T>::filter(this->GetForwardOpBlock(), &fwd_inputs);
     grad_op->SetOutput(framework::GradVarName(ConditionalOp::kInputs),
-                       this->InputGrad(ConditionalOp::kInputs, false));
+                       fwd_inputs);
     grad_op->SetBlockAttr("sub_block", this->grad_block_[0]);
     grad_op->SetAttr("is_scalar_condition",
                      this->GetAttr("is_scalar_condition"));
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.h b/paddle/fluid/operators/controlflow/conditional_block_op.h
index d85eca8f5cb3a2..f2407e9a3f05ac 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.h
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.h
@@ -119,11 +119,6 @@ class ConditionalBlockOpProtoMaker : public framework::OpProtoAndCheckerMaker {
                   "The conditional variable (Cond) is used as scalar "
                   "condition.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>(ConditionalOp::kSkipEagerDeletionVars,
-                                      "Vars that would not be deleted when "
-                                      "garbage collection strategy enables")
-        .SetDefault(std::vector<std::string>())
-        .AsExtra();
     AddComment(R"DOC(Conditional block operator
 
 If `is_scalar_condition` is True, the conditional variable (Cond) is a scalar,
diff --git a/paddle/fluid/operators/controlflow/feed_op.cc b/paddle/fluid/operators/controlflow/feed_op.cc
index 4cef104496510f..e36ddace5b6e11 100644
--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -11,6 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/tensor_utils.h"
 
 namespace paddle {
 namespace framework {
@@ -61,6 +62,22 @@ class FeedVariableVisitor {
     *out_str = in_str;
   }
 
+  void operator()(const phi::SparseCooTensor &in_tensor) const {
+    phi::SparseCooTensor *out_tensor =
+        out_var_->GetMutable<phi::SparseCooTensor>();
+    if (platform::is_same_place(in_tensor.place(), place_)) {
+      *out_tensor = in_tensor;
+    } else {
+      platform::DeviceContext *context =
+          platform::DeviceContextPool::Instance().Get(place_);
+
+      phi::DenseTensor indices, values;
+      framework::TensorCopy(in_tensor.indices(), place_, *context, &indices);
+      framework::TensorCopy(in_tensor.values(), place_, *context, &values);
+      out_tensor->SetMember(indices, values, in_tensor.meta());
+    }
+  }
+
  private:
   framework::Variable *out_var_;
   const platform::Place &place_;
diff --git a/paddle/fluid/operators/controlflow/fetch_op.cc b/paddle/fluid/operators/controlflow/fetch_op.cc
index c1ed46867f1aca..7f179f9d97b968 100644
--- a/paddle/fluid/operators/controlflow/fetch_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_op.cc
@@ -123,6 +123,9 @@ class FetchOp : public framework::OperatorBase {
       auto &src_item = fetch_var->Get<framework::Vocab>();
       auto *dst_item = &(PADDLE_GET(framework::Vocab, fetch_list->at(col)));
       *dst_item = src_item;
+    } else if (fetch_var->IsType<phi::SparseCooTensor>()) {
+      auto &src_item = fetch_var->Get<phi::SparseCooTensor>();
+      fetch_list->at(col) = src_item;
     } else {
       auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
       framework::LoDTensorArray tmp(src_item.size());
diff --git a/paddle/fluid/operators/controlflow/fetch_v2_op.cc b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
index 64489c294d1233..02af91100c25a5 100644
--- a/paddle/fluid/operators/controlflow/fetch_v2_op.cc
+++ b/paddle/fluid/operators/controlflow/fetch_v2_op.cc
@@ -98,6 +98,12 @@ class FetchV2Op : public framework::OperatorWithKernel {
         return framework::OpKernelType(framework::proto::VarType::FP32,
                                        platform::CPUPlace());
       }
+    } else if (fetch_var->IsType<phi::SparseCooTensor>()) {
+      auto &src_item = fetch_var->Get<phi::SparseCooTensor>();
+      if (!src_item.initialized()) {
+        return framework::OpKernelType(framework::proto::VarType::FP32,
+                                       platform::CPUPlace());
+      }
     } else {
       auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
       if (src_item.empty() || !src_item[0].IsInitialized()) {
@@ -163,6 +169,12 @@ class FetchV2Kernel {
         dst_item->ShareDataWith(src_item);
         dst_item->set_lod(src_item.lod());
       }
+    } else if (fetch_var->IsType<phi::SparseCooTensor>()) {
+      auto &src_item = fetch_var->Get<phi::SparseCooTensor>();
+      if (!src_item.initialized()) {
+        return;
+      }
+      fetch_list->at(col) = src_item;
     } else {
       auto &src_item = fetch_var->Get<framework::LoDTensorArray>();
       framework::LoDTensorArray tmp(src_item.size());
diff --git a/paddle/fluid/operators/controlflow/op_variant.cc b/paddle/fluid/operators/controlflow/op_variant.cc
index 48b7a434106728..8d43a21e66437f 100644
--- a/paddle/fluid/operators/controlflow/op_variant.cc
+++ b/paddle/fluid/operators/controlflow/op_variant.cc
@@ -81,5 +81,22 @@ void AppendOpVariantByOpName(const std::vector<framework::OpDesc *> &op_descs,
   }
 }
 
+void AppendOpVariantByOpName(
+    const std::vector<framework::OpDesc *> &op_descs,
+    const std::string &candidate_op_name,
+    std::unordered_set<OpVariant, OpVariant::Hasher> *result_ops) {
+  PADDLE_ENFORCE_NOT_NULL(
+      result_ops,
+      platform::errors::Unavailable("result_ops should not be a null_ptr."));
+  for (auto *op_desc : op_descs) {
+    PADDLE_ENFORCE_NOT_NULL(
+        op_desc,
+        platform::errors::Unavailable("op_desc should not be a null_ptr."));
+    if (op_desc->Type() == candidate_op_name) {
+      result_ops->emplace(op_desc);
+    }
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h
index 738e7a4acc7eb0..ad7cc6b741eb9e 100644
--- a/paddle/fluid/operators/controlflow/op_variant.h
+++ b/paddle/fluid/operators/controlflow/op_variant.h
@@ -78,5 +78,10 @@ void AppendOpVariantByOpName(const std::vector<framework::OpDesc *> &op_descs,
                              const std::string &candidate_op_name,
                              std::vector<OpVariant> *result_ops);
 
+void AppendOpVariantByOpName(
+    const std::vector<framework::OpDesc *> &op_descs,
+    const std::string &candidate_op_name,
+    std::unordered_set<OpVariant, OpVariant::Hasher> *result_ops);
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc
index 4e0344b3b93916..10fa24b1bd4f5b 100644
--- a/paddle/fluid/operators/controlflow/while_op.cc
+++ b/paddle/fluid/operators/controlflow/while_op.cc
@@ -221,11 +221,6 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
         .SetDefault(false);
-    AddAttr<std::vector<std::string>>(kSkipEagerDeletionVars,
-                                      "Vars that would skip eager deletion."
-                                      "Users should not set this manually.")
-        .SetDefault(std::vector<std::string>())
-        .AsExtra();
     AddComment(R"DOC(
 )DOC");
   }
diff --git a/paddle/fluid/operators/conv_base_helper.h b/paddle/fluid/operators/conv_base_helper.h
index b52936c1972189..4dc83c9717ae77 100644
--- a/paddle/fluid/operators/conv_base_helper.h
+++ b/paddle/fluid/operators/conv_base_helper.h
@@ -36,17 +36,10 @@ using framework::ConvSearchCache;
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 
-// As the basic for SearchAlgorithm struct.
-template <typename PerfT>
-struct SearchAlgorithm {};
-
 // As the container of searchAlgorithm::Find() result.
 template <typename AlgoT>
 struct SearchResult {
   SearchResult() {}
-  explicit SearchResult(const phi::autotune::DnnNode& node)
-      : algo(static_cast<AlgoT>(node.algo)),
-        workspace_size(node.workspace_size) {}
 
   explicit SearchResult(AlgoT a) : algo(a) {}
   explicit SearchResult(AlgoT a, float t, size_t size)
@@ -55,12 +48,21 @@ struct SearchResult {
   AlgoT algo = static_cast<AlgoT>(0);
   float time = -1.f;
   size_t workspace_size = 0;
+  bool exhaustive_search = false;
 };
 
 template <typename T>
 static std::ostream& operator<<(std::ostream& out, const std::vector<T>& v) {
   out << "[";
-  for (auto const& tmp : v) out << tmp << ",";
+  bool is_first = true;
+  for (auto const& tmp : v) {
+    if (is_first) {
+      out << tmp;
+      is_first = false;
+    } else {
+      out << ", " << tmp;
+    }
+  }
   out << "]";
   return out;
 }
@@ -113,7 +115,7 @@ struct ConvArgsBase {
     auto w_shape = phi::vectorize(w->dims());
     VLOG(10) << "[ConvArgs] x_dims=" << x_shape << ", w_dims=" << w_shape
              << ", strides=" << s << ", paddings=" << p << ", dilations=" << d
-             << ",data= " << paddle::experimental::CppTypeToDataType<T>::Type()
+             << ", data=" << paddle::experimental::CppTypeToDataType<T>::Type()
              << ", group=" << group
              << ", data layout=" << static_cast<int64_t>(data_layout);
 
diff --git a/paddle/fluid/operators/conv_cudnn_helper.h b/paddle/fluid/operators/conv_cudnn_helper.h
index e6fcf2be286ecf..52c530d71f04fa 100644
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_base_helper.h"
 #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
-#include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/kernels/autotune/switch_autotune.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
 namespace paddle {
@@ -52,11 +52,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
   }
 
   auto in_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *input);
-  auto out_t =
-      framework::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
-          *out, new_out_dims);
+      phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(*input);
+  auto out_t = phi::EigenTensor<T, D, Eigen::RowMajor, Eigen::DenseIndex>::From(
+      *out, new_out_dims);
 
   phi::funcs::EigenSlice<std::decay_t<decltype(place)>, T, D>::Eval(
       place, out_t, in_t, offsets, extents);
@@ -146,83 +144,21 @@ void ChooseAlgoByWorkspace(const std::vector<PerfT>& perf_results,
   }
 }
 
-static void SetConvMathType(const phi::GPUContext& ctx,
-                            cudnnDataType_t dtype,
-                            const platform::ConvolutionDescriptor& cdesc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-    VLOG(5) << "use cudnn_tensor_op_math";
-#if CUDA_VERSION >= 11000
-#if CUDNN_VERSION_MIN(8, 1, 0)
-  } else if (ctx.GetComputeCapability() >= 80 && dtype == CUDNN_DATA_BFLOAT16) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_TENSOR_OP_MATH));
-#endif  // CUDNN_VERSION_MIN(8, 1, 0)
-  } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_FMA_MATH));
-#endif  // CUDA_VERSION >= 11000
-  } else {
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
-        cdesc.desc(), CUDNN_DEFAULT_MATH));
-    VLOG(5) << "NOT use cudnn_tensor_op_math";
-  }
-#endif
-}
+template <typename PerfT>
+struct SearchAlgorithmBase {};
 
 // cuDNN convolution forward algorithm searcher, consisted of three searching
 // modes, namely: deterministic, heuristic and exhaustive_search mode.
 // As well as one workspace size acquirsition function with respect to
 // the chosen alogrithm.
 template <>
-struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
+struct SearchAlgorithmBase<cudnnConvolutionFwdAlgoPerf_t> {
   using PerfT = cudnnConvolutionFwdAlgoPerf_t;
   using AlgoT = cudnnConvolutionFwdAlgo_t;
+  constexpr static phi::autotune::AlgorithmType kAlgoType =
+      phi::autotune::AlgorithmType::kConvForward;
 
-  template <typename T>
-  static SearchResult<AlgoT> Find(const ConvArgs& args,
-                                  bool exhaustive_search,
-                                  bool deterministic,
-                                  const phi::GPUContext& ctx) {
-    SearchResult<AlgoT> result;
-    auto dtype = platform::CudnnDataType<T>::type;
-    SetConvMathType(ctx, dtype, args.cdesc);
-
-    if (deterministic) {
-      result = FindAlgoDeterministic(args);
-    } else {
-      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
-      // 2. Once turning on auto-tune, runn heuristic search(default) before
-      //    auto-tune process, run exhaustive_search during mentioned process.
-      // 3. After auto-tune process, run cached algorithm if cached, run
-      //    default mode for the rest.
-      auto key = args.Convert2ConvCacheKey<T>();
-      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConvForward();
-      if (cache.Find(key)) {
-        auto t = cache.Get(key);
-        result.algo = static_cast<AlgoT>(t.algo);
-        result.workspace_size = t.workspace_size;
-      } else {
-        bool use_autotune =
-            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
-        if (exhaustive_search || use_autotune) {
-          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-        } else {
-          result = FindAlgoHeuristic(args, ctx);
-        }
-        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
-                                    result.workspace_size);
-        cache.Set(key, node);
-      }
-    }
-    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
-            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo
-            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
-    return result;
-  }
+  static const std::string GetPerfName() { return "ConvForward"; }
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionFwdAlgo_t algo) {
@@ -239,7 +175,7 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
     return workspace_size;
   }
 
- private:
+ protected:
   static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
     auto workspace_size = GetWorkspaceSize(args, static_cast<AlgoT>(1));
     return SearchResult<AlgoT>(static_cast<AlgoT>(1), -1.0, workspace_size);
@@ -271,6 +207,10 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 
     if (result.workspace_size > workspace_size_limit) {
 #if CUDNN_VERSION >= 8000
+      VLOG(4) << GetPerfResultString<PerfT>("[Heuristic] FwdAlgo Perf result",
+                                            perf_results,
+                                            actual_perf_count,
+                                            workspace_size_limit);
       // cudnnGetConvolutionForwardAlgorithm is removed in CUDNN-8
       ChooseAlgoByWorkspace<PerfT, AlgoT>(
           perf_results, workspace_size_limit, &result);
@@ -387,53 +327,13 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
 // As well as one workspace size acquirsition function with
 // respect to the chosen alogrithm.
 template <>
-struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
+struct SearchAlgorithmBase<cudnnConvolutionBwdDataAlgoPerf_t> {
   using PerfT = cudnnConvolutionBwdDataAlgoPerf_t;
   using AlgoT = cudnnConvolutionBwdDataAlgo_t;
+  constexpr static phi::autotune::AlgorithmType kAlgoType =
+      phi::autotune::AlgorithmType::kConvBackwardData;
 
-  template <typename T>
-  static SearchResult<AlgoT> Find(const ConvArgs& args,
-                                  bool exhaustive_search,
-                                  bool deterministic,
-                                  const phi::GPUContext& ctx) {
-    SearchResult<AlgoT> result;
-    auto dtype = platform::CudnnDataType<T>::type;
-    SetConvMathType(ctx, dtype, args.cdesc);
-
-    if (deterministic) {
-      result = FindAlgoDeterministic(args);
-    } else {
-      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
-      // 2. Once turning on auto-tune, runn heuristic search(default) before
-      //    auto-tune process, run exhaustive_search during mentioned process.
-      // 3. After auto-tune process, run cached algorithm if cached, run
-      //    default mode for the rest.
-      auto key = args.Convert2ConvCacheKey<T>();
-      auto& cache =
-          phi::autotune::AutoTuneCache::Instance().GetConvBackwardData();
-      if (cache.Find(key)) {
-        auto t = cache.Get(key);
-        result.algo = static_cast<AlgoT>(t.algo);
-        result.workspace_size = t.workspace_size;
-      } else {
-        bool use_autotune =
-            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
-        if (exhaustive_search || use_autotune) {
-          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-        } else {
-          result = FindAlgoHeuristic(args, ctx);
-        }
-        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
-                                    result.workspace_size);
-        cache.Set(key, node);
-      }
-    }
-    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
-            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo
-            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
-    return result;
-  }
+  static const std::string GetPerfName() { return "ConvBackwardData"; }
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionBwdDataAlgo_t algo) {
@@ -450,7 +350,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
     return workspace_size;
   }
 
- private:
+ protected:
   static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
     auto workspace_size =
         GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1);
@@ -609,54 +509,13 @@ struct SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t> {
 // exhaustive_search mode. As well as one workspace size acquirsition function
 // with respect to the chosen alogrithm.
 template <>
-struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
+struct SearchAlgorithmBase<cudnnConvolutionBwdFilterAlgoPerf_t> {
   using PerfT = cudnnConvolutionBwdFilterAlgoPerf_t;
   using AlgoT = cudnnConvolutionBwdFilterAlgo_t;
+  constexpr static phi::autotune::AlgorithmType kAlgoType =
+      phi::autotune::AlgorithmType::kConvBackwardFilter;
 
-  template <typename T>
-  static SearchResult<AlgoT> Find(const ConvArgs& args,
-                                  bool exhaustive_search,
-                                  bool deterministic,
-                                  const phi::GPUContext& ctx) {
-    platform::CUDAGraphCaptureModeGuard guard;
-    SearchResult<AlgoT> result;
-    auto dtype = platform::CudnnDataType<T>::type;
-    SetConvMathType(ctx, dtype, args.cdesc);
-
-    if (deterministic) {
-      result = FindAlgoDeterministic(args);
-    } else {
-      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
-      // 2. Once turning on auto-tune, runn heuristic search(default) before
-      //    auto-tune process, run exhaustive_search during mentioned process.
-      // 3. After auto-tune process, run cached algorithm if cached, run
-      //    default mode for the rest.
-      auto key = args.Convert2ConvCacheKey<T>();
-      auto& cache =
-          phi::autotune::AutoTuneCache::Instance().GetConvBackwardFilter();
-      if (cache.Find(key)) {
-        auto t = cache.Get(key);
-        result.algo = static_cast<AlgoT>(t.algo);
-        result.workspace_size = t.workspace_size;
-      } else {
-        bool use_autotune =
-            phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
-        if (exhaustive_search || use_autotune) {
-          result = FindAlgoExhaustiveSearch<T>(args, ctx);
-        } else {
-          result = FindAlgoHeuristic(args, ctx);
-        }
-        phi::autotune::DnnNode node(static_cast<int64_t>(result.algo),
-                                    result.workspace_size);
-        cache.Set(key, node);
-      }
-    }
-    VLOG(3) << "[cuDNN Convoltion] exhaustive_search=" << exhaustive_search
-            << ", deterministic=" << deterministic
-            << ", choose algo=" << result.algo
-            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
-    return result;
-  }
+  static const std::string GetPerfName() { return "ConvBackwardFilter"; }
 
   static size_t GetWorkspaceSize(const ConvArgs& args,
                                  cudnnConvolutionBwdFilterAlgo_t algo) {
@@ -674,7 +533,7 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
     return workspace_size;
   }
 
- private:
+ protected:
   static SearchResult<AlgoT> FindAlgoDeterministic(const ConvArgs& args) {
     auto workspace_size =
         GetWorkspaceSize(args, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1);
@@ -891,5 +750,103 @@ struct SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t> {
   }
 };
 
+template <typename PerfT>
+struct SearchAlgorithm : public SearchAlgorithmBase<PerfT> {
+  using AlgoT = typename SearchAlgorithmBase<PerfT>::AlgoT;
+
+  template <typename T>
+  static SearchResult<AlgoT> Find(const phi::GPUContext& ctx,
+                                  const ConvArgs& args,
+                                  bool exhaustive_search,
+                                  bool deterministic,
+                                  bool enable_autotune = true) {
+    SearchResult<AlgoT> result;
+    bool use_autotune = false;
+    auto dtype = platform::CudnnDataType<T>::type;
+    SetConvMathType(ctx, dtype, args.cdesc);
+
+    if (deterministic) {
+      result = SearchAlgorithmBase<PerfT>::FindAlgoDeterministic(args);
+    } else {
+      // 1. Once turning on exhaustive FLAGS, always get exhaustive_search.
+      // 2. Once turning on auto-tune, run heuristic (default) before
+      //    auto-tune process, run exhaustive_search during mentioned process.
+      //    Auto tune is only enabled between specified range.
+      // 3. After auto-tune process, run cached algorithm if cached, run
+      //    default mode for the rest.
+      auto key = args.Convert2ConvCacheKey<T>();
+      auto& cache = phi::autotune::AutoTuneCache::Instance().GetConv(
+          SearchAlgorithmBase<PerfT>::kAlgoType);
+      bool find_in_cache = cache.Find(key);
+      if (find_in_cache) {
+        auto t = cache.Get(key);
+        result.algo = static_cast<AlgoT>(t.algo);
+        result.workspace_size = t.workspace_size;
+        result.exhaustive_search = t.exhaustive_search;
+      }
+      if (!result.exhaustive_search) {
+        bool need_update_cache = false;
+        // In conv2d_tranpose, enable_autotune is set to false because some
+        // algorithm picked by exhaustive search method produce wrong result.
+        use_autotune = enable_autotune &&
+                       phi::autotune::AutoTuneStatus::Instance().UseAutoTune();
+        if (exhaustive_search || use_autotune) {
+          // Once autotune is enabled, the autotuned result can rewrite the
+          // previous result in cache found by heuristic method.
+          result =
+              SearchAlgorithmBase<PerfT>::template FindAlgoExhaustiveSearch<T>(
+                  args, ctx);
+          need_update_cache = true;
+        } else if (!find_in_cache) {
+          result = SearchAlgorithmBase<PerfT>::FindAlgoHeuristic(args, ctx);
+          need_update_cache = true;
+        }
+        if (need_update_cache) {
+          phi::autotune::ConvAutoTuneResult node(
+              static_cast<int64_t>(result.algo),
+              result.workspace_size,
+              exhaustive_search || use_autotune);
+          cache.Set(key, node);
+        }
+      }
+    }
+    VLOG(3) << "[cuDNN " << SearchAlgorithmBase<PerfT>::GetPerfName()
+            << "] exhaustive_search=" << exhaustive_search
+            << ", use_autotune=" << use_autotune
+            << ", deterministic=" << deterministic
+            << ", choose algo=" << result.algo
+            << ", workspace=" << ToMegaBytes(result.workspace_size) << " MB";
+    return result;
+  }
+
+  static void SetConvMathType(const phi::GPUContext& ctx,
+                              cudnnDataType_t dtype,
+                              const platform::ConvolutionDescriptor& cdesc) {
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    if (ctx.GetComputeCapability() >= 70 && dtype == CUDNN_DATA_HALF) {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
+      VLOG(5) << "Enable Tensor Core for FLOAT16";
+#if CUDA_VERSION >= 11000
+#if CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (ctx.GetComputeCapability() >= 80 &&
+               dtype == CUDNN_DATA_BFLOAT16) {
+      VLOG(5) << "Enable Tensor Core for BFLOAT16";
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_TENSOR_OP_MATH));
+#endif  // CUDNN_VERSION_MIN(8, 1, 0)
+    } else if (dtype == CUDNN_DATA_FLOAT && !cdesc.allow_tf32_) {
+      VLOG(5) << "Disable TensorFloat (Tensor Core) for FLOAT";
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_FMA_MATH));
+#endif  // CUDA_VERSION >= 11000
+    } else {
+      PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSetConvolutionMathType(
+          cdesc.desc(), CUDNN_DEFAULT_MATH));
+    }
+#endif
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_miopen_helper.h b/paddle/fluid/operators/conv_miopen_helper.h
index 1e78dcb6b731ef..648116647b04a7 100644
--- a/paddle/fluid/operators/conv_miopen_helper.h
+++ b/paddle/fluid/operators/conv_miopen_helper.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/conv_base_helper.h"
 
 namespace paddle {
@@ -55,6 +56,9 @@ static void RemovePaddingSlice(const phi::GPUContext& context,
   out_t.device(place) = in_t.slice(offsets, extents);
 }
 
+template <typename PerfT>
+struct SearchAlgorithm {};
+
 template <>
 struct SearchAlgorithm<miopenConvFwdAlgorithm_t> {
   using perf_t = miopenConvAlgoPerf_t;
diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc
index 0d98f5b75e4fb9..a02bf699b32dd5 100644
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -142,7 +142,7 @@ class CrossEntropyGradientOpBase : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const {
+  void InferShape(framework::InferShapeContext* ctx) const override {
     OP_INOUT_CHECK(
         ctx->HasInput("Label"), "Input", "Label", "CrossEntropyGradientOpBase");
     OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Y")),
diff --git a/paddle/fluid/operators/cum_op.cc b/paddle/fluid/operators/cum_op.cc
index 09d3f1dbe74093..29bc83bd9ae518 100644
--- a/paddle/fluid/operators/cum_op.cc
+++ b/paddle/fluid/operators/cum_op.cc
@@ -72,13 +72,9 @@ class CumsumGradMaker : public framework::SingleGradOpMaker<T> {
     grad_op->SetType("cumsum");
     grad_op->SetInput("X", this->OutputGrad("Out"));
     grad_op->SetOutput("Out", this->InputGrad("X"));
-    grad_op->SetAttr("axis", PADDLE_GET_CONST(int, this->GetAttr("axis")));
-    grad_op->SetAttr("flatten",
-                     PADDLE_GET_CONST(bool, this->GetAttr("flatten")));
+    grad_op->SetAttrMap(this->Attrs());
     grad_op->SetAttr("reverse",
                      !PADDLE_GET_CONST(bool, this->GetAttr("reverse")));
-    grad_op->SetAttr("exclusive",
-                     PADDLE_GET_CONST(bool, this->GetAttr("exclusive")));
   }
 };
 
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
new file mode 100644
index 00000000000000..704d85acf13621
--- /dev/null
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -0,0 +1,60 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/custom_device_common_op_registry.h"
+#include "paddle/fluid/operators/run_program_op.h"
+#include "paddle/fluid/operators/save_combine_op.h"
+#include "paddle/phi/backends/device_manager.h"
+
+#define REGISTER_OP_CUSTOM_DEVICE_KERNEL(op_type, dev_type, ...)             \
+  static paddle::framework::OpKernelRegistrar<phi::CustomPlace, __VA_ARGS__> \
+      __op_custom_device_kernel_registrar_##op_type##_##__acosf##__(         \
+          #op_type,                                                          \
+          dev_type,                                                          \
+          paddle::framework::OpKernelType::kDefaultCustomizedTypeValue);     \
+  __op_custom_device_kernel_registrar_##op_type##_##__acosf##__.Touch();
+
+namespace paddle {
+namespace operators {
+
+void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
+  auto device_type = dev_type.c_str();
+  /* see [Why use single type kernel] */
+  REGISTER_OP_CUSTOM_DEVICE_KERNEL(
+      run_program,
+      device_type,
+      paddle::operators::
+          RunProgramOpKernel<paddle::platform::CustomDeviceContext, float>);
+  REGISTER_OP_CUSTOM_DEVICE_KERNEL(
+      run_program_grad,
+      device_type,
+      paddle::operators ::
+          RunProgramGradOpKernel<paddle::platform::CustomDeviceContext, float>);
+  REGISTER_OP_CUSTOM_DEVICE_KERNEL(
+      save_combine,
+      device_type,
+      paddle::operators ::
+          SaveCombineOpKernel<paddle::platform::CustomDeviceContext, float>,
+      paddle::operators ::
+          SaveCombineOpKernel<paddle::platform::CustomDeviceContext, double>,
+      paddle::operators ::
+          SaveCombineOpKernel<paddle::platform::CustomDeviceContext, int>,
+      paddle::operators ::
+          SaveCombineOpKernel<paddle::platform::CustomDeviceContext, int64_t>);
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+#undef REGISTER_OP_CUSTOM_DEVICE_KERNEL
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.h b/paddle/fluid/operators/custom_device_common_op_registry.h
similarity index 66%
rename from paddle/phi/api/lib/sparse_api_custom_impl.h
rename to paddle/fluid/operators/custom_device_common_op_registry.h
index 6053d281f0ff16..421c745c536845 100644
--- a/paddle/phi/api/lib/sparse_api_custom_impl.h
+++ b/paddle/fluid/operators/custom_device_common_op_registry.h
@@ -14,19 +14,16 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/common/backend.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
 
-namespace paddle {
-namespace experimental {
-namespace sparse {
-
-Tensor to_dense_impl(const Tensor& x);
+#include <string>
 
-Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim);
+namespace paddle {
+namespace operators {
 
-Tensor to_sparse_csr_impl(const Tensor& x);
+void RegisterCustomDeviceCommonKernel(const std::string& device_type);
 
-}  // namespace sparse
-}  // namespace experimental
+}  // namespace operators
 }  // namespace paddle
+
+#endif
diff --git a/paddle/fluid/operators/dequantize_abs_max_op.cc b/paddle/fluid/operators/dequantize_abs_max_op.cc
index 64807329a4043a..ff4bb5f53341bd 100644
--- a/paddle/fluid/operators/dequantize_abs_max_op.cc
+++ b/paddle/fluid/operators/dequantize_abs_max_op.cc
@@ -69,7 +69,7 @@ class DequantizeMaxAbsOp : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
+      const framework::ExecutionContext& ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     auto type = framework::OpKernelType(data_type, ctx.device_context());
     return type;
diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc
index c80c050b14afd7..b3c1770493c9c6 100644
--- a/paddle/fluid/operators/dequantize_log_op.cc
+++ b/paddle/fluid/operators/dequantize_log_op.cc
@@ -76,7 +76,7 @@ class DequantizeLogOp : public framework::OperatorWithKernel {
   }
 
   framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
+      const framework::ExecutionContext& ctx) const override {
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
     auto type = framework::OpKernelType(data_type, ctx.device_context());
     return type;
diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index d965e1ace5fc31..81860c60492039 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -42,19 +42,23 @@ if(WITH_XPU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_xpu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
   detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 elseif(WITH_MLU)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_mlu.cc)
-  detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_mlu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc yolo_box_op_mlu.cc)
 elseif(WITH_ASCEND_CL)
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op_npu.cc)
   detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op_npu.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
 else()
   detection_library(iou_similarity_op SRCS iou_similarity_op.cc
                     iou_similarity_op.cu)
   detection_library(prior_box_op SRCS prior_box_op.cc)
+  detection_library(yolo_box_op SRCS yolo_box_op.cc)
   # detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
 endif()
 
@@ -73,7 +77,6 @@ detection_library(locality_aware_nms_op SRCS locality_aware_nms_op.cc DEPS gpc)
 detection_library(matrix_nms_op SRCS matrix_nms_op.cc DEPS gpc)
 detection_library(box_clip_op SRCS box_clip_op.cc box_clip_op.cu)
 detection_library(yolov3_loss_op SRCS yolov3_loss_op.cc)
-detection_library(yolo_box_op SRCS yolo_box_op.cc)
 detection_library(box_decoder_and_assign_op SRCS box_decoder_and_assign_op.cc
                   box_decoder_and_assign_op.cu)
 detection_library(sigmoid_focal_loss_op SRCS sigmoid_focal_loss_op.cc
diff --git a/paddle/fluid/operators/detection/prior_box_op_mlu.cc b/paddle/fluid/operators/detection/prior_box_op_mlu.cc
new file mode 100644
index 00000000000000..04402f6ae200a1
--- /dev/null
+++ b/paddle/fluid/operators/detection/prior_box_op_mlu.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/operators/detection/prior_box_op.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class PriorBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* input = ctx.Input<phi::DenseTensor>("Input");
+    auto* image = ctx.Input<phi::DenseTensor>("Image");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* variances = ctx.Output<phi::DenseTensor>("Variances");
+    float step_w = ctx.Attr<float>("step_w");
+    float step_h = ctx.Attr<float>("step_h");
+    float offset = ctx.Attr<float>("offset");
+    bool clip = ctx.Attr<bool>("clip");
+    bool min_max_aspect_ratios_order =
+        ctx.Attr<bool>("min_max_aspect_ratios_order");
+
+    int im_width = image->dims()[3];
+    int im_height = image->dims()[2];
+    int width = input->dims()[3];
+    int height = input->dims()[2];
+
+    auto aspect_ratios = ctx.Attr<std::vector<float>>("aspect_ratios");
+    bool flip = ctx.Attr<bool>("flip");
+    std::vector<float> new_aspect_ratios;
+    ExpandAspectRatios(aspect_ratios, flip, &new_aspect_ratios);
+    auto& dev_ctx = ctx.template device_context<platform::MLUDeviceContext>();
+    phi::DenseTensor ratios;
+    paddle::framework::TensorFromVector(new_aspect_ratios, dev_ctx, &ratios);
+    MLUOpTensorDesc new_aspect_ratios_desc(ratios);
+
+    auto min_sizes = ctx.Attr<std::vector<float>>("min_sizes");
+    phi::DenseTensor min;
+    paddle::framework::TensorFromVector(min_sizes, dev_ctx, &min);
+    MLUOpTensorDesc min_sizes_desc(min);
+
+    auto max_sizes = ctx.Attr<std::vector<float>>("max_sizes");
+    phi::DenseTensor max;
+    paddle::framework::TensorFromVector(max_sizes, dev_ctx, &max);
+    MLUOpTensorDesc max_sizes_desc(max);
+
+    auto variances_attr = ctx.Attr<std::vector<float>>("variances");
+    phi::DenseTensor var_tensor;
+    paddle::framework::TensorFromVector(variances_attr, dev_ctx, &var_tensor);
+    MLUOpTensorDesc variances_attr_desc(var_tensor);
+
+    auto place = ctx.GetPlace();
+
+    boxes->mutable_data<T>(place);
+    variances->mutable_data<T>(place);
+
+    MLUOpTensorDesc var_desc(*variances);
+    MLUOpTensorDesc output_desc(*boxes);
+    MLUOP::OpPriorBox(ctx,
+                      min_sizes_desc.get(),
+                      GetBasePtr(&min),
+                      new_aspect_ratios_desc.get(),
+                      GetBasePtr(&ratios),
+                      variances_attr_desc.get(),
+                      GetBasePtr(&var_tensor),
+                      max_sizes_desc.get(),
+                      GetBasePtr(&max),
+                      height,
+                      width,
+                      im_height,
+                      im_width,
+                      step_h,
+                      step_w,
+                      offset,
+                      clip,
+                      min_max_aspect_ratios_order,
+                      output_desc.get(),
+                      GetBasePtr(boxes),
+                      var_desc.get(),
+                      GetBasePtr(variances));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(prior_box, ops::PriorBoxMLUKernel<float>);
diff --git a/paddle/fluid/operators/detection/yolo_box_op_mlu.cc b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
new file mode 100644
index 00000000000000..739c05805d68a2
--- /dev/null
+++ b/paddle/fluid/operators/detection/yolo_box_op_mlu.cc
@@ -0,0 +1,137 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+template <typename T>
+class YoloBoxMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* x = ctx.Input<phi::DenseTensor>("X");
+    auto* img_size = ctx.Input<phi::DenseTensor>("ImgSize");
+    auto* boxes = ctx.Output<phi::DenseTensor>("Boxes");
+    auto* scores = ctx.Output<phi::DenseTensor>("Scores");
+    const std::vector<int> anchors = ctx.Attr<std::vector<int>>("anchors");
+    auto class_num = ctx.Attr<int>("class_num");
+    auto conf_thresh = ctx.Attr<float>("conf_thresh");
+    auto downsample_ratio = ctx.Attr<int>("downsample_ratio");
+    auto clip_bbox = ctx.Attr<bool>("clip_bbox");
+    auto scale = ctx.Attr<float>("scale_x_y");
+    auto iou_aware = ctx.Attr<bool>("iou_aware");
+    auto iou_aware_factor = ctx.Attr<float>("iou_aware_factor");
+
+    int anchor_num = anchors.size() / 2;
+    int64_t size = anchors.size();
+    auto dim_x = x->dims();
+    int n = dim_x[0];
+    int s = anchor_num;
+    int h = dim_x[2];
+    int w = dim_x[3];
+
+    // The output of mluOpYoloBox: A 4-D tensor with shape [N, anchor_num, 4,
+    // H*W], the coordinates of boxes, and a 4-D tensor with shape [N,
+    // anchor_num, :attr:`class_num`, H*W], the classification scores of boxes.
+    std::vector<int64_t> boxes_dim_mluops({n, s, 4, h * w});
+    std::vector<int64_t> scores_dim_mluops({n, s, class_num, h * w});
+
+    // In Paddle framework: A 3-D tensor with shape [N, M, 4], the coordinates
+    // of boxes, and a 3-D tensor with shape [N, M, :attr:`class_num`], the
+    // classification scores of boxes.
+    std::vector<int64_t> boxes_out_dim({n, s, h * w, 4});
+    std::vector<int64_t> scores_out_dim({n, s, h * w, class_num});
+
+    auto& dev_ctx = ctx.template device_context<MLUDeviceContext>();
+    phi::DenseTensor boxes_tensor_mluops =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, 4, h * w}, dev_ctx);
+    phi::DenseTensor scores_tensor_mluops =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>({n, s, class_num, h * w},
+                                                   dev_ctx);
+    MLUOpTensorDesc boxes_trans_desc_mluops(
+        4, boxes_dim_mluops.data(), ToMluOpDataType<T>());
+    MLUCnnlTensorDesc boxes_trans_desc_cnnl(
+        4, boxes_dim_mluops.data(), ToCnnlDataType<T>());
+    MLUOpTensorDesc scores_trans_desc_mluops(
+        4, scores_dim_mluops.data(), ToMluOpDataType<T>());
+    MLUCnnlTensorDesc scores_trans_desc_cnnl(
+        4, scores_dim_mluops.data(), ToCnnlDataType<T>());
+
+    boxes->mutable_data<T>(ctx.GetPlace());
+    scores->mutable_data<T>(ctx.GetPlace());
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), boxes);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0), scores);
+
+    MLUOpTensorDesc x_desc(*x, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<T>());
+    MLUOpTensorDesc img_size_desc(
+        *img_size, MLUOP_LAYOUT_ARRAY, ToMluOpDataType<int32_t>());
+    Tensor anchors_temp(framework::TransToPhiDataType(VT::INT32));
+    anchors_temp.Resize({size});
+    paddle::framework::TensorFromVector(
+        anchors, ctx.device_context(), &anchors_temp);
+    MLUOpTensorDesc anchors_desc(anchors_temp);
+    MLUCnnlTensorDesc boxes_desc_cnnl(
+        4, boxes_out_dim.data(), ToCnnlDataType<T>());
+    MLUCnnlTensorDesc scores_desc_cnnl(
+        4, scores_out_dim.data(), ToCnnlDataType<T>());
+
+    MLUOP::OpYoloBox(ctx,
+                     x_desc.get(),
+                     GetBasePtr(x),
+                     img_size_desc.get(),
+                     GetBasePtr(img_size),
+                     anchors_desc.get(),
+                     GetBasePtr(&anchors_temp),
+                     class_num,
+                     conf_thresh,
+                     downsample_ratio,
+                     clip_bbox,
+                     scale,
+                     iou_aware,
+                     iou_aware_factor,
+                     boxes_trans_desc_mluops.get(),
+                     GetBasePtr(&boxes_tensor_mluops),
+                     scores_trans_desc_mluops.get(),
+                     GetBasePtr(&scores_tensor_mluops));
+    const std::vector<int> perm = {0, 1, 3, 2};
+
+    // transpose the boxes from [N, S, 4, H*W] to [N, S, H*W, 4]
+    MLUCnnl::Transpose(ctx,
+                       perm,
+                       4,
+                       boxes_trans_desc_cnnl.get(),
+                       GetBasePtr(&boxes_tensor_mluops),
+                       boxes_desc_cnnl.get(),
+                       GetBasePtr(boxes));
+
+    // transpose the scores from [N, S, class_num, H*W] to [N, S, H*W,
+    // class_num]
+    MLUCnnl::Transpose(ctx,
+                       perm,
+                       4,
+                       scores_trans_desc_cnnl.get(),
+                       GetBasePtr(&scores_tensor_mluops),
+                       scores_desc_cnnl.get(),
+                       GetBasePtr(scores));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(yolo_box, ops::YoloBoxMLUKernel<float>);
diff --git a/paddle/fluid/operators/dropout_op_mlu.cc b/paddle/fluid/operators/dropout_op_mlu.cc
index 923e6cc5ed9942..142e047e6c2b11 100644
--- a/paddle/fluid/operators/dropout_op_mlu.cc
+++ b/paddle/fluid/operators/dropout_op_mlu.cc
@@ -39,8 +39,17 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
     MLUCnnlTensorDesc x_desc(*x);
     MLUCnnlTensorDesc out_desc(*out);
 
-    if (!is_test) {
-      // exec dropout op for training only.
+    if (is_test && is_upscale) {
+      // dropout op for inference: out = input.
+      framework::TensorCopy(
+          *x,
+          ctx.GetPlace(),
+          ctx.template device_context<platform::MLUDeviceContext>(),
+          out);
+      return;
+    } else if (!is_test) {
+      // dropout op for training: out = input * mask / ( 1.0 - dropout_prob ) or
+      // out = input * mask.
       int seed_data = 0;
       if (seed_tensor) {
         if (platform::is_mlu_place(seed_tensor->place())) {
@@ -79,50 +88,44 @@ class DropoutMLUKernel : public framework::OpKernel<T> {
       const int device_id = ctx.GetPlace().GetDeviceId();
       auto mlu_gen_random = GetMLURandomGenerator(ctx, device_id, seed_data);
 
-      const float prob = is_upscale ? dropout_prob : 0.0f;
+      // compute out = input * mask / ( 1.0 - dropout_prob )
       MLUCnnl::FusedDropout(ctx,
                             mlu_gen_random->get(),
                             x_desc.get(),
                             GetBasePtr(x),
-                            prob,
+                            dropout_prob,
                             GetBasePtr(&(mlu_gen_random->get_state())),
                             mask_desc.get(),
                             GetBasePtr(mask),
                             out_desc.get(),
                             GetBasePtr(out));
-    } else {
-      // exec dropout op for inference only.
+
       if (is_upscale) {
-        framework::TensorCopy(
-            *x,
-            ctx.GetPlace(),
-            ctx.template device_context<platform::MLUDeviceContext>(),
-            out);
-      } else {
-        auto scale = static_cast<T>(1.0f - dropout_prob);
-        Tensor scale_tensor(x->dtype());
-        scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
-        MLUCnnlTensorDesc scale_desc(scale_tensor);
-        MLUCnnl::Fill(ctx,
-                      CNNL_POINTER_MODE_HOST,
-                      &scale,
-                      scale_desc.get(),
-                      GetBasePtr(&scale_tensor));
-
-        auto data_type = ToCnnlDataType<T>();
-        MLUCnnlOpTensorDesc op_tensor_desc(
-            CNNL_OP_TENSOR_MUL, data_type, CNNL_NOT_PROPAGATE_NAN);
-        MLUCnnl::OpTensor(ctx,
-                          op_tensor_desc.get(),
-                          x_desc.get(),
-                          GetBasePtr(x),
-                          scale_desc.get(),
-                          GetBasePtr(&scale_tensor),
-                          out_desc.get(),
-                          GetBasePtr(out),
-                          data_type);
+        return;
       }
     }
+
+    // In downgrade_in_infer mode, need to multiply (1.0f - dropout_prob).
+    Tensor scale_tensor(x->dtype());
+    Tensor bias_tensor(x->dtype());
+    scale_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    bias_tensor.mutable_data<T>({1}, ctx.GetPlace());
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    FillMLUTensorWithHostValue(
+        ctx, static_cast<T>(1.0f - dropout_prob), &scale_tensor);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.0f), &bias_tensor);
+
+    MLUCnnl::Scale(ctx,
+                   0,
+                   is_test ? x_desc.get() : out_desc.get(),
+                   is_test ? GetBasePtr(x) : GetBasePtr(out),
+                   scale_desc.get(),
+                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
+                   out_desc.get(),
+                   GetBasePtr(out));
   }
 };
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h
index 610e5932b1c36d..e722d5f7e6e992 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op.h
@@ -216,47 +216,12 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInputX();
     AddInputY();
     AddOpOutput();
-
     AddAttr<int>("axis",
                  "(int, default -1). If X.dimension != Y.dimension,"
                  "Y.dimension must be a subsequence of x.dimension. And axis "
                  "is the start dimension index "
                  "for broadcasting Y onto X. ")
         .SetDefault(-1);
-    AddAttr<bool>("use_mkldnn", "(bool, default false). Used by MKLDNN.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>("x_data_format", "This parameter is no longer used.")
-        .SetDefault("")
-        .AsExtra();
-    AddAttr<std::string>("y_data_format", "This parameter is no longer used.")
-        .SetDefault("")
-        .AsExtra();
-    AddAttr<bool>(
-        "use_quantizer",
-        "(bool, default false) "
-        "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>(
-        "mkldnn_data_type",
-        "(string, default \"float32\"). Data type of mkldnn kernel")
-        .SetDefault("float32")
-        .InEnum({"float32", "int8", "bfloat16"})
-        .AsExtra();
-    /* int8 parameters */
-    AddAttr<float>("Scale_x",
-                   "(float, default 1.0f), The quantize scale of X tensor")
-        .SetDefault(1.0f)
-        .AsExtra();
-    AddAttr<float>("Scale_y",
-                   "(float, default 1.0f), The quantize scale of Y tensor")
-        .SetDefault(1.0f)
-        .AsExtra();
-    AddAttr<float>("Scale_out",
-                   "(float, default 1.0f), The quantize scale of output data")
-        .SetDefault(1.0f)
-        .AsExtra();
     AddOpComment();
   }
 
diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
index 42d749b7b8e3e4..31372dc323f45c 100644
--- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
+++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h
@@ -79,6 +79,7 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
                                              scale_x,
                                              scale_y,
                                              scale_o,
+                                             true,
                                              get_post_ops(ctx));
 
     // oneDNN's binary is optimized for broadcasting y into x, so in other case
@@ -127,7 +128,16 @@ class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
     binary_prim->execute(astream, args);
     astream.wait();
 
-    z->set_mem_desc(dst_memory->get_desc());
+    if (handler.use_broadcasting_hack == false) {
+      platform::SetOutMemDescWithLogicalLayoutFusesSupport(
+          ctx, z, dst_memory->get_desc());
+    } else {
+      auto dims = dst_memory->get_desc().dims();
+      dims.insert(dims.begin(), x->dims()[0]);
+      dims[1] /= dims[0];
+      platform::SetOutMemDescWithLogicalLayoutFusesSupport(
+          ctx, z, dst_memory->get_desc().reshape(dims));
+    }
   }
 };
 
@@ -149,12 +159,20 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    VLOG(4) << "element sub: dx " << dx << " dy " << dy << " dout " << dout;
 
     // oneDNN's binary is optimized for broadcasting y into x, so in other case
     // we have to swap tensors to achieve optimal performance
+    bool swap_x_y = false;
     if (x->numel() < y->numel()) {
       std::swap(x, y);
       std::swap(dx, dy);
+      swap_x_y = true;
+    }
+
+    std::vector<float> scales{1.0};
+    if (swap_x_y) {
+      scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
     }
 
     int axis = ctx.Attr<int>("axis");
@@ -172,7 +190,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
 
     auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
     if (dx) {
       std::shared_ptr<dnnl::memory> dst_memory;
 
@@ -181,8 +198,11 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
           BINARY_OP == dnnl::algorithm::binary_sub) {
         dst_memory = reorder_handler.AcquireDstMemory(
             dx, dout->mem_desc(), ctx.GetPlace());
-        auto reorder_p =
-            reorder_handler.AcquireReorder(dst_memory, reorder_src_memory_p);
+
+        dnnl::primitive_attr reorder_attr;
+        reorder_attr.set_output_scales(0, scales);
+        auto reorder_p = reorder_handler.AcquireReorder(
+            dst_memory, reorder_src_memory_p, reorder_attr);
         platform::RecordEvent record_reorder(
             "int_reorder",
             platform::TracerEventType::UserDefined,
@@ -190,6 +210,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
             platform::EventRole::kUniqueOp);
 
         reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
+
       } else {  // elementwise_mul & elementwise_div
         platform::BinaryMKLDNNHandler<T> binary_handler(BINARY_OP,
                                                         axis,
@@ -200,7 +221,8 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                                         dx,
                                                         1.0f,
                                                         1.0f,
-                                                        1.0f);
+                                                        1.0f,
+                                                        false);
 
         const auto src_dout_memory = binary_handler.AcquireSrcMemory(dout);
         const auto src_y_memory = binary_handler.AcquireSecondSrcMemory(y);
@@ -233,11 +255,10 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
               dy, dout->mem_desc(), ctx.GetPlace());
 
           dnnl::primitive_attr reorder_attr;
-          std::vector<float> scales(1);
-          scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
           reorder_attr.set_output_scales(0, scales);
-          auto reorder_p = std::make_shared<dnnl::reorder>(
-              *(reorder_src_memory_p), *(reorder_dst_memory_p), reorder_attr);
+
+          auto reorder_p = reorder_handler.AcquireReorder(
+              reorder_dst_memory_p, reorder_src_memory_p, reorder_attr);
           platform::RecordEvent record_reorder(
               "int_reorder",
               platform::TracerEventType::UserDefined,
@@ -267,7 +288,8 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
             nullptr,
             1.0f,
             1.0f,
-            1.0f);
+            1.0f,
+            false);
 
         src_1_memory = binary_handler.AcquireSecondSrcMemory(x);
 
@@ -282,7 +304,8 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
               nullptr,
               1.0f,
               1.0f,
-              1.0f);
+              1.0f,
+              false);
 
           post_op_memory = post_op_binary_handler.AcquireSrcMemory(y);
 
@@ -301,6 +324,7 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
                                                -1.0f,
                                                1.0f,
                                                1.0f,
+                                               false,
                                                po);
 
           src_1_memory = binary_handler.AcquireSecondSrcMemory(out);
@@ -331,7 +355,8 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
         // Broadcasting
         if (BINARY_OP == dnnl::algorithm::binary_sub) {
           dnnl::post_ops po;
-          po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, -1.0f, 0);
+          po.append_eltwise(
+              1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0);
           broadcast_reduction_attr.set_post_ops(po);
         }
 
diff --git a/paddle/fluid/operators/expand_as_v2_op.cc b/paddle/fluid/operators/expand_as_v2_op.cc
index 6fcf301897f295..7263192e139247 100644
--- a/paddle/fluid/operators/expand_as_v2_op.cc
+++ b/paddle/fluid/operators/expand_as_v2_op.cc
@@ -26,6 +26,13 @@ using framework::Tensor;
 class ExpandAsV2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
+
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"),
+        ctx.device_context());
+  }
 };
 
 class ExpandAsV2OpMaker : public framework::OpProtoAndCheckerMaker {
diff --git a/paddle/fluid/operators/expand_v2_op.cc b/paddle/fluid/operators/expand_v2_op.cc
index d548023bfbf2ec..fd92a43318c583 100644
--- a/paddle/fluid/operators/expand_v2_op.cc
+++ b/paddle/fluid/operators/expand_v2_op.cc
@@ -88,16 +88,6 @@ class ExpandV2OpMaker : public framework::OpProtoAndCheckerMaker {
               "the corresponding value given by Attr(expand_times).");
     AddAttr<std::vector<int>>("shape", "The expanded shape for each dimension.")
         .SetDefault({});
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<std::string>(
-        "mkldnn_data_type",
-        "(string, default \"float32\"). Data type of mkldnn kernel")
-        .SetDefault("float32")
-        .InEnum({"float32", "bfloat16"})
-        .AsExtra();
     AddComment(R"DOC(
 Expand the input to the given shape. The rank of X
 should be in [1, 6] and size of 'shape' must be in [1, 6] also.
diff --git a/paddle/fluid/operators/fake_quantize_op.cc b/paddle/fluid/operators/fake_quantize_op.cc
index cb8263714a5e47..bf1f0103f768bb 100644
--- a/paddle/fluid/operators/fake_quantize_op.cc
+++ b/paddle/fluid/operators/fake_quantize_op.cc
@@ -432,24 +432,6 @@ class FakeQuantOrWithDequantAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
-    AddAttr<int>(
-        "round_type",
-        "(int, default 1) The round type of fp32 to int."
-        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
-        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
-        "round(2.5)=3")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &round_type) {
-          PADDLE_ENFORCE_EQ(
-              round_type == 0 || round_type == 1,
-              true,
-              platform::errors::InvalidArgument(
-                  "'round_type' should be 0 or 1, 0 rounding to "
-                  "nearest ties to even and 1 is rounding to nearest "
-                  "ties away from zero.but the received is %d",
-                  round_type));
-        })
-        .AsExtra();
     AddComment(R"DOC(
 This is a Base Op which supports FakeQuantAbsMaxOpMaker and FakeQuantDequantAbsMaxOpMaker.
 FakeQuantAbsMaxOp operator is used in the dynamic quantization.
@@ -529,24 +511,6 @@ class FakeChannelWiseQuantizeAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
-    AddAttr<int>(
-        "round_type",
-        "(int, default 1) The round type of fp32 to int."
-        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
-        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
-        "round(2.5)=3")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &round_type) {
-          PADDLE_ENFORCE_EQ(
-              round_type == 0 || round_type == 1,
-              true,
-              platform::errors::InvalidArgument(
-                  "'round_type' should be 0 or 1, 0 rounding to "
-                  "nearest ties to even and 1 is rounding to nearest "
-                  "ties away from zero.but the received is %d",
-                  round_type));
-        })
-        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
@@ -628,24 +592,6 @@ class FakeChannelWiseQuantizeDequantizeAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
-    AddAttr<int>(
-        "round_type",
-        "(int, default 1) The round type of fp32 to int."
-        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
-        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
-        "round(2.5)=3")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &round_type) {
-          PADDLE_ENFORCE_EQ(
-              round_type == 0 || round_type == 1,
-              true,
-              platform::errors::InvalidArgument(
-                  "'round_type' should be 0 or 1, 0 rounding to "
-                  "nearest ties to even and 1 is rounding to nearest "
-                  "ties away from zero.but the received is %d",
-                  round_type));
-        })
-        .AsExtra();
     AddComment(R"DOC(
 The scale of FakeChannelWiseQuantize operator is a vector.
 In detail, each channel of the input X has a scale value.
@@ -715,24 +661,6 @@ class FakeQuantizeRangeAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
-    AddAttr<int>(
-        "round_type",
-        "(int, default 1) The round type of fp32 to int."
-        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
-        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
-        "round(2.5)=3")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &round_type) {
-          PADDLE_ENFORCE_EQ(
-              round_type == 0 || round_type == 1,
-              true,
-              platform::errors::InvalidArgument(
-                  "'round_type' should be 0 or 1, 0 rounding to "
-                  "nearest ties to even and 1 is rounding to nearest "
-                  "ties away from zero.but the received is %d",
-                  round_type));
-        })
-        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
@@ -815,24 +743,6 @@ class FakeQuantOrWithDequantMovingAverageAbsMaxOpMaker
                                 "the received is %d",
                                 bit_length));
         });
-    AddAttr<int>(
-        "round_type",
-        "(int, default 1) The round type of fp32 to int."
-        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
-        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
-        "round(2.5)=3")
-        .SetDefault(1)
-        .AddCustomChecker([](const int &round_type) {
-          PADDLE_ENFORCE_EQ(
-              round_type == 0 || round_type == 1,
-              true,
-              platform::errors::InvalidArgument(
-                  "'round_type' should be 0 or 1, 0 rounding to "
-                  "nearest ties to even and 1 is rounding to nearest "
-                  "ties away from zero.but the received is %d",
-                  round_type));
-        })
-        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
diff --git a/paddle/fluid/operators/fake_quantize_op.cu.h b/paddle/fluid/operators/fake_quantize_op.cu.h
index 22ba8254cdc2c2..9c71cce770f0e6 100644
--- a/paddle/fluid/operators/fake_quantize_op.cu.h
+++ b/paddle/fluid/operators/fake_quantize_op.cu.h
@@ -590,20 +590,16 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis0(const T *in,
                                                            const T *scale,
                                                            const int bin_cnt,
                                                            const int round_type,
-                                                           const int n,
-                                                           const int c,
+                                                           const int wh_size,
+                                                           const int num,
+                                                           const int cout,
                                                            T *out) {
-  int tid = threadIdx.x;
-
-  int channel_size = n / c;
-  const T *in_c = in + blockIdx.x * channel_size;
-  T *out_c = out + blockIdx.x * channel_size;
-
-  T s = scale[blockIdx.x];
-  T inv_s = inverse(s);
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  for (int i = tid; i < channel_size; i += blockDim.x) {
-    T x = in_c[i];
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / wh_size) % cout];
+    T inv_s = inverse(s);
+    T x = in[i];
     if (round_type == 0) {
       x = bin_cnt * inv_s * x;
       x = roundWithTiesToEven(x);
@@ -611,12 +607,12 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis0(const T *in,
       T min_bound = -bin_cnt - static_cast<T>(1);
       x = x > max_bound ? max_bound : x;
       x = x < min_bound ? min_bound : x;
-      out_c[i] = (x * s) / bin_cnt;
+      out[i] = (x * s) / bin_cnt;
     } else {
       T v = x > s ? s : x;
       v = v < -s ? -s : v;
       v = bin_cnt * inv_s * v;
-      out_c[i] = round(v) * s / bin_cnt;
+      out[i] = round(v) * s / bin_cnt;
     }
   }
 }
@@ -627,19 +623,16 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
                                                            const T *scale,
                                                            const int bin_cnt,
                                                            const int round_type,
-                                                           const int n,
-                                                           const int cin,
+                                                           const int wh_size,
+                                                           const int num,
                                                            const int cout,
                                                            T *out) {
-  T s = scale[blockIdx.x % cout];
-  T inv_s = inverse(s);
-
-  int wh_size = n / (cin * cout);
-  const T *in_c = in + blockIdx.x * wh_size;
-  T *out_c = out + blockIdx.x * wh_size;
+  int64_t idx = blockDim.x * blockIdx.x + threadIdx.x;
 
-  for (int i = threadIdx.x; i < wh_size; i += blockDim.x) {
-    T x = in_c[i];
+  for (int64_t i = idx; i < num; i += blockDim.x * gridDim.x) {
+    T s = scale[(i / wh_size) % cout];
+    T inv_s = inverse(s);
+    T x = in[i];
     if (round_type == 0) {
       x = bin_cnt * inv_s * x;
       x = roundWithTiesToEven(x);
@@ -647,12 +640,12 @@ __global__ void ChannelClipAndQuantDequantKernelQuantAxis1(const T *in,
       T min_bound = -bin_cnt - static_cast<T>(1);
       x = x > max_bound ? max_bound : x;
       x = x < min_bound ? min_bound : x;
-      out_c[i] = (x * s) / bin_cnt;
+      out[i] = (x * s) / bin_cnt;
     } else {
       T v = x > s ? s : x;
       v = v < -s ? -s : v;
       v = bin_cnt * inv_s * v;
-      out_c[i] = round(v) * s / bin_cnt;
+      out[i] = round(v) * s / bin_cnt;
     }
   }
 }
@@ -682,30 +675,39 @@ struct ChannelClipFakeQuantDequantFunctor<phi::GPUContext, T> {
     const T *scale_data = scale.data<T>();
     T *out_data = out->mutable_data<T>(ctx.GetPlace());
 
+    int64_t block_size =
+        std::min(static_cast<int64_t>(num),
+                 static_cast<int64_t>(ctx.GetMaxThreadsPerBlock() / 4));
+
+    int64_t max_threads = ctx.GetMaxPhysicalThreadCount();  // SM * block_per_SM
+    const int64_t max_blocks =
+        std::max(((max_threads - 1) / block_size + 1), static_cast<int64_t>(1));
+    const int64_t grid_size =
+        std::min(max_blocks, (num + block_size - 1) / block_size);
+
     if (quant_axis == 0) {
-      int grid = in_dims[0];
-      int block = 1024;
+      const int window_size = num / in_dims[0];
       ChannelClipAndQuantDequantKernelQuantAxis0<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data,
-                                             scale_data,
-                                             bin_cnt,
-                                             round_type,
-                                             num,
-                                             in_dims[0],
-                                             out_data);
+          <<<grid_size, block_size, 0, ctx.stream()>>>(in_data,
+                                                       scale_data,
+                                                       bin_cnt,
+                                                       round_type,
+                                                       window_size,
+                                                       num,
+                                                       in_dims[0],
+                                                       out_data);
     } else if (quant_axis == 1) {
-      int grid = in_dims[0] * in_dims[1];
-      int block = 1024;
+      const int window_size = num / (in_dims[0] * in_dims[1]);
 
       ChannelClipAndQuantDequantKernelQuantAxis1<T>
-          <<<grid, block, 0, ctx.stream()>>>(in_data,
-                                             scale_data,
-                                             bin_cnt,
-                                             round_type,
-                                             num,
-                                             in_dims[0],
-                                             in_dims[1],
-                                             out_data);
+          <<<grid_size, block_size, 0, ctx.stream()>>>(in_data,
+                                                       scale_data,
+                                                       bin_cnt,
+                                                       round_type,
+                                                       window_size,
+                                                       num,
+                                                       in_dims[1],
+                                                       out_data);
     }
   }
 };
diff --git a/paddle/fluid/operators/fill_any_like_op.cc b/paddle/fluid/operators/fill_any_like_op.cc
index 528ea076a322be..eb66cc88b3145c 100644
--- a/paddle/fluid/operators/fill_any_like_op.cc
+++ b/paddle/fluid/operators/fill_any_like_op.cc
@@ -58,7 +58,7 @@ class FillAnyLikeOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("Out", "The variable will be filled up with specified value.");
     AddAttr<float>("value", "The filled value").SetDefault(0.0);
     AddAttr<int>("dtype",
-                 "Output tensor data type. defalut value is -1,"
+                 "Output tensor data type. default value is -1,"
                  "according to the input dtype.")
         .SetDefault(-1);
     AddComment(R"DOC(
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 0dd0e1dcecf6bf..28167c4736fa3e 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -174,10 +174,6 @@ class FillConstantOpMaker : public framework::OpProtoAndCheckerMaker {
                  "3: XPUPlace. "
                  "4: NPUPlace. ")
         .SetDefault(-1);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddOutput("Out",
               "(Tensor) Tensor of specified shape will be filled "
               "with the specified value");
diff --git a/paddle/fluid/operators/flip_op.cc b/paddle/fluid/operators/flip_op.cc
deleted file mode 100644
index 7f00fad6e3d121..00000000000000
--- a/paddle/fluid/operators/flip_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "paddle/fluid/framework/infershape_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/core/infermeta_utils.h"
-#include "paddle/phi/infermeta/unary.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::OpKernelType;
-using framework::Tensor;
-
-class FlipOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const {
-    framework::LibraryType library = framework::LibraryType::kPlain;
-    framework::DataLayout layout = framework::DataLayout::kAnyLayout;
-    int customized_type_value =
-        framework::OpKernelType::kDefaultCustomizedTypeValue;
-    auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    return framework::OpKernelType(input_data_type,
-                                   ctx.GetPlace(),
-                                   layout,
-                                   library,
-                                   customized_type_value);
-  }
-};
-
-class FlipOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("X", "(Tensor), The input tensor of flip op.");
-    AddOutput("Out", "(Tensor), The output tensor of flip op.");
-    AddAttr<std::vector<int>>("axis", "The axes to flip on.");
-    AddComment(R"DOC(
-          Flip Operator.
-          Reverse the order of a n-D tensor along given axis in axes.
-      )DOC");
-  }
-};
-
-class FlipOpInferVarType : public framework::PassInDtypeAndVarTypeToOutput {
- protected:
-  std::unordered_map<std::string, std::string>& GetInputOutputWithSameType()
-      const override {
-    static std::unordered_map<std::string, std::string> m{{"X", /*->*/ "Out"}};
-    return m;
-  }
-};
-
-template <typename T>
-class FlipOpGradMaker : public framework::SingleGradOpMaker<T> {
- public:
-  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
-
- protected:
-  void Apply(GradOpPtr<T> retv) const override {
-    retv->SetType("flip");
-    retv->SetInput("X", this->OutputGrad("Out"));
-    retv->SetOutput("Out", this->InputGrad("X"));
-    retv->SetAttrMap(this->Attrs());
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-DECLARE_INFER_SHAPE_FUNCTOR(flip,
-                            FlipInferShapeFunctor,
-                            PD_INFER_META(phi::FlipInferMeta));
-REGISTER_OPERATOR(flip,
-                  ops::FlipOp,
-                  ops::FlipOpMaker,
-                  ops::FlipOpInferVarType,
-                  ops::FlipOpGradMaker<paddle::framework::OpDesc>,
-                  ops::FlipOpGradMaker<paddle::imperative::OpBase>,
-                  FlipInferShapeFunctor);
-
-/* ==========================  register checkpoint ===========================*/
-REGISTER_OP_VERSION(flip).AddCheckpoint(
-    R"ROC(Upgrade flip, add new attr [axis] and delete attr [dims].)ROC",
-    paddle::framework::compatible::OpVersionDesc()
-        .NewAttr("axis",
-                 "The added attr 'axis' doesn't set default value.",
-                 paddle::none)
-        .DeleteAttr("dims", "The attr 'dims' is deleted."));
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 02a3f4d7a0eb6d..9a14d35b59990b 100755
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -23,6 +23,7 @@ register_operators(
   fused_transformer_op
   fused_feedforward_op
   fused_multi_transformer_op
+  fused_multi_transformer_int8_op
   fused_bias_dropout_residual_layer_norm_op
   resnet_unit_op
   fused_gemm_epilogue_op
@@ -36,6 +37,7 @@ op_library(fusion_lstm_op)
 if(WITH_XPU)
   op_library(resnet_basic_block_op)
   op_library(resnet_unit_op)
+  op_library(fused_gemm_epilogue_op)
 endif()
 
 if(WITH_GPU OR WITH_ROCM)
@@ -118,6 +120,7 @@ if(WITH_GPU OR WITH_ROCM)
     # fused_attention_op
     op_library(fused_attention_op)
     op_library(fused_multi_transformer_op)
+    op_library(fused_multi_transformer_int8_op)
     op_library(fused_bias_dropout_residual_layer_norm_op)
   endif()
   # resnet_unit needs cudnn 8.0 above
diff --git a/paddle/fluid/operators/fused/attention_layer_norm.h b/paddle/fluid/operators/fused/attention_layer_norm.h
index baed3ca7a1aa23..e54bca8a89368f 100644
--- a/paddle/fluid/operators/fused/attention_layer_norm.h
+++ b/paddle/fluid/operators/fused/attention_layer_norm.h
@@ -19,7 +19,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+// NOTE: T must be the same as OutType in ComputeBackward
+template <typename T, typename InType = T, typename OutType = T>
 class AttnLayerNorm {
  public:
   AttnLayerNorm(const phi::GPUContext& dev_ctx,
@@ -33,17 +34,28 @@ class AttnLayerNorm {
 
   ~AttnLayerNorm() {}
 
-  void ComputeForward(const T* x_data,
+  void ComputeForward(const InType* x_data,
                       const LayerNormParamType<T>* scale_data,
                       const LayerNormParamType<T>* bias_data,
-                      T* y_data,
+                      OutType* y_data,
                       LayerNormParamType<T>* mean_data,
-                      LayerNormParamType<T>* var_data) {
+                      LayerNormParamType<T>* var_data,
+                      const float* dequant_out_scale_data = nullptr,
+                      const int quant_out_scale_offset = 0,
+                      const float quant_in_scale = 1.0,
+                      const int quant_round_type = 1,
+                      const float quant_max_bound = 127.0,
+                      const float quant_min_bound = -127.0) {
     auto stream = dev_ctx_.stream();
 
     switch (GetDesiredBlockDim(feature_size_)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, LayerNormParamType<T>, kBlockDim>
+          LayerNormForward<T,
+                           LayerNormParamType<T>,
+                           kBlockDim,
+                           false,
+                           InType,
+                           OutType>
           <<<batch_size_, kBlockDim, 0, stream>>>(x_data,
                                                   scale_data,
                                                   bias_data,
@@ -51,7 +63,13 @@ class AttnLayerNorm {
                                                   mean_data,
                                                   var_data,
                                                   epsilon_,
-                                                  feature_size_));
+                                                  feature_size_,
+                                                  dequant_out_scale_data,
+                                                  quant_out_scale_offset,
+                                                  quant_in_scale,
+                                                  quant_round_type,
+                                                  quant_max_bound,
+                                                  quant_min_bound));
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Feature_size must be larger than 1"));
diff --git a/paddle/fluid/operators/fused/attn_gemm_int8.h b/paddle/fluid/operators/fused/attn_gemm_int8.h
new file mode 100644
index 00000000000000..ba114df9085fbc
--- /dev/null
+++ b/paddle/fluid/operators/fused/attn_gemm_int8.h
@@ -0,0 +1,189 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/operators/fused/cublaslt.h"
+#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class AttnMatmulINT8 {
+ public:
+  AttnMatmulINT8(
+      const phi::GPUContext& dev_ctx, int m, int n, int k, bool compute_bias)
+      : dev_ctx_(dev_ctx), m_(m), n_(n), k_(k), compute_bias_(compute_bias) {
+    auto helper = std::make_shared<CublasLtHelper>(m, k, n);
+    helpers_.emplace_back(helper);
+  }
+  ~AttnMatmulINT8() {}
+
+  // This function is used to execute GEMM, with input and output's types are
+  // both T.
+  void ComputeForward(const framework::Tensor* weight,
+                      const framework::Tensor* input,
+                      framework::Tensor* input_tmp,
+                      const framework::Tensor* bias,
+                      framework::Tensor* output,
+                      framework::Tensor* output_tmp,
+                      framework::Tensor* bias_out,
+                      const float quant_in_scale,
+                      const framework::Tensor* dequant_out_scale,
+                      const int quant_out_scale_offset,
+                      const int quant_round_type = 1,
+                      const float quant_max_bound = 127.0,
+                      const float quant_min_bound = -127.0) {
+    quantize_kernel_launcher<T>(input->data<T>(),
+                                input_tmp->data<int8_t>(),
+                                quant_in_scale,
+                                m_,
+                                k_,
+                                quant_round_type,
+                                quant_max_bound,
+                                quant_min_bound,
+                                dev_ctx_.stream());
+
+    helpers_[0]->GEMM(input_tmp->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output_tmp->data<int32_t>(),
+                      dev_ctx_.stream());
+
+    dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
+                                  output->data<T>(),
+                                  m_,
+                                  n_,
+                                  dev_ctx_.stream(),
+                                  quant_in_scale,
+                                  dequant_out_scale->data<float>(),
+                                  quant_out_scale_offset);
+
+    if (compute_bias_) {
+      // bias_out = output + bias
+      std::vector<const framework::Tensor*> ins = {output, bias};
+      std::vector<framework::Tensor*> outs = {bias_out};
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      PADDLE_ENFORCE_EQ(cudaGetLastError(),
+                        cudaSuccess,
+                        platform::errors::Fatal(
+                            "cuda error occured after computing bias. "
+                            "But it does not mean this error is caused by "
+                            "bias computing"));
+    }
+  }
+
+  // This function is used to execute GEMM, with input and output's types are
+  // both INT8.
+  void ComputeForwardINT8ToINT8(const framework::Tensor* weight,
+                                framework::Tensor* input,
+                                const framework::Tensor* bias,
+                                framework::Tensor* output,
+                                framework::Tensor* bias_out) {
+    helpers_[0]->GEMM(input->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output->data<int32_t>(),
+                      dev_ctx_.stream());
+  }
+
+  // This function is used to execute GEMM, with input and output's types are
+  // INT8 and T.
+  void ComputeForwardINT8ToT(const framework::Tensor* weight,
+                             const float quant_in_scale,
+                             framework::Tensor* input,
+                             const framework::Tensor* bias,
+                             framework::Tensor* output,
+                             framework::Tensor* output_tmp,
+                             framework::Tensor* bias_out,
+                             const framework::Tensor* dequant_out_scale,
+                             const int quant_out_scale_offset) {
+    helpers_[0]->GEMM(input->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output_tmp->data<int32_t>(),
+                      dev_ctx_.stream());
+
+    dequantize_kernel_launcher<T>(output_tmp->data<int32_t>(),
+                                  output->data<T>(),
+                                  m_,
+                                  n_,
+                                  dev_ctx_.stream(),
+                                  quant_in_scale,
+                                  dequant_out_scale->data<float>(),
+                                  quant_out_scale_offset);
+
+    if (compute_bias_) {
+      // bias_out = output + bias
+      std::vector<const framework::Tensor*> ins = {output, bias};
+      std::vector<framework::Tensor*> outs = {bias_out};
+      phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
+          dev_ctx_, ins, &outs, -1, phi::funcs::AddFunctor<T>());
+      PADDLE_ENFORCE_EQ(cudaGetLastError(),
+                        cudaSuccess,
+                        platform::errors::Fatal(
+                            "cuda error occured after computing bias. "
+                            "But it does not mean this error is caused by "
+                            "bias computing"));
+    }
+  }
+
+  // This function is used to execute GEMM, with input and output's types are T
+  // and INT8.
+  void ComputeForwardTToINT8(const framework::Tensor* weight,
+                             const float quant_in_scale,
+                             const framework::Tensor* input,
+                             framework::Tensor* input_tmp,
+                             const framework::Tensor* bias,
+                             framework::Tensor* output,
+                             framework::Tensor* bias_out,
+                             const int quant_round_type = 1,
+                             const float quant_max_bound = 127.0,
+                             const float quant_min_bound = -127.0) {
+    quantize_kernel_launcher<T>(input->data<T>(),
+                                input_tmp->data<int8_t>(),
+                                quant_in_scale,
+                                m_,
+                                k_,
+                                quant_round_type,
+                                quant_max_bound,
+                                quant_min_bound,
+                                dev_ctx_.stream());
+
+    helpers_[0]->GEMM(input_tmp->data<int8_t>(),
+                      weight->data<int8_t>(),
+                      output->data<int32_t>(),
+                      dev_ctx_.stream());
+  }
+
+ private:
+  const phi::GPUContext& dev_ctx_;
+
+  int m_;  // m
+  int n_;  // n
+  int k_;  // k
+
+  int compute_bias_;
+  std::vector<std::shared_ptr<CublasLtHelper>> helpers_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/conv_fusion_op.cu b/paddle/fluid/operators/fused/conv_fusion_op.cu
index 4f05e6b6e2f463..6f0ebc2c7ebf66 100644
--- a/paddle/fluid/operators/fused/conv_fusion_op.cu
+++ b/paddle/fluid/operators/fused/conv_fusion_op.cu
@@ -35,6 +35,7 @@ using ScopedActivationDescriptor = platform::ScopedActivationDescriptor;
 using DataLayout = platform::DataLayout;
 using framework::AlgorithmsCache;
 using framework::ConvSearchCache;
+using framework::SearchFuseResult;
 
 template <typename T>
 using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
@@ -348,34 +349,35 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
               &perf_count,
               perf_results.get()));
       algo = (perf_results.get())[best_algo_idx].algo;
+#else
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
               handle,
               cudnn_input_desc,
               cudnn_filter_desc,
               cudnn_conv_desc,
               cudnn_output_desc,
-              algo,
-              &workspace_size_in_bytes));
-      if (workspace_size_in_bytes > workspace_size_limit)
-        workspace_size_limit = workspace_size_in_bytes;
-#else
+              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+              workspace_size_limit,
+              &algo));
+#endif
       PADDLE_ENFORCE_GPU_SUCCESS(
-          platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+          platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
               handle,
               cudnn_input_desc,
               cudnn_filter_desc,
               cudnn_conv_desc,
               cudnn_output_desc,
-              CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-              workspace_size_limit,
-              &algo));
+              algo,
+              &workspace_size_in_bytes));
+      if (workspace_size_in_bytes > workspace_size_limit)
+        workspace_size_limit = workspace_size_in_bytes;
       VLOG(3) << "cuDNN forward algo " << algo;
-#endif
     } else {
-      std::function<cudnnConvolutionFwdAlgo_t()> search_func =
-          [&]() -> cudnnConvolutionFwdAlgo_t {
+      std::function<SearchFuseResult<cudnnConvolutionFwdAlgo_t>()> search_func =
+          [&]() -> SearchFuseResult<cudnnConvolutionFwdAlgo_t> {
         int returned_algo_count;
+        SearchFuseResult<cudnnConvolutionFwdAlgo_t> fwd_result;
         std::array<cudnnConvolutionFwdAlgoPerf_t, kNUM_CUDNN_FWD_ALGS>
             fwd_perf_stat;
         auto cudnn_find_func = [&](void* cudnn_workspace) {
@@ -402,11 +404,34 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
           VLOG(3) << stat.algo << ": " << stat.status << " " << stat.time << " "
                   << stat.memory;
         }
-        return fwd_perf_stat[0].algo;
+
+        PADDLE_ENFORCE_GPU_SUCCESS(
+            platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
+                handle,
+                cudnn_input_desc,
+                cudnn_filter_desc,
+                cudnn_conv_desc,
+                cudnn_output_desc,
+                fwd_perf_stat[0].algo,
+                &workspace_size_in_bytes));
+        // PADDLE_ENFORCE_LE(
+        //     workspace_size_in_bytes,
+        //     workspace_size_limit,
+        //     platform::errors::InvalidArgument(
+        //         "The actual workspace size to be allocated for cuDNN is
+        //         expected " "to be less than the limit. But received: the
+        //         actual workspace " "size = %d, limit = %d.",
+        //         workspace_size_in_bytes,
+        //         workspace_size_limit));
+
+        fwd_result.algo = fwd_perf_stat[0].algo;
+        fwd_result.workspace_size = workspace_size_in_bytes;
+        return fwd_result;
       };
-      AlgorithmsCache<cudnnConvolutionFwdAlgo_t>& algo_cache =
+      AlgorithmsCache<SearchFuseResult<cudnnConvolutionFwdAlgo_t>>& algo_cache =
           *(framework::ConvSearchCache::Instance().GetConvFusion());
       int search_times = ctx.Attr<int>("search_times");
+      SearchFuseResult<cudnnConvolutionFwdAlgo_t> algo_result;
       search_times = std::max(
           static_cast<int>(FLAGS_cudnn_exhaustive_search_times), search_times);
       // TODO(dangqingqing): Unify this if-else.
@@ -414,39 +439,24 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel<T> {
         // The searched algo will be cached by `search_times` times for
         // different input dimension. For other dimensions, select the algo
         // of closest area.
-        algo = algo_cache.GetAlgorithm(
+        algo_result = algo_cache.GetAlgorithm(
             x_dims[2] * x_dims[3], search_times, 0, search_func);
+        algo = algo_result.algo;
+        workspace_size_in_bytes = algo_result.workspace_size;
       } else {
-        algo = algo_cache.GetAlgorithm(x_dims,
-                                       f_dims,
-                                       strides,
-                                       paddings,
-                                       dilations,
-                                       0,
-                                       dtype,
-                                       search_func);
+        algo_result = algo_cache.GetAlgorithm(x_dims,
+                                              f_dims,
+                                              strides,
+                                              paddings,
+                                              dilations,
+                                              0,
+                                              dtype,
+                                              search_func);
+        algo = algo_result.algo;
+        workspace_size_in_bytes = algo_result.workspace_size;
       }
       VLOG(3) << "choose algo " << algo;
     }
-
-    PADDLE_ENFORCE_GPU_SUCCESS(
-        platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
-            handle,
-            cudnn_input_desc,
-            cudnn_filter_desc,
-            cudnn_conv_desc,
-            cudnn_output_desc,
-            algo,
-            &workspace_size_in_bytes));
-    // PADDLE_ENFORCE_LE(
-    //     workspace_size_in_bytes,
-    //     workspace_size_limit,
-    //     platform::errors::InvalidArgument(
-    //         "The actual workspace size to be allocated for cuDNN is expected
-    //         " "to be less than the limit. But received: the actual workspace
-    //         " "size = %d, limit = %d.", workspace_size_in_bytes,
-    //         workspace_size_limit));
-
     if ((activation == "identity") && (!residual)) {
       // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is
       // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib.
diff --git a/paddle/fluid/operators/fused/cublaslt.h b/paddle/fluid/operators/fused/cublaslt.h
new file mode 100644
index 00000000000000..b9cc6b56f13eea
--- /dev/null
+++ b/paddle/fluid/operators/fused/cublaslt.h
@@ -0,0 +1,211 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <sstream>
+#include <string>
+#include <unordered_map>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/platform/dynload/cublasLt.h"
+
+namespace dyl = paddle::platform::dynload;
+
+namespace paddle {
+namespace operators {
+class CublasLtHelper {
+ public:
+  CublasLtHelper(int m, int k, int n)
+      : alpha_(1), beta_(0), m_(m), k_(k), n_(n) {
+    cublasStatus_t status;
+    // handle and matmul desc
+    status = dyl::cublasLtCreate(&handle_);
+#if CUBLAS_VER_MAJOR < 11
+    cudaDataType_t cudaComputeType = CUDA_R_32I;
+#else
+    cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
+#endif
+
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+#if CUBLAS_VER_MAJOR < 11
+    status = dyl::cublasLtMatmulDescCreate(&matmul_desc_, cudaComputeType);
+#else
+    status = dyl::cublasLtMatmulDescCreate(
+        &matmul_desc_, cudaComputeType, CUDA_R_32I);
+#endif
+
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatmulDescCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+    cublasOperation_t op_transpose = CUBLAS_OP_T;
+    status = dyl::cublasLtMatmulDescSetAttribute(matmul_desc_,
+                                                 CUBLASLT_MATMUL_DESC_TRANSA,
+                                                 &op_transpose,
+                                                 sizeof(op_transpose));
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatmulDescSetAttribute execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+    // matrix desc
+    status = dyl::cublasLtMatrixLayoutCreate(&B_desc_, CUDA_R_8I, k, n, k);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+    status = dyl::cublasLtMatrixLayoutCreate(&A_desc_, CUDA_R_8I, k, m, k);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+
+    status = dyl::cublasLtMatrixLayoutCreate(&C_desc_, CUDA_R_32I, n, m, n);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatrixLayoutCreate execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+  }
+  ~CublasLtHelper() {
+    if (handle_) dyl::cublasLtDestroy(handle_);
+    if (matmul_desc_) dyl::cublasLtMatmulDescDestroy(matmul_desc_);
+    if (A_desc_) dyl::cublasLtMatrixLayoutDestroy(A_desc_);
+    if (B_desc_) dyl::cublasLtMatrixLayoutDestroy(B_desc_);
+    if (C_desc_) dyl::cublasLtMatrixLayoutDestroy(C_desc_);
+  }
+
+  void GEMM(int8_t* A_dev,
+            const int8_t* B_dev,
+            int32_t* C_dev,
+            cudaStream_t stream) {
+    cublasStatus_t status;
+
+#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
+    cublasLtMatmulAlgo_t algo;
+    int algoId = 21;
+    int swizzle = 0;
+    int customOption = 0;
+    int tile = 15;
+    int splitK_val = 0;
+    int reductionScheme = 0;
+#if CUDA_VERSION >= 11000
+    int stages = 23;
+#endif
+
+#if CUBLAS_VER_MAJOR < 11
+    cudaDataType_t cudaComputeType = CUDA_R_32I;
+#else
+    cublasComputeType_t cudaComputeType = CUBLAS_COMPUTE_32I;
+#endif
+
+    dyl::cublasLtMatmulAlgoInit(handle_,
+                                cudaComputeType,
+                                CUDA_R_32I,
+                                CUDA_R_8I,
+                                CUDA_R_8I,
+                                CUDA_R_32I,
+                                CUDA_R_32I,
+                                algoId,
+                                &algo);
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo,
+        CUBLASLT_ALGO_CONFIG_CUSTOM_OPTION,
+        &(customOption),
+        sizeof(customOption));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo, CUBLASLT_ALGO_CONFIG_TILE_ID, &(tile), sizeof(tile));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(&algo,
+                                              CUBLASLT_ALGO_CONFIG_SPLITK_NUM,
+                                              &(splitK_val),
+                                              sizeof(splitK_val));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo, CUBLASLT_ALGO_CONFIG_CTA_SWIZZLING, &(swizzle), sizeof(swizzle));
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo,
+        CUBLASLT_ALGO_CONFIG_REDUCTION_SCHEME,
+        &(reductionScheme),
+        sizeof(int));
+#if CUDA_VERSION >= 11000
+    dyl::cublasLtMatmulAlgoConfigSetAttribute(
+        &algo, CUBLASLT_ALGO_CONFIG_STAGES_ID, &(stages), sizeof(stages));
+#endif
+#endif
+    status = dyl::cublasLtMatmul(handle_,
+                                 matmul_desc_,
+                                 &alpha_,
+                                 B_dev,
+                                 B_desc_,
+                                 A_dev,
+                                 A_desc_,
+                                 &beta_,
+                                 C_dev,
+                                 C_desc_,
+                                 C_dev,
+                                 C_desc_,
+#if __CUDA_ARCH__ >= 800 && CUDA_VERSION >= 11020
+                                 &algo,
+#else
+                                 nullptr,
+#endif
+                                 nullptr,
+                                 0,
+                                 stream);
+    PADDLE_ENFORCE_EQ(
+        status,
+        CUBLAS_STATUS_SUCCESS,
+        platform::errors::External(
+            "cublasLtMatmul execution error"
+            "refer https://docs.nvidia.com/cuda/cublas/index.html to get more "
+            "information"));
+  }
+
+ private:
+  cublasLtHandle_t handle_;
+  cublasLtMatmulDesc_t matmul_desc_;
+  cublasLtMatrixLayout_t A_desc_;
+  cublasLtMatrixLayout_t B_desc_;
+  cublasLtMatrixLayout_t C_desc_;
+  int32_t alpha_;
+  int32_t beta_;
+
+  int m_;
+  int k_;
+  int n_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
index cde4ed061423e8..6eb71442c6a3ca 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv.cu.h
@@ -45,6 +45,14 @@ struct NormConvolutionArgs {
            int stride,
            int dilation,
            int group) {
+    PADDLE_ENFORCE_LT(
+        ctx.GetComputeCapability(),
+        90,
+        phi::errors::PreconditionNotMet(
+            "Expect compute compatiblity to be less than 90, but got %d. "
+            "CUDNN FusedOps is no longer available on H100 and later "
+            "devices.",
+            ctx.GetComputeCapability()));
     PADDLE_ENFORCE_EQ(
         input_shape.size(),
         4U,
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index ef93612ffce39a..52a0efc225fc4a 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -442,7 +442,7 @@ TEST(CudnnNormConvFp16, K1S1) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -472,7 +472,7 @@ TEST(CudnnNormConvFp16, K3S1) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -502,7 +502,7 @@ TEST(CudnnNormConvFp16, K1S1O4) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() < 70) {
+  if (ctx->GetComputeCapability() < 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3, true),
@@ -532,7 +532,7 @@ TEST(CudnnNormConvFp16, K1S2O4) {
   phi::GPUContext *ctx = static_cast<phi::GPUContext *>(
       platform::DeviceContextPool::Instance().Get(platform::CUDAPlace(0)));
 
-  if (ctx->GetComputeCapability() <= 70) {
+  if (ctx->GetComputeCapability() <= 70 || ctx->GetComputeCapability() >= 90) {
     ASSERT_THROW(test.CheckForward(1e-3, true),
                  paddle::platform::EnforceNotMet);
     ASSERT_THROW(test.CheckBackward(1e-3), paddle::platform::EnforceNotMet);
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cc b/paddle/fluid/operators/fused/fused_attention_op.cc
index 30badd3125588d..8d03ba451bdae4 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cc
+++ b/paddle/fluid/operators/fused/fused_attention_op.cc
@@ -704,6 +704,13 @@ class FusedAttentionGradOpMaker : public framework::SingleGradOpMaker<T> {
   }
 };
 
+DECLARE_NO_NEED_BUFFER_VARS_INFERER(FusedAttentionGradNoNeedBufferInferer,
+                                    "QKVBiasOut",
+                                    "QKVOut",
+                                    "QKOut",
+                                    "QKTVOut",
+                                    "OutLinearOut");
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -713,7 +720,9 @@ REGISTER_OPERATOR(fused_attention,
                   ops::FusedAttentionOpMaker,
                   ops::FusedAttentionGradOpMaker<paddle::framework::OpDesc>,
                   ops::FusedAttentionGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(fused_attention_grad, ops::FusedAttentionGradOp);
+REGISTER_OPERATOR(fused_attention_grad,
+                  ops::FusedAttentionGradOp,
+                  ops::FusedAttentionGradNoNeedBufferInferer);
 
 REGISTER_OP_VERSION(fused_attention)
     .AddCheckpoint(
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 059d94031ac8e5..ac9e219075174d 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -30,7 +30,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -50,13 +50,15 @@ static void AllReduce(framework::Tensor &tensor,  // NOLINT
 
   if (map->has(ring_id)) {
     paddle::distributed::ProcessGroup *pg = map->get(ring_id);
+    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL *>(pg);
+
     std::vector<phi::DenseTensor> in_tensor;
     std::vector<phi::DenseTensor> out_tensor;
     in_tensor.push_back(tensor);
     out_tensor.push_back(tensor);
     paddle::distributed::AllreduceOptions opts;
     opts.reduce_op = distributed::ReduceOp::SUM;
-    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true);
     task->Wait();
   } else {
     auto dtype = platform::ToNCCLDataType(
@@ -408,28 +410,24 @@ class FusedAttentionGradKernel : public framework::OpKernel<T> {
         (out_linear_bias == nullptr) ? nullptr : out_linear_bias->data<T>();
 
     // fw output
-    auto *fmha_out = ctx.Input<Tensor>("FMHAOut");
-    auto *transpose_out_2 = ctx.Input<Tensor>("TransposeOut2");
-    auto *qk_out = ctx.Input<Tensor>("QKOut");
-    auto *qktv_out = ctx.Input<Tensor>("QKTVOut");
-    auto *softmax_out = ctx.Input<Tensor>("SoftmaxOut");
-    auto *attn_dropout_mask_out = ctx.Input<Tensor>("AttnDropoutMaskOut");
-    auto *attn_dropout_out = ctx.Input<Tensor>("AttnDropoutOut");
-    auto *src_mask_out = ctx.Input<Tensor>("SrcMaskOut");
-    auto *out_linear_out = ctx.Input<Tensor>("OutLinearOut");
-    auto *ln_2_mean = ctx.Input<Tensor>("Ln2Mean");
-    auto *ln_2_var = ctx.Input<Tensor>("Ln2Variance");
-    auto *dropout_mask_out = ctx.Input<Tensor>("DropoutMaskOut");
+    auto *fmha_out = ctx.Input<phi::DenseTensor>("FMHAOut");
+    auto *transpose_out_2 = ctx.Input<phi::DenseTensor>("TransposeOut2");
+    auto *qk_out = ctx.Input<phi::DenseTensor>("QKOut");
+    auto *softmax_out = ctx.Input<phi::DenseTensor>("SoftmaxOut");
+    auto *attn_dropout_mask_out =
+        ctx.Input<phi::DenseTensor>("AttnDropoutMaskOut");
+    auto *attn_dropout_out = ctx.Input<phi::DenseTensor>("AttnDropoutOut");
+    auto *src_mask_out = ctx.Input<phi::DenseTensor>("SrcMaskOut");
+    auto *ln_2_mean = ctx.Input<phi::DenseTensor>("Ln2Mean");
+    auto *ln_2_var = ctx.Input<phi::DenseTensor>("Ln2Variance");
+    auto *dropout_mask_out = ctx.Input<phi::DenseTensor>("DropoutMaskOut");
     auto *bias_dropout_residual_out =
         ctx.Input<Tensor>("BiasDropoutResidualOut");
     auto *fmha_out_data = fmha_out->data<T>();
     auto *transpose_out_2_data = transpose_out_2->data<T>();
-    auto *qk_out_data = qk_out->data<T>();
-    auto *qktv_out_data = qktv_out->data<T>();
     auto *softmax_out_data = softmax_out->data<T>();
     auto *src_mask_out_data =
         (src_mask == nullptr) ? nullptr : src_mask_out->data<T>();
-    auto *out_linear_out_data = out_linear_out->data<T>();
     auto *dropout_mask_out_data = dropout_mask_out->data<uint8_t>();
 
     // output's grad
diff --git a/paddle/fluid/operators/fused/fused_dropout_act_bias.h b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
index 732da5fa52a8bc..e3e19d9ea6ebcb 100644
--- a/paddle/fluid/operators/fused/fused_dropout_act_bias.h
+++ b/paddle/fluid/operators/fused/fused_dropout_act_bias.h
@@ -60,19 +60,32 @@ struct GeluGradFunctor {
  * the src, mask and dst shape is (rows, cols)
  * the bias shape is (1, cols)
  */
-template <typename T, typename MaskType, int VecSize, typename Functor>
-__global__ void FusedDropoutActBias(Functor act,
-                                    const uint64_t seed,
-                                    const uint64_t rows,
-                                    const uint64_t cols,
-                                    const int increment,
-                                    const float dropout_prob,
-                                    const bool is_upscale_in_train,
-                                    const bool is_test,
-                                    const T *__restrict__ src,
-                                    const T *__restrict__ bias,
-                                    T *dst,
-                                    MaskType *mask) {
+template <typename T,
+          typename MaskType,
+          int VecSize,
+          typename Functor,
+          typename InType = T,
+          typename OutType = T>
+__global__ void FusedDropoutActBias(
+    Functor act,
+    const uint64_t seed,
+    const uint64_t rows,
+    const uint64_t cols,
+    const int increment,
+    const float dropout_prob,
+    const bool is_upscale_in_train,
+    const bool is_test,
+    const InType *__restrict__ src,
+    const T *__restrict__ bias,
+    OutType *dst,
+    MaskType *mask,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
@@ -90,7 +103,9 @@ __global__ void FusedDropoutActBias(Functor act,
                                         VecSize,
                                         false,
                                         true,
-                                        Functor>(r,
+                                        Functor,
+                                        InType,
+                                        OutType>(r,
                                                  i,
                                                  cols,
                                                  &state,
@@ -104,7 +119,14 @@ __global__ void FusedDropoutActBias(Functor act,
                                                  is_test,
                                                  nullptr,
                                                  nullptr,
-                                                 act);
+                                                 act,
+                                                 quant_last_in_scale,
+                                                 dequant_out_scale_data,
+                                                 quant_out_scale_offset,
+                                                 quant_next_in_scale,
+                                                 quant_round_type,
+                                                 quant_max_bound,
+                                                 quant_min_bound);
     }
   }
 }
@@ -112,7 +134,11 @@ __global__ void FusedDropoutActBias(Functor act,
 /**
  * @brief dst = dropout(activation(src + bias));
  */
-template <typename T, typename MaskType, typename Functor>
+template <typename T,
+          typename MaskType,
+          typename Functor,
+          typename InType = T,
+          typename OutType = T>
 void LaunchDropoutActBias(Functor act_functor,
                           const uint64_t seed,
                           const uint32_t rows,
@@ -121,14 +147,21 @@ void LaunchDropoutActBias(Functor act_functor,
                           const float dropout_prob,
                           const bool is_upscale_in_train,
                           const bool is_test,
-                          const T *src,
+                          const InType *src,
                           const T *bias,
-                          T *dst,
+                          OutType *dst,
                           MaskType *mask_data,
-                          const phi::GPUContext &ctx) {
+                          const phi::GPUContext &ctx,
+                          const float quant_last_in_scale = 1.0,
+                          const float *dequant_out_scale_data = nullptr,
+                          const int quant_out_scale_offset = 0,
+                          const float quant_next_in_scale = 1.0,
+                          const int quant_round_type = 1,
+                          const float quant_max_bound = 127.0,
+                          const float quant_min_bound = -127.0) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
-    SetZero<T>(ctx, dst, rows * cols);
+    SetZero<T>(ctx, reinterpret_cast<T *>(dst), rows * cols);
     SetZero<MaskType>(ctx, mask_data, rows * cols);
     return;
   }
@@ -137,7 +170,7 @@ void LaunchDropoutActBias(Functor act_functor,
   const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   const auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
   if (cols % VecSize == 0) {
-    FusedDropoutActBias<T, MaskType, VecSize, Functor>
+    FusedDropoutActBias<T, MaskType, VecSize, Functor, InType, OutType>
         <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
             act_functor,
             seed,
@@ -150,9 +183,13 @@ void LaunchDropoutActBias(Functor act_functor,
             src,
             bias,
             dst,
-            mask_data);
+            mask_data,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
   } else {
-    FusedDropoutActBias<T, MaskType, 1, Functor>
+    FusedDropoutActBias<T, MaskType, 1, Functor, InType, OutType>
         <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
             act_functor,
             seed,
@@ -165,7 +202,11 @@ void LaunchDropoutActBias(Functor act_functor,
             src,
             bias,
             dst,
-            mask_data);
+            mask_data,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
   }
 }
 
@@ -215,17 +256,19 @@ template <typename T,
           int BlockSizeX,
           int BlockSizeY,
           int VecSize,
-          typename Functor>
-__global__ void FusedDropoutActBiasGrad(Functor act_grad,
-                                        const T *dout,
-                                        const MaskType *mask,
-                                        const T *src,
-                                        const T *bias,
-                                        const T factor,
-                                        const int64_t rows,
-                                        const int64_t cols,
-                                        T *dx,
-                                        T *dbias) {
+          typename Functor,
+          int THREADS_PER_CTA = BlockSizeX *BlockSizeY>
+__global__ __launch_bounds__(THREADS_PER_CTA) void FusedDropoutActBiasGrad(
+    Functor act_grad,
+    const T *dout,
+    const MaskType *mask,
+    const T *src,
+    const T *bias,
+    const T factor,
+    const int64_t rows,
+    const int64_t cols,
+    T *dx,
+    T *dbias) {
   int64_t col_id = blockIdx.x * blockDim.x + threadIdx.x;
 
   using LoadT = phi::AlignedVector<T, VecSize>;
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
index 0f37d242ebcb3c..1b8dc4bb324ca7 100644
--- a/paddle/fluid/operators/fused/fused_dropout_common.h
+++ b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -20,6 +20,7 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
diff --git a/paddle/fluid/operators/fused/fused_dropout_helper.h b/paddle/fluid/operators/fused/fused_dropout_helper.h
index 208b2a58bca691..2d1491fefb07e1 100644
--- a/paddle/fluid/operators/fused/fused_dropout_helper.h
+++ b/paddle/fluid/operators/fused/fused_dropout_helper.h
@@ -109,7 +109,10 @@ struct DropoutParam {
   }
 };
 
-template <typename T, typename MaskType>
+template <typename T,
+          typename MaskType,
+          typename InType = T,
+          typename OutType = T>
 class FusedDropoutHelper {
  private:
   int GetIncrement(const phi::GPUContext& ctx) {
@@ -140,25 +143,34 @@ class FusedDropoutHelper {
 
   // out = residual + dropout( src + bias )
   void ResidualDropoutBias(const phi::GPUContext& ctx,
-                           const T* src,
+                           const InType* src,
                            const T* residual,
                            const T* bias,
-                           T* out,
-                           MaskType* mask) {
+                           OutType* out,
+                           MaskType* mask,
+                           const float quant_last_in_scale = 1.0,
+                           const float* dequant_out_scale_data = nullptr,
+                           const int quant_out_scale_offset = 0,
+                           const float quant_next_in_scale = 1.0) {
     auto increment = GetIncrement(ctx);
-    LaunchResidualDropoutBias<T, MaskType>(rows_,
-                                           cols_,
-                                           increment,
-                                           dropout_param_.seed,
-                                           dropout_param_.dropout_prob,
-                                           dropout_param_.is_test,
-                                           dropout_param_.is_upscale_in_train,
-                                           src,
-                                           residual,
-                                           bias,
-                                           mask,
-                                           out,
-                                           ctx);
+    LaunchResidualDropoutBias<T, MaskType, InType, OutType>(
+        rows_,
+        cols_,
+        increment,
+        dropout_param_.seed,
+        dropout_param_.dropout_prob,
+        dropout_param_.is_test,
+        dropout_param_.is_upscale_in_train,
+        src,
+        residual,
+        bias,
+        mask,
+        out,
+        ctx,
+        quant_last_in_scale,
+        dequant_out_scale_data,
+        quant_out_scale_offset,
+        quant_next_in_scale);
   }
 
   void ResidualDropoutBiasGrad(const phi::GPUContext& ctx,
@@ -189,15 +201,22 @@ class FusedDropoutHelper {
 
   // out = dropout(activation(src + bias))
   void DropoutActBias(const phi::GPUContext& ctx,
-                      const T* src,
+                      const InType* src,
                       const T* bias,
                       const std::string& act_method,
-                      T* out,
-                      MaskType* mask) {
+                      OutType* out,
+                      MaskType* mask,
+                      const float quant_last_in_scale = 1.0,
+                      const float* dequant_out_scale_data = nullptr,
+                      const int quant_out_scale_offset = 0,
+                      const float quant_next_in_scale = 1.0,
+                      const int quant_round_type = 1,
+                      const float quant_max_bound = 127.0,
+                      const float quant_min_bound = -127.0) {
     auto increment = GetIncrement(ctx);
     if (act_method == "gelu") {
       GeluFunctor<T> gelu;
-      LaunchDropoutActBias<T, MaskType, GeluFunctor<T>>(
+      LaunchDropoutActBias<T, MaskType, GeluFunctor<T>, InType, OutType>(
           gelu,
           dropout_param_.seed,
           rows_,
@@ -210,23 +229,40 @@ class FusedDropoutHelper {
           bias,
           out,
           mask,
-          ctx);
+          ctx,
+          quant_last_in_scale,
+          dequant_out_scale_data,
+          quant_out_scale_offset,
+          quant_next_in_scale,
+          quant_round_type,
+          quant_max_bound,
+          quant_min_bound);
     } else if (act_method == "relu") {
       phi::funcs::ReluFunctor<T> relu;
-      LaunchDropoutActBias<T, MaskType, phi::funcs::ReluFunctor<T>>(
-          relu,
-          dropout_param_.seed,
-          rows_,
-          cols_,
-          increment,
-          dropout_param_.dropout_prob,
-          dropout_param_.is_upscale_in_train,
-          dropout_param_.is_test,
-          src,
-          bias,
-          out,
-          mask,
-          ctx);
+      LaunchDropoutActBias<T,
+                           MaskType,
+                           phi::funcs::ReluFunctor<T>,
+                           InType,
+                           OutType>(relu,
+                                    dropout_param_.seed,
+                                    rows_,
+                                    cols_,
+                                    increment,
+                                    dropout_param_.dropout_prob,
+                                    dropout_param_.is_upscale_in_train,
+                                    dropout_param_.is_test,
+                                    src,
+                                    bias,
+                                    out,
+                                    mask,
+                                    ctx,
+                                    quant_last_in_scale,
+                                    dequant_out_scale_data,
+                                    quant_out_scale_offset,
+                                    quant_next_in_scale,
+                                    quant_round_type,
+                                    quant_max_bound,
+                                    quant_min_bound);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
           "Currently only supports gelu or relu activation functions!"));
@@ -283,8 +319,12 @@ class FusedDropoutHelper {
   DropoutParam dropout_param_;
 };
 
-template <typename T, typename MaskType>
-class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
+template <typename T,
+          typename MaskType,
+          typename InType = T,
+          typename OutType = T>
+class FusedDropoutLayerNormHelper
+    : public FusedDropoutHelper<T, MaskType, InType, OutType> {
  public:
   FusedDropoutLayerNormHelper() {}
   FusedDropoutLayerNormHelper(const int rows,
@@ -301,23 +341,24 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
                               const int cols,
                               const DropoutParam& dropout_param,
                               const float epsilon)
-      : FusedDropoutHelper<T, MaskType>(ctx, rows, cols, dropout_param) {
+      : FusedDropoutHelper<T, MaskType, InType, OutType>(
+            ctx, rows, cols, dropout_param) {
     using U = LayerNormParamType<T>;
     epsilon_ = epsilon;
   }
 
   // call layer_norm
   void LayerNorm(const phi::GPUContext& ctx,
-                 const T* src,
+                 const InType* src,
                  const LayerNormParamType<T>* gamma,
                  const LayerNormParamType<T>* beta,
-                 T* out,
+                 OutType* out,
                  LayerNormParamType<T>* mean,
                  LayerNormParamType<T>* variance) {
     using U = LayerNormParamType<T>;
     switch (GetDesiredBlockDim(this->cols_)) {
       FIXED_BLOCK_DIM_CASE(
-          LayerNormForward<T, U, kBlockDim>
+          LayerNormForward<T, U, kBlockDim, false, InType, OutType>
           <<<this->rows_, kBlockDim, 0, ctx.stream()>>>(
               src, gamma, beta, out, mean, variance, epsilon_, this->cols_));
     }
@@ -349,17 +390,25 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
 
   // out = layernorm(residual + dropout(src + bias))
   template <typename P = LayerNormParamType<T>, bool is_same_type = false>
-  void LayernormResidualDropoutBias(const phi::GPUContext& ctx,
-                                    const T* src,
-                                    const T* residual,
-                                    const T* bias,
-                                    const P* gamma,
-                                    const P* beta,
-                                    T* dropout_out,
-                                    MaskType* mask,
-                                    T* out,
-                                    LayerNormParamType<T>* mean,
-                                    LayerNormParamType<T>* variance) {
+  void LayernormResidualDropoutBias(
+      const phi::GPUContext& ctx,
+      const InType* src,
+      const T* residual,
+      const T* bias,
+      const P* gamma,
+      const P* beta,
+      T* dropout_out,
+      MaskType* mask,
+      OutType* out,
+      LayerNormParamType<T>* mean,
+      LayerNormParamType<T>* variance,
+      const float quant_last_in_scale = 1.0,
+      const float* dequant_out_scale_data = nullptr,
+      const int quant_out_scale_offset = 0,
+      const float quant_next_in_scale = 1.0,
+      const int quant_round_type = 1,
+      const float quant_max_bound = 127.0,
+      const float quant_min_bound = -127.0) {
     using U = LayerNormParamType<T>;
     int vec_size = MAX_CACHE_BYTES / sizeof(T);
     if (this->cols_ % vec_size != 0) {
@@ -368,7 +417,12 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
     int threads = GetDesiredBlockDim(this->cols_ / vec_size);
     int increment = ((this->cols_ - 1) / (threads * vec_size) + 1) * vec_size;
     increment = this->dropout_param_.UpdateSeedAndIncrement(ctx, increment);
-    LaunchLayernormResidualDropoutBias<T, MaskType, U, is_same_type>(
+    LaunchLayernormResidualDropoutBias<T,
+                                       MaskType,
+                                       U,
+                                       is_same_type,
+                                       InType,
+                                       OutType>(
         this->rows_,
         this->cols_,
         increment,
@@ -387,7 +441,14 @@ class FusedDropoutLayerNormHelper : public FusedDropoutHelper<T, MaskType> {
         out,
         mean,
         variance,
-        ctx);
+        ctx,
+        quant_last_in_scale,
+        dequant_out_scale_data,
+        quant_out_scale_offset,
+        quant_next_in_scale,
+        quant_round_type,
+        quant_max_bound,
+        quant_min_bound);
   }
 
   template <typename P = LayerNormParamType<T>, bool is_same_type = false>
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
index 758fb8a23f8f92..c2cb6f46010daf 100644
--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -223,13 +223,7 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
 
       // For layer_norm, reduce to calculate mean and std
       sum_i += static_cast<float>(tmp_3);
-#if defined(PADDLE_WITH_CUDA) && __CUDA_ARCH__ >= 530
-      square_sum_i += static_cast<float>(__hmul(tmp_3, tmp_3));
-#elif defined(PADDLE_WITH_CUDA)
       square_sum_i += static_cast<float>(tmp_3) * static_cast<float>(tmp_3);
-#else
-      square_sum_i += static_cast<float>(tmp_3 * tmp_3);
-#endif
     }
     auto pair = BlockReduce(temp_storage)
                     .Reduce(PairForLayerNorm<float>(sum_i, square_sum_i),
@@ -282,9 +276,9 @@ __global__ void InplaceAddReluAddLayerNormKernel(const float16* y_data,
       half tmp_0 = __hdiv(__hsub(save_ptr[save_index], mean_i), std_i);
       half tmp_1 = scale ? __hmul(scale[j], tmp_0) : tmp_0;
 #else
-      half tmp_0 = static_cast<float>(static_cast<float>(save_ptr[save_index]) +
-                                      static_cast<float>(mean_i) /
-                                          static_cast<float>(std_i));
+      half tmp_0 = static_cast<half>((static_cast<float>(save_ptr[save_index]) -
+                                      static_cast<float>(mean_i)) /
+                                     static_cast<float>(std_i));
       half tmp_1 = scale ? static_cast<half>(static_cast<float>(scale[j]) *
                                              static_cast<float>(tmp_0))
                          : tmp_0;
@@ -400,19 +394,16 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
     auto* out_data = dev_ctx.template Alloc<T>(out, out->numel() * sizeof(T));
 
     auto blas = phi::funcs::GetBlas<phi::GPUContext, T>(dev_ctx);
-    blas.GEMM(false,
-              false,
+    blas.GEMM(CblasNoTrans,
+              CblasNoTrans,
               M,
               N,
               K,
               static_cast<T>(1.0),
               x_data,
-              K,
               w_data,
-              N,
               static_cast<T>(0.0),
-              out_data,
-              N);
+              out_data);
     auto* y = ctx.Input<framework::Tensor>("Y");
     auto* bias_0 = ctx.Input<framework::Tensor>("Bias0");
     auto* bias_1 = ctx.Input<framework::Tensor>("Bias1");
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cc b/paddle/fluid/operators/fused/fused_feedforward_op.cc
index 9b8b256a9ee54f..f47f01465da2df 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -276,10 +276,12 @@ class FusedFeedForwardOpGrad : public framework::OperatorWithKernel {
                    "Input",
                    "Dropout1Out",
                    "FusedFeedForwardGrad");
-    OP_INOUT_CHECK(ctx->HasInput("Dropout2Out"),
-                   "Input",
-                   "Dropout2Out",
-                   "FusedFeedForwardGrad");
+    if (!pre_layer_norm) {
+      OP_INOUT_CHECK(ctx->HasInput("Dropout2Out"),
+                     "Input",
+                     "Dropout2Out",
+                     "FusedFeedForwardGrad");
+    }
     OP_INOUT_CHECK(ctx->HasInput("Linear1Weight"),
                    "Input",
                    "Linear1Weight",
@@ -368,10 +370,12 @@ class FusedFeedForwardOpGradMaker : public framework::SingleGradOpMaker<T> {
     op->SetInput("Dropout2Mask", this->Output("Dropout2Mask"));
     op->SetInput("Linear1Out", this->Output("Linear1Out"));
     op->SetInput("Dropout1Out", this->Output("Dropout1Out"));
-    op->SetInput("Dropout2Out", this->Output("Dropout2Out"));
 
     op->SetAttrMap(this->Attrs());
     bool pre_layer_norm = PADDLE_GET_CONST(bool, op->GetAttr("pre_layer_norm"));
+    if (!pre_layer_norm) {
+      op->SetInput("Dropout2Out", this->Output("Dropout2Out"));
+    }
 
     op->SetOutput(framework::GradVarName("X"), this->InputGrad("X"));
     if (pre_layer_norm) {
diff --git a/paddle/fluid/operators/fused/fused_feedforward_op.cu b/paddle/fluid/operators/fused/fused_feedforward_op.cu
index 33d1e89bf28fe2..b8ba7b8810000b 100644
--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/distributed/collective/ProcessGroupNCCL.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #endif
@@ -43,13 +43,15 @@ static void AllReduce(framework::Tensor& tensor,  // NOLINT
 
   if (map->has(ring_id)) {
     paddle::distributed::ProcessGroup* pg = map->get(ring_id);
+    auto pg_nccl = static_cast<distributed::ProcessGroupNCCL*>(pg);
+
     std::vector<phi::DenseTensor> in_tensor;
     std::vector<phi::DenseTensor> out_tensor;
     in_tensor.push_back(tensor);
     out_tensor.push_back(tensor);
     paddle::distributed::AllreduceOptions opts;
     opts.reduce_op = distributed::ReduceOp::SUM;
-    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    auto task = pg_nccl->AllReduce(in_tensor, out_tensor, opts, true, true);
     task->Wait();
   } else {
     auto dtype = platform::ToNCCLDataType(
@@ -337,7 +339,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
                const framework::Tensor& linear1_out,
                const framework::Tensor* ln1_out,
                const framework::Tensor& dropout1_out,
-               const framework::Tensor& dropout2_out,
+               const framework::Tensor* dropout2_out,
                const framework::Tensor& linear1_weight,
                const framework::Tensor* linear1_bias,
                const framework::Tensor& linear2_weight,
@@ -420,7 +422,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
       fused_dropout_layernorm_helper.LayernormResidualDropoutBiasGrad(
           ctx,
           d_out.data<T>(),
-          dropout2_out.data<T>(),
+          dropout2_out->data<T>(),
           dropout2_mask.data<uint8_t>(),
           ln2_gamma_ptr,
           ln2_mean->data<U>(),
@@ -504,7 +506,7 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
     auto* ln1_out =
         pre_layer_norm ? context.Input<framework::Tensor>("Ln1Out") : nullptr;
     auto dropout1_out = *context.Input<framework::Tensor>("Dropout1Out");
-    auto dropout2_out = *context.Input<framework::Tensor>("Dropout2Out");
+    auto* dropout2_out = context.Input<framework::Tensor>("Dropout2Out");
     auto linear1_weight = *context.Input<framework::Tensor>("Linear1Weight");
     auto* linear1_bias = context.Input<framework::Tensor>("Linear1Bias");
     auto linear2_weight = *context.Input<framework::Tensor>("Linear2Weight");
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
index f1eeae8651cdb1..c0978bca6568a4 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cc
@@ -139,9 +139,8 @@ class FusedGemmEpilogueOp : public framework::OperatorWithKernel {
     }
 
     ctx->SetOutputDim("Out", phi::make_ddim(out_dims));
-    // Note (Ming Huang): Reserve space of relu is a bit-mask,
-    // which cannot pass nan_and_inf checking if shape is set.
-    if (activation == "gelu" && ctx->HasOutput("ReserveSpace")) {
+
+    if (ctx->HasOutput("ReserveSpace")) {
       ctx->SetOutputDim("ReserveSpace", phi::make_ddim(out_dims));
     }
   }
@@ -392,14 +391,13 @@ class FusedGemmEpilogueOpGradMaker : public framework::SingleGradOpMaker<T> {
  protected:
   void Apply(GradOpPtr<T> op) const override {
     const auto& act_type = this->template Attr<std::string>("activation");
-    PADDLE_ENFORCE_EQ(
-        act_type,
-        "none",
-        phi::errors::InvalidArgument("The activation should be none."));
 
     op->SetType(this->ForwardOpType() + "_grad");
     op->SetInput("X", this->Input("X"));
     op->SetInput("Y", this->Input("Y"));
+    if (act_type != "none") {
+      op->SetInput("ReserveSpace", this->Input("ReserveSpace"));
+    }
     op->SetInput("DOut", this->OutputGrad("Out"));
 
     op->SetOutput("DX", this->InputGrad("X"));
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
index 22340210b5715d..1c8f966f2aae20 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/dynload/cublasLt.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -62,6 +63,9 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
     if (std::is_same<T, paddle::platform::float16>::value) {
       mat_type = CUDA_R_16F;
     }
+    if (std::is_same<T, platform::bfloat16>::value) {
+      mat_type = CUDA_R_16BF;
+    }
     if (std::is_same<T, double>::value) {
       mat_type = CUDA_R_64F;
       scale_type = CUDA_R_64F;
@@ -103,15 +107,21 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
             sizeof(bias_data)));
 
     if (enable_auxiliary && activation != "none") {
-      size_t reserve_space_size = 0;
+      // Note (Ming Huang): The initialization of ReseveSpace is happened in the
+      // dev_ctx.Alloc. Therefore, we set real date type up here.
       if (activation == "relu") {
-        // Count in bits.
-        reserve_space_size = phi::product(out->dims()) / 8;
+        paddle::experimental::DataType rs_type =
+            paddle::experimental::DataType::BOOL;
+        size_t reserve_space_size =
+            phi::product(reserve_space->dims()) * SizeOf(rs_type);
+        dev_ctx.Alloc(reserve_space, rs_type, reserve_space_size);
       } else {
-        reserve_space_size = phi::product(out->dims()) * sizeof(T);
+        size_t reserve_space_size =
+            phi::product(reserve_space->dims()) * sizeof(T);
+        dev_ctx.Alloc<T>(reserve_space, reserve_space_size);
       }
-      dev_ctx.Alloc(reserve_space, out->type(), reserve_space_size);
-      void* aux_data = reinterpret_cast<void*>(reserve_space->data<T>());
+
+      void* aux_data = reserve_space->data();
 
       PADDLE_ENFORCE_GPU_SUCCESS(
           platform::dynload::cublasLtMatmulDescSetAttribute(
@@ -181,7 +191,6 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
                                                               stream,
                                                               workspace->ptr(),
                                                               workspace_size);
-
     PADDLE_ENFORCE_GPU_SUCCESS(
         platform::dynload::cublasLtMatmul(lt_handle,
                                           operation_desc,
@@ -352,6 +361,9 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
     if (std::is_same<T, paddle::platform::float16>::value) {
       mat_type = CUDA_R_16F;
     }
+    if (std::is_same<T, platform::bfloat16>::value) {
+      mat_type = CUDA_R_16BF;
+    }
     if (std::is_same<T, double>::value) {
       mat_type = CUDA_R_64F;
       scale_type = CUDA_R_64F;
@@ -471,7 +483,7 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
               sizeof(epiloque_func_for_dx)));
 
       if (activation_grad != "none") {
-        auto* aux_data = reserve_space->data<T>();
+        auto* aux_data = reserve_space->data();
         PADDLE_ENFORCE_GPU_SUCCESS(
             platform::dynload::cublasLtMatmulDescSetAttribute(
                 dx_operation_desc,
@@ -686,12 +698,14 @@ REGISTER_OP_CUDA_KERNEL(
     fused_gemm_epilogue,
     ops::FusedGemmEpilogueKernel<phi::GPUContext, float>,
     ops::FusedGemmEpilogueKernel<phi::GPUContext, double>,
-    ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::float16>);
+    ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::float16>,
+    ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::bfloat16>);
 
 REGISTER_OP_CUDA_KERNEL(
     fused_gemm_epilogue_grad,
     ops::FusedGemmEpilogueGradKernel<phi::GPUContext, float>,
     ops::FusedGemmEpilogueGradKernel<phi::GPUContext, double>,
     ops::FusedGemmEpilogueGradKernel<phi::GPUContext,
-                                     paddle::platform::float16>);
+                                     paddle::platform::float16>,
+    ops::FusedGemmEpilogueKernel<phi::GPUContext, paddle::platform::bfloat16>);
 #endif
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
index 160bda908185f6..0f4fa9f88954db 100644
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.h
@@ -15,7 +15,12 @@ limitations under the License. */
 
 #pragma once
 
+#ifdef PADDLE_WITH_CUDA
+
 #include <cuda_runtime_api.h>
+#include "cuda.h"  // NOLINT
+
+#if CUDA_VERSION >= 11060
 
 #include <algorithm>
 #include <mutex>
@@ -321,3 +326,6 @@ class GemmEpilogueAlgoCache {
 
 }  // namespace operators
 }  // namespace paddle
+
+#endif
+#endif
diff --git a/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
new file mode 100644
index 00000000000000..2b4b03e32cd8ef
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op_xpu.cc
@@ -0,0 +1,241 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Copyright (c) 2022 NVIDIA Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/framework/scope_guard.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueXPUKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = ctx.template device_context<phi::XPUContext>();
+
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+    const Tensor* bias = ctx.Input<Tensor>("Bias");
+
+    Tensor* out = ctx.Output<Tensor>("Out");
+    Tensor* reserve_space = ctx.Output<Tensor>("ReserveSpace");
+
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+
+    std::string activation = ctx.Attr<std::string>("activation");
+    VLOG(5) << "trans_x = " << trans_x << " , trans_y = " << trans_y
+            << " , activation = " << activation;
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+
+    // (M * K) * (K * N) for new api use
+    // int64_t M = trans_x ? x_mat_dims[1] : x_mat_dims[0];
+    // int64_t K = trans_y ? y->dims()[1] : y->dims()[0];
+    // int64_t N = trans_y ? y->dims()[0] : y->dims()[1];
+
+    // 调用新接口，这里先分开调用，等待qingpen的新接口
+    int r = 0;
+    xpu::Activation_t act = xpu::Activation_t::LINEAR;
+    if (activation == "relu") {
+      act = xpu::Activation_t::RELU;
+    } else if (activation == "gelu") {
+      act = xpu::Activation_t::GELU;
+    }
+    // fc + bias + act
+    // 1. fc
+    phi::XpuFcInfo fc_info;
+
+    phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &fc_info);
+    VLOG(0) << "FusedGemmEpilogueXPUKernel 000";
+    xpu::Context* xpu_ctx = dev_ctx.x_context();
+
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y->data<T>());
+    XPUType* out_ptr =
+        reinterpret_cast<XPUType*>(out->mutable_data<T>(ctx.GetPlace()));
+    xpu::ctx_guard RAII_GUARD(xpu_ctx);
+    XPUType* fc_out_ptr = RAII_GUARD.alloc_l3_or_gm<XPUType>(out->numel());
+    phi::MatMulXPUFunction<XPUType>(
+        xpu_ctx, x_ptr, y_ptr, fc_out_ptr, fc_info, 1.0f);
+    XPUType* bias_out_ptr = out_ptr;
+    if (activation != "none" && reserve_space) {
+      bias_out_ptr = reinterpret_cast<XPUType*>(
+          reserve_space->mutable_data<T>(ctx.GetPlace()));
+    }
+    // 2 bias
+    const XPUType* bias_ptr = reinterpret_cast<const XPUType*>(bias->data<T>());
+    r = xpu::broadcast_add(xpu_ctx,
+                           fc_out_ptr,
+                           bias_ptr,
+                           bias_out_ptr,
+                           {fc_info.m, fc_info.n},
+                           {fc_info.n});
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
+    // 3 act
+    if (activation == "relu") {
+      r = xpu::relu(xpu_ctx, bias_out_ptr, out_ptr, out->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu");
+    } else if (activation == "gelu") {
+      r = xpu::gelu(xpu_ctx, bias_out_ptr, out_ptr, out->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu");
+    }
+  }
+};
+
+template <typename DeviceContext, typename T>
+class FusedGemmEpilogueXPUGradKernel : public framework::OpKernel<T> {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    bool trans_x = ctx.Attr<bool>("trans_x");
+    bool trans_y = ctx.Attr<bool>("trans_y");
+    auto& dev_ctx = ctx.template device_context<phi::XPUContext>();
+    const Tensor* dout = ctx.Input<Tensor>("DOut");
+    const Tensor* x = ctx.Input<Tensor>("X");
+    const Tensor* y = ctx.Input<Tensor>("Y");
+
+    const Tensor* reserve_space = ctx.Input<Tensor>("ReserveSpace");
+
+    Tensor* dx = ctx.Output<Tensor>("DX");
+    Tensor* dy = ctx.Output<Tensor>("DY");
+    Tensor* dbias = ctx.Output<Tensor>("DBias");
+
+    std::string activation = "none";
+    if (ctx.HasAttr("activation")) {
+      activation = ctx.Attr<std::string>("activation");
+    } else if (ctx.HasAttr("activation_grad")) {
+      activation = ctx.Attr<std::string>("activation_grad");
+    }
+
+    auto* xpu_ctx = dev_ctx.x_context();
+    xpu::ctx_guard RAII_GUARD(xpu_ctx);
+    const XPUType* dout_ptr = reinterpret_cast<const XPUType*>(dout->data<T>());
+
+    const XPUType* dout_fc_ptr = dout_ptr;
+    const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x->data<T>());
+    const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y->data<T>());
+
+    // const XPUType*
+    const XPUType* reserve_space_ptr =
+        (reserve_space == NULL)
+            ? (reinterpret_cast<const XPUType*>(NULL))
+            : (reinterpret_cast<const XPUType*>(reserve_space->data<T>()));
+    XPUType* d_act_input_ptr;
+    if (activation != "none") {
+      d_act_input_ptr = RAII_GUARD.alloc_l3_or_gm<XPUType>(dout->numel());
+      dout_fc_ptr = d_act_input_ptr;
+    }
+
+    // 1. act_grad  2. fc_grad 3. dbias
+    int r = 0;
+    if (activation == "relu") {
+      r = xpu::relu_grad(xpu_ctx,
+                         reserve_space_ptr,
+                         reserve_space_ptr,
+                         dout_ptr,
+                         d_act_input_ptr,
+                         dout->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu_grad");
+    } else if (activation == "gelu") {
+      r = xpu::gelu_grad(xpu_ctx,
+                         reserve_space_ptr,
+                         reserve_space_ptr,
+                         dout_ptr,
+                         d_act_input_ptr,
+                         dout->numel());
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "gelu_grad");
+    }
+
+    auto x_mat_dims =
+        phi::flatten_to_2d(x->dims(), trans_x ? 1 : x->dims().size() - 1);
+    phi::XpuFcInfo info_forward;
+    phi::GetFCInfo(x_mat_dims, y->dims(), trans_x, trans_y, &info_forward);
+
+    // 2. fc_grad
+    const XPUType* a_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_1 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* a_2 = reinterpret_cast<const XPUType*>(NULL);
+    const XPUType* b_2 = reinterpret_cast<const XPUType*>(NULL);
+    XPUType* c_1 =
+        (dx == NULL)
+            ? reinterpret_cast<XPUType*>(NULL)
+            : reinterpret_cast<XPUType*>(dx->mutable_data<T>(ctx.GetPlace()));
+    XPUType* c_2 =
+        (dy == NULL)
+            ? reinterpret_cast<XPUType*>(NULL)
+            : reinterpret_cast<XPUType*>(dy->mutable_data<T>(ctx.GetPlace()));
+    phi::XpuFcInfo info_dx;
+    phi::XpuFcInfo info_dy;
+    std::tuple<phi::XpuFcInfo,
+               phi::XpuFcInfo,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*,
+               const XPUType*>
+        fc_info = phi::MatmulGradFcInfo(xpu_ctx,
+                                        &RAII_GUARD,
+                                        info_forward,
+                                        trans_x,
+                                        trans_y,
+                                        x_ptr,
+                                        y_ptr,
+                                        dout_fc_ptr);
+    std::tie(info_dx, info_dy, a_1, b_1, a_2, b_2) = fc_info;
+    if (dx) {
+      phi::MatMulXPUFunction<XPUType>(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f);
+    }
+    if (dy) {
+      phi::MatMulXPUFunction<XPUType>(xpu_ctx, a_2, b_2, c_2, info_dy, 1.0f);
+    }
+    // 3. dbias
+    if (dbias) {
+      XPUType* dbias_ptr =
+          reinterpret_cast<XPUType*>(dbias->mutable_data<T>(ctx.GetPlace()));
+      r = xpu::reduce_sum(xpu_ctx,
+                          dout_fc_ptr,
+                          dbias_ptr,
+                          {info_forward.m, info_forward.n},
+                          {0});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OP_XPU_KERNEL(
+    fused_gemm_epilogue,
+    ops::FusedGemmEpilogueXPUKernel<phi::XPUContext, float>,
+    ops::FusedGemmEpilogueXPUKernel<phi::XPUContext,
+                                    paddle::platform::float16>);
+
+REGISTER_OP_XPU_KERNEL(
+    fused_gemm_epilogue_grad,
+    ops::FusedGemmEpilogueXPUGradKernel<phi::XPUContext, float>,
+    ops::FusedGemmEpilogueXPUGradKernel<phi::XPUContext,
+                                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
index 7bb3498567cc3f..137943afbfb94d 100644
--- a/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_layernorm_residual_dropout_bias.h
@@ -418,7 +418,9 @@ template <typename T,
           int THREADS_PER_CTA = WARPS_M *THREADS_PER_ROW,
           int ROWS_PER_CTA = WARPS_M,
           int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW *VecSize,
-          int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA>
+          int LDGS = ELTS_PER_ROW / ELTS_PER_ROW_PER_CTA,
+          typename InType = T,
+          typename OutType = T>
 __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
     int rows,
     int cols,
@@ -428,7 +430,7 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
     const bool is_test,
     const uint64_t increment,
     const float epsilon,
-    const T *__restrict__ x_ptr,
+    const InType *__restrict__ x_ptr,
     const T *__restrict__ residual_ptr,
     const T *__restrict__ bias_ptr,
     const ScaleT *__restrict__ gamma_ptr,
@@ -437,10 +439,20 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
     U *__restrict__ mean_out_ptr,
     U *__restrict__ var_out_ptr,
     T *__restrict__ residual_out_ptr,
-    T *__restrict__ y_ptr) {
+    OutType *__restrict__ y_ptr,
+    const float quant_last_in_scale = 1.0,
+    const float *__restrict__ quant_out_scale_ptr = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
   __shared__ U smem[WARPS_M * WARPS_N];
   using Vec = phi::AlignedVector<T, VecSize>;
   using Vec_scale = phi::AlignedVector<ScaleT, VecSize>;
+  using Vec_in_type = phi::AlignedVector<InType, VecSize>;
+  using Vec_out_type = phi::AlignedVector<OutType, VecSize>;
+  using Vec_float = phi::AlignedVector<float, VecSize>;
   using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
 
   const int tidx = threadIdx.x;
@@ -481,12 +493,21 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
   constexpr U rn = 1.f / U(ELTS_PER_ROW);
   for (int row = r; row < rows; row += gridDim.x * ROWS_PER_CTA) {
     Vec x[LDGS];
+    Vec_in_type x_input[LDGS];
     Vec residual[LDGS];
+    Vec_float dequant_out_scale[LDGS];
+
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Load<T, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize, &x[it]);
       phi::Load<T, VecSize>(residual_ptr + row * ELTS_PER_ROW + col * VecSize,
                             &residual[it]);
+      phi::Load<InType, VecSize>(x_ptr + row * ELTS_PER_ROW + col * VecSize,
+                                 &x_input[it]);
+      if (quant_out_scale_ptr != nullptr) {
+        phi::Load<float, VecSize>(
+            quant_out_scale_ptr + quant_out_scale_offset + col * VecSize,
+            &dequant_out_scale[it]);
+      }
       col += THREADS_PER_ROW;
     }
 
@@ -520,10 +541,21 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 #pragma unroll
         for (int jt = 0; jt < VecSize; jt++) {
           // dropout(x) + residual
-          x[it][jt] = (x[it][jt] + bias[it][jt]) *
-                          static_cast<T>(mask_vec[it][jt]) * factor +
-                      residual[it][jt];
-          xf[it * VecSize + jt] = U(x[it][jt]);
+          if (std::is_same<InType, int32_t>::value) {
+            T tmp = (static_cast<T>(static_cast<float>(x_input[it][jt]) *
+                                    quant_last_in_scale /
+                                    dequant_out_scale[it][jt]) +
+                     bias[it][jt]) *
+                        static_cast<T>(mask_vec[it][jt]) * factor +
+                    residual[it][jt];
+            x[it][jt] = tmp;
+            xf[it * VecSize + jt] = U(tmp);
+          } else {
+            x[it][jt] = (static_cast<T>(x_input[it][jt]) + bias[it][jt]) *
+                            static_cast<T>(mask_vec[it][jt]) * factor +
+                        residual[it][jt];
+            xf[it * VecSize + jt] = U(x[it][jt]);
+          }
         }
       }
     } else {
@@ -532,8 +564,19 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 #pragma unroll
         for (int jt = 0; jt < VecSize; jt++) {
           // dropout(x) + residual
-          x[it][jt] = x[it][jt] * static_cast<T>(mask_vec[it][jt]) * factor +
-                      residual[it][jt];
+          if (std::is_same<InType, int32_t>::value) {
+            // for int32 input, we need to dequantize.
+            T tmp = static_cast<T>(static_cast<float>(x_input[it][jt]) *
+                                   quant_last_in_scale /
+                                   dequant_out_scale[it][jt]) *
+                        static_cast<T>(mask_vec[it][jt]) * factor +
+                    residual[it][jt];
+            x[it][jt] = tmp;
+          } else {
+            x[it][jt] = static_cast<T>(x_input[it][jt]) *
+                            static_cast<T>(mask_vec[it][jt]) * factor +
+                        residual[it][jt];
+          }
           xf[it * VecSize + jt] = U(x[it][jt]);
         }
       }
@@ -626,6 +669,8 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
       var_out_ptr[row] = var_local * rn;
     }
 
+    Vec_out_type x_output[LDGS];
+
 #pragma unroll
     for (int it = 0; it < LDGS; it++) {
 #pragma unroll
@@ -638,12 +683,26 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
         U tmp = rsigma * (static_cast<U>(xf[it * VecSize + jt]) - mu_local);
         x[it][jt] = static_cast<T>(static_cast<U>(gamma[it][jt]) * tmp +
                                    static_cast<U>(beta[it][jt]));
+
+        if (std::is_same<OutType, int8_t>::value)
+          x_output[it][jt] = quant_helper(x[it][jt],
+                                          quant_next_in_scale,
+                                          quant_round_type,
+                                          quant_max_bound,
+                                          quant_min_bound);
       }
     }
 
 #pragma unroll
     for (int it = 0, col = c; it < LDGS; it++) {
-      phi::Store<T, VecSize>(x[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
+      if (std::is_same<OutType, int8_t>::value) {
+        phi::Store<OutType, VecSize>(
+            x_output[it], y_ptr + row * ELTS_PER_ROW + col * VecSize);
+      } else {
+        phi::Store<T, VecSize>(
+            x[it],
+            reinterpret_cast<T *>(y_ptr) + row * ELTS_PER_ROW + col * VecSize);
+      }
       col += THREADS_PER_ROW;
     }
   }
@@ -668,7 +727,9 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_fast_ln_fwd_kernel(
 template <typename T,
           typename MaskType,
           typename U,
-          bool ScaleBiasWithSameTypeX = false>
+          bool ScaleBiasWithSameTypeX = false,
+          typename InType = T,
+          typename OutType = T>
 void LaunchLayernormResidualDropoutBias(
     const uint32_t rows,
     const uint32_t cols,
@@ -678,18 +739,26 @@ void LaunchLayernormResidualDropoutBias(
     const float epsilon,
     const bool is_upscale_in_train,
     const bool is_test,
-    const T *src,
+    const InType *src,
     const T *residual,
     const T *bias,
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *layernorm_bias,
     MaskType *mask_data,
     T *dst,
-    T *layernorm_dst,
+    OutType *layernorm_dst,
     LayerNormParamType<T> *mean,
     LayerNormParamType<T> *var,
-    const phi::GPUContext &ctx) {
+    const phi::GPUContext &ctx,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
   // dropout_prob == 1.0f
+  // NOTE(minghaoBD): OutType should be T if drop_out_rate == 1.0
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
     auto cuda_place = ctx.GetPlace();
     memory::Copy(cuda_place,
@@ -705,14 +774,15 @@ void LaunchLayernormResidualDropoutBias(
     switch (GetDesiredBlockDim(cols)) {
       FIXED_BLOCK_DIM_CASE(
           LayerNormForward<T, U, kBlockDim, ScaleBiasWithSameTypeX>
-          <<<rows, kBlockDim, 0, ctx.stream()>>>(dst,
-                                                 scale,
-                                                 layernorm_bias,
-                                                 layernorm_dst,
-                                                 mean,
-                                                 var,
-                                                 epsilon,
-                                                 cols));
+          <<<rows, kBlockDim, 0, ctx.stream()>>>(
+              dst,
+              scale,
+              layernorm_bias,
+              reinterpret_cast<T *>(layernorm_dst),
+              mean,
+              var,
+              epsilon,
+              cols));
       default:
         PADDLE_THROW(platform::errors::InvalidArgument(
             "Product from begin_norm_axis to end must be larger than 1"));
@@ -722,44 +792,63 @@ void LaunchLayernormResidualDropoutBias(
     return;
   }
 
-#define LAUNCH_FUSED_FAST_LN_KERNEL_BASE(cols)                                 \
-  case (cols): {                                                               \
-    constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                   \
-    constexpr int WARPS_M = 4 / WARPS_N;                                       \
-    const int THREADS_PER_WARP = 32;                                           \
-    const int BYTES_PER_LDG = 16;                                              \
-    const int VecSize = BYTES_PER_LDG / sizeof(T);                             \
-    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;          \
-    const int ROWS_PER_CTA = WARPS_M;                                          \
-    const int grid =                                                           \
-        static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA)));  \
-    fused_fast_ln_fwd_kernel<                                                  \
-        T,                                                                     \
-        U,                                                                     \
-        LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,                     \
-        uint8_t,                                                               \
-        VecSize,                                                               \
-        WARPS_M,                                                               \
-        WARPS_N,                                                               \
-        BYTES_PER_LDG,                                                         \
-        cols><<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows,                \
-                                                          cols,                \
-                                                          seed,                \
-                                                          dropout_prob,        \
-                                                          is_upscale_in_train, \
-                                                          is_test,             \
-                                                          increment,           \
-                                                          epsilon,             \
-                                                          src,                 \
-                                                          residual,            \
-                                                          bias,                \
-                                                          scale,               \
-                                                          layernorm_bias,      \
-                                                          mask_data,           \
-                                                          mean,                \
-                                                          var,                 \
-                                                          dst,                 \
-                                                          layernorm_dst);      \
+#define LAUNCH_FUSED_FAST_LN_KERNEL_BASE(cols)                                \
+  case (cols): {                                                              \
+    constexpr int WARPS_N = cols < 1024 ? 1 : (cols / 1024);                  \
+    constexpr int WARPS_M = 4 / WARPS_N;                                      \
+    const int THREADS_PER_WARP = 32;                                          \
+    const int BYTES_PER_LDG = 16;                                             \
+    const int VecSize = BYTES_PER_LDG / sizeof(T);                            \
+    const int THREADS_PER_CTA = WARPS_N * THREADS_PER_WARP * WARPS_M;         \
+    const int ROWS_PER_CTA = WARPS_M;                                         \
+    const int THREADS_PER_ROW = WARPS_N * THREADS_PER_WARP;                   \
+    const int ELTS_PER_ROW_PER_CTA = THREADS_PER_ROW * VecSize;               \
+    const int LDGS = cols / ELTS_PER_ROW_PER_CTA;                             \
+    const int grid =                                                          \
+        static_cast<int>(std::ceil(rows / static_cast<float>(ROWS_PER_CTA))); \
+    fused_fast_ln_fwd_kernel<                                                 \
+        T,                                                                    \
+        U,                                                                    \
+        LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX>,                    \
+        uint8_t,                                                              \
+        VecSize,                                                              \
+        WARPS_M,                                                              \
+        WARPS_N,                                                              \
+        BYTES_PER_LDG,                                                        \
+        cols,                                                                 \
+        THREADS_PER_WARP,                                                     \
+        THREADS_PER_ROW,                                                      \
+        THREADS_PER_CTA,                                                      \
+        ROWS_PER_CTA,                                                         \
+        ELTS_PER_ROW_PER_CTA,                                                 \
+        LDGS,                                                                 \
+        InType,                                                               \
+        OutType>                                                              \
+        <<<grid, THREADS_PER_CTA, 0, ctx.stream()>>>(rows,                    \
+                                                     cols,                    \
+                                                     seed,                    \
+                                                     dropout_prob,            \
+                                                     is_upscale_in_train,     \
+                                                     is_test,                 \
+                                                     increment,               \
+                                                     epsilon,                 \
+                                                     src,                     \
+                                                     residual,                \
+                                                     bias,                    \
+                                                     scale,                   \
+                                                     layernorm_bias,          \
+                                                     mask_data,               \
+                                                     mean,                    \
+                                                     var,                     \
+                                                     dst,                     \
+                                                     layernorm_dst,           \
+                                                     quant_last_in_scale,     \
+                                                     dequant_out_scale_data,  \
+                                                     quant_out_scale_offset,  \
+                                                     quant_next_in_scale,     \
+                                                     quant_round_type,        \
+                                                     quant_max_bound,         \
+                                                     quant_min_bound);        \
   } break
 
 #define LAUNCH_FUSED_FAST_LN_KERNEL       \
@@ -784,24 +873,25 @@ void LaunchLayernormResidualDropoutBias(
   if (cols % VecSize != 0) {
     int blockDim = GetDesiredBlockDim(cols);
     FusedLayernormResidualDropoutBias<T, uint8_t, 1, U, ScaleBiasWithSameTypeX>
-        <<<rows, blockDim, 0, ctx.stream()>>>(rows,
-                                              cols,
-                                              seed,
-                                              dropout_prob,
-                                              is_upscale_in_train,
-                                              is_test,
-                                              increment,
-                                              epsilon,
-                                              src,
-                                              residual,
-                                              bias,
-                                              scale,
-                                              layernorm_bias,
-                                              mask_data,
-                                              dst,
-                                              layernorm_dst,
-                                              mean,
-                                              var);
+        <<<rows, blockDim, 0, ctx.stream()>>>(
+            rows,
+            cols,
+            seed,
+            dropout_prob,
+            is_upscale_in_train,
+            is_test,
+            increment,
+            epsilon,
+            reinterpret_cast<const T *>(src),
+            residual,
+            bias,
+            scale,
+            layernorm_bias,
+            mask_data,
+            dst,
+            reinterpret_cast<T *>(layernorm_dst),
+            mean,
+            var);
   } else {
     if (can_call_fast_ln_kernel) {
       switch (cols) {
@@ -819,24 +909,25 @@ void LaunchLayernormResidualDropoutBias(
                                         VecSize,
                                         U,
                                         ScaleBiasWithSameTypeX>
-          <<<rows, blockDim, 0, ctx.stream()>>>(rows,
-                                                cols,
-                                                seed,
-                                                dropout_prob,
-                                                is_upscale_in_train,
-                                                is_test,
-                                                increment,
-                                                epsilon,
-                                                src,
-                                                residual,
-                                                bias,
-                                                scale,
-                                                layernorm_bias,
-                                                mask_data,
-                                                dst,
-                                                layernorm_dst,
-                                                mean,
-                                                var);
+          <<<rows, blockDim, 0, ctx.stream()>>>(
+              rows,
+              cols,
+              seed,
+              dropout_prob,
+              is_upscale_in_train,
+              is_test,
+              increment,
+              epsilon,
+              reinterpret_cast<const T *>(src),
+              residual,
+              bias,
+              scale,
+              layernorm_bias,
+              mask_data,
+              dst,
+              reinterpret_cast<T *>(layernorm_dst),
+              mean,
+              var);
     }
   }
 }
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
new file mode 100644
index 00000000000000..9572a87aba21de
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cc
@@ -0,0 +1,369 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class FusedMultiTransformerINT8Op : public framework::OperatorWithKernel {
+ private:
+  static constexpr const char *OpName = "FusedMultiTransformerINT8Op";
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+#define CHECK_INPUT(name) \
+  OP_INOUT_CHECK(ctx->HasInput(#name), "Input", #name, OpName)
+#define CHECK_INPUTS(name) \
+  OP_INOUT_CHECK(ctx->HasInputs(#name), "Input", #name, OpName)
+#define CHECK_OUTPUT(name) \
+  OP_INOUT_CHECK(ctx->HasOutput(#name), "Output", #name, OpName)
+#define CHECK_OUTPUTS(name) \
+  OP_INOUT_CHECK(ctx->HasOutputs(#name), "Output", #name, OpName)
+
+    CHECK_INPUT(X);
+
+    // attention
+    CHECK_INPUTS(QKVW);
+    CHECK_INPUTS(OutLinearW);
+
+    if (ctx->HasInput("TimeStep")) {
+      CHECK_INPUTS(CacheKV);
+    }
+
+    if (ctx->HasInputs("CacheKV")) {
+      CHECK_OUTPUTS(CacheKVOut);
+    }
+
+    // ffn
+    CHECK_INPUTS(FFN1Weight);
+    CHECK_INPUTS(FFN2Weight);
+
+    CHECK_OUTPUT(Out);
+
+    // x: qkv's input [batch_size, seq_len, dim_embed]
+    // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+    auto x_dim = ctx->GetInputDim("X");
+    auto y_dim = ctx->GetInputsDim("QKVW")[0];
+    bool trans_qkvw = ctx->Attrs().Get<bool>("trans_qkvw");
+    PADDLE_ENFORCE_EQ(
+        x_dim.size(),
+        3,
+        platform::errors::InvalidArgument("The dimensions of x must be 3"
+                                          "(batch_size, seq_len, dim_embed),"
+                                          "but received dimensions of"
+                                          "Input is [%d]",
+                                          x_dim.size()));
+    PADDLE_ENFORCE_EQ(y_dim.size(),
+                      4,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of qkv_weight must be 4"
+                          "(3, num_head, dim_head, dim_embed),"
+                          "but received dimensions of"
+                          "Input is [%d]",
+                          y_dim.size()));
+    PADDLE_ENFORCE_EQ(
+        x_dim[2],
+        trans_qkvw ? y_dim[3] : y_dim[0],
+        platform::errors::InvalidArgument(
+            "ShapeError: the dimension of x_dim[2] and y_dim[3](trans_qkvw is "
+            "true) or y_dim[0](trans_qkvw is false)"
+            "must be equal. But received: the shape "
+            "of input x = [%s], and the shape of "
+            "input qkv_weight = [%s]",
+            x_dim,
+            y_dim));
+
+    if (ctx->Attrs().Get<int>("ring_id") == -1) {
+      if (trans_qkvw) {
+        PADDLE_ENFORCE_EQ(y_dim[1] * y_dim[2],
+                          y_dim[3],
+                          platform::errors::InvalidArgument(
+                              "The dimensions of qkv_weight must be 4"
+                              "(3, num_head, dim_head, dim_embed),"
+                              "and must satisfy the limitations: "
+                              "(num_head * dim_head == dim_embed)"));
+
+      } else {
+        PADDLE_ENFORCE_EQ(y_dim[2] * y_dim[3],
+                          y_dim[0],
+                          platform::errors::InvalidArgument(
+                              "The dimensions of qkv_weight must be 4"
+                              "(dim_embed, 3, num_head, dim_head),"
+                              "and must satisfy the limitations: "
+                              "(num_head * dim_head == dim_embed)"));
+      }
+    }
+
+    if (ctx->HasInputs("CacheKV")) {
+      // [2, batch_size, num_head, max_seq_len, head_size]
+      const auto &c_dims = ctx->GetInputsDim("CacheKV");
+      const auto &c_dim = c_dims[0];
+
+      PADDLE_ENFORCE_EQ(
+          c_dim.size(),
+          5,
+          paddle::platform::errors::InvalidArgument(
+              "The CacheKV must be 5 dims, but got %d", c_dim.size()));
+      PADDLE_ENFORCE_EQ(c_dim[0],
+                        2,
+                        paddle::platform::errors::InvalidArgument(
+                            "The first dim of CacheKV must be 2, but got %d",
+                            c_dim[0]));  // 2
+      PADDLE_ENFORCE_EQ(c_dim[1],
+                        x_dim[0],
+                        paddle::platform::errors::InvalidArgument(
+                            "The second dim of CacheKV must be equal with "
+                            "batch size %d, but got %d",
+                            x_dim[0],
+                            c_dim[1]));  // batch_size
+      PADDLE_ENFORCE_EQ(c_dim[2],
+                        trans_qkvw ? y_dim[1] : y_dim[2],
+                        paddle::platform::errors::InvalidArgument(
+                            "The third dim of CacheKV must be equal with num "
+                            "head %d, but got %d",
+                            trans_qkvw ? y_dim[1] : y_dim[2],
+                            c_dim[2]));  // num_head
+      PADDLE_ENFORCE_GT(
+          c_dim[3],
+          0,
+          paddle::platform::errors::InvalidArgument(
+              "The forth dim of CacheKV must be greater than 0, but got %d",
+              c_dim[3]));  // cache_seq_len
+      PADDLE_ENFORCE_EQ(c_dim[4],
+                        trans_qkvw ? y_dim[2] : y_dim[3],
+                        paddle::platform::errors::InvalidArgument(
+                            "The fifth dim of CacheKV must be equal with head "
+                            "size %d, but got %d",
+                            trans_qkvw ? y_dim[2] : y_dim[3],
+                            c_dim[4]));  // head_size
+    }
+
+    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string &var_name,
+      const Tensor &tensor,
+      const framework::OpKernelType &expected_kernel_type) const override {
+    if (var_name == "TimeStep") {
+      VLOG(10) << "var_name:" << var_name << " need not to transform";
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(
+        expected_kernel_type.data_type_, tensor.place(), tensor.layout());
+  }
+};
+
+class FusedMultiTransformerINT8OpMaker
+    : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The input tensor.");
+    AddInput("LnScale",
+             "Scale is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDuplicable();
+    AddInput("LnBias",
+             "Bias is a 1-dimensional tensor of size "
+             "H. Here, H represents the last dimension of its input tensor.")
+        .AsDuplicable();
+    AddInput("QKVW", "The qkv weight tensor.").AsDuplicable();
+    AddInput("QKVBias", "The qkv bias tensor.").AsDispensable().AsDuplicable();
+
+    AddInput("CacheKV", "(optional) The cached KV for generation inference.")
+        .AsDispensable()
+        .AsDuplicable();
+    AddInput("TimeStep",
+             "(optional, int) The time step for generation inference.")
+        .AsDispensable();
+    AddInput("SrcMask", "(optional) The attention mask tensor in fmha.")
+        .AsDispensable();
+    AddInput("OutLinearW", "The out_linear weight tensor.").AsDuplicable();
+    AddInput("OutLinearBias", "The out_linear bias tensor.")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("FFNLnScale", "The layer_norm scale of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFNLnBias", "The layer_norm bias of FusedFeedForward op")
+        .AsDuplicable();
+
+    AddInput("FFN1Weight", "The linear1 weight of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN1Bias", "The linear1 bias of FusedFeedForward op")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("FFN2Weight", "The linear2 weight of FusedFeedForward op")
+        .AsDuplicable();
+    AddInput("FFN2Bias", "The linear2 bias input of FusedFeedForward op")
+        .AsDispensable()
+        .AsDuplicable();
+
+    AddInput("QKVOutScale",
+             "QKVOutScale is used to dequantize qkv output tensor."
+             "In order to keep consistent with the PTQ/QAT calculation logic,"
+             "QKVOutScale should be max_bound * max_bound / max_range."
+             "Here max_range is per-channel weight scale."
+             "The shape of QKVOutScale is [num_layers, num_channels]")
+        .AsDispensable();
+    AddInput("OutLinearOutScale",
+             "OutLinearOutScale is used to dequantize out_linear output tensor."
+             "The definition and shape is the same as QKVOutScale")
+        .AsDispensable();
+    AddInput("FFN1OutScale",
+             "FFN1OutScale is used to dequantize ffn1 output tensor."
+             "The definition and shape is the same as QKVOutScale")
+        .AsDispensable();
+    AddInput("FFN2OutScale",
+             "FFN2OutScale is used to dequantize ffn2 output tensor."
+             "The definition and shape is the same as QKVOutScale")
+        .AsDispensable();
+
+    AddOutput("CacheKVOut", "The updated cache KV. Inplace with CacheKV")
+        .AsDispensable()
+        .AsDuplicable();
+    AddOutput("Out", "Result after multi .");
+
+    AddAttr<bool>("pre_layer_norm",
+                  "if true, the attention op uses pre_layer_norm architecure, "
+                  "else, uses post_layer_norm architecuture. "
+                  "[default true].")
+        .SetDefault(true);
+    AddAttr<float>("epsilon",
+                   "Constant for numerical stability [default 1e-5].")
+        .SetDefault(1e-5)
+        .AddCustomChecker([](const float &epsilon) {
+          PADDLE_ENFORCE_EQ(epsilon >= 0.0f && epsilon <= 0.001f,
+                            true,
+                            platform::errors::InvalidArgument(
+                                "'epsilon' in Op(LayerNorm) should be between"
+                                "0.0 and 0.001, But received [%s].",
+                                epsilon));
+        });
+
+    AddAttr<float>("dropout_rate", "Probability of setting units to zero.")
+        .SetDefault(.5f)
+        .AddCustomChecker([](const float &drop_p) {
+          PADDLE_ENFORCE_EQ(drop_p >= 0.0f && drop_p <= 1.0f,
+                            true,
+                            platform::errors::InvalidArgument(
+                                "'dropout_rate' must be between 0.0 and 1.0."));
+        });
+
+    AddAttr<bool>("is_test",
+                  "(bool, default false) Set to true for inference only, false "
+                  "for training. Some layers may run faster when this is true.")
+        .SetDefault(false);
+    AddAttr<std::string>(
+        "dropout_implementation",
+        "[\"downgrade_in_infer\"|\"upscale_in_train\"]"
+        "The meaning is the same as 'attn_dropout_implementation'.")
+        .SetDefault("downgrade_in_infer")
+        .AddCustomChecker([](const std::string &type) {
+          PADDLE_ENFORCE_EQ(
+              type == "downgrade_in_infer" || type == "upscale_in_train",
+              true,
+              platform::errors::InvalidArgument(
+                  "dropout_implementation can only be downgrade_in_infer or "
+                  "upscale_in_train"));
+        });
+    AddAttr<std::string>("act_method", "act_method").SetDefault("gelu");
+    AddAttr<bool>(
+        "trans_qkvw",
+        "Whether the weights of qkv should be transposed. If true,"
+        "the shape eights of qkv should be [3, num_head, dim_head, dim_embed]."
+        "Otherwise the shape of weights of qkv should be"
+        "[dim_embed, 3, num_head, dim_head]")
+        .SetDefault(true);
+
+    AddAttr<int>(
+        "ring_id",
+        "ring id for tensor model parallel. distributed training and inference")
+        .SetDefault(-1);
+
+    AddAttr<int>("num_head", "num_head").SetDefault(0);
+    AddAttr<int>("dim_head", "dim_head").SetDefault(0);
+    AddAttr<int>("dim_ffn", "dim_ffn").SetDefault(0);
+
+    AddAttr<std::vector<float>>(
+        "qkv_in_scale",
+        "qkv_in_scale is used to quantize qkv input tensor."
+        "in_scale is generated by PTQ or QAT, which represents valid max range "
+        "of this tensor."
+        "the size of qkv_in_scale should be num_layers, which is equal to "
+        "QKVW.dims()[0]")
+        .SetDefault({});
+    AddAttr<std::vector<float>>(
+        "out_linear_in_scale",
+        "out_linear_in_scale is used to quantize out_linear input tensor."
+        "the size of out_linear_in_scale is the same as qkv_in_scale")
+        .SetDefault({});
+    AddAttr<std::vector<float>>(
+        "ffn1_in_scale",
+        "ffn1_in_scale is used to quantize ffn1 input tensor."
+        "the size of ffn1_in_scale is the same as qkv_in_scale")
+        .SetDefault({});
+    AddAttr<std::vector<float>>(
+        "ffn2_in_scale",
+        "ffn2_in_scale is used to quantize ffn2 input tensor."
+        "the size of ffn2_in_scale is the same as qkv_in_scale")
+        .SetDefault({});
+
+    AddAttr<int>(
+        "quant_round_type",
+        "(int, default 1) The round type of fp32 to int."
+        "0: rounding to nearest ties to even. Eg: round(1.5)=2, round(2.5)=2"
+        "1: rounding to nearest ties away from zero. Eg: round(1.5)=2, "
+        "round(-2.5)=-3")
+        .SetDefault(1);
+    AddAttr<float>(
+        "quant_max_bound",
+        "(float, default 127.0) the max bound of float type to int type")
+        .SetDefault(127.0);
+    AddAttr<float>(
+        "quant_min_bound",
+        "(float, default -127.0) the min bound of float type to int type")
+        .SetDefault(-127.0);
+
+    AddComment(R"DOC(fused multi transformer layers op)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    fused_multi_transformer_int8,
+    ops::FusedMultiTransformerINT8Op,
+    ops::FusedMultiTransformerINT8OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
new file mode 100644
index 00000000000000..8e200275f8171a
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_int8_op.cu
@@ -0,0 +1,670 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/fused/attn_gemm_int8.h"
+#include "paddle/fluid/operators/fused/fused_multi_transformer_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class FusedMultiTransformerINT8OpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    using U = LayerNormParamType<T>;
+    auto &dev_ctx = ctx.cuda_device_context();
+
+    auto *time_step = ctx.Input<Tensor>("TimeStep");
+    // 0. input
+    auto *input_x = ctx.Input<Tensor>("X");
+    const auto input_x_dims = input_x->dims();
+    int bsz = input_x_dims[0];
+    int seq_len = input_x_dims[1];
+    int dim_embed = input_x_dims[2];
+    int bsz_seq = bsz * seq_len;
+
+    // quant input scales, vector, size = num_layers
+    auto qkv_in_scale = ctx.Attr<std::vector<float>>("qkv_in_scale");
+    auto out_linear_in_scale =
+        ctx.Attr<std::vector<float>>("out_linear_in_scale");
+    auto ffn1_in_scale = ctx.Attr<std::vector<float>>("ffn1_in_scale");
+    auto ffn2_in_scale = ctx.Attr<std::vector<float>>("ffn2_in_scale");
+
+    // quant round type and bound
+    auto quant_round_type = ctx.Attr<int>("quant_round_type");
+    auto quant_max_bound = ctx.Attr<float>("quant_max_bound");
+    auto quant_min_bound = ctx.Attr<float>("quant_min_bound");
+
+    // dequant output scales, tensor, size = [num_layers, n], n is gemm output
+    // size
+    auto *qkv_out_scale = ctx.Input<Tensor>("QKVOutScale");
+    auto *out_linear_out_scale = ctx.Input<Tensor>("OutLinearOutScale");
+    auto *ffn1_out_scale = ctx.Input<Tensor>("FFN1OutScale");
+    auto *ffn2_out_scale = ctx.Input<Tensor>("FFN2OutScale");
+
+    int qkv_out_scale_n = qkv_out_scale->dims()[1];
+    int out_linear_out_scale_n = out_linear_out_scale->dims()[1];
+    int ffn1_out_scale_n = ffn1_out_scale->dims()[1];
+    int ffn2_out_scale_n = ffn2_out_scale->dims()[1];
+
+    // 1. layer norm
+    const auto pre_layer_norm = ctx.Attr<bool>("pre_layer_norm");
+    const float epsilon = ctx.Attr<float>("epsilon");
+    auto ln_scales = ctx.MultiInput<Tensor>("LnScale");
+    auto ln_biases = ctx.MultiInput<Tensor>("LnBias");
+
+    auto ln_compute =
+        AttnLayerNorm<T, T, int8_t>(dev_ctx, epsilon, bsz_seq, dim_embed);
+    Tensor ln_mean, ln_var;
+    ln_mean.Resize({{bsz_seq}});
+    auto *ln_mean_data =
+        dev_ctx.Alloc<U>(&ln_mean, ln_mean.numel() * sizeof(U));
+    ln_var.Resize({{bsz_seq}});
+    auto *ln_var_data = dev_ctx.Alloc<U>(&ln_var, ln_var.numel() * sizeof(U));
+
+    // 2. qkv
+    // x: qkv's input [batch_size, seq_len, dim_embed]
+    // y: qkv's weight: [3, num_head, dim_head, dim_embed]
+    auto qkv_weights = ctx.MultiInput<Tensor>("QKVW");
+    auto qkv_biases = ctx.MultiInput<Tensor>("QKVBias");
+    const bool trans_qkvw = ctx.Attr<bool>("trans_qkvw");
+    const auto qkv_w_dims = qkv_weights[0]->dims();
+    int num_head = trans_qkvw ? qkv_w_dims[1] : qkv_w_dims[2];
+    int dim_head = trans_qkvw ? qkv_w_dims[2] : qkv_w_dims[3];
+    int hidden_size = num_head * dim_head;
+    int output_size = 3 * hidden_size;
+    int input_size = dim_embed;
+
+    bool compute_bias = qkv_biases.size() > 0 && time_step == nullptr;
+    // (transA, transB, compute_bias) = (false, trans_qkvw, false)
+    AttnMatmulINT8<T> qkv_compute(
+        dev_ctx, bsz_seq, output_size, input_size, compute_bias);
+    Tensor qkv_out;
+    qkv_out.Resize({{bsz, seq_len, 3, num_head, dim_head}});
+    auto *qkv_out_data =
+        dev_ctx.Alloc<T>(&qkv_out, qkv_out.numel() * sizeof(T));
+
+    // 3. fmha
+    AttnDropoutParam attn_param(
+        true, "upscale_in_train", 0.0, true, true, 0, nullptr);
+    auto fmha_compute =
+        FMHARef<T>(dev_ctx, bsz, seq_len, num_head, dim_head, attn_param);
+    auto *src_mask = ctx.Input<Tensor>("SrcMask");
+    auto cache_kvs = ctx.MultiInput<Tensor>("CacheKV");
+    auto cache_kv_outs = ctx.MultiOutput<Tensor>("CacheKVOut");
+    // auto *time_step = ctx.Input<Tensor>("TimeStep");
+
+    auto out_seq_len = seq_len;
+    if (time_step) {
+      PADDLE_ENFORCE_EQ(time_step->place(),
+                        platform::CPUPlace(),
+                        platform::errors::PreconditionNotMet(
+                            "The place of input(TimeStep) must be CPUPlace."));
+      // cache_seq_len
+      int time_step_value = time_step->data<int>()[0];
+      PADDLE_ENFORCE_GT(time_step_value,
+                        0,
+                        platform::errors::PreconditionNotMet(
+                            "The value of time_step must > 0, but now is %d",
+                            time_step_value));
+      PADDLE_ENFORCE_EQ(
+          seq_len,
+          1,
+          platform::errors::PreconditionNotMet(
+              "In decode stage, the seq_len of input must be 1, but now is %d",
+              seq_len));
+      out_seq_len += time_step_value;
+    }
+
+    Tensor transpose_out_2, qk_out;
+    transpose_out_2.Resize({{3, bsz, num_head, seq_len, dim_head}});
+    auto *transpose_out_2_data =
+        dev_ctx.Alloc<T>(&transpose_out_2, transpose_out_2.numel() * sizeof(T));
+    qk_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
+    auto *qk_out_data = dev_ctx.Alloc<T>(&qk_out, qk_out.numel() * sizeof(T));
+
+    Tensor softmax_out;
+    Tensor attn_dropout_mask_out, attn_dropout_out;
+    Tensor qktv_out, fmha_out;
+    softmax_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
+    auto *softmax_out_data =
+        dev_ctx.Alloc<T>(&softmax_out, softmax_out.numel() * sizeof(T));
+
+    attn_dropout_mask_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
+    auto *attn_dropout_mask_out_data = dev_ctx.Alloc<T>(
+        &attn_dropout_mask_out, attn_dropout_mask_out.numel() * sizeof(T));
+    attn_dropout_out.Resize({{bsz, num_head, seq_len, out_seq_len}});
+    auto *attn_dropout_data_data = dev_ctx.Alloc<T>(
+        &attn_dropout_out, attn_dropout_out.numel() * sizeof(T));
+
+    qktv_out.Resize({{bsz, num_head, seq_len, dim_head}});
+    auto *qktv_out_data =
+        dev_ctx.Alloc<T>(&qktv_out, qktv_out.numel() * sizeof(T));
+    fmha_out.Resize({{bsz, seq_len, num_head, dim_head}});
+    auto *fmha_out_data =
+        dev_ctx.Alloc<T>(&fmha_out, fmha_out.numel() * sizeof(T));
+
+    // 4. out_linear
+    auto out_linear_weights = ctx.MultiInput<Tensor>("OutLinearW");
+    auto out_linear_biases = ctx.MultiInput<Tensor>("OutLinearBias");
+    int ring_id = ctx.Attr<int>("ring_id");
+    // (transA, transB, compute_bias) = (false, false, false)
+    AttnMatmulINT8<T> out_linear_compute(
+        dev_ctx, bsz_seq, dim_embed, hidden_size, false);
+
+    // 5. ln(residual + bias)
+    DropoutParam dropout_param2(true, 0, true, true, 0.0, nullptr, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t, int32_t, int8_t>
+        fused_dropout_layernorm_helper(
+            dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
+    FusedDropoutLayerNormHelper<T, uint8_t>
+        fused_dropout_layernorm_helper_for_post_layernorm(
+            dev_ctx, bsz_seq, dim_embed, dropout_param2, epsilon);
+    auto ffn_ln_scales = ctx.MultiInput<Tensor>("FFNLnScale");
+    auto ffn_ln_biases = ctx.MultiInput<Tensor>("FFNLnBias");
+    Tensor bias_dropout_residual_out, dropout_mask_out;
+    T *bias_dropout_residual_out_data = nullptr;
+    if (pre_layer_norm) {
+      bias_dropout_residual_out.Resize({{bsz, seq_len, dim_embed}});
+      bias_dropout_residual_out_data =
+          dev_ctx.Alloc<T>(&bias_dropout_residual_out,
+                           bias_dropout_residual_out.numel() * sizeof(T));
+    }
+    dropout_mask_out.Resize({{bsz, seq_len, dim_embed}});
+    auto *dropout_mask_out_data = dev_ctx.Alloc<uint8_t>(
+        &dropout_mask_out, dropout_mask_out.numel() * sizeof(uint8_t));
+
+    // 6. ffn matmul1
+    auto ffn1_weights = ctx.MultiInput<Tensor>("FFN1Weight");
+    auto ffn1_biases = ctx.MultiInput<Tensor>("FFN1Bias");
+    auto ffn1_weight_dim = ffn1_weights[0]->dims();
+
+    int dim_ffn = ffn1_weight_dim[0];
+    AttnMatmulINT8<T> ffn1_linear_compute(
+        dev_ctx, bsz_seq, dim_ffn, dim_embed, false);
+    Tensor ffn1_out;
+    ffn1_out.Resize({{bsz_seq, dim_ffn}});
+    auto *ffn1_out_data =
+        dev_ctx.Alloc<T>(&ffn1_out, ffn1_out.numel() * sizeof(T));
+
+    // 7. ffn act + bias
+    DropoutParam ffn1_dropout_param(true, 0, true, true, 0.0, nullptr, 0);
+    FusedDropoutHelper<T, uint8_t, int32_t, int8_t> fused_act_dropout_helper(
+        dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
+    FusedDropoutHelper<T, uint8_t> fused_act_dropout_helper_for_post_layernorm(
+        dev_ctx, bsz_seq, dim_ffn, ffn1_dropout_param);
+    Tensor ffn1_dropout_out, ffn1_dropout_mask;
+    ffn1_dropout_out.Resize({{bsz_seq, dim_ffn}});
+    auto *ffn1_dropout_out_data = dev_ctx.Alloc<T>(
+        &ffn1_dropout_out, ffn1_dropout_out.numel() * sizeof(T));
+    ffn1_dropout_mask.Resize({{bsz_seq, dim_ffn}});
+    auto *ffn1_dropout_mask_data = dev_ctx.Alloc<uint8_t>(
+        &ffn1_dropout_mask, ffn1_dropout_mask.numel() * sizeof(uint8_t));
+
+    // 8. ffn2 matmul
+    auto ffn2_weights = ctx.MultiInput<Tensor>("FFN2Weight");
+    auto ffn2_biases = ctx.MultiInput<Tensor>("FFN2Bias");
+    AttnMatmulINT8<T> ffn2_linear_compute(
+        dev_ctx, bsz_seq, dim_embed, dim_ffn, false);
+
+    // 9. ffn2 residual bias
+    DropoutParam ffn2_dropout_param(true, 0, true, true, 0.0, nullptr, 0);
+    FusedDropoutLayerNormHelper<T, uint8_t, int32_t, int8_t>
+        ffn2_fused_dropout_helper(
+            dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
+    FusedDropoutLayerNormHelper<T, uint8_t, int32_t, T>
+        ffn2_fused_dropout_dequant_helper(
+            dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
+    FusedDropoutLayerNormHelper<T, uint8_t>
+        ffn2_fused_dropout_helper_for_post_layernorm(
+            dev_ctx, bsz_seq, dim_embed, ffn2_dropout_param, epsilon);
+
+    // []. init workspace for cublasLt transform
+    Tensor input_workspace, output_workspace;
+    // for input and output transform data is CUBLASLT_ORDER_COL32 format,
+    int m_max = bsz_seq, k_max = std::max(dim_embed, dim_ffn),
+        n_max = std::max({output_size, dim_embed, dim_ffn});
+
+    input_workspace.Resize(
+        {{32 * ((m_max + 32 - 1) / 32), (k_max + 31) / 32 * 32}});
+    dev_ctx.Alloc<int8_t>(&input_workspace,
+                          input_workspace.numel() * sizeof(int8_t));
+    output_workspace.Resize({{n_max * 4, (m_max + 31) / 32 * 32 * 4}});
+    dev_ctx.Alloc<int32_t>(&output_workspace,
+                           output_workspace.numel() * sizeof(int32_t));
+
+    // calc
+    auto *out = ctx.Output<Tensor>("Out");
+    auto *from_data = dev_ctx.Alloc<T>(out, out->numel() * sizeof(T));
+    Tensor *from_tensor = out;
+    Tensor tmp_out;
+    tmp_out.Resize({{bsz, seq_len, dim_embed}});
+    auto *tmp_out_data =
+        dev_ctx.Alloc<T>(&tmp_out, tmp_out.numel() * sizeof(T));
+
+    auto *x_data = input_x->data<T>();
+    Tensor *buf0 = nullptr;
+    Tensor *buf1 = nullptr;
+
+    // step0:  x   --> buf1
+    // step1: buf1 --> buf0
+    // step2: buf0 --> buf1
+    int layers = qkv_weights.size();
+    if (pre_layer_norm) {
+      buf1 = out;
+    } else {
+      buf0 = &tmp_out;
+      buf1 = out;
+    }
+
+    for (int i = 0; i < layers; ++i) {
+      // step1. layer_norm
+      if (i == 0 && pre_layer_norm) {
+        auto *ln_scale_data = ln_scales[i]->data<U>();
+        auto *ln_bias_data = ln_biases[i]->data<U>();
+        // TODO(wangxi): can remove mean var in inference
+        ln_compute.ComputeForward(x_data,
+                                  ln_scale_data,
+                                  ln_bias_data,
+                                  input_workspace.data<int8_t>(),
+                                  ln_mean_data,
+                                  ln_var_data,
+                                  nullptr,
+                                  0,
+                                  qkv_in_scale[i],
+                                  quant_round_type,
+                                  quant_max_bound,
+                                  quant_min_bound);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step1";
+#endif
+
+      // step2. qkv
+      const Tensor *qkv_bias = qkv_biases.size() > 0 ? qkv_biases[i] : nullptr;
+      // NOTE: in decoder stage, bias is fused in fmha
+      const Tensor *bias = time_step ? nullptr : qkv_bias;
+      if (!pre_layer_norm && i == 0) {
+        qkv_compute.ComputeForward(qkv_weights[i],
+                                   input_x,
+                                   &input_workspace,
+                                   bias,
+                                   &qkv_out,
+                                   &output_workspace,
+                                   &qkv_out,
+                                   qkv_in_scale[i],
+                                   qkv_out_scale,
+                                   i * qkv_out_scale_n,
+                                   quant_round_type,
+                                   quant_max_bound,
+                                   quant_min_bound);
+      } else if (!pre_layer_norm) {
+        qkv_compute.ComputeForward(qkv_weights[i],
+                                   buf1,
+                                   &input_workspace,
+                                   bias,
+                                   &qkv_out,
+                                   &output_workspace,
+                                   &qkv_out,
+                                   qkv_in_scale[i],
+                                   qkv_out_scale,
+                                   i * qkv_out_scale_n,
+                                   quant_round_type,
+                                   quant_max_bound,
+                                   quant_min_bound);
+      } else {
+        qkv_compute.ComputeForwardINT8ToT(qkv_weights[i],
+                                          qkv_in_scale[i],
+                                          &input_workspace,
+                                          bias,
+                                          &qkv_out,
+                                          &output_workspace,
+                                          &qkv_out,
+                                          qkv_out_scale,
+                                          i * qkv_out_scale_n);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step2";
+#endif
+
+      // step3. fmha
+      const Tensor *cache_kv = cache_kvs.size() > 0 ? cache_kvs[i] : nullptr;
+      Tensor *cache_kv_out = cache_kv ? cache_kv_outs[i] : nullptr;
+
+      if (time_step) {  // generation decoder stage
+        // [2, batch_size, num_head, max_seq_len, head_size]
+        int max_seq_len = cache_kv->dims()[3];
+        fmha<T>(dev_ctx,
+                qkv_out,
+                *qkv_bias,
+                *src_mask,
+                cache_kv_out,
+                &fmha_out,
+                bsz,
+                max_seq_len,
+                num_head,
+                dim_head,
+                time_step->data<int>()[0],
+                1. / sqrt(dim_head));
+      } else if (cache_kv_out) {  // generation context stage
+        // TODO(wangxi): can remove dropout in inference
+        fmha_compute.ComputeForward(qkv_out,
+                                    nullptr,
+                                    src_mask,
+                                    &transpose_out_2,
+                                    nullptr,
+                                    &qk_out,
+                                    nullptr,
+                                    &softmax_out,
+                                    &attn_dropout_mask_out,
+                                    &attn_dropout_out,
+                                    &qktv_out,
+                                    &fmha_out);
+        // [3, bsz, num_head, seq_len, head_dim]
+        T *qkv_data = transpose_out_2_data;
+        int64_t q_size = bsz * seq_len * num_head * dim_head;
+        int64_t k_size = q_size;
+        const T *q_ptr = qkv_data;
+        const T *k_ptr = q_ptr + q_size;
+        const T *v_ptr = k_ptr + k_size;
+
+        // [2, bsz, num_head, max_seq_len, head_dim]
+        int max_seq_len = cache_kv_out->dims()[3];
+        T *cache_kv_data = cache_kv_out->data<T>();
+        int64_t cache_k_size = bsz * num_head * max_seq_len * dim_head;
+
+        T *cache_k_ptr = cache_kv_data;
+        T *cache_v_ptr = cache_kv_data + cache_k_size;
+
+        write_cache_kv<T>(dev_ctx,
+                          cache_k_ptr,
+                          cache_v_ptr,
+                          k_ptr,
+                          v_ptr,
+                          bsz,
+                          num_head,
+                          seq_len,
+                          max_seq_len,
+                          dim_head);
+      } else {  // not generation
+        // TODO(wangxi): can remove dropout in inference
+        fmha_compute.ComputeForward(qkv_out,
+                                    cache_kv,
+                                    src_mask,
+                                    &transpose_out_2,
+                                    cache_kv_out,
+                                    &qk_out,
+                                    nullptr,
+                                    &softmax_out,
+                                    &attn_dropout_mask_out,
+                                    &attn_dropout_out,
+                                    &qktv_out,
+                                    &fmha_out);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step3";
+#endif
+
+      if (pre_layer_norm) {
+        out_linear_compute.ComputeForwardTToINT8(out_linear_weights[i],
+                                                 out_linear_in_scale[i],
+                                                 &fmha_out,
+                                                 &input_workspace,
+                                                 nullptr,
+                                                 &output_workspace,
+                                                 nullptr,
+                                                 quant_round_type,
+                                                 quant_max_bound,
+                                                 quant_min_bound);
+        AllReduce<int32_t>(output_workspace,
+                           ring_id,
+                           bsz * seq_len * num_head * dim_head,
+                           dev_ctx);
+      } else {
+        out_linear_compute.ComputeForward(out_linear_weights[i],
+                                          &fmha_out,
+                                          &input_workspace,
+                                          nullptr,
+                                          buf0,
+                                          &output_workspace,
+                                          nullptr,
+                                          out_linear_in_scale[i],
+                                          out_linear_out_scale,
+                                          i * out_linear_out_scale_n,
+                                          quant_round_type,
+                                          quant_max_bound,
+                                          quant_min_bound);
+        AllReduce<T>(*buf0, ring_id, buf0->numel(), dev_ctx);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step4";
+#endif
+
+      // step5. ln(residual + dropout(input + bias))
+      if (pre_layer_norm) {
+        auto *ln_scale_data = ffn_ln_scales[i]->data<U>();
+        auto *ln_bias_data = ffn_ln_biases[i]->data<U>();
+        auto *out_linear_bias_data = out_linear_biases[i]->data<T>();
+
+        // inplace
+        // non-inplace: buf1 -> input_workspace
+        fused_dropout_layernorm_helper.LayernormResidualDropoutBias(
+            dev_ctx,
+            output_workspace.data<int32_t>(),
+            x_data,
+            out_linear_bias_data,
+            ln_scale_data,
+            ln_bias_data,
+            bias_dropout_residual_out_data,
+            dropout_mask_out_data,
+            input_workspace.data<int8_t>(),
+            ln_mean_data,
+            ln_var_data,
+            out_linear_in_scale[i],
+            out_linear_out_scale->data<float>(),
+            i * out_linear_out_scale_n,
+            ffn1_in_scale[i],
+            quant_round_type,
+            quant_max_bound,
+            quant_min_bound);
+      } else {
+        auto *ln_scale_data = ln_scales[i]->data<U>();
+        auto *ln_bias_data = ln_biases[i]->data<U>();
+        auto *out_linear_bias_data = out_linear_biases[i]->data<T>();
+        auto *residual_data = (i == 0 ? x_data : buf1->data<T>());
+        fused_dropout_layernorm_helper_for_post_layernorm
+            .LayernormResidualDropoutBias(dev_ctx,
+                                          buf0->data<T>(),
+                                          residual_data,
+                                          out_linear_bias_data,
+                                          ln_scale_data,
+                                          ln_bias_data,
+                                          buf0->data<T>(),
+                                          dropout_mask_out_data,
+                                          buf1->data<T>(),
+                                          ln_mean_data,
+                                          ln_var_data);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step5";
+#endif
+
+      // step6. ffn matmul1
+
+      if (pre_layer_norm) {
+        ffn1_linear_compute.ComputeForwardINT8ToINT8(ffn1_weights[i],
+                                                     &input_workspace,
+                                                     nullptr,
+                                                     &output_workspace,
+                                                     nullptr);
+      } else {
+        ffn1_linear_compute.ComputeForward(ffn1_weights[i],
+                                           buf1,
+                                           &input_workspace,
+                                           nullptr,
+                                           &ffn1_out,
+                                           &output_workspace,
+                                           nullptr,
+                                           ffn1_in_scale[i],
+                                           ffn1_out_scale,
+                                           i * ffn1_out_scale_n,
+                                           quant_round_type,
+                                           quant_max_bound,
+                                           quant_min_bound);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step6";
+#endif
+
+      // step7. act bias
+      // TODO(wangxi): remove dropout mask in inference
+      if (pre_layer_norm) {
+        fused_act_dropout_helper.DropoutActBias(
+            dev_ctx,
+            output_workspace.data<int32_t>(),
+            ffn1_biases[i]->data<T>(),
+            "gelu",
+            input_workspace.data<int8_t>(),
+            ffn1_dropout_mask_data,
+            ffn1_in_scale[i],
+            ffn1_out_scale->data<float>(),
+            i * ffn1_out_scale_n,
+            ffn2_in_scale[i],
+            quant_round_type,
+            quant_max_bound,
+            quant_min_bound);
+      } else {
+        fused_act_dropout_helper_for_post_layernorm.DropoutActBias(
+            dev_ctx,
+            ffn1_out_data,
+            ffn1_biases[i]->data<T>(),
+            "gelu",
+            ffn1_dropout_out_data,
+            ffn1_dropout_mask_data);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step7";
+#endif
+
+      // step8. ffn matmul2
+      if (pre_layer_norm) {
+        ffn2_linear_compute.ComputeForwardINT8ToINT8(ffn2_weights[i],
+                                                     &input_workspace,
+                                                     nullptr,
+                                                     &output_workspace,
+                                                     nullptr);
+      } else {
+        ffn2_linear_compute.ComputeForward(ffn2_weights[i],
+                                           &ffn1_dropout_out,
+                                           &input_workspace,
+                                           nullptr,
+                                           buf0,
+                                           &output_workspace,
+                                           nullptr,
+                                           ffn2_in_scale[i],
+                                           ffn2_out_scale,
+                                           i * ffn2_out_scale_n,
+                                           quant_round_type,
+                                           quant_max_bound,
+                                           quant_min_bound);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step8.0";
+#endif
+
+      if (pre_layer_norm) {
+        AllReduce<int32_t>(output_workspace,
+                           ring_id,
+                           bsz * seq_len * num_head * dim_head,
+                           dev_ctx);
+      } else {
+        AllReduce<T>(*buf0, ring_id, buf0->numel(), dev_ctx);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step8.1";
+#endif
+
+      // step9. residual bias
+      if (pre_layer_norm) {
+        // TODO(wangxi): remove dropout mask in inference
+        if (i < layers - 1) {
+          auto *ln_scale_data = ln_scales[i + 1]->data<U>();
+          auto *ln_bias_data = ln_biases[i + 1]->data<U>();
+
+          ffn2_fused_dropout_helper.LayernormResidualDropoutBias(
+              dev_ctx,
+              output_workspace.data<int32_t>(),
+              bias_dropout_residual_out_data,
+              ffn2_biases[i]->data<T>(),
+              ln_scale_data,
+              ln_bias_data,
+              buf1->data<T>(),
+              dropout_mask_out_data,
+              input_workspace.data<int8_t>(),
+              ln_mean_data,
+              ln_var_data,
+              ffn2_in_scale[i],
+              ffn2_out_scale->data<float>(),
+              i * ffn2_out_scale_n,
+              qkv_in_scale[i + 1],
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          ffn2_fused_dropout_dequant_helper.ResidualDropoutBias(
+              dev_ctx,
+              output_workspace.data<int32_t>(),
+              bias_dropout_residual_out_data,
+              ffn2_biases[i]->data<T>(),
+              buf1->data<T>(),
+              dropout_mask_out_data,
+              ffn2_in_scale[i],
+              ffn2_out_scale->data<float>(),
+              i * ffn2_out_scale_n,
+              1.0);
+        }
+      } else {
+        auto *ln_scale_data = ffn_ln_scales[i]->data<U>();
+        auto *ln_bias_data = ffn_ln_biases[i]->data<U>();
+        ffn2_fused_dropout_helper_for_post_layernorm
+            .LayernormResidualDropoutBias(dev_ctx,
+                                          buf0->data<T>(),
+                                          buf1->data<T>(),
+                                          ffn2_biases[i]->data<T>(),
+                                          ln_scale_data,
+                                          ln_bias_data,
+                                          buf0->data<T>(),
+                                          dropout_mask_out_data,
+                                          buf1->data<T>(),
+                                          ln_mean_data,
+                                          ln_var_data);
+      }
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+      VLOG(0) << "step9";
+#endif
+      if (pre_layer_norm) {
+        x_data = buf1->data<T>();
+      }
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+REGISTER_OP_CUDA_KERNEL(fused_multi_transformer_int8,
+                        ops::FusedMultiTransformerINT8OpKernel<plat::float16>,
+                        ops::FusedMultiTransformerINT8OpKernel<float>);
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
index 04681f3d7a3d75..5cf22885aabba6 100644
--- a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu
@@ -1,1161 +1,19 @@
 /* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
- * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-// This file has been adapted from FasterTransformer file:
-// https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
-// We add License in the head.
-
-#include <cuda_fp16.h>
-#include <float.h>
 
-#include <cub/cub.cuh>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/fused/attention_layer_norm.h"
-#include "paddle/fluid/operators/fused/attn_gemm.h"
-#include "paddle/fluid/operators/fused/fmha_ref.h"
-#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
-#include "paddle/phi/api/include/tensor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-#include "paddle/fluid/distributed/collective/ProcessGroup.h"
-#include "paddle/fluid/platform/collective_helper.h"
-#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
-#endif
+#include "paddle/fluid/operators/fused/fused_multi_transformer_op.h"
 
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-
-// for debug
-// #define _DEBUG_FUSED_MULTI_TRANSFORMER
-
-template <typename T>
-static void AllReduce(framework::Tensor &tensor,  // NOLINT
-                      const int ring_id,
-                      const phi::GPUContext &ctx) {
-  if (ring_id == -1) return;
-#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
-  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
-
-  if (map->has(ring_id)) {
-    paddle::distributed::ProcessGroup *pg = map->get(ring_id);
-    std::vector<phi::DenseTensor> in_tensor;
-    std::vector<phi::DenseTensor> out_tensor;
-    in_tensor.push_back(tensor);
-    out_tensor.push_back(tensor);
-    paddle::distributed::AllreduceOptions opts;
-    opts.reduce_op = distributed::ReduceOp::SUM;
-    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
-    task->Wait();
-  } else {
-    auto dtype = platform::ToNCCLDataType(
-        framework::TransToProtoVarType(tensor.dtype()));
-    int64_t numel = tensor.numel();
-    const void *sendbuff = tensor.data<T>();
-    auto place = ctx.GetPlace();
-    void *recvbuff = ctx.Alloc<T>(&tensor, tensor.numel() * sizeof(T));
-    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
-    auto stream = ctx.stream();
-    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
-        sendbuff, recvbuff, numel, dtype, ncclSum, comm->comm(), stream));
-  }
-#else
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
-      "parallel op."));
-#endif
-}
-
-namespace {
-
-namespace plat = paddle::platform;
-using float16 = plat::float16;
-
-#define MMHA_USE_FP32_ACUM_FOR_LOGITS
-#define MMHA_USE_FP32_ACUM_FOR_OUT
-
-template <typename T>
-struct Masked_multihead_attention_params {
-  // output buffer, [B, 1(seq_len), num_head * dim_head]
-  T *out;
-  // qkv_out, [B, 1(seq_len), 3, num_head * dim_head]
-  const T *qkv;
-  // bias, [3, num_head, dim_head]
-  const T *qkv_bias;
-  // TODO(wangxi): optimize with input_lengths and max_input_len?
-  // [bsz, 1, 1, time_step(cache_seq_length)+1]
-  const T *attn_mask;
-
-  // [2, B, num_head, max_seq_len(valid cache_seq_len), dim_head]
-  // k [B, num_head, dim_head/x, max_seq_len, x], that is `seq_len` first
-  // v [B, num_head, max_seq_len, dim_head]
-  T *cache_kv;
-
-  int batch_size;
-  int num_head;
-  int timestep;  // cache_seq_length
-  int max_seq_length;
-
-  // 1.f / sqrt(Dh)
-  float inv_sqrt_dh;
-};
-
-struct Float8_ {
-  float2 x;
-  float2 y;
-  float2 z;
-  float2 w;
-};
-
-// clang-format off
-
-template <typename T, int Dh> struct Qk_vec_ {};
-template <> struct Qk_vec_<float,    32> { using Type = float;    };
-template <> struct Qk_vec_<float,    64> { using Type = float2;   };
-template <> struct Qk_vec_<float,   128> { using Type = float4;   };
-template <> struct Qk_vec_<float,   256> { using Type = float4;   };
-template <> struct Qk_vec_<float16,  32> { using Type = uint32_t; };
-template <> struct Qk_vec_<float16,  64> { using Type = uint32_t; };
-template <> struct Qk_vec_<float16, 128> { using Type = uint2;    };
-template <> struct Qk_vec_<float16, 256> { using Type = uint4;    };
-
-template <typename T, int THREADS_PER_KEY> struct K_vec_ {};
-template <> struct K_vec_<float,   4> { using Type = float;    };
-template <> struct K_vec_<float,   2> { using Type = float2;   };
-template <> struct K_vec_<float,   1> { using Type = float4;   };
-template <> struct K_vec_<float16, 4> { using Type = uint32_t; };
-template <> struct K_vec_<float16, 2> { using Type = uint2;    };
-template <> struct K_vec_<float16, 1> { using Type = uint4;    };
-
-template <typename T, int V_VEC_SIZE> struct V_vec_ {};
-template <> struct V_vec_<float,   1> { using Type = float;    };
-template <> struct V_vec_<float,   2> { using Type = float2;   };
-template <> struct V_vec_<float,   4> { using Type = float4;   };
-template <> struct V_vec_<float16, 2> { using Type = uint32_t; };
-template <> struct V_vec_<float16, 4> { using Type = uint2;    };
-template <> struct V_vec_<float16, 8> { using Type = uint4;    };
-
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-template <typename T> struct V_vec_acum_fp32_ {};
-// template <> struct V_vec_acum_fp32_<float>  { using Type = float;  };
-// template <> struct V_vec_acum_fp32_<float2> { using Type = float2; };
-template <> struct V_vec_acum_fp32_<float4> { using Type = float4; };
-// template <> struct V_vec_acum_fp32_<uint32_t> { using Type = float2;   };
-// template <> struct V_vec_acum_fp32_<uint2   > { using Type = Float4_;  };
-template <> struct V_vec_acum_fp32_<uint4> { using Type = Float8_; };
-#endif
-
-// clang-format on
-
-inline __device__ float half_to_float(uint16_t h) {
-  float f;
-  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
-  return f;
-}
-
-inline __device__ float2 half2_to_float2(uint32_t v) {
-  uint16_t lo, hi;
-  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
-  return make_float2(half_to_float(lo), half_to_float(hi));
-}
-
-inline __device__ uint32_t float2_to_half2(float2 f) {
-  union {
-    uint32_t u32;
-    uint16_t u16[2];
-  } tmp;
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
-  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
-               : "=r"(tmp.u32)
-               : "f"(f.y), "f"(f.x));
-#else
-  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
-  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
-#endif
-  return tmp.u32;
-}
-
-inline __device__ float add(float a, float b) { return a + b; }
-
-inline __device__ float2 add(float2 a, float2 b) {
-  float2 c;
-  c.x = add(a.x, b.x);
-  c.y = add(a.y, b.y);
-  return c;
-}
-
-inline __device__ float4 add(float4 a, float4 b) {
-  float4 c;
-  c.x = add(a.x, b.x);
-  c.y = add(a.y, b.y);
-  c.z = add(a.z, b.z);
-  c.w = add(a.w, b.w);
-  return c;
-}
-
-inline __device__ uint16_t add(uint16_t a, uint16_t b) {
-  uint16_t c;
-  asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
-  return c;
-}
-
-inline __device__ uint32_t add(uint32_t a, uint32_t b) {
-  uint32_t c;
-  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
-  return c;
-}
-
-inline __device__ uint2 add(uint2 a, uint2 b) {
-  uint2 c;
-  c.x = add(a.x, b.x);
-  c.y = add(a.y, b.y);
-  return c;
-}
-
-inline __device__ uint4 add(uint4 a, uint4 b) {
-  uint4 c;
-  c.x = add(a.x, b.x);
-  c.y = add(a.y, b.y);
-  c.z = add(a.z, b.z);
-  c.w = add(a.w, b.w);
-  return c;
-}
-
-inline __device__ float2 add(uint32_t a, float2 fb) {
-  float2 fa = half2_to_float2(a);
-  return add(fa, fb);
-}
-
-inline __device__ Float8_ add(uint4 a, Float8_ fb) {
-  Float8_ fc;
-  fc.x = add(a.x, fb.x);
-  fc.y = add(a.y, fb.y);
-  fc.z = add(a.z, fb.z);
-  fc.w = add(a.w, fb.w);
-  return fc;
-}
-
-template <typename Acc, typename A, typename B>
-inline __device__ Acc mul(A a, B b);
-
-template <>
-inline __device__ float mul<float, float>(float a, float b) {
-  return a * b;
-}
-
-template <>
-inline __device__ float2 mul(float2 a, float2 b) {
-  float2 c;
-  c.x = a.x * b.x;
-  c.y = a.y * b.y;
-  return c;
-}
-
-template <>
-inline __device__ float4 mul(float4 a, float4 b) {
-  float4 c;
-  c.x = a.x * b.x;
-  c.y = a.y * b.y;
-  c.z = a.z * b.z;
-  c.w = a.w * b.w;
-  return c;
-}
-
-template <>
-inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
-  uint16_t c;
-  asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
-  return c;
-}
-
-template <>
-inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
-  uint32_t c;
-  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
-  return c;
-}
-
-template <>
-inline __device__ uint2 mul(uint2 a, uint2 b) {
-  uint2 c;
-  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
-  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
-  return c;
-}
-
-template <>
-inline __device__ uint4 mul(uint4 a, uint4 b) {
-  uint4 c;
-  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
-  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
-  c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
-  c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
-  return c;
-}
-
-template <>
-inline __device__ uint32_t mul(uint32_t a, float b) {
-  float2 tmp = half2_to_float2(a);
-  float2 tmp_res;
-  tmp_res.x = tmp.x * b;
-  tmp_res.y = tmp.y * b;
-  uint32_t res = float2_to_half2(tmp_res);
-  return res;
-}
-
-template <>
-inline __device__ uint2 mul(uint2 a, float b) {
-  uint2 res;
-  res.x = mul<uint32_t, uint32_t, float>(a.x, b);
-  res.y = mul<uint32_t, uint32_t, float>(a.y, b);
-  return res;
-}
-
-template <>
-inline __device__ uint4 mul(uint4 a, float b) {
-  uint4 res;
-  res.x = mul<uint32_t, uint32_t, float>(a.x, b);
-  res.y = mul<uint32_t, uint32_t, float>(a.y, b);
-  res.z = mul<uint32_t, uint32_t, float>(a.z, b);
-  res.w = mul<uint32_t, uint32_t, float>(a.w, b);
-  return res;
-}
-
-template <>
-inline __device__ float2 mul(float2 a, float b) {
-  float2 res;
-  res.x = a.x * b;
-  res.y = a.y * b;
-  return res;
-}
-
-template <>
-inline __device__ float4 mul(float4 a, float b) {
-  float4 res;
-  res.x = a.x * b;
-  res.y = a.y * b;
-  res.z = a.z * b;
-  res.w = a.w * b;
-  return res;
-}
-
-inline __device__ float sum(float v) { return v; }
-inline __device__ float sum(float2 v) { return v.x + v.y; }
-inline __device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
-inline __device__ float sum(uint16_t v) { return half_to_float(v); }
-inline __device__ float sum(uint32_t v) {
-  float2 tmp = half2_to_float2(v);
-  return tmp.x + tmp.y;
-}
-
-inline __device__ float sum(uint2 v) {
-  uint32_t c = add(v.x, v.y);
-  return sum(c);
-}
-
-inline __device__ float sum(uint4 v) {
-  uint32_t c = add(v.x, v.y);
-  c = add(c, v.z);
-  c = add(c, v.w);
-  return sum(c);
-}
-
-template <typename T>
-inline __device__ float dot(T a, T b) {
-  return sum(mul<T, T, T>(a, b));
-}
-
-template <typename A, typename T>
-inline __device__ float dot(T a, T b) {
-  return sum(mul<A, T, T>(a, b));
-}
-
-inline __device__ constexpr uint32_t shfl_mask(int threads) {
-  return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u;
-}
-
-template <typename T>
-inline __device__ __host__ T div_up(T m, T n) {
-  return (m + n - 1) / n;
-}
-
-inline __device__ float fma(float a, float b, float c) { return a * b + c; }
-
-inline __device__ float2 fma(float2 a, float2 b, float2 c) {
-  float2 d;
-  d.x = fma(a.x, b.x, c.x);
-  d.y = fma(a.y, b.y, c.y);
-  return d;
-}
-
-inline __device__ float4 fma(float4 a, float4 b, float4 c) {
-  float4 d;
-  d.x = fma(a.x, b.x, c.x);
-  d.y = fma(a.y, b.y, c.y);
-  d.z = fma(a.z, b.z, c.z);
-  d.w = fma(a.w, b.w, c.w);
-  return d;
-}
-
-inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
-  uint32_t d;
-  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
-               : "=r"(d)
-               : "r"(a), "r"(b), "r"(c));
-  return d;
-}
-
-inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) {
-  uint2 d;
-  d.x = fma(a.x, b.x, c.x);
-  d.y = fma(a.y, b.y, c.y);
-  return d;
-}
-
-inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) {
-  uint4 d;
-  d.x = fma(a.x, b.x, c.x);
-  d.y = fma(a.y, b.y, c.y);
-  d.z = fma(a.z, b.z, c.z);
-  d.w = fma(a.w, b.w, c.w);
-  return d;
-}
-
-inline __device__ float2 fma(float a, float2 b, float2 c) {
-  float2 d;
-  d.x = fma(a, b.x, c.x);
-  d.y = fma(a, b.y, c.y);
-  return d;
-}
-
-inline __device__ float4 fma(float a, float4 b, float4 c) {
-  float4 d;
-  d.x = fma(a, b.x, c.x);
-  d.y = fma(a, b.y, c.y);
-  d.z = fma(a, b.z, c.z);
-  d.w = fma(a, b.w, c.w);
-  return d;
-}
-
-inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
-  Float8_ d;
-  d.x = fma(a, b.x, c.x);
-  d.y = fma(a, b.y, c.y);
-  d.z = fma(a, b.z, c.z);
-  d.w = fma(a, b.w, c.w);
-  return d;
-}
-
-inline __device__ uint32_t h0_h0(uint16_t a) {
-  uint32_t b;
-  asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
-  return b;
-}
-
-inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) {
-  return fma(h0_h0(a), b, c);
-}
-
-inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) {
-  uint32_t s = h0_h0(a);
-  uint2 d;
-  d.x = fma(s, b.x, c.x);
-  d.y = fma(s, b.y, c.y);
-  return d;
-}
-
-inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) {
-  uint32_t s = h0_h0(a);
-  uint4 d;
-  d.x = fma(s, b.x, c.x);
-  d.y = fma(s, b.y, c.y);
-  d.z = fma(s, b.z, c.z);
-  d.w = fma(s, b.w, c.w);
-  return d;
-}
-
-inline __device__ float cast_to_float(float u) { return u; }
-
-inline __device__ float2 cast_to_float(float2 u) { return u; }
-
-inline __device__ float4 cast_to_float(float4 u) { return u; }
-
-inline __device__ Float8_ cast_to_float(uint4 u) {
-  Float8_ tmp;
-  tmp.x = half2_to_float2(u.x);
-  tmp.y = half2_to_float2(u.y);
-  tmp.z = half2_to_float2(u.z);
-  tmp.w = half2_to_float2(u.w);
-  return tmp;
-}
-
-template <int THREADS_PER_KEY, typename K_vec, int N>
-inline __device__ float qk_dot_(const K_vec (&q)[N],
-                                const K_vec (&k)[N],
-                                float inv_sqrt_dh) {
-  K_vec inv_q = mul<K_vec, K_vec, float>(q[0], inv_sqrt_dh);
-  K_vec qk_vec = mul<K_vec, K_vec, K_vec>(inv_q, k[0]);
-#pragma unroll
-  for (int ii = 1; ii < N; ++ii) {
-    inv_q = mul<K_vec, K_vec, float>(q[ii], inv_sqrt_dh);
-    qk_vec = fma(inv_q, k[ii], qk_vec);
-  }
-
-  float qk = sum(qk_vec);
-#pragma unroll
-  for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
-    qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
-  }
-  return qk;
-}
-
-template <typename T, int THREADS_PER_KEY>
-struct Qk_dot {
-  template <typename K_vec, int N>
-  static inline __device__ float dot(const K_vec (&q)[N],
-                                     const K_vec (&k)[N],
-                                     float inv_sqrt_dh) {
-    return qk_dot_<THREADS_PER_KEY>(q, k, inv_sqrt_dh);
-  }
-};
-
-template <int WARPS_PER_BLOCK, int WARP_SIZE = 32>
-inline __device__ float block_sum(float *red_smem, float sum) {
-  int warp = threadIdx.x / WARP_SIZE;
-  int lane = threadIdx.x % WARP_SIZE;
-
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
-  }
-
-  if (lane == 0) {
-    red_smem[warp] = sum;
-  }
-  __syncthreads();
-
-  if (lane < WARPS_PER_BLOCK) {
-    sum = red_smem[lane];
-  }
-
-#pragma unroll
-  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
-  }
-
-  return __shfl_sync(uint32_t(-1), sum, 0);
-}
-
-inline __device__ void convert_from_float(float &dst, float src) {  // NOLINT
-  dst = src;
-}
-
-inline __device__ void convert_from_float(float4 &dst, float4 src) {  // NOLINT
-  dst = src;
-}
-
-inline __device__ void convert_from_float(plat::float16 &dst,  // NOLINT
-                                          float src) {
-  dst = static_cast<plat::float16>(src);
-}
-
-inline __device__ void convert_from_float(uint4 &dst, Float8_ src) {  // NOLINT
-  dst.x = float2_to_half2(src.x);
-  dst.y = float2_to_half2(src.y);
-  dst.z = float2_to_half2(src.z);
-  dst.w = float2_to_half2(src.w);
-}
-
-inline __device__ void zero(uint16_t &dst) { dst = uint16_t(0); }  // NOLINT
-
-template <typename T>
-inline __device__ void zero(T &dst) {  // NOLINT
-  constexpr int WORDS = sizeof(T) / 4;
-  union {
-    T raw;
-    uint32_t words[WORDS];
-  } tmp;
-#pragma unroll
-  for (int ii = 0; ii < WORDS; ++ii) {
-    tmp.words[ii] = 0u;
-  }
-  dst = tmp.raw;
-}
-
-template <typename T,
-          int Dh,
-          int Dh_MAX,
-          int THREADS_PER_KEY,
-          int THREADS_PER_VALUE,
-          int THREADS_PER_BLOCK>
-__global__ void masked_multihead_attention_kernel(
-    Masked_multihead_attention_params<T> params) {
-#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
-
-  static_assert(Dh_MAX % THREADS_PER_KEY == 0, "");
-  static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");
-
-  constexpr int WARP_SIZE = 32;
-  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
-
-  extern __shared__ char smem_[];
-
-  float *qk_smem = reinterpret_cast<float *>(smem_);
-
-  char *logits_smem_ = smem_;
-  // fp32 accum for logits
-  float *logits_smem = reinterpret_cast<float *>(logits_smem_);
-
-  T *out_smem = reinterpret_cast<T *>(smem_);
-
-  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
-  using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
-  __shared__ __align__(sizeof(Qk_vec)) T q_smem[Dh_MAX];
-
-  const int bi = blockIdx.y;
-  const int hi = blockIdx.x;
-  const int bhi = bi * params.num_head + hi;
-  const int tid = threadIdx.x;
-
-  float qk_max = -FLT_MAX;
-  float qk = 0;
-
-  // qkv [B, S=1, 3, num_head, head_dim]
-  int qkv_base_offset = bi * 3 * params.num_head * Dh + hi * Dh;
-
-  constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
-  static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
-  // Use block reduction if needed
-  // static_assert(Dh_MAX / QK_VEC_SIZE <= WARP_SIZE, "");
-  constexpr int QK_VECS_PER_WARP = Dh_MAX / QK_VEC_SIZE;
-
-  // cache_k, [B, num_head, head_dim / x, max_seq_len, x]
-  // x == 4/8 for FP32/FP16, 128bit, 16Byte
-  constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
-  constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec);
-
-  const T *q_base = params.qkv;
-  const T *k_base = params.qkv + params.num_head * Dh;
-  const T *q_bias_base = params.qkv_bias;
-  const T *k_bias_base = params.qkv_bias + params.num_head * Dh;
-
-  if (tid < QK_VECS_PER_WARP) {
-    int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
-    int qk_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
-
-    Qk_vec q;
-    zero(q);
-    q = (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
-            ? *reinterpret_cast<const Qk_vec *>(&q_base[qk_offset])
-            : q;
-    Qk_vec k;
-    zero(k);
-    k = (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
-            ? *reinterpret_cast<const Qk_vec *>(&k_base[qk_offset])
-            : k;
-
-    Qk_vec q_bias;
-    zero(q_bias);
-    q_bias =
-        (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
-            ? *reinterpret_cast<const Qk_vec *>(&q_bias_base[qk_bias_offset])
-            : q_bias;
-    Qk_vec k_bias;
-    zero(k_bias);
-    k_bias =
-        (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
-            ? *reinterpret_cast<const Qk_vec *>(&k_bias_base[qk_bias_offset])
-            : k_bias;
-
-    q = add(q, q_bias);
-    // TODO(wangxi): See this https://github.com/microsoft/unilm/issues/510
-    //   we may not require k_bias.
-    k = add(k, k_bias);
-
-    *reinterpret_cast<Qk_vec *>(&q_smem[tid * QK_VEC_SIZE]) = q;
-
-    int co = tid / QK_VECS_IN_16B;
-    int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE;
-    int offset = bhi * params.max_seq_length * Dh +
-                 co * params.max_seq_length * QK_ELTS_IN_16B +
-                 params.timestep * QK_ELTS_IN_16B + ci;
-    if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
-      *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
-    }
-
-    qk = dot<Qk_vec, Qk_vec>(q, k);
-
-    if (QK_VECS_PER_WARP <= WARP_SIZE) {
-#pragma unroll
-      for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
-        qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
-      }
-    }
-  }
-  if (QK_VECS_PER_WARP > WARP_SIZE) {
-    constexpr int WARPS_PER_RED =
-        (QK_VECS_PER_WARP + WARP_SIZE - 1) / WARP_SIZE;
-    qk = block_sum<WARPS_PER_RED>(&red_smem[WARPS_PER_RED], qk);
-  }
-  if (tid == 0) {
-    // NOTE(wangxi): mask must be 0.0
-    // T mask = params.attn_mask[
-    //    bi * (params.timestep + 1) + params.timestep];
-    // qk += static_cast<float>(mask);
-    qk *= params.inv_sqrt_dh;
-    qk_max = qk;
-    qk_smem[params.timestep] = qk;
-  }
-  __syncthreads();
-
-#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
-  if (bi == 0 && hi == 0 && tid == 0) {
-    printf("=======q_out=======\n");
-    for (int i = 0; i < Dh; ++i) printf("%f ", static_cast<float>(q_smem[i]));
-    printf("\n");
-  }
-  __syncthreads();
-#endif
-
-  using K_vec = typename K_vec_<T, THREADS_PER_KEY>::Type;
-  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T);
-  static_assert(Dh_MAX % K_VEC_SIZE == 0, "");
-  constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY;
-  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
-
-  int ko = tid / THREADS_PER_KEY;
-  int ki = (tid % THREADS_PER_KEY) * K_VEC_SIZE;
-
-  static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD, "");
-
-  K_vec q[K_VECS_PER_THREAD];
-#pragma unroll
-  for (int i = 0; i < K_VECS_PER_THREAD; ++i) {
-    q[i] = *reinterpret_cast<const K_vec *>(
-        &q_smem[ki + i * THREADS_PER_KEY * K_VEC_SIZE]);
-  }
-
-  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
-  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
-
-  T *k_cache = &params.cache_kv[bhi * params.max_seq_length * Dh + ki];
-  int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
-
-  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
-    K_vec k[K_VECS_PER_THREAD];
-    K_vec k_vec_zero;
-    zero(k_vec_zero);
-#pragma unroll
-    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
-      int jj = ii * params.max_seq_length + ti;
-      if (ti < params.timestep) {
-        k[ii] =
-            (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.max_seq_length)
-                ? *reinterpret_cast<const K_vec *>(
-                      &k_cache[jj * QK_ELTS_IN_16B])
-                : k_vec_zero;
-      }
-    }
-
-    // NOTE(liyurui): We should multiple q with inv_sqrt_dh first, for dot(q, k)
-    // may overflow with FP16 in large model.
-    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k, params.inv_sqrt_dh);
-
-    // bool is_mask = false;
-    if (ti < params.timestep && tid % THREADS_PER_KEY == 0) {
-      // qk_max = is_mask ? qk_max : fmaxf(qk_max, qk);
-      T mask = params.attn_mask[bi * (params.timestep + 1) + ti];
-      qk += static_cast<float>(mask);
-      qk_max = fmaxf(qk_max, qk);
-
-      qk_smem[ti] = qk;
-    }
-  }
-
-#pragma unroll
-  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  const int warp = tid / WARP_SIZE;
-  const int lane = tid % WARP_SIZE;
-
-  if (lane == 0) {
-    red_smem[warp] = qk_max;
-  }
-
-  __syncthreads();
-
-  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
-#pragma unroll
-  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
-    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
-  }
-
-  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
-
-#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
-  if (bi == 0 && hi == 0 && tid == 0) {
-    printf("=======qk_out=======\n");
-    for (int i = 0; i <= params.timestep; ++i) printf("%f ", qk_smem[i]);
-    printf("qk_max=%f\n", qk_max);
-  }
-  __syncthreads();
-#endif
-
-  float sum = 0.f;
-  for (int ti = tid; ti <= params.timestep; ti += THREADS_PER_BLOCK) {
-    // bool is_mask = false;
-    // float logit = is_mask ? 0.f : __expf(qk_smem[ti] - qk_max);
-    float logit = __expf(qk_smem[ti] - qk_max);
-    sum += logit;
-    qk_smem[ti] = logit;
-  }
-
-  sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
-
-  // FIXME(wangxi): need add 1.e-6f?
-  float inv_sum = __fdividef(1.f, sum + 1.e-6f);
-  for (int ti = tid; ti <= params.timestep; ti += THREADS_PER_BLOCK) {
-    convert_from_float(logits_smem[ti], qk_smem[ti] * inv_sum);
-  }
-  __syncthreads();
-
-  constexpr int V_VEC_SIZE = Dh_MAX / THREADS_PER_VALUE;
-  using V_vec = typename V_vec_<T, V_VEC_SIZE>::Type;
-
-  int vo = tid / THREADS_PER_VALUE;
-  int vi = (tid % THREADS_PER_VALUE) * V_VEC_SIZE;
-
-  T *v_cache = &params.cache_kv[params.batch_size * params.num_head *
-                                    params.max_seq_length * Dh +
-                                bhi * params.max_seq_length * Dh + vi];
-
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-  using V_vec_acum = typename V_vec_acum_fp32_<V_vec>::Type;
-#else
-  using V_vec_acum = V_vec;
-#endif
-
-  V_vec_acum out;
-  zero(out);
-
-  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
-  if (Dh == Dh_MAX || vi < Dh) {
-    for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) {
-      V_vec v = *reinterpret_cast<const V_vec *>(&v_cache[ti * Dh]);
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-      float logit = logits_smem[ti];
-      out = fma(logit, cast_to_float(v), out);
-#else
-      T logit = logits_smem[ti];
-      // Update the partial sums.
-      out = fma(logit, v, out);
-#endif
-    }
-  }
-
-#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
-  if (bi == 0 && hi == 0 && tid == 0) {
-    printf("======logits_out=====\n");
-    for (int i = 0; i <= params.timestep; ++i) printf("%f ", logits_smem[i]);
-    printf("\n");
-  }
-  __syncthreads();
-#endif
-
-  V_vec v_bias;
-  zero(v_bias);
-  if (vo == (params.timestep % V_PER_ITER) && (Dh == Dh_MAX || vi < Dh)) {
-    V_vec v = *reinterpret_cast<const V_vec *>(
-        &params.qkv[2 * params.num_head * Dh + qkv_base_offset + vi]);
-    v_bias = *reinterpret_cast<const V_vec *>(
-        &params.qkv_bias[2 * params.num_head * Dh + hi * Dh + vi]);
-    v = add(v, v_bias);
-    *reinterpret_cast<V_vec *>(&v_cache[params.timestep * Dh]) = v;
-
-#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
-    out = fma(logits_smem[params.timestep], cast_to_float(v), out);
-#else
-    out = fma(logits_smem[params.timestep], v, out);
-#endif
-  }
-
-  __syncthreads();
-
-  if (Dh == Dh_MAX || vi < Dh) {
-#pragma unroll
-    for (int active_groups = V_PER_ITER; active_groups >= 2;
-         active_groups /= 2) {
-      int midpoint = active_groups / 2;
-
-      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-        convert_from_float(
-            *reinterpret_cast<V_vec *>(&out_smem[(vo - midpoint) * Dh + vi]),
-            out);
-#else
-        *reinterpret_cast<V_vec *>(&out_smem[(vo - midpoint) * Dh + vi]) = out;
-#endif
-      }
-      __syncthreads();
-      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
-        out =
-            add(*reinterpret_cast<const V_vec *>(&out_smem[vo * Dh + vi]), out);
-      }
-      __syncthreads();
-    }
-  }
-
-  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
-#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
-    convert_from_float(*reinterpret_cast<V_vec *>(&params.out[bhi * Dh + vi]),
-                       out);
-#else
-    *reinterpret_cast<V_vec *>(&params.out[bhi * Dh + vi]) = out;
-#endif
-  }
-
-#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
-  __syncthreads();
-  if (bi == 0 && hi == 0 && tid == 0) {
-    printf("======fmha_out=====\n");
-    for (int i = 0; i < Dh; ++i)
-      printf("%f ", static_cast<float>(params.out[i]));
-    printf("\n");
-  }
-#endif
-#else
-  assert(false);
-#endif
-}
-
-template <typename T>
-inline size_t smem_size_in_bytes(
-    const Masked_multihead_attention_params<T> &params,
-    int dim_head,
-    int threads_per_value,
-    int threads_per_block) {
-  size_t qk_sz = div_up(params.timestep + 1, 4) * 16;
-  size_t logits_sz = 0;
-
-#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS
-  if (sizeof(T) != 4) {
-    logits_sz = div_up(params.max_seq_length, 4) * 4 * sizeof(T);
-  }
-#endif
-  size_t softmax_sz = qk_sz + logits_sz;
-
-  int rows_per_red = threads_per_block / threads_per_value;
-  size_t red_sz = rows_per_red * dim_head * sizeof(T) / 2;
-
-  return max(softmax_sz, red_sz);
-}
-
-#define MMHA_LAUNCH_KERNEL(                                              \
-    T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \
-  size_t smem_sz =                                                       \
-      smem_size_in_bytes<T>(params, Dh, THDS_PER_VALUE, THDS_PER_BLOCK); \
-  dim3 grid(params.num_head, params.batch_size);                         \
-  masked_multihead_attention_kernel<T,                                   \
-                                    Dh,                                  \
-                                    Dh_MAX,                              \
-                                    THDS_PER_KEY,                        \
-                                    THDS_PER_VALUE,                      \
-                                    THDS_PER_BLOCK>                      \
-      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
-
-template <typename T, int Dh, int Dh_MAX>
-void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
-                        const cudaStream_t &stream) {
-  constexpr int THREADS_PER_VALUE = Dh_MAX * sizeof(T) / 16;
-  if (params.timestep < 32) {
-    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream);
-  } else if (params.timestep < 2048) {
-    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream);
-  } else {
-    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream);
-  }
-}
-
-template <typename T>
-void fmha(const phi::GPUContext &dev_ctx,
-          const Tensor &qkv_tensor,
-          const Tensor &qkv_bias_tensor,
-          const Tensor &src_mask_tensor,
-          Tensor *cache_kv_tensor,
-          Tensor *out_tensor,
-          int batch_size,
-          int max_seq_length,
-          int num_head,
-          int dim_head,
-          int timestep,
-          float inv_sqrt_dh) {
-  Masked_multihead_attention_params<T> params;
-  params.out = out_tensor->data<T>();
-  params.qkv = qkv_tensor.data<T>();
-  params.qkv_bias = qkv_bias_tensor.data<T>();
-  params.attn_mask = src_mask_tensor.data<T>();
-  params.cache_kv = cache_kv_tensor->data<T>();
-
-  params.batch_size = batch_size;
-  params.num_head = num_head;
-  params.timestep = timestep;
-  params.max_seq_length = max_seq_length;
-  params.inv_sqrt_dh = inv_sqrt_dh;
-
-  switch (dim_head) {
-    case 10:
-      fmha_launch_kernel<T, 10, 32>(params, dev_ctx.stream());
-      break;
-    case 26:
-      fmha_launch_kernel<T, 26, 32>(params, dev_ctx.stream());
-      break;
-    case 32:
-      fmha_launch_kernel<T, 32, 32>(params, dev_ctx.stream());
-      break;
-    case 64:
-      fmha_launch_kernel<T, 64, 64>(params, dev_ctx.stream());
-      break;
-    case 96:
-      fmha_launch_kernel<T, 96, 128>(params, dev_ctx.stream());
-      break;
-    case 128:
-      fmha_launch_kernel<T, 128, 128>(params, dev_ctx.stream());
-      break;
-    case 192:
-      fmha_launch_kernel<T, 192, 256>(params, dev_ctx.stream());
-      break;
-    default:
-      PADDLE_THROW(platform::errors::Unimplemented(
-          "Dim_head = %d is unsupport!", dim_head));
-  }
-}
-
-// NOTE: simd with 16Bytes(128bit), float is 4, float16 is 8
-constexpr int VEC_16B = 16;
-
-template <typename T>
-__global__ void write_cache_k_kernel(T *cache_k,
-                                     const T *k,
-                                     const int num_head,
-                                     const int dim_head,
-                                     const int seq_len,
-                                     const int max_seq_len) {
-  const int bi = blockIdx.y;
-  const int hi = blockIdx.z;
-  constexpr int X_ELEMS = VEC_16B / sizeof(T);
-
-  // [bsz, num_head, seq_len, dim_head/x, x]
-  auto k_src = reinterpret_cast<const uint4 *>(
-      k + bi * num_head * seq_len * dim_head + hi * seq_len * dim_head);
-  // [bsz, num_head, dim_head/x, max_seq_len, x]
-  auto k_dst = reinterpret_cast<uint4 *>(
-      cache_k + bi * num_head * max_seq_len * dim_head +
-      hi * max_seq_len * dim_head);
-
-  const int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  // vec size
-  int dim_head_div_x = dim_head / X_ELEMS;
-
-  // FIXME(wangxi): num_head is not need?
-  // if (out_idx >= num_head * dim_head_div_x * max_seq_len) return;
-  if (out_idx >= dim_head_div_x * max_seq_len) return;
-
-  int idx = out_idx;
-  const int k_seq_len_id = idx % max_seq_len;
-  // idx = (idx - k_seq_len_id) / max_seq_len;
-  idx = idx / max_seq_len;
-  const int k_vec_id = idx % dim_head_div_x;
-
-  if (k_seq_len_id < seq_len) {
-    k_dst[out_idx] = k_src[k_seq_len_id * dim_head_div_x + k_vec_id];
-  }
-}
-
-template <typename T>
-__global__ void write_cache_v_kernel(T *cache_v,
-                                     const T *v,
-                                     const int num_head,
-                                     const int dim_head,
-                                     const int seq_len,
-                                     const int max_seq_len) {
-  const int bi = blockIdx.y;
-  const int hi = blockIdx.z;
-
-  // [bsz, num_head, seq_len, dim_head/x, x]
-  auto v_src = reinterpret_cast<const uint4 *>(
-      v + bi * num_head * seq_len * dim_head + hi * seq_len * dim_head);
-  // [bsz, num_head, max_seq_len, dim_head/x, x]
-  auto v_dst = reinterpret_cast<uint4 *>(
-      cache_v + bi * num_head * max_seq_len * dim_head +
-      hi * max_seq_len * dim_head);
-
-  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
-  constexpr int X_ELEMS = VEC_16B / sizeof(T);
-  const int dim_head_div_x = dim_head / X_ELEMS;
-
-  if (idx >= dim_head_div_x * seq_len) return;
-
-  v_dst[idx] = v_src[idx];
-}
-
-template <typename T>
-void write_cache_kv(const phi::GPUContext &dev_ctx,
-                    T *cache_k,
-                    T *cache_v,
-                    const T *k,
-                    const T *v,
-                    const int bsz,
-                    const int num_head,
-                    const int seq_len,
-                    const int max_seq_len,
-                    const int dim_head) {
-  constexpr int block_sz = 128;
-  constexpr int x = VEC_16B / sizeof(T);
-
-  assert(dim_head % x == 0);
-  PADDLE_ENFORCE_EQ(
-      dim_head % x,
-      0,
-      platform::errors::PreconditionNotMet(
-          "dim_head=%d must be divisible by vec_size=%d", dim_head, x));
-
-  int max_size = max_seq_len * dim_head / x;
-  int size = seq_len * dim_head / x;
-  dim3 grid(div_up(max_size, block_sz), bsz, num_head);
-  dim3 grid_v(div_up(size, block_sz), bsz, num_head);
-
-  // transpose [bsz, num_head, seq_len, dim_head/x, x]->
-  // [bsz, num_head, dim_head/x, max_seq_len, x]
-  write_cache_k_kernel<<<grid, block_sz, 0, dev_ctx.stream()>>>(
-      cache_k, k, num_head, dim_head, seq_len, max_seq_len);
-
-  // copy [bsz, num_head, seq_len, dim_head/x, x]->
-  // [bsz, num_head, max_seq_len, dim_head/x, x]
-  write_cache_v_kernel<<<grid_v, block_sz, 0, dev_ctx.stream()>>>(
-      cache_v, v, num_head, dim_head, seq_len, max_seq_len);
-}
-
-}  // namespace
-
 template <typename T>
 class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
  public:
@@ -1480,11 +338,11 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
       if (pre_layer_norm) {
         out_linear_compute.ComputeForward(
             out_linear_weights[i], &fmha_out, nullptr, buf1, nullptr);
-        AllReduce<T>(*buf1, ring_id, dev_ctx);
+        AllReduce<T>(*buf1, ring_id, buf1->numel(), dev_ctx);
       } else {
         out_linear_compute.ComputeForward(
             out_linear_weights[i], &fmha_out, nullptr, buf0, nullptr);
-        AllReduce<T>(*buf0, ring_id, dev_ctx);
+        AllReduce<T>(*buf0, ring_id, buf0->numel(), dev_ctx);
       }
 #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
       VLOG(0) << "step4";
@@ -1563,9 +421,9 @@ class FusedMultiTransformerOpKernel : public framework::OpKernel<T> {
 #endif
 
       if (pre_layer_norm) {
-        AllReduce<T>(*buf1, ring_id, dev_ctx);
+        AllReduce<T>(*buf1, ring_id, buf1->numel(), dev_ctx);
       } else {
-        AllReduce<T>(*buf0, ring_id, dev_ctx);
+        AllReduce<T>(*buf0, ring_id, buf0->numel(), dev_ctx);
       }
 #ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
       VLOG(0) << "step8.1";
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.h
new file mode 100644
index 00000000000000..761a31ce094d12
--- /dev/null
+++ b/paddle/fluid/operators/fused/fused_multi_transformer_op.h
@@ -0,0 +1,1161 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+ * Copyright (c) 2011-2021, NVIDIA CORPORATION.  All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+// This file has been adapted from FasterTransformer file:
+// https://github.com/NVIDIA/FasterTransformer/blob/v4.0/fastertransformer/cuda/masked_multihead_attention.cu
+// We add License in the head.
+
+#include <cuda_fp16.h>
+#include <float.h>
+
+#include <cub/cub.cuh>
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/operators/fused/attention_layer_norm.h"
+#include "paddle/fluid/operators/fused/attn_gemm.h"
+#include "paddle/fluid/operators/fused/fmha_ref.h"
+#include "paddle/fluid/operators/fused/fused_dropout_helper.h"
+#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+// for debug
+// #define _DEBUG_FUSED_MULTI_TRANSFORMER
+
+template <typename T>
+static void AllReduce(framework::Tensor &tensor,  // NOLINT
+                      const int ring_id,
+                      const int count,
+                      const phi::GPUContext &ctx) {
+  if (ring_id == -1) return;
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  auto map = paddle::distributed::ProcessGroupMapFromGid::getInstance();
+
+  if (map->has(ring_id)) {
+    paddle::distributed::ProcessGroup *pg = map->get(ring_id);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(tensor);
+    out_tensor.push_back(tensor);
+    paddle::distributed::AllreduceOptions opts;
+    opts.reduce_op = distributed::ReduceOp::SUM;
+    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    task->Wait();
+  } else {
+    auto dtype = platform::ToNCCLDataType(
+        framework::TransToProtoVarType(tensor.dtype()));
+    int64_t numel = tensor.numel();
+    const void *sendbuff = tensor.data<T>();
+    auto place = ctx.GetPlace();
+    void *recvbuff = tensor.mutable_data<T>(place);
+    auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
+    auto stream = ctx.stream();
+    PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
+        sendbuff, recvbuff, count, dtype, ncclSum, comm->comm(), stream));
+  }
+#else
+  PADDLE_THROW(platform::errors::Unimplemented(
+      "PaddlePaddle should compile with NCCL or RCCL when used tensor model "
+      "parallel op."));
+#endif
+}
+
+namespace {  // NOLINT
+
+namespace plat = paddle::platform;
+using float16 = plat::float16;
+
+#define MMHA_USE_FP32_ACUM_FOR_LOGITS
+#define MMHA_USE_FP32_ACUM_FOR_OUT
+
+template <typename T>
+struct Masked_multihead_attention_params {
+  // output buffer, [B, 1(seq_len), num_head * dim_head]
+  T *out;
+  // qkv_out, [B, 1(seq_len), 3, num_head * dim_head]
+  const T *qkv;
+  // bias, [3, num_head, dim_head]
+  const T *qkv_bias;
+  // TODO(wangxi): optimize with input_lengths and max_input_len?
+  // [bsz, 1, 1, time_step(cache_seq_length)+1]
+  const T *attn_mask;
+
+  // [2, B, num_head, max_seq_len(valid cache_seq_len), dim_head]
+  // k [B, num_head, dim_head/x, max_seq_len, x], that is `seq_len` first
+  // v [B, num_head, max_seq_len, dim_head]
+  T *cache_kv;
+
+  int batch_size;
+  int num_head;
+  int timestep;  // cache_seq_length
+  int max_seq_length;
+
+  // 1.f / sqrt(Dh)
+  float inv_sqrt_dh;
+};
+
+struct Float8_ {
+  float2 x;
+  float2 y;
+  float2 z;
+  float2 w;
+};
+
+// clang-format off
+
+template <typename T, int Dh> struct Qk_vec_ {};
+template <> struct Qk_vec_<float,    32> { using Type = float;    };
+template <> struct Qk_vec_<float,    64> { using Type = float2;   };
+template <> struct Qk_vec_<float,   128> { using Type = float4;   };
+template <> struct Qk_vec_<float,   256> { using Type = float4;   };
+template <> struct Qk_vec_<float16,  32> { using Type = uint32_t; };
+template <> struct Qk_vec_<float16,  64> { using Type = uint32_t; };
+template <> struct Qk_vec_<float16, 128> { using Type = uint2;    };
+template <> struct Qk_vec_<float16, 256> { using Type = uint4;    };
+
+template <typename T, int THREADS_PER_KEY> struct K_vec_ {};
+template <> struct K_vec_<float,   4> { using Type = float;    };
+template <> struct K_vec_<float,   2> { using Type = float2;   };
+template <> struct K_vec_<float,   1> { using Type = float4;   };
+template <> struct K_vec_<float16, 4> { using Type = uint32_t; };
+template <> struct K_vec_<float16, 2> { using Type = uint2;    };
+template <> struct K_vec_<float16, 1> { using Type = uint4;    };
+
+template <typename T, int V_VEC_SIZE> struct V_vec_ {};
+template <> struct V_vec_<float,   1> { using Type = float;    };
+template <> struct V_vec_<float,   2> { using Type = float2;   };
+template <> struct V_vec_<float,   4> { using Type = float4;   };
+template <> struct V_vec_<float16, 2> { using Type = uint32_t; };
+template <> struct V_vec_<float16, 4> { using Type = uint2;    };
+template <> struct V_vec_<float16, 8> { using Type = uint4;    };
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+template <typename T> struct V_vec_acum_fp32_ {};
+// template <> struct V_vec_acum_fp32_<float>  { using Type = float;  };
+// template <> struct V_vec_acum_fp32_<float2> { using Type = float2; };
+template <> struct V_vec_acum_fp32_<float4> { using Type = float4; };
+// template <> struct V_vec_acum_fp32_<uint32_t> { using Type = float2;   };
+// template <> struct V_vec_acum_fp32_<uint2   > { using Type = Float4_;  };
+template <> struct V_vec_acum_fp32_<uint4> { using Type = Float8_; };
+#endif
+
+// clang-format on
+
+inline __device__ float half_to_float(uint16_t h) {
+  float f;
+  asm volatile("cvt.f32.f16 %0, %1;\n" : "=f"(f) : "h"(h));
+  return f;
+}
+
+inline __device__ float2 half2_to_float2(uint32_t v) {
+  uint16_t lo, hi;
+  asm volatile("mov.b32 {%0, %1}, %2;\n" : "=h"(lo), "=h"(hi) : "r"(v));
+  return make_float2(half_to_float(lo), half_to_float(hi));
+}
+
+inline __device__ uint32_t float2_to_half2(float2 f) {
+  union {
+    uint32_t u32;
+    uint16_t u16[2];
+  } tmp;
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+  asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+               : "=r"(tmp.u32)
+               : "f"(f.y), "f"(f.x));
+#else
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+  asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+#endif
+  return tmp.u32;
+}
+
+inline __device__ float add(float a, float b) { return a + b; }
+
+inline __device__ float2 add(float2 a, float2 b) {
+  float2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ float4 add(float4 a, float4 b) {
+  float4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+inline __device__ uint16_t add(uint16_t a, uint16_t b) {
+  uint16_t c;
+  asm volatile("add.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+  return c;
+}
+
+inline __device__ uint32_t add(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("add.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+inline __device__ uint2 add(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  return c;
+}
+
+inline __device__ uint4 add(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = add(a.x, b.x);
+  c.y = add(a.y, b.y);
+  c.z = add(a.z, b.z);
+  c.w = add(a.w, b.w);
+  return c;
+}
+
+inline __device__ float2 add(uint32_t a, float2 fb) {
+  float2 fa = half2_to_float2(a);
+  return add(fa, fb);
+}
+
+inline __device__ Float8_ add(uint4 a, Float8_ fb) {
+  Float8_ fc;
+  fc.x = add(a.x, fb.x);
+  fc.y = add(a.y, fb.y);
+  fc.z = add(a.z, fb.z);
+  fc.w = add(a.w, fb.w);
+  return fc;
+}
+
+template <typename Acc, typename A, typename B>
+inline __device__ Acc mul(A a, B b);
+
+template <>
+inline __device__ float mul<float, float>(float a, float b) {
+  return a * b;
+}
+
+template <>
+inline __device__ float2 mul(float2 a, float2 b) {
+  float2 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  return c;
+}
+
+template <>
+inline __device__ float4 mul(float4 a, float4 b) {
+  float4 c;
+  c.x = a.x * b.x;
+  c.y = a.y * b.y;
+  c.z = a.z * b.z;
+  c.w = a.w * b.w;
+  return c;
+}
+
+template <>
+inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
+  uint16_t c;
+  asm volatile("mul.f16 %0, %1, %2;\n" : "=h"(c) : "h"(a), "h"(b));
+  return c;
+}
+
+template <>
+inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
+  uint32_t c;
+  asm volatile("mul.f16x2 %0, %1, %2;\n" : "=r"(c) : "r"(a), "r"(b));
+  return c;
+}
+
+template <>
+inline __device__ uint2 mul(uint2 a, uint2 b) {
+  uint2 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  return c;
+}
+
+template <>
+inline __device__ uint4 mul(uint4 a, uint4 b) {
+  uint4 c;
+  c.x = mul<uint32_t, uint32_t, uint32_t>(a.x, b.x);
+  c.y = mul<uint32_t, uint32_t, uint32_t>(a.y, b.y);
+  c.z = mul<uint32_t, uint32_t, uint32_t>(a.z, b.z);
+  c.w = mul<uint32_t, uint32_t, uint32_t>(a.w, b.w);
+  return c;
+}
+
+template <>
+inline __device__ uint32_t mul(uint32_t a, float b) {
+  float2 tmp = half2_to_float2(a);
+  float2 tmp_res;
+  tmp_res.x = tmp.x * b;
+  tmp_res.y = tmp.y * b;
+  uint32_t res = float2_to_half2(tmp_res);
+  return res;
+}
+
+template <>
+inline __device__ uint2 mul(uint2 a, float b) {
+  uint2 res;
+  res.x = mul<uint32_t, uint32_t, float>(a.x, b);
+  res.y = mul<uint32_t, uint32_t, float>(a.y, b);
+  return res;
+}
+
+template <>
+inline __device__ uint4 mul(uint4 a, float b) {
+  uint4 res;
+  res.x = mul<uint32_t, uint32_t, float>(a.x, b);
+  res.y = mul<uint32_t, uint32_t, float>(a.y, b);
+  res.z = mul<uint32_t, uint32_t, float>(a.z, b);
+  res.w = mul<uint32_t, uint32_t, float>(a.w, b);
+  return res;
+}
+
+template <>
+inline __device__ float2 mul(float2 a, float b) {
+  float2 res;
+  res.x = a.x * b;
+  res.y = a.y * b;
+  return res;
+}
+
+template <>
+inline __device__ float4 mul(float4 a, float b) {
+  float4 res;
+  res.x = a.x * b;
+  res.y = a.y * b;
+  res.z = a.z * b;
+  res.w = a.w * b;
+  return res;
+}
+
+inline __device__ float sum(float v) { return v; }
+inline __device__ float sum(float2 v) { return v.x + v.y; }
+inline __device__ float sum(float4 v) { return v.x + v.y + v.z + v.w; }
+inline __device__ float sum(uint16_t v) { return half_to_float(v); }
+inline __device__ float sum(uint32_t v) {
+  float2 tmp = half2_to_float2(v);
+  return tmp.x + tmp.y;
+}
+
+inline __device__ float sum(uint2 v) {
+  uint32_t c = add(v.x, v.y);
+  return sum(c);
+}
+
+inline __device__ float sum(uint4 v) {
+  uint32_t c = add(v.x, v.y);
+  c = add(c, v.z);
+  c = add(c, v.w);
+  return sum(c);
+}
+
+template <typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<T, T, T>(a, b));
+}
+
+template <typename A, typename T>
+inline __device__ float dot(T a, T b) {
+  return sum(mul<A, T, T>(a, b));
+}
+
+inline __device__ constexpr uint32_t shfl_mask(int threads) {
+  return threads == 32 ? uint32_t(-1) : (1u << threads) - 1u;
+}
+
+template <typename T>
+inline __device__ __host__ T div_up(T m, T n) {
+  return (m + n - 1) / n;
+}
+
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
+
+inline __device__ float2 fma(float2 a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ float4 fma(float4 a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
+  uint32_t d;
+  asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+               : "=r"(d)
+               : "r"(a), "r"(b), "r"(c));
+  return d;
+}
+
+inline __device__ uint2 fma(uint2 a, uint2 b, uint2 c) {
+  uint2 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  return d;
+}
+
+inline __device__ uint4 fma(uint4 a, uint4 b, uint4 c) {
+  uint4 d;
+  d.x = fma(a.x, b.x, c.x);
+  d.y = fma(a.y, b.y, c.y);
+  d.z = fma(a.z, b.z, c.z);
+  d.w = fma(a.w, b.w, c.w);
+  return d;
+}
+
+inline __device__ float2 fma(float a, float2 b, float2 c) {
+  float2 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  return d;
+}
+
+inline __device__ float4 fma(float a, float4 b, float4 c) {
+  float4 d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+
+inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
+  Float8_ d;
+  d.x = fma(a, b.x, c.x);
+  d.y = fma(a, b.y, c.y);
+  d.z = fma(a, b.z, c.z);
+  d.w = fma(a, b.w, c.w);
+  return d;
+}
+
+inline __device__ uint32_t h0_h0(uint16_t a) {
+  uint32_t b;
+  asm volatile("mov.b32 %0, {%1, %1};" : "=r"(b) : "h"(a));
+  return b;
+}
+
+inline __device__ uint32_t fma(uint16_t a, uint32_t b, uint32_t c) {
+  return fma(h0_h0(a), b, c);
+}
+
+inline __device__ uint2 fma(uint16_t a, uint2 b, uint2 c) {
+  uint32_t s = h0_h0(a);
+  uint2 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  return d;
+}
+
+inline __device__ uint4 fma(uint16_t a, uint4 b, uint4 c) {
+  uint32_t s = h0_h0(a);
+  uint4 d;
+  d.x = fma(s, b.x, c.x);
+  d.y = fma(s, b.y, c.y);
+  d.z = fma(s, b.z, c.z);
+  d.w = fma(s, b.w, c.w);
+  return d;
+}
+
+inline __device__ float cast_to_float(float u) { return u; }
+
+inline __device__ float2 cast_to_float(float2 u) { return u; }
+
+inline __device__ float4 cast_to_float(float4 u) { return u; }
+
+inline __device__ Float8_ cast_to_float(uint4 u) {
+  Float8_ tmp;
+  tmp.x = half2_to_float2(u.x);
+  tmp.y = half2_to_float2(u.y);
+  tmp.z = half2_to_float2(u.z);
+  tmp.w = half2_to_float2(u.w);
+  return tmp;
+}
+
+template <int THREADS_PER_KEY, typename K_vec, int N>
+inline __device__ float qk_dot_(const K_vec (&q)[N],
+                                const K_vec (&k)[N],
+                                float inv_sqrt_dh) {
+  K_vec inv_q = mul<K_vec, K_vec, float>(q[0], inv_sqrt_dh);
+  K_vec qk_vec = mul<K_vec, K_vec, K_vec>(inv_q, k[0]);
+#pragma unroll
+  for (int ii = 1; ii < N; ++ii) {
+    inv_q = mul<K_vec, K_vec, float>(q[ii], inv_sqrt_dh);
+    qk_vec = fma(inv_q, k[ii], qk_vec);
+  }
+
+  float qk = sum(qk_vec);
+#pragma unroll
+  for (int mask = THREADS_PER_KEY / 2; mask >= 1; mask /= 2) {
+    qk += __shfl_xor_sync(uint32_t(-1), qk, mask);
+  }
+  return qk;
+}
+
+template <typename T, int THREADS_PER_KEY>
+struct Qk_dot {
+  template <typename K_vec, int N>
+  static inline __device__ float dot(const K_vec (&q)[N],
+                                     const K_vec (&k)[N],
+                                     float inv_sqrt_dh) {
+    return qk_dot_<THREADS_PER_KEY>(q, k, inv_sqrt_dh);
+  }
+};
+
+template <int WARPS_PER_BLOCK, int WARP_SIZE = 32>
+inline __device__ float block_sum(float *red_smem, float sum) {
+  int warp = threadIdx.x / WARP_SIZE;
+  int lane = threadIdx.x % WARP_SIZE;
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= 1; mask /= 2) {
+    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+  }
+
+  if (lane == 0) {
+    red_smem[warp] = sum;
+  }
+  __syncthreads();
+
+  if (lane < WARPS_PER_BLOCK) {
+    sum = red_smem[lane];
+  }
+
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    sum += __shfl_xor_sync(uint32_t(-1), sum, mask);
+  }
+
+  return __shfl_sync(uint32_t(-1), sum, 0);
+}
+
+inline __device__ void convert_from_float(float &dst, float src) {  // NOLINT
+  dst = src;
+}
+
+inline __device__ void convert_from_float(float4 &dst, float4 src) {  // NOLINT
+  dst = src;
+}
+
+inline __device__ void convert_from_float(plat::float16 &dst,  // NOLINT
+                                          float src) {
+  dst = static_cast<plat::float16>(src);
+}
+
+inline __device__ void convert_from_float(uint4 &dst, Float8_ src) {  // NOLINT
+  dst.x = float2_to_half2(src.x);
+  dst.y = float2_to_half2(src.y);
+  dst.z = float2_to_half2(src.z);
+  dst.w = float2_to_half2(src.w);
+}
+
+inline __device__ void zero(uint16_t &dst) { dst = uint16_t(0); }  // NOLINT
+
+template <typename T>
+inline __device__ void zero(T &dst) {  // NOLINT
+  constexpr int WORDS = sizeof(T) / 4;
+  union {
+    T raw;
+    uint32_t words[WORDS];
+  } tmp;
+#pragma unroll
+  for (int ii = 0; ii < WORDS; ++ii) {
+    tmp.words[ii] = 0u;
+  }
+  dst = tmp.raw;
+}
+
+template <typename T,
+          int Dh,
+          int Dh_MAX,
+          int THREADS_PER_KEY,
+          int THREADS_PER_VALUE,
+          int THREADS_PER_BLOCK>
+__global__ void masked_multihead_attention_kernel(
+    Masked_multihead_attention_params<T> params) {
+#if CUDA_ARCH_FP16_SUPPORTED(__CUDA_ARCH__)
+
+  static_assert(Dh_MAX % THREADS_PER_KEY == 0, "");
+  static_assert(Dh_MAX % THREADS_PER_VALUE == 0, "");
+
+  constexpr int WARP_SIZE = 32;
+  constexpr int WARPS_PER_BLOCK = THREADS_PER_BLOCK / WARP_SIZE;
+
+  extern __shared__ char smem_[];
+
+  float *qk_smem = reinterpret_cast<float *>(smem_);
+
+  char *logits_smem_ = smem_;
+  // fp32 accum for logits
+  float *logits_smem = reinterpret_cast<float *>(logits_smem_);
+
+  T *out_smem = reinterpret_cast<T *>(smem_);
+
+  __shared__ float red_smem[WARPS_PER_BLOCK * 2];
+  using Qk_vec = typename Qk_vec_<T, Dh_MAX>::Type;
+  __shared__ __align__(sizeof(Qk_vec)) T q_smem[Dh_MAX];
+
+  const int bi = blockIdx.y;
+  const int hi = blockIdx.x;
+  const int bhi = bi * params.num_head + hi;
+  const int tid = threadIdx.x;
+
+  float qk_max = -FLT_MAX;
+  float qk = 0;
+
+  // qkv [B, S=1, 3, num_head, head_dim]
+  int qkv_base_offset = bi * 3 * params.num_head * Dh + hi * Dh;
+
+  constexpr int QK_VEC_SIZE = sizeof(Qk_vec) / sizeof(T);
+  static_assert(Dh_MAX % QK_VEC_SIZE == 0, "");
+  // Use block reduction if needed
+  // static_assert(Dh_MAX / QK_VEC_SIZE <= WARP_SIZE, "");
+  constexpr int QK_VECS_PER_WARP = Dh_MAX / QK_VEC_SIZE;
+
+  // cache_k, [B, num_head, head_dim / x, max_seq_len, x]
+  // x == 4/8 for FP32/FP16, 128bit, 16Byte
+  constexpr int QK_ELTS_IN_16B = 16 / sizeof(T);
+  constexpr int QK_VECS_IN_16B = 16 / sizeof(Qk_vec);
+
+  const T *q_base = params.qkv;
+  const T *k_base = params.qkv + params.num_head * Dh;
+  const T *q_bias_base = params.qkv_bias;
+  const T *k_bias_base = params.qkv_bias + params.num_head * Dh;
+
+  if (tid < QK_VECS_PER_WARP) {
+    int qk_offset = qkv_base_offset + tid * QK_VEC_SIZE;
+    int qk_bias_offset = hi * Dh + tid * QK_VEC_SIZE;
+
+    Qk_vec q;
+    zero(q);
+    q = (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
+            ? *reinterpret_cast<const Qk_vec *>(&q_base[qk_offset])
+            : q;
+    Qk_vec k;
+    zero(k);
+    k = (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
+            ? *reinterpret_cast<const Qk_vec *>(&k_base[qk_offset])
+            : k;
+
+    Qk_vec q_bias;
+    zero(q_bias);
+    q_bias =
+        (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
+            ? *reinterpret_cast<const Qk_vec *>(&q_bias_base[qk_bias_offset])
+            : q_bias;
+    Qk_vec k_bias;
+    zero(k_bias);
+    k_bias =
+        (Dh == Dh_MAX || tid * QK_VEC_SIZE < Dh)
+            ? *reinterpret_cast<const Qk_vec *>(&k_bias_base[qk_bias_offset])
+            : k_bias;
+
+    q = add(q, q_bias);
+    // TODO(wangxi): See this https://github.com/microsoft/unilm/issues/510
+    //   we may not require k_bias.
+    k = add(k, k_bias);
+
+    *reinterpret_cast<Qk_vec *>(&q_smem[tid * QK_VEC_SIZE]) = q;
+
+    int co = tid / QK_VECS_IN_16B;
+    int ci = (tid % QK_VECS_IN_16B) * QK_VEC_SIZE;
+    int offset = bhi * params.max_seq_length * Dh +
+                 co * params.max_seq_length * QK_ELTS_IN_16B +
+                 params.timestep * QK_ELTS_IN_16B + ci;
+    if (Dh == Dh_MAX || co < Dh / QK_ELTS_IN_16B) {
+      *reinterpret_cast<Qk_vec *>(&params.cache_kv[offset]) = k;
+    }
+
+    qk = dot<Qk_vec, Qk_vec>(q, k);
+
+    if (QK_VECS_PER_WARP <= WARP_SIZE) {
+#pragma unroll
+      for (int mask = QK_VECS_PER_WARP / 2; mask >= 1; mask /= 2) {
+        qk += __shfl_xor_sync(shfl_mask(QK_VECS_PER_WARP), qk, mask);
+      }
+    }
+  }
+  if (QK_VECS_PER_WARP > WARP_SIZE) {
+    constexpr int WARPS_PER_RED =
+        (QK_VECS_PER_WARP + WARP_SIZE - 1) / WARP_SIZE;
+    qk = block_sum<WARPS_PER_RED>(&red_smem[WARPS_PER_RED], qk);
+  }
+  if (tid == 0) {
+    // NOTE(wangxi): mask must be 0.0
+    // T mask = params.attn_mask[
+    //    bi * (params.timestep + 1) + params.timestep];
+    // qk += static_cast<float>(mask);
+    qk *= params.inv_sqrt_dh;
+    qk_max = qk;
+    qk_smem[params.timestep] = qk;
+  }
+  __syncthreads();
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("=======q_out=======\n");
+    for (int i = 0; i < Dh; ++i) printf("%f ", static_cast<float>(q_smem[i]));
+    printf("\n");
+  }
+  __syncthreads();
+#endif
+
+  using K_vec = typename K_vec_<T, THREADS_PER_KEY>::Type;
+  constexpr int K_VEC_SIZE = sizeof(K_vec) / sizeof(T);
+  static_assert(Dh_MAX % K_VEC_SIZE == 0, "");
+  constexpr int K_ELTS_PER_THREAD = Dh_MAX / THREADS_PER_KEY;
+  constexpr int K_VECS_PER_THREAD = K_ELTS_PER_THREAD / K_VEC_SIZE;
+
+  int ko = tid / THREADS_PER_KEY;
+  int ki = (tid % THREADS_PER_KEY) * K_VEC_SIZE;
+
+  static_assert(Dh_MAX == THREADS_PER_KEY * K_VEC_SIZE * K_VECS_PER_THREAD, "");
+
+  K_vec q[K_VECS_PER_THREAD];
+#pragma unroll
+  for (int i = 0; i < K_VECS_PER_THREAD; ++i) {
+    q[i] = *reinterpret_cast<const K_vec *>(
+        &q_smem[ki + i * THREADS_PER_KEY * K_VEC_SIZE]);
+  }
+
+  constexpr int K_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_KEY;
+  constexpr int K_PER_WARP = WARP_SIZE / THREADS_PER_KEY;
+
+  T *k_cache = &params.cache_kv[bhi * params.max_seq_length * Dh + ki];
+  int ti_end = div_up(params.timestep, K_PER_WARP) * K_PER_WARP;
+
+  for (int ti = ko; ti < ti_end; ti += K_PER_ITER) {
+    K_vec k[K_VECS_PER_THREAD];
+    K_vec k_vec_zero;
+    zero(k_vec_zero);
+#pragma unroll
+    for (int ii = 0; ii < K_VECS_PER_THREAD; ++ii) {
+      int jj = ii * params.max_seq_length + ti;
+      if (ti < params.timestep) {
+        k[ii] =
+            (Dh == Dh_MAX || jj * QK_ELTS_IN_16B < Dh * params.max_seq_length)
+                ? *reinterpret_cast<const K_vec *>(
+                      &k_cache[jj * QK_ELTS_IN_16B])
+                : k_vec_zero;
+      }
+    }
+
+    // NOTE(liyurui): We should multiple q with inv_sqrt_dh first, for dot(q, k)
+    // may overflow with FP16 in large model.
+    float qk = Qk_dot<T, THREADS_PER_KEY>::dot(q, k, params.inv_sqrt_dh);
+
+    // bool is_mask = false;
+    if (ti < params.timestep && tid % THREADS_PER_KEY == 0) {
+      // qk_max = is_mask ? qk_max : fmaxf(qk_max, qk);
+      T mask = params.attn_mask[bi * (params.timestep + 1) + ti];
+      qk += static_cast<float>(mask);
+      qk_max = fmaxf(qk_max, qk);
+
+      qk_smem[ti] = qk;
+    }
+  }
+
+#pragma unroll
+  for (int mask = WARP_SIZE / 2; mask >= THREADS_PER_KEY; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+  }
+
+  const int warp = tid / WARP_SIZE;
+  const int lane = tid % WARP_SIZE;
+
+  if (lane == 0) {
+    red_smem[warp] = qk_max;
+  }
+
+  __syncthreads();
+
+  qk_max = lane < WARPS_PER_BLOCK ? red_smem[lane] : -FLT_MAX;
+#pragma unroll
+  for (int mask = WARPS_PER_BLOCK / 2; mask >= 1; mask /= 2) {
+    qk_max = fmaxf(qk_max, __shfl_xor_sync(uint32_t(-1), qk_max, mask));
+  }
+
+  qk_max = __shfl_sync(uint32_t(-1), qk_max, 0);
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("=======qk_out=======\n");
+    for (int i = 0; i <= params.timestep; ++i) printf("%f ", qk_smem[i]);
+    printf("qk_max=%f\n", qk_max);
+  }
+  __syncthreads();
+#endif
+
+  float sum = 0.f;
+  for (int ti = tid; ti <= params.timestep; ti += THREADS_PER_BLOCK) {
+    // bool is_mask = false;
+    // float logit = is_mask ? 0.f : __expf(qk_smem[ti] - qk_max);
+    float logit = __expf(qk_smem[ti] - qk_max);
+    sum += logit;
+    qk_smem[ti] = logit;
+  }
+
+  sum = block_sum<WARPS_PER_BLOCK>(&red_smem[WARPS_PER_BLOCK], sum);
+
+  // FIXME(wangxi): need add 1.e-6f?
+  float inv_sum = __fdividef(1.f, sum + 1.e-6f);
+  for (int ti = tid; ti <= params.timestep; ti += THREADS_PER_BLOCK) {
+    convert_from_float(logits_smem[ti], qk_smem[ti] * inv_sum);
+  }
+  __syncthreads();
+
+  constexpr int V_VEC_SIZE = Dh_MAX / THREADS_PER_VALUE;
+  using V_vec = typename V_vec_<T, V_VEC_SIZE>::Type;
+
+  int vo = tid / THREADS_PER_VALUE;
+  int vi = (tid % THREADS_PER_VALUE) * V_VEC_SIZE;
+
+  T *v_cache = &params.cache_kv[params.batch_size * params.num_head *
+                                    params.max_seq_length * Dh +
+                                bhi * params.max_seq_length * Dh + vi];
+
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+  using V_vec_acum = typename V_vec_acum_fp32_<V_vec>::Type;
+#else
+  using V_vec_acum = V_vec;
+#endif
+
+  V_vec_acum out;
+  zero(out);
+
+  constexpr int V_PER_ITER = THREADS_PER_BLOCK / THREADS_PER_VALUE;
+  if (Dh == Dh_MAX || vi < Dh) {
+    for (int ti = vo; ti < params.timestep; ti += V_PER_ITER) {
+      V_vec v = *reinterpret_cast<const V_vec *>(&v_cache[ti * Dh]);
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+      float logit = logits_smem[ti];
+      out = fma(logit, cast_to_float(v), out);
+#else
+      T logit = logits_smem[ti];
+      // Update the partial sums.
+      out = fma(logit, v, out);
+#endif
+    }
+  }
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("======logits_out=====\n");
+    for (int i = 0; i <= params.timestep; ++i) printf("%f ", logits_smem[i]);
+    printf("\n");
+  }
+  __syncthreads();
+#endif
+
+  V_vec v_bias;
+  zero(v_bias);
+  if (vo == (params.timestep % V_PER_ITER) && (Dh == Dh_MAX || vi < Dh)) {
+    V_vec v = *reinterpret_cast<const V_vec *>(
+        &params.qkv[2 * params.num_head * Dh + qkv_base_offset + vi]);
+    v_bias = *reinterpret_cast<const V_vec *>(
+        &params.qkv_bias[2 * params.num_head * Dh + hi * Dh + vi]);
+    v = add(v, v_bias);
+    *reinterpret_cast<V_vec *>(&v_cache[params.timestep * Dh]) = v;
+
+#if defined(MMHA_USE_FP32_ACUM_FOR_LOGITS)
+    out = fma(logits_smem[params.timestep], cast_to_float(v), out);
+#else
+    out = fma(logits_smem[params.timestep], v, out);
+#endif
+  }
+
+  __syncthreads();
+
+  if (Dh == Dh_MAX || vi < Dh) {
+#pragma unroll
+    for (int active_groups = V_PER_ITER; active_groups >= 2;
+         active_groups /= 2) {
+      int midpoint = active_groups / 2;
+
+      if (vo >= midpoint && vo < active_groups && (Dh == Dh_MAX || vi < Dh)) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+        convert_from_float(
+            *reinterpret_cast<V_vec *>(&out_smem[(vo - midpoint) * Dh + vi]),
+            out);
+#else
+        *reinterpret_cast<V_vec *>(&out_smem[(vo - midpoint) * Dh + vi]) = out;
+#endif
+      }
+      __syncthreads();
+      if (vo < midpoint && (Dh == Dh_MAX || vi < Dh)) {
+        out =
+            add(*reinterpret_cast<const V_vec *>(&out_smem[vo * Dh + vi]), out);
+      }
+      __syncthreads();
+    }
+  }
+
+  if (vo == 0 && (Dh == Dh_MAX || vi < Dh)) {
+#ifdef MMHA_USE_FP32_ACUM_FOR_OUT
+    convert_from_float(*reinterpret_cast<V_vec *>(&params.out[bhi * Dh + vi]),
+                       out);
+#else
+    *reinterpret_cast<V_vec *>(&params.out[bhi * Dh + vi]) = out;
+#endif
+  }
+
+#ifdef _DEBUG_FUSED_MULTI_TRANSFORMER
+  __syncthreads();
+  if (bi == 0 && hi == 0 && tid == 0) {
+    printf("======fmha_out=====\n");
+    for (int i = 0; i < Dh; ++i)
+      printf("%f ", static_cast<float>(params.out[i]));
+    printf("\n");
+  }
+#endif
+#else
+  assert(false);
+#endif
+}
+
+template <typename T>
+inline size_t smem_size_in_bytes(
+    const Masked_multihead_attention_params<T> &params,
+    int dim_head,
+    int threads_per_value,
+    int threads_per_block) {
+  size_t qk_sz = div_up(params.timestep + 1, 4) * 16;
+  size_t logits_sz = 0;
+
+#ifndef MMHA_USE_FP32_ACUM_FOR_LOGITS  // NOLINT
+  if (sizeof(T) != 4) {
+    logits_sz = div_up(params.max_seq_length, 4) * 4 * sizeof(T);
+  }
+#endif  // NOLINT
+  size_t softmax_sz = qk_sz + logits_sz;
+
+  int rows_per_red = threads_per_block / threads_per_value;
+  size_t red_sz = rows_per_red * dim_head * sizeof(T) / 2;
+
+  return max(softmax_sz, red_sz);
+}
+
+#define MMHA_LAUNCH_KERNEL(                                              \
+    T, Dh, Dh_MAX, THDS_PER_KEY, THDS_PER_VALUE, THDS_PER_BLOCK, stream) \
+  size_t smem_sz =                                                       \
+      smem_size_in_bytes<T>(params, Dh, THDS_PER_VALUE, THDS_PER_BLOCK); \
+  dim3 grid(params.num_head, params.batch_size);                         \
+  masked_multihead_attention_kernel<T,                                   \
+                                    Dh,                                  \
+                                    Dh_MAX,                              \
+                                    THDS_PER_KEY,                        \
+                                    THDS_PER_VALUE,                      \
+                                    THDS_PER_BLOCK>                      \
+      <<<grid, THDS_PER_BLOCK, smem_sz, stream>>>(params)
+
+template <typename T, int Dh, int Dh_MAX>
+void fmha_launch_kernel(const Masked_multihead_attention_params<T> &params,
+                        const cudaStream_t &stream) {
+  constexpr int THREADS_PER_VALUE = Dh_MAX * sizeof(T) / 16;
+  if (params.timestep < 32) {
+    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 4, THREADS_PER_VALUE, 64, stream);
+  } else if (params.timestep < 2048) {
+    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 2, THREADS_PER_VALUE, 128, stream);
+  } else {
+    MMHA_LAUNCH_KERNEL(T, Dh, Dh_MAX, 1, THREADS_PER_VALUE, 256, stream);
+  }
+}
+
+template <typename T>
+void fmha(const phi::GPUContext &dev_ctx,
+          const Tensor &qkv_tensor,
+          const Tensor &qkv_bias_tensor,
+          const Tensor &src_mask_tensor,
+          Tensor *cache_kv_tensor,
+          Tensor *out_tensor,
+          int batch_size,
+          int max_seq_length,
+          int num_head,
+          int dim_head,
+          int timestep,
+          float inv_sqrt_dh) {
+  Masked_multihead_attention_params<T> params;
+  params.out = out_tensor->data<T>();
+  params.qkv = qkv_tensor.data<T>();
+  params.qkv_bias = qkv_bias_tensor.data<T>();
+  params.attn_mask = src_mask_tensor.data<T>();
+  params.cache_kv = cache_kv_tensor->data<T>();
+
+  params.batch_size = batch_size;
+  params.num_head = num_head;
+  params.timestep = timestep;
+  params.max_seq_length = max_seq_length;
+  params.inv_sqrt_dh = inv_sqrt_dh;
+
+  switch (dim_head) {
+    case 10:
+      fmha_launch_kernel<T, 10, 32>(params, dev_ctx.stream());
+      break;
+    case 26:
+      fmha_launch_kernel<T, 26, 32>(params, dev_ctx.stream());
+      break;
+    case 32:
+      fmha_launch_kernel<T, 32, 32>(params, dev_ctx.stream());
+      break;
+    case 64:
+      fmha_launch_kernel<T, 64, 64>(params, dev_ctx.stream());
+      break;
+    case 96:
+      fmha_launch_kernel<T, 96, 128>(params, dev_ctx.stream());
+      break;
+    case 128:
+      fmha_launch_kernel<T, 128, 128>(params, dev_ctx.stream());
+      break;
+    case 192:
+      fmha_launch_kernel<T, 192, 256>(params, dev_ctx.stream());
+      break;
+    default:
+      PADDLE_THROW(platform::errors::Unimplemented(
+          "Dim_head = %d is unsupport!", dim_head));
+  }
+}
+
+// NOTE: simd with 16Bytes(128bit), float is 4, float16 is 8
+constexpr int VEC_16B = 16;
+
+template <typename T>
+__global__ void write_cache_k_kernel(T *cache_k,
+                                     const T *k,
+                                     const int num_head,
+                                     const int dim_head,
+                                     const int seq_len,
+                                     const int max_seq_len) {
+  const int bi = blockIdx.y;
+  const int hi = blockIdx.z;
+  constexpr int X_ELEMS = VEC_16B / sizeof(T);
+
+  // [bsz, num_head, seq_len, dim_head/x, x]
+  auto k_src = reinterpret_cast<const uint4 *>(
+      k + bi * num_head * seq_len * dim_head + hi * seq_len * dim_head);
+  // [bsz, num_head, dim_head/x, max_seq_len, x]
+  auto k_dst = reinterpret_cast<uint4 *>(
+      cache_k + bi * num_head * max_seq_len * dim_head +
+      hi * max_seq_len * dim_head);
+
+  const int out_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  // vec size
+  int dim_head_div_x = dim_head / X_ELEMS;
+
+  // FIXME(wangxi): num_head is not need?
+  // if (out_idx >= num_head * dim_head_div_x * max_seq_len) return;
+  if (out_idx >= dim_head_div_x * max_seq_len) return;
+
+  int idx = out_idx;
+  const int k_seq_len_id = idx % max_seq_len;
+  // idx = (idx - k_seq_len_id) / max_seq_len;
+  idx = idx / max_seq_len;
+  const int k_vec_id = idx % dim_head_div_x;
+
+  if (k_seq_len_id < seq_len) {
+    k_dst[out_idx] = k_src[k_seq_len_id * dim_head_div_x + k_vec_id];
+  }
+}
+
+template <typename T>
+__global__ void write_cache_v_kernel(T *cache_v,
+                                     const T *v,
+                                     const int num_head,
+                                     const int dim_head,
+                                     const int seq_len,
+                                     const int max_seq_len) {
+  const int bi = blockIdx.y;
+  const int hi = blockIdx.z;
+
+  // [bsz, num_head, seq_len, dim_head/x, x]
+  auto v_src = reinterpret_cast<const uint4 *>(
+      v + bi * num_head * seq_len * dim_head + hi * seq_len * dim_head);
+  // [bsz, num_head, max_seq_len, dim_head/x, x]
+  auto v_dst = reinterpret_cast<uint4 *>(
+      cache_v + bi * num_head * max_seq_len * dim_head +
+      hi * max_seq_len * dim_head);
+
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  constexpr int X_ELEMS = VEC_16B / sizeof(T);
+  const int dim_head_div_x = dim_head / X_ELEMS;
+
+  if (idx >= dim_head_div_x * seq_len) return;
+
+  v_dst[idx] = v_src[idx];
+}
+
+template <typename T>
+void write_cache_kv(const phi::GPUContext &dev_ctx,
+                    T *cache_k,
+                    T *cache_v,
+                    const T *k,
+                    const T *v,
+                    const int bsz,
+                    const int num_head,
+                    const int seq_len,
+                    const int max_seq_len,
+                    const int dim_head) {
+  constexpr int block_sz = 128;
+  constexpr int x = VEC_16B / sizeof(T);
+
+  assert(dim_head % x == 0);
+  PADDLE_ENFORCE_EQ(
+      dim_head % x,
+      0,
+      platform::errors::PreconditionNotMet(
+          "dim_head=%d must be divisible by vec_size=%d", dim_head, x));
+
+  int max_size = max_seq_len * dim_head / x;
+  int size = seq_len * dim_head / x;
+  dim3 grid(div_up(max_size, block_sz), bsz, num_head);
+  dim3 grid_v(div_up(size, block_sz), bsz, num_head);
+
+  // transpose [bsz, num_head, seq_len, dim_head/x, x]->
+  // [bsz, num_head, dim_head/x, max_seq_len, x]
+  write_cache_k_kernel<<<grid, block_sz, 0, dev_ctx.stream()>>>(
+      cache_k, k, num_head, dim_head, seq_len, max_seq_len);
+
+  // copy [bsz, num_head, seq_len, dim_head/x, x]->
+  // [bsz, num_head, max_seq_len, dim_head/x, x]
+  write_cache_v_kernel<<<grid_v, block_sz, 0, dev_ctx.stream()>>>(
+      cache_v, v, num_head, dim_head, seq_len, max_seq_len);
+}
+
+}  // namespace
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
index c1131cae5d86f7..f162d200abfe1e 100644
--- a/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
+++ b/paddle/fluid/operators/fused/fused_residual_dropout_bias.h
@@ -28,7 +28,9 @@ template <typename T,
           int VecSize,
           bool ComputeLayerNorm,
           bool Activation,
-          typename Functor>
+          typename Functor,
+          typename InType = T,
+          typename OutType = T>
 __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     const int row_id,
     const int col_id,
@@ -36,30 +38,45 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
     curandStatePhilox4_32_10_t *state,
     const float dropout_prob,
     const T factor,
-    const T *__restrict__ src,
+    const InType *__restrict__ src,
     const T *__restrict__ residual,
     const T *__restrict__ bias,
-    T *dst,
+    OutType *dst,
     MaskType *mask,
     const bool is_test,
     typename details::MPTypeTrait<T>::Type *mean_val,
     typename details::MPTypeTrait<T>::Type *var_val,
-    Functor act_func) {
+    Functor act_func,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
   using LoadT = phi::AlignedVector<T, VecSize>;
+  using LoadInType = phi::AlignedVector<InType, VecSize>;
+  using LoadFloat = phi::AlignedVector<float, VecSize>;
   using StoreT = phi::AlignedVector<T, VecSize>;
+  using StoreOutType = phi::AlignedVector<OutType, VecSize>;
+
   using MaskStoreT = phi::AlignedVector<MaskType, VecSize>;
   using U = typename details::MPTypeTrait<T>::Type;
 
-  LoadT src_vec;
+  LoadInType src_vec;
   LoadT residual_vec;
   LoadT bias_vec;
+  LoadFloat quant_out_scale_vec;
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
     bias_vec[ii] = static_cast<T>(0);
     residual_vec[ii] = static_cast<T>(0);
   }
   // vectorize load data from global
-  phi::Load<T, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<InType, VecSize>(&src[row_id * cols + col_id], &src_vec);
+  phi::Load<float, VecSize>(
+      &dequant_out_scale_data[quant_out_scale_offset + col_id],
+      &quant_out_scale_vec);
   if (residual) {
     phi::Load<T, VecSize>(&residual[row_id * cols + col_id], &residual_vec);
   }
@@ -84,10 +101,18 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
   }
 
   StoreT dest_vec;
+  StoreOutType dest_vec_out_type;
 
 #pragma unroll
   for (int ii = 0; ii < VecSize; ii++) {
-    T tmp = src_vec[ii] + bias_vec[ii];
+    T tmp;
+    if (std::is_same<InType, int32_t>::value) {
+      T tmp0 = static_cast<T>(static_cast<float>(src_vec[ii]) *
+                              quant_last_in_scale / quant_out_scale_vec[ii]);
+      tmp = tmp0 + bias_vec[ii];
+    } else {
+      tmp = static_cast<T>(src_vec[ii]) + bias_vec[ii];
+    }
     if (Activation) {
       tmp = act_func(tmp);
     }
@@ -98,10 +123,23 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
       *mean_val += tmp;
       *var_val += (tmp * tmp);
     }
+    if (std::is_same<OutType, int8_t>::value) {
+      dest_vec_out_type[ii] = quant_helper(dest_vec[ii],
+                                           quant_next_in_scale,
+                                           quant_round_type,
+                                           quant_max_bound,
+                                           quant_min_bound);
+    }
   }
 
   // store result to global
-  phi::Store<T, VecSize>(dest_vec, &dst[row_id * cols + col_id]);
+  if (std::is_same<OutType, int8_t>::value) {
+    phi::Store<OutType, VecSize>(dest_vec_out_type,
+                                 &dst[row_id * cols + col_id]);
+  } else {
+    phi::Store<T, VecSize>(dest_vec,
+                           reinterpret_cast<T *>(&dst[row_id * cols + col_id]));
+  }
   if (!is_test) {
     phi::Store<MaskType, VecSize>(mask_vec, &mask[row_id * cols + col_id]);
   }
@@ -114,19 +152,28 @@ __forceinline__ __device__ void FusedResidualDropoutBiasOneThread(
  * is_test: only used in inference
  * mask: can be null if is_test=true
  */
-template <typename T, typename MaskType, int VecSize>
-__global__ void FusedResidualDropoutBias(const size_t rows,
-                                         const size_t cols,
-                                         uint64_t seed,
-                                         const float dropout_prob,
-                                         const bool is_upscale_in_train,
-                                         const T *__restrict__ src,
-                                         const T *__restrict__ residual,
-                                         const T *__restrict__ bias,
-                                         MaskType *mask,
-                                         T *dst,
-                                         uint64_t increment,
-                                         const bool is_test) {
+template <typename T,
+          typename MaskType,
+          int VecSize,
+          typename InType = T,
+          typename OutType = T>
+__global__ void FusedResidualDropoutBias(
+    const size_t rows,
+    const size_t cols,
+    uint64_t seed,
+    const float dropout_prob,
+    const bool is_upscale_in_train,
+    const InType *__restrict__ src,
+    const T *__restrict__ residual,
+    const T *__restrict__ bias,
+    MaskType *mask,
+    OutType *dst,
+    uint64_t increment,
+    const bool is_test,
+    const float quant_last_in_scale = 1.0,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_next_in_scale = 1.0) {
   int col_id = blockDim.x * blockIdx.x + threadIdx.x;
   int row_id = blockIdx.y;
   int idx = row_id * cols + col_id;
@@ -142,22 +189,27 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
                                         VecSize,
                                         false,
                                         false,
-                                        phi::funcs::ReluFunctor<T>>(
-          r,
-          i,
-          cols,
-          &state,
-          dropout_prob,
-          factor,
-          src,
-          residual,
-          bias,
-          dst,
-          mask,
-          is_test,
-          nullptr,
-          nullptr,
-          relu);
+                                        phi::funcs::ReluFunctor<T>,
+                                        InType,
+                                        OutType>(r,
+                                                 i,
+                                                 cols,
+                                                 &state,
+                                                 dropout_prob,
+                                                 factor,
+                                                 src,
+                                                 residual,
+                                                 bias,
+                                                 dst,
+                                                 mask,
+                                                 is_test,
+                                                 nullptr,
+                                                 nullptr,
+                                                 relu,
+                                                 quant_last_in_scale,
+                                                 dequant_out_scale_data,
+                                                 quant_out_scale_offset,
+                                                 quant_next_in_scale);
     }
   }
 }
@@ -165,7 +217,10 @@ __global__ void FusedResidualDropoutBias(const size_t rows,
 /**
  * @brief dst = residual + dropout(src + bias);
  */
-template <typename T, typename MaskType>
+template <typename T,
+          typename MaskType,
+          typename InType = T,
+          typename OutType = T>
 void LaunchResidualDropoutBias(const uint32_t rows,
                                const uint32_t cols,
                                const int increment,
@@ -173,14 +228,19 @@ void LaunchResidualDropoutBias(const uint32_t rows,
                                const float dropout_prob,
                                const bool is_test,
                                bool is_upscale_in_train,
-                               const T *src,
+                               const InType *src,
                                const T *residual,
                                const T *bias,
                                MaskType *mask_data,
-                               T *dst,
-                               const phi::GPUContext &ctx) {
+                               OutType *dst,
+                               const phi::GPUContext &ctx,
+                               const float quant_last_in_scale = 1.0,
+                               const float *dequant_out_scale_data = nullptr,
+                               const int quant_out_scale_offset = 0,
+                               const float quant_next_in_scale = 1.0) {
   // dropout_prob == 1.0f
   if (std::abs(dropout_prob - 1.0f) < 1e-5) {
+    // NOTE(minghaoBD): OutType should be T if dropout_prob == 1.0
     if (residual == dst) return;
     if (residual) {
       memory::Copy(ctx.GetPlace(),
@@ -202,7 +262,7 @@ void LaunchResidualDropoutBias(const uint32_t rows,
   const int real_vec_size = cols % VecSize == 0 ? VecSize : 1;
   auto config = Get1DBlocksAnd2DGrids(ctx, rows, cols, real_vec_size);
   if (cols % VecSize == 0) {
-    FusedResidualDropoutBias<T, uint8_t, VecSize>
+    FusedResidualDropoutBias<T, uint8_t, VecSize, InType, OutType>
         <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
             rows,
             cols,
@@ -215,9 +275,13 @@ void LaunchResidualDropoutBias(const uint32_t rows,
             mask_data,
             dst,
             increment,
-            is_test);
+            is_test,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
   } else {
-    FusedResidualDropoutBias<T, uint8_t, 1>
+    FusedResidualDropoutBias<T, uint8_t, 1, InType, OutType>
         <<<config.block_per_grid, config.thread_per_block, 0, ctx.stream()>>>(
             rows,
             cols,
@@ -230,7 +294,11 @@ void LaunchResidualDropoutBias(const uint32_t rows,
             mask_data,
             dst,
             increment,
-            is_test);
+            is_test,
+            quant_last_in_scale,
+            dequant_out_scale_data,
+            quant_out_scale_offset,
+            quant_next_in_scale);
   }
 }
 
diff --git a/paddle/fluid/operators/fused/quant_dequant_kernel.h b/paddle/fluid/operators/fused/quant_dequant_kernel.h
new file mode 100644
index 00000000000000..21b7b0f345466e
--- /dev/null
+++ b/paddle/fluid/operators/fused/quant_dequant_kernel.h
@@ -0,0 +1,136 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/operators/fake_quantize_op.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+__forceinline__ __device__ int8_t quant_helper(const T input,
+                                               const float scale,
+                                               const int round_type,
+                                               const float max_bound,
+                                               const float min_bound) {
+  float quant_value = max_bound * inverse(scale) * static_cast<float>(input);
+  if (round_type == 0) {
+    quant_value = static_cast<float>(roundWithTiesToEven(quant_value));
+  } else {
+    quant_value = static_cast<float>(round(quant_value));
+  }
+  quant_value = quant_value > max_bound ? max_bound : quant_value;
+  quant_value = quant_value < min_bound ? min_bound : quant_value;
+  return static_cast<int8_t>(quant_value);
+}
+
+template <typename T>
+__global__ void quantize_kernel(const T* input,
+                                char4* output,
+                                const float scale,
+                                const int m,
+                                const int n,
+                                const int round_type,
+                                const float max_bound,
+                                const float min_bound) {
+  int n_id = (blockIdx.x * blockDim.x + threadIdx.x) << 2;
+  int m_id = blockIdx.y * blockDim.y + threadIdx.y;
+
+  bool check = ((m_id < m) && (n_id < n));
+  if (check) {
+    char4 tmp;
+    tmp.x = quant_helper(
+        input[m_id * n + n_id], scale, round_type, max_bound, min_bound);
+    tmp.y = quant_helper(
+        input[m_id * n + n_id + 1], scale, round_type, max_bound, min_bound);
+    tmp.z = quant_helper(
+        input[m_id * n + n_id + 2], scale, round_type, max_bound, min_bound);
+    tmp.w = quant_helper(
+        input[m_id * n + n_id + 3], scale, round_type, max_bound, min_bound);
+    output[(m_id * n + n_id) >> 2] = tmp;
+  }
+}
+
+template <typename T>
+void quantize_kernel_launcher(const T* input,
+                              int8_t* output,
+                              const float scale,
+                              const int m,
+                              const int n,
+                              const int round_type,
+                              const float max_bound,
+                              const float min_bound,
+                              gpuStream_t stream) {
+  // TODO(minghaoBD): optimize the kennel launch times when m==1 or n==1
+  dim3 grid((n + 31) / 32, (m + 31) / 32);
+  dim3 block(32, 32);
+
+  quantize_kernel<<<grid, block, 0, stream>>>(input,
+                                              (char4*)output,  // NOLINT
+                                              scale,
+                                              m,
+                                              n,
+                                              round_type,
+                                              max_bound,
+                                              min_bound);
+}
+
+// dequantize using weight scales and input scales
+template <typename T>
+__global__ void dequantize_kernel(T* output,
+                                  const int32_t* input,
+                                  const int m,  // hidden
+                                  const int n,  // batch size
+                                  const float quant_in_scale,
+                                  const float* dequant_out_scale_data,
+                                  const int quant_out_scale_offset) {
+  int m_id = blockIdx.x * blockDim.x + threadIdx.x;  // hidden
+  int n_id = blockIdx.y * blockDim.y + threadIdx.y;  // batch size
+
+  bool check = ((m_id < m) && (n_id < n));
+  if (check) {
+    float out_scale = dequant_out_scale_data[quant_out_scale_offset + m_id];
+    output[n_id * m + m_id] =
+        static_cast<T>(static_cast<float>(input[n_id * m + m_id]) *
+                       quant_in_scale / out_scale);
+  }
+}
+
+template <typename T>
+void dequantize_kernel_launcher(const int32_t* input,
+                                T* output,
+                                const int batch_size,    // m
+                                const int hidden_units,  // n
+                                gpuStream_t stream,
+                                const float quant_in_scale,
+                                const float* dequant_out_scale_data,
+                                const int quant_out_scale_offset) {
+  dim3 grid((hidden_units + 31) / 32, (batch_size + 31) / 32);
+  dim3 block(32, 32);
+
+  dequantize_kernel<<<grid, block, 0, stream>>>(output,
+                                                input,
+                                                hidden_units,
+                                                batch_size,
+                                                quant_in_scale,
+                                                dequant_out_scale_data,
+                                                quant_out_scale_offset);
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cc b/paddle/fluid/operators/fused_softmax_mask_op.cc
index 11c1fa4af85603..604eaaaf3fc7cf 100644
--- a/paddle/fluid/operators/fused_softmax_mask_op.cc
+++ b/paddle/fluid/operators/fused_softmax_mask_op.cc
@@ -11,10 +11,14 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fused_softmax_mask_op.h"
 
 #include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -23,30 +27,6 @@ using framework::Tensor;
 class SoftmaxMaskFuseOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SoftmaxMaskFuse");
-    OP_INOUT_CHECK(ctx->HasInput("Mask"), "Input", "Mask", "SoftmaxMaskFuse");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "SoftmaxMaskFuse");
-    auto x_dims = ctx->GetInputDim("X");
-    auto mask_dims = ctx->GetInputDim("Mask");
-
-    PADDLE_ENFORCE_EQ(
-        x_dims.size(),
-        4,
-        platform::errors::InvalidArgument("Input x must be in 4D dimension but "
-                                          "received the dimension of X is %d",
-                                          x_dims.size()));
-    PADDLE_ENFORCE_EQ(mask_dims.size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "Input mask must be in 4D dimension but "
-                          "received the dimension of mask is %d",
-                          mask_dims.size()));
-
-    ctx->SetOutputDim("Out", x_dims);
-    ctx->ShareLoD("X", "Out");
-  }
 };
 
 class SoftmaxMaskFuseOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -80,17 +60,6 @@ By doing this fusion, we can optimize the training by
 class SoftmaxMaskFuseOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput(framework::GradVarName("Out")),
-                   "Input",
-                   framework::GradVarName("Out"),
-                   "SoftmaxMaskFuseGrad");
-
-    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    ctx->SetOutputDim(framework::GradVarName("X"), out_dims);
-    ctx->ShareLoD(framework::GradVarName("Out"), framework::GradVarName("X"));
-  }
 };
 
 template <typename T>
@@ -111,12 +80,18 @@ class SoftmaxMaskFuseGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(fused_softmax_mask,
+                            SoftmaxMaskFuseInferShapeFunctor,
+                            PD_INFER_META(phi::SoftmaxMaskFuseInferMeta));
+DECLARE_INFER_SHAPE_FUNCTOR(fused_softmax_mask_grad,
+                            SoftmaxMaskFuseGradInferShapeFunctor,
+                            PD_INFER_META(phi::GeneralUnaryGradInferMeta));
 REGISTER_OPERATOR(fused_softmax_mask,
                   ops::SoftmaxMaskFuseOp,
                   ops::SoftmaxMaskFuseOpMaker,
                   ops::SoftmaxMaskFuseGradOpMaker<paddle::framework::OpDesc>,
-                  ops::SoftmaxMaskFuseGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OPERATOR(fused_softmax_mask_grad, ops::SoftmaxMaskFuseOpGrad);
-REGISTER_OP_CPU_KERNEL(fused_softmax_mask,
-                       ops::SoftmaxMaskFuseCPUKernel<phi::CPUContext, float>,
-                       ops::SoftmaxMaskFuseCPUKernel<phi::CPUContext, double>);
+                  ops::SoftmaxMaskFuseGradOpMaker<paddle::imperative::OpBase>,
+                  SoftmaxMaskFuseInferShapeFunctor);
+REGISTER_OPERATOR(fused_softmax_mask_grad,
+                  ops::SoftmaxMaskFuseOpGrad,
+                  SoftmaxMaskFuseGradInferShapeFunctor);
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.cu b/paddle/fluid/operators/fused_softmax_mask_op.cu
deleted file mode 100644
index c259d0efb490b2..00000000000000
--- a/paddle/fluid/operators/fused_softmax_mask_op.cu
+++ /dev/null
@@ -1,595 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-// this file is inspired by:
-// https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/fused_kernels/scaled_masked_softmax.h
-/* Copyright (c) 2020, NVIDIA CORPORATION.  All rights reserved.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#ifdef PADDLE_WITH_CUDA
-#include <cuda.h>
-#include <curand_kernel.h>
-#endif
-#ifdef PADDLE_WITH_HIP
-#include <hip/hip_runtime.h>
-#include <hiprand_kernel.h>
-#endif
-#include <stdint.h>
-#include <thrust/device_ptr.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-
-#include <algorithm>
-#include <string>
-
-#include "paddle/fluid/framework/generator.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/fused_softmax_mask_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-#ifdef PADDLE_WITH_HIP
-#define WARP_SIZE 64
-#else
-#define WARP_SIZE 32
-#endif
-
-#define MASK 0xffffffff
-
-namespace plat = paddle::platform;
-
-__device__ __inline__ void load_data(plat::float16* dst,
-                                     const plat::float16* src) {
-  *(reinterpret_cast<float2*>(dst)) = *(reinterpret_cast<const float2*>(src));
-}
-
-__device__ __inline__ void load_data(float* dst, const float* src) {
-  *(reinterpret_cast<float4*>(dst)) = *(reinterpret_cast<const float4*>(src));
-}
-
-int get_pow2(int value) {
-  // get next pow2 index
-  int pow2_index = 0;
-  while ((1 << pow2_index) < value) {
-    ++pow2_index;
-  }
-  return pow2_index;
-}
-
-template <typename T>
-struct AddOP {
-  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
-};
-
-template <typename T>
-struct MaxOP {
-  __device__ __forceinline__ T operator()(T a, T b) const {
-    return a < b ? b : a;
-  }
-};
-
-template <typename T>
-__device__ __forceinline__ T
-warp_shfl_xor(T value, int laneMask, int width, unsigned int mask = MASK) {
-#if CUDA_VERSION >= 9000
-  return __shfl_xor_sync(mask, value, laneMask, width);
-#else
-  return __shfl_xor(value, laneMask, width);
-#endif
-}
-
-template <typename T, int batch, int width, template <typename> class ReduceOp>
-__device__ __forceinline__ void warp_reduce(T* sum) {
-  ReduceOp<T> r;
-#pragma unroll
-  for (int offset = width / 2; offset > 0; offset /= 2) {
-#pragma unroll
-    for (int i = 0; i < batch; ++i) {
-      T b = warp_shfl_xor(sum[i], offset, width);
-      sum[i] = r(sum[i], b);
-    }
-  }
-}
-
-// T == fp16
-template <typename T, int pow2_index>
-__global__ void SoftmaxMaskFuseGPUKernel(const T* x_data,
-                                         const T* mask_data,
-                                         T* y_data,
-                                         int batch_count,
-                                         int key_seq_len) {
-  // the forward gpu kernel
-  constexpr int next_pow2 = 1 << pow2_index;
-  constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
-  constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4);
-  constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1;
-  constexpr int kOneLoadingCounts = 4;
-
-  int data_first_idx =
-      (blockDim.y *
-           (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) +
-       threadIdx.y) *
-      kLocalBatchSize;
-
-  int mask_fist_idx =
-      (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) *
-      kLocalBatchSize;
-
-  // batch_count might not be a multiple of kLocalBatchSize. Check how
-  // many batches have to computed within this WARP.
-  int local_batches = batch_count - data_first_idx;
-  if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize;
-
-  // might be many batches per warp. compute the index within the batch
-  int local_idx = threadIdx.x;
-
-  int x_offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx;
-  int mask_offset = mask_fist_idx * key_seq_len + kOneLoadingCounts * local_idx;
-  x_data += x_offset;
-  mask_data += mask_offset;
-  y_data += x_offset;
-
-  // using float for all inter compute
-  float data[kLocalBatchSize][kLocalIterations];
-  T temp_data[kOneLoadingCounts];
-  T temp_mask[kOneLoadingCounts];
-
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-    int batch_data = (i >= local_batches) ? 0 : key_seq_len;
-
-#pragma unroll
-    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
-
-      if (data_index < batch_data) {
-        int itr_idx = i * key_seq_len + ii * warp_size;
-
-        // efficiently load data from global memory
-        load_data(temp_data, x_data + itr_idx);
-        load_data(temp_mask, mask_data + itr_idx);
-
-#pragma unroll
-        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
-          data[i][ii + counter] = static_cast<float>(temp_data[counter]) +
-                                  static_cast<float>(temp_mask[counter]);
-        }
-      } else {
-#pragma unroll
-        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
-          data[i][ii + counter] = -std::numeric_limits<float>::infinity();
-        }
-      }
-    }
-  }
-
-  // compute max_value
-  // max value for each batch for current warp
-  float samples_max_value[kLocalBatchSize];
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-    samples_max_value[i] = data[i][0];
-#pragma unroll
-    for (int ii = 1; ii < kLocalIterations; ++ii) {
-      samples_max_value[i] = (samples_max_value[i] > data[i][ii])
-                                 ? samples_max_value[i]
-                                 : data[i][ii];
-    }
-  }
-  // max value for each batch for all warp
-  warp_reduce<float, kLocalBatchSize, warp_size, MaxOP>(samples_max_value);
-
-  // compute the sum for each batch for current warp
-  float samples_sum[kLocalBatchSize]{0.0f};
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-#pragma unroll
-    for (int ii = 0; ii < kLocalIterations; ++ii) {
-      data[i][ii] = std::exp((data[i][ii] - samples_max_value[i]));
-      samples_sum[i] += data[i][ii];
-    }
-  }
-  // samples_sum for each batch for all warp
-  warp_reduce<float, kLocalBatchSize, warp_size, AddOP>(samples_sum);
-
-  // load the result from device back to host
-  T samples_out[kOneLoadingCounts];
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-    if (i >= local_batches) break;
-#pragma unroll
-    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int idx = kOneLoadingCounts * local_idx + ii * warp_size;
-      if (idx < key_seq_len) {
-#pragma unroll
-        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
-          samples_out[counter] = data[i][ii + counter] / samples_sum[i];
-        }
-        load_data(y_data + i * key_seq_len + ii * warp_size, samples_out);
-      } else {
-        break;
-      }
-    }
-  }
-}
-
-template <typename T, int pow2_index>
-__global__ void SoftmaxMaskFuseGradGPUKernel(const T* grad_input,
-                                             T* grad_output,
-                                             const T* softmax_rst,
-                                             int batch_count,
-                                             int key_seq_len) {
-  constexpr int next_pow2 = 1 << pow2_index;
-  constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
-  constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4);
-  constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1;
-  constexpr int kOneLoadingCounts = 4;
-
-  int data_first_idx =
-      (blockDim.y * blockIdx.x + threadIdx.y) * kLocalBatchSize;
-
-  // batch_count might not be a multiple of kLocalBatchSize. Check how
-  // many batches have to computed within this WARP.
-  int local_batches = batch_count - data_first_idx;
-  if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize;
-
-  // might be many batches per warp. compute the index within the batch
-  int local_idx = threadIdx.x;
-
-  // the first element to process by the current thread
-  int offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx;
-  grad_input += offset;
-  grad_output += offset;
-  softmax_rst += offset;
-
-  // using float for all inter compute
-  float grad_input_reg[kLocalBatchSize][kLocalIterations]{0.0f};
-  float softmax_rst_reg[kLocalBatchSize][kLocalIterations]{0.0f};
-  T temp_grad_input[kOneLoadingCounts];
-  T temp_softmax_rst[kOneLoadingCounts];
-
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-    int batch_data = (i >= local_batches) ? 0 : key_seq_len;
-
-#pragma unroll
-    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * WARP_SIZE;
-      if (data_index < batch_data) {
-        load_data(temp_grad_input,
-                  grad_input + i * key_seq_len + ii * warp_size);
-        load_data(temp_softmax_rst,
-                  softmax_rst + i * key_seq_len + ii * warp_size);
-
-#pragma unroll
-        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
-          softmax_rst_reg[i][ii + counter] =
-              static_cast<float>(temp_softmax_rst[counter]);
-        }
-#pragma unroll
-        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
-          grad_input_reg[i][ii + counter] =
-              static_cast<float>(temp_grad_input[counter]) *
-              softmax_rst_reg[i][ii + counter];
-        }
-      }
-    }
-  }
-
-  float samples_sum[kLocalBatchSize];
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-    samples_sum[i] = grad_input_reg[i][0];
-#pragma unroll
-    for (int ii = 1; ii < kLocalIterations; ++ii) {
-      samples_sum[i] += grad_input_reg[i][ii];
-    }
-  }
-  warp_reduce<float, kLocalBatchSize, warp_size, AddOP>(samples_sum);
-
-#pragma unroll
-  for (int i = 0; i < kLocalBatchSize; ++i) {
-    if (i >= local_batches) break;
-#pragma unroll
-    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
-      if (data_index < key_seq_len) {
-        // compute gradients
-        T samples_out[kOneLoadingCounts];
-#pragma unroll
-        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
-          samples_out[counter] =
-              grad_input_reg[i][ii + counter] -
-              softmax_rst_reg[i][ii + counter] * samples_sum[i];
-        }
-        load_data(grad_output + i * key_seq_len + ii * warp_size, samples_out);
-      }
-    }
-  }
-}
-
-// T only supports fp16
-// leave as template only for future update
-template <typename Place, typename T>
-class SoftmaxMaskFuseKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* x = context.Input<Tensor>("X");
-    auto* mask = context.Input<Tensor>("Mask");
-    auto* y = context.Output<Tensor>("Out");
-
-    auto* x_data = x->data<T>();
-    auto* mask_data = mask->data<T>();
-    auto* y_data = y->mutable_data<T>(context.GetPlace());
-
-    auto x_dim = x->dims();
-    auto mask_dim = mask->dims();
-    auto batches = x_dim[0];
-    auto attn_heads = x_dim[1];
-    auto query_seq_len = x_dim[2];
-    auto key_seq_len = x_dim[3];
-
-    PADDLE_ENFORCE_GT(query_seq_len,
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Input x's second last dim must be large than 1 but "
-                          "received the second last dimension of x is %d",
-                          query_seq_len));
-
-    PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len < 8192,
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Input x's last dim must be between [32, 8192) "
-                          "received the last dimension of x is %d",
-                          key_seq_len));
-
-    PADDLE_ENFORCE_EQ(mask_dim[1],
-                      1,
-                      platform::errors::InvalidArgument(
-                          "Input mask's second dim must be 1 "
-                          "received the second dimension of mask is %d",
-                          mask_dim[1]));
-
-    // dim of x and mask must be equal
-    for (size_t idx = 0; idx < 4; ++idx) {
-      if (idx == 1) continue;
-      PADDLE_ENFORCE_EQ(
-          x_dim[idx],
-          mask_dim[idx],
-          platform::errors::InvalidArgument(
-              "Input x's %dth dim should be equal with input mask's %dth dim "
-              "but "
-              "received the %dth dimension of x and mask are not equal "
-              "the %dth dim of x is %d, while the %dth dim of mask is %d.",
-              idx,
-              idx,
-              idx,
-              idx,
-              x_dim[idx],
-              idx,
-              mask_dim[idx]));
-    }
-
-    auto& place = *context.template device_context<Place>().eigen_device();
-    auto stream = context.cuda_device_context().stream();
-
-    int pow2_index = get_pow2(key_seq_len);
-    const int next_pow2 = 1 << pow2_index;
-    int batch_count = batches * attn_heads * query_seq_len;
-    int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
-    int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
-    // use 128 threads per block to maximum gpu utilization
-    constexpr int threads_per_block = 128;
-
-    int warps_per_block = (threads_per_block / warp_size);
-    int batches_per_block = warps_per_block * batches_per_warp;
-    PADDLE_ENFORCE_EQ(
-        query_seq_len % batches_per_block,
-        0,
-        platform::errors::InvalidArgument(
-            "The query seq len (third dim of input X) must can divide the "
-            "number of batches per block. The query seq len is %d, while "
-            "the number of batches per block is %d.",
-            query_seq_len,
-            batches_per_block));
-    dim3 blocks(query_seq_len / batches_per_block, attn_heads, batches);
-    dim3 threads(warp_size, warps_per_block, 1);
-
-    // launch the kernel based on the pow2_index
-    switch (pow2_index) {
-      case 5:  // 32
-        SoftmaxMaskFuseGPUKernel<T, 5><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 6:  // 64
-        SoftmaxMaskFuseGPUKernel<T, 6><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 7:  // 128
-        SoftmaxMaskFuseGPUKernel<T, 7><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 8:  // 256
-        SoftmaxMaskFuseGPUKernel<T, 8><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 9:  // 512
-        SoftmaxMaskFuseGPUKernel<T, 9><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 10:  // 1024
-        SoftmaxMaskFuseGPUKernel<T, 10><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 11:  // 2048
-        SoftmaxMaskFuseGPUKernel<T, 11><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 12:  // 4096
-        SoftmaxMaskFuseGPUKernel<T, 12><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      case 13:  // 8192
-        SoftmaxMaskFuseGPUKernel<T, 13><<<blocks, threads, 0, stream>>>(
-            x_data, mask_data, y_data, batch_count, key_seq_len);
-        break;
-      default:
-        break;
-    }
-  }
-};
-
-template <typename Place, typename T>
-class SoftmaxMaskFuseGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* grad_x = context.Output<Tensor>(framework::GradVarName("X"));
-    auto* grad_y = context.Input<Tensor>(framework::GradVarName("Out"));
-    auto* softmax_rst = context.Input<Tensor>("Softmax");
-
-    auto* grad_x_data = grad_x->mutable_data<T>(context.GetPlace());
-    auto* grad_y_data = grad_y->data<T>();
-    auto* softmax_rst_data = softmax_rst->data<T>();
-
-    auto y_dim = grad_y->dims();
-    auto batches = y_dim[0];
-    auto attn_heads = y_dim[1];
-    auto query_seq_len = y_dim[2];
-    auto key_seq_len = y_dim[3];
-
-    auto& place = *context.template device_context<Place>().eigen_device();
-    auto stream = context.cuda_device_context().stream();
-
-    int pow2_index = get_pow2(key_seq_len);
-    const int next_pow2 = 1 << pow2_index;
-    int batch_count = batches * attn_heads * query_seq_len;
-    int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
-    int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
-    // use 128 threads per block to maximum gpu utilization
-    constexpr int threads_per_block = 128;
-
-    int warps_per_block = (threads_per_block / warp_size);
-    int batches_per_block = warps_per_block * batches_per_warp;
-    int blocks = batch_count / batches_per_block;
-    dim3 threads(warp_size, warps_per_block, 1);
-
-    // launch the kernel based on the pow2_index
-    switch (pow2_index) {
-      case 5:  // 32
-        SoftmaxMaskFuseGradGPUKernel<T, 5>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 6:  // 64
-        SoftmaxMaskFuseGradGPUKernel<T, 6>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 7:  // 128
-        SoftmaxMaskFuseGradGPUKernel<T, 7>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 8:  // 256
-        SoftmaxMaskFuseGradGPUKernel<T, 8>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 9:  // 512
-        SoftmaxMaskFuseGradGPUKernel<T, 9>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 10:  // 1024
-        SoftmaxMaskFuseGradGPUKernel<T, 10>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 11:  // 2048
-        SoftmaxMaskFuseGradGPUKernel<T, 11>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 12:  // 4096
-        SoftmaxMaskFuseGradGPUKernel<T, 12>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      case 13:  // 8192
-        SoftmaxMaskFuseGradGPUKernel<T, 13>
-            <<<blocks, threads, 0, stream>>>(grad_y_data,
-                                             grad_x_data,
-                                             softmax_rst_data,
-                                             batch_count,
-                                             key_seq_len);
-        break;
-      default:
-        break;
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    fused_softmax_mask,
-    ops::SoftmaxMaskFuseKernel<phi::GPUContext, plat::float16>,
-    ops::SoftmaxMaskFuseKernel<phi::GPUContext, float>);
-REGISTER_OP_CUDA_KERNEL(
-    fused_softmax_mask_grad,
-    ops::SoftmaxMaskFuseGradKernel<phi::GPUContext, plat::float16>,
-    ops::SoftmaxMaskFuseGradKernel<phi::GPUContext, float>);
diff --git a/paddle/fluid/operators/fused_softmax_mask_op.h b/paddle/fluid/operators/fused_softmax_mask_op.h
deleted file mode 100644
index 137dfb830dec08..00000000000000
--- a/paddle/fluid/operators/fused_softmax_mask_op.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class SoftmaxMaskFuseCPUKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_gpu_place(ctx.GetPlace()),
-                      true,
-                      platform::errors::Unimplemented(
-                          "Softmax mask fuse op only supports GPU now."));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
index 54db576d3171b7..f92479888f8177 100644
--- a/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
+++ b/paddle/fluid/operators/fused_softmax_mask_upper_triangle_op.cu
@@ -128,26 +128,27 @@ __device__ __forceinline__ void warp_reduce_upper_tri(T* sum) {
 template <typename T, int pow2_index>
 __global__ void SoftmaxMaskFuseUpperTriangleGPUKernel(const T* src,
                                                       T* dst,
-                                                      int batch_count,
-                                                      int key_seq_len) {
+                                                      int64_t batch_count,
+                                                      int64_t key_seq_len) {
   constexpr int next_pow2 = 1 << pow2_index;
   constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
   constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4);
   constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1;
   constexpr int kOneLoadingCounts = 4;
-  int key_seq_len_pow_2 = key_seq_len * key_seq_len;
+  int64_t key_seq_len_pow_2 = key_seq_len * key_seq_len;
 
-  int first_idx =
-      (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * kLocalBatchSize +
+  int64_t first_idx =
+      (static_cast<int64_t>(blockDim.y) * blockIdx.y + threadIdx.y) *
+          gridDim.x * kLocalBatchSize +
       blockIdx.x;
-  int local_block_idx = blockIdx.x + 1;
-  int warp_iter_upper_bound =
+  int64_t local_block_idx = blockIdx.x + 1;
+  int64_t warp_iter_upper_bound =
       (local_block_idx + kOneLoadingCounts * warp_size - 1) / warp_size;
 
-  int local_batches = batch_count - first_idx;
+  int64_t local_batches = batch_count - first_idx;
   if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize;
 
-  int local_idx = threadIdx.x;
+  int64_t local_idx = threadIdx.x;
 
   src += first_idx * key_seq_len + kOneLoadingCounts * local_idx;
   dst += first_idx * key_seq_len + kOneLoadingCounts * local_idx;
@@ -157,11 +158,11 @@ __global__ void SoftmaxMaskFuseUpperTriangleGPUKernel(const T* src,
 
 #pragma unroll
   for (int i = 0; i < kLocalBatchSize; ++i) {
-    int batch_total_number = (i >= local_batches) ? 0 : local_block_idx;
+    auto batch_total_number = (i >= local_batches) ? 0 : local_block_idx;
 
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int element_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto element_index = kOneLoadingCounts * local_idx + ii * warp_size;
 
       if (element_index < batch_total_number) {
         load_data_upper_tri(temp_in,
@@ -216,7 +217,7 @@ __global__ void SoftmaxMaskFuseUpperTriangleGPUKernel(const T* src,
     if (i >= local_batches) break;
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int element_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto element_index = kOneLoadingCounts * local_idx + ii * warp_size;
 
       if (element_index < local_block_idx) {
 #pragma unroll
@@ -242,31 +243,32 @@ template <typename T, int pow2_index>
 __global__ void SoftmaxMaskFuseUpperTriangleGradGPUKernel(const T* grad_input,
                                                           T* grad_output,
                                                           const T* softmax_rst,
-                                                          int batch_count,
-                                                          int key_seq_len) {
+                                                          int64_t batch_count,
+                                                          int64_t key_seq_len) {
   constexpr int next_pow2 = 1 << pow2_index;
   constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
   constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4);
   constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1;
   constexpr int kOneLoadingCounts = 4;
-  int key_seq_len_pow_2 = key_seq_len * key_seq_len;
+  int64_t key_seq_len_pow_2 = key_seq_len * key_seq_len;
 
-  int first_idx =
-      (blockDim.y * blockIdx.y + threadIdx.y) * gridDim.x * kLocalBatchSize +
+  int64_t first_idx =
+      (static_cast<int64_t>(blockDim.y) * blockIdx.y + threadIdx.y) *
+          gridDim.x * kLocalBatchSize +
       blockIdx.x;
-  int local_block_idx = blockIdx.x + 1;
+  int64_t local_block_idx = blockIdx.x + 1;
 
   // micro_batch_size might not be a multiple of WARP_BATCH. Check how
   // many batches have to computed within this WARP.
-  int local_batches = batch_count - first_idx;
+  int64_t local_batches = batch_count - first_idx;
   if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize;
 
   // there might be multiple batches per warp. compute the index within the
   // batch
-  int local_idx = threadIdx.x;
+  int64_t local_idx = threadIdx.x;
 
   // the first element to process by the current thread
-  int offset = first_idx * key_seq_len + kOneLoadingCounts * local_idx;
+  int64_t offset = first_idx * key_seq_len + kOneLoadingCounts * local_idx;
   grad_input += offset;
   grad_output += offset;
   softmax_rst += offset;
@@ -279,11 +281,11 @@ __global__ void SoftmaxMaskFuseUpperTriangleGradGPUKernel(const T* grad_input,
 
 #pragma unroll
   for (int i = 0; i < kLocalBatchSize; ++i) {
-    int batch_total_number = (i >= local_batches) ? 0 : local_block_idx;
+    auto batch_total_number = (i >= local_batches) ? 0 : local_block_idx;
 
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int element_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto element_index = kOneLoadingCounts * local_idx + ii * warp_size;
       if (element_index < batch_total_number) {
         load_data_upper_tri(
             temp_grad_input,
@@ -328,7 +330,7 @@ __global__ void SoftmaxMaskFuseUpperTriangleGradGPUKernel(const T* grad_input,
     if (i >= local_batches) break;
 #pragma unroll
     for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
-      int element_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      auto element_index = kOneLoadingCounts * local_idx + ii * warp_size;
       if (element_index < key_seq_len) {
         // compute gradients
         T samples_out[kOneLoadingCounts];
@@ -369,10 +371,10 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
                           key_seq_len,
                           query_seq_len));
 
-    PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len < 8192,
+    PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len <= 16384,
                       true,
                       platform::errors::InvalidArgument(
-                          "Input x's last dim must be between [32, 8192) "
+                          "Input x's last dim must be between [32, 16384] "
                           "received the last dimension of x is %d",
                           key_seq_len));
 
@@ -381,7 +383,7 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
 
     int pow2_index = get_pow2_index_value(key_seq_len);
     const int next_pow2 = 1 << pow2_index;
-    int batch_count = attn_mul_batch * query_seq_len;
+    int64_t batch_count = attn_mul_batch * query_seq_len;
     int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
     int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
     constexpr int threads_per_block = 128;
@@ -448,7 +450,13 @@ class SoftmaxMaskFuseUpperTriangleKernel : public framework::OpKernel<T> {
             <<<blocks, threads, 0, stream>>>(
                 x_data, y_data, batch_count, key_seq_len);
         break;
+      case 14:  // 16384
+        SoftmaxMaskFuseUpperTriangleGPUKernel<T, 14>
+            <<<blocks, threads, 0, stream>>>(
+                x_data, y_data, batch_count, key_seq_len);
+        break;
       default:
+        PADDLE_THROW(phi::errors::Unimplemented("Too large sequence length."));
         break;
     }
   }
@@ -478,7 +486,7 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
 
     int pow2_index = get_pow2_index_value(key_seq_len);
     const int next_pow2 = 1 << pow2_index;
-    int batch_count = attn_mul_batch * query_seq_len;
+    int64_t batch_count = attn_mul_batch * query_seq_len;
     int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
     int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
     // use 128 threads per block to maximum gpu utilization
@@ -564,7 +572,16 @@ class SoftmaxMaskFuseUpperTriangleGradKernel : public framework::OpKernel<T> {
                                              batch_count,
                                              key_seq_len);
         break;
+      case 14:
+        SoftmaxMaskFuseUpperTriangleGradGPUKernel<T, 14>
+            <<<blocks, threads, 0, stream>>>(grad_y_data,
+                                             grad_x_data,
+                                             softmax_rst_data,
+                                             batch_count,
+                                             key_seq_len);
+        break;
       default:
+        PADDLE_THROW(phi::errors::Unimplemented("Too large sequence length."));
         break;
     }
   }
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index f993189f070da4..77e4adfeea7872 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -82,14 +82,6 @@ class GatherOpMaker : public framework::OpProtoAndCheckerMaker {
              "The Tensor which contains the axis that we do gather operation.")
         .AsDispensable();
     AddOutput("Out", "The output of gather op");
-    AddAttr<bool>(
-        "overwrite",
-        "(bool, default: False) "
-        "In backward process, calc the grad when has same index,"
-        "If true, update the grad using the overwrite mode in same index,"
-        "If false, using the accumulate mode in same index.")
-        .SetDefault(true)
-        .AsExtra();
     AddAttr<int>(
         "axis",
         "The Tensor which contains the axis that we do gather operation.")
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index add87fdd3c1121..e7debf896a2861 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -80,11 +80,11 @@ class GeluGradOp : public framework::OperatorWithKernel {
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
     auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
 #ifdef PADDLE_WITH_MKLDNN
-    auto it = this->Attrs().find("use_mkldnn");
-    if (library == framework::LibraryType::kPlain &&
-        it != this->Attrs().end() && this->CanMKLDNNBeUsed(ctx, data_type)) {
-      library = framework::LibraryType::kMKLDNN;
-      layout = framework::DataLayout::kMKLDNN;
+    if (this->CanMKLDNNBeUsed(ctx, data_type)) {
+      return framework::OpKernelType(data_type,
+                                     ctx.GetPlace(),
+                                     framework::DataLayout::kMKLDNN,
+                                     framework::LibraryType::kMKLDNN);
     }
 #endif
     return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
diff --git a/paddle/fluid/operators/huber_loss_op_mlu.cc b/paddle/fluid/operators/huber_loss_op_mlu.cc
new file mode 100644
index 00000000000000..4387037ad01afb
--- /dev/null
+++ b/paddle/fluid/operators/huber_loss_op_mlu.cc
@@ -0,0 +1,187 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/mlu/mlu_baseop.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = phi::DenseTensor;
+
+template <typename T>
+class HuberLossMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* residual = ctx.Output<Tensor>("Residual");
+    auto* out = ctx.Output<Tensor>("Out");
+    auto delta = ctx.Attr<float>("delta");
+
+    auto place = ctx.GetPlace();
+
+    // compute y-x
+    cnnlDataType_t data_type = ToCnnlDataType<T>();
+    residual->mutable_data<T>(x->dims(), place);
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlOpTensorDesc sub_op_desc(
+        CNNL_OP_TENSOR_SUB, data_type, CNNL_NOT_PROPAGATE_NAN);
+    MLUCnnl::OpTensor(ctx,
+                      sub_op_desc.get(),
+                      x_desc.get(),
+                      GetBasePtr(y),
+                      x_desc.get(),
+                      GetBasePtr(x),
+                      x_desc.get(),
+                      GetBasePtr(residual),
+                      data_type);
+
+    // compute smoothl1loss
+    out->mutable_data<T>(x->dims(), place);
+    cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+        CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                           // here
+    MLUCnnl::SmoothL1LossForward(ctx,
+                                 x_desc.get(),
+                                 GetBasePtr(x),
+                                 x_desc.get(), /* target has same shape as x */
+                                 GetBasePtr(y),
+                                 static_cast<float>(delta),
+                                 smoothl1_algo,
+                                 x_desc.get(), /* out has same shape as x */
+                                 GetBasePtr(out));
+
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(out->dims().size() - 1, 0);
+
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+    MLUCnnlTensorDesc out_desc(*out);
+    MLUCnnl::Scale(ctx,
+                   axis,
+                   out_desc.get(),
+                   GetBasePtr(out),
+                   scale_desc.get(),
+                   GetBasePtr(&scale_tensor),
+                   bias_desc.get(),
+                   GetBasePtr(&bias_tensor),
+                   out_desc.get(),
+                   GetBasePtr(out));
+  }
+};
+
+template <typename T>
+class HuberLossGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto& dev_ctx = GetDevCtxFromCTX(ctx);
+    auto* residual = ctx.Input<Tensor>("Residual");
+    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
+    auto delta = ctx.Attr<float>("delta");
+
+    auto place = ctx.GetPlace();
+
+    Tensor t_grad_rd;
+    t_grad_rd =
+        ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+    MLUCnnlTensorDesc t_grad_rd_desc(t_grad_rd);
+    if (dx || dy) {
+      Tensor t_zero;
+      t_zero =
+          ctx.AllocateTmpTensor<T, MLUDeviceContext>(residual->dims(), dev_ctx);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &t_zero);
+
+      MLUCnnlTensorDesc residual_desc(*residual);
+      MLUCnnlTensorDesc dout_desc(*dout);
+
+      cnnlSmoothL1LossAlgorithm_t smoothl1_algo =
+          CNNL_SMOOTHL1LOSS_REDUCTION_NONE;  // defines whether to do reduction
+                                             // here
+      MLUCnnl::SmoothL1LossBackward(ctx,
+                                    residual_desc.get(),
+                                    GetBasePtr(residual),
+                                    residual_desc.get(),
+                                    GetBasePtr(&t_zero),
+                                    dout_desc.get(),
+                                    GetBasePtr(dout),
+                                    static_cast<float>(delta),
+                                    smoothl1_algo,
+                                    t_grad_rd_desc.get(),
+                                    GetBasePtr(&t_grad_rd));
+    }
+    // compute multiply by delta
+    Tensor scale_tensor, bias_tensor;
+    scale_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+    bias_tensor = ctx.AllocateTmpTensor<T, MLUDeviceContext>({1}, dev_ctx);
+
+    FillMLUTensorWithHostValue(ctx, static_cast<T>(0.f), &bias_tensor);
+    const int axis = std::max(t_grad_rd.dims().size() - 1, 0);
+
+    MLUCnnlTensorDesc scale_desc(scale_tensor);
+    MLUCnnlTensorDesc bias_desc(bias_tensor);
+
+    if (dx) {
+      dx->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(-delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dx);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dx));
+    }
+    if (dy) {
+      dy->mutable_data<T>(place);
+      FillMLUTensorWithHostValue(ctx, static_cast<T>(delta), &scale_tensor);
+      MLUCnnlTensorDesc out_desc(*dy);
+      MLUCnnl::Scale(ctx,
+                     axis,
+                     t_grad_rd_desc.get(),
+                     GetBasePtr(&t_grad_rd),
+                     scale_desc.get(),
+                     GetBasePtr(&scale_tensor),
+                     bias_desc.get(),
+                     GetBasePtr(&bias_tensor),
+                     out_desc.get(),
+                     GetBasePtr(dy));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_MLU_KERNEL(huber_loss,
+                       ops::HuberLossMLUKernel<float>,
+                       ops::HuberLossMLUKernel<plat::float16>);
+REGISTER_OP_MLU_KERNEL(huber_loss_grad,
+                       ops::HuberLossGradMLUKernel<float>,
+                       ops::HuberLossGradMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/kron_op.cc b/paddle/fluid/operators/kron_op.cc
index cede00d5b01ec1..5f7606265e4e79 100644
--- a/paddle/fluid/operators/kron_op.cc
+++ b/paddle/fluid/operators/kron_op.cc
@@ -39,7 +39,7 @@ class KronOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
       return framework::OpKernelType(
@@ -121,7 +121,7 @@ class KronGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
       return framework::OpKernelType(
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index 0c41429c61e888..899eae3efb45bc 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -24,6 +24,7 @@ namespace cub = hipcub;
 
 #include <iostream>
 
+#include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/core/ddim.h"
@@ -338,16 +339,24 @@ using LayerNormScaleBiasT =
 template <typename T,
           typename U,
           int BlockDim,
-          bool ScaleBiasWithSameTypeX = false>
+          bool ScaleBiasWithSameTypeX = false,
+          typename InType = T,
+          typename OutType = T>
 __global__ void LayerNormForward(
-    const T *x,
+    const InType *x,
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *scale,
     const LayerNormScaleBiasT<T, U, ScaleBiasWithSameTypeX> *bias,
-    T *y,
+    OutType *y,
     U *mean,
     U *var,
     float epsilon,
-    int64_t feature_size) {
+    int64_t feature_size,
+    const float *dequant_out_scale_data = nullptr,
+    const int quant_out_scale_offset = 0,
+    const float quant_in_scale = 1.0,
+    const int quant_round_type = 1,
+    const float quant_max_bound = 127.0,
+    const float quant_min_bound = -127.0) {
   __shared__ U mean_share;
   __shared__ U var_share;
   __shared__ U shared_mean[32];  // threadIdx.x / warpSize <= kMaxBlockDim /
@@ -387,28 +396,72 @@ __global__ void LayerNormForward(
     if (bias != nullptr) {
       for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
            i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>(static_cast<U>(scale[j]) *
-                                  (static_cast<U>(x[i]) - mean_val) * invvar +
-                              static_cast<U>(bias[j]));
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>(static_cast<U>(scale[j]) *
+                                 (static_cast<U>(x[i]) - mean_val) * invvar +
+                             static_cast<U>(bias[j])),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] = static_cast<OutType>(static_cast<U>(scale[j]) *
+                                          (static_cast<U>(x[i]) - mean_val) *
+                                          invvar +
+                                      static_cast<U>(bias[j]));
+        }
       }
     } else {
       for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
            i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>(static_cast<U>(scale[j]) *
-                              (static_cast<U>(x[i]) - mean_val) * invvar);
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>(static_cast<U>(scale[j]) *
+                             (static_cast<U>(x[i]) - mean_val) * invvar),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] =
+              static_cast<OutType>(static_cast<U>(scale[j]) *
+                                   (static_cast<U>(x[i]) - mean_val) * invvar);
+        }
       }
     }
   } else {  // scale == nullptr
     if (bias != nullptr) {
       for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
            i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
-                              static_cast<U>(bias[j]));
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar +
+                             static_cast<U>(bias[j])),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] =
+              static_cast<OutType>((static_cast<U>(x[i]) - mean_val) * invvar +
+                                   static_cast<U>(bias[j]));
+        }
       }
     } else {
       for (int64_t i = beg_idx, j = threadIdx.x; i < end_idx;
            i += BlockDim, j += BlockDim) {
-        y[i] = static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar);
+        if (std::is_same<OutType, int8_t>::value) {
+          y[i] = quant_helper(
+              static_cast<T>((static_cast<U>(x[i]) - mean_val) * invvar),
+              quant_in_scale,
+              quant_round_type,
+              quant_max_bound,
+              quant_min_bound);
+        } else {
+          y[i] =
+              static_cast<OutType>((static_cast<U>(x[i]) - mean_val) * invvar);
+        }
       }
     }
   }
diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc
index 7baf76a1e10803..5f023fbad6a027 100644
--- a/paddle/fluid/operators/lookup_table_v2_op.cc
+++ b/paddle/fluid/operators/lookup_table_v2_op.cc
@@ -84,46 +84,12 @@ class LookupTableV2OpMaker : public framework::OpProtoAndCheckerMaker {
              "An input with type int64 "
              "contains the ids to be looked up in W.");
     AddOutput("Out", "The lookup results, which have the same type as W.");
-    AddAttr<bool>("is_sparse",
-                  "(boolean, default false) "
-                  "Sparse update.")
-        .SetDefault(false)
-        .AsExtra();
-    AddAttr<bool>("is_distributed",
-                  "(boolean, default false) distributed lookup table.")
-        .SetDefault(false)
-        .AsExtra();
     AddAttr<int64_t>("padding_idx",
                      "(int64, default -1) "
                      "If the value is -1, it makes no effect to lookup. "
                      "Otherwise the given value indicates padding the output "
                      "with zeros whenever lookup encounters it in Ids.")
         .SetDefault(kNoPadding);
-
-    // for parameter prefetch
-    AddAttr<bool>("remote_prefetch", "").SetDefault(false).AsExtra();
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.")
-        .SetDefault(0)
-        .AsExtra();
-    AddAttr<int>("slot", "slot of id").SetDefault(0).AsExtra();
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}))
-        .AsExtra();
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({})
-        .AsExtra();
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, the split table names that will be fetched from "
-        "parameter server)"
-        "in the order of input variables for mapping")
-        .SetDefault({})
-        .AsExtra();
-
     AddComment(R"DOC(
 Lookup Table V2 Operator.
 
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index a31c218307b9c1..a49ceb42559c5c 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -714,7 +714,7 @@ class MatMulOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
       const framework::Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
+      const framework::OpKernelType &expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
       return framework::OpKernelType(
diff --git a/paddle/fluid/operators/matmul_v2_op.cc b/paddle/fluid/operators/matmul_v2_op.cc
index 209bf6d1f6ccd3..b1483c3fd6e557 100644
--- a/paddle/fluid/operators/matmul_v2_op.cc
+++ b/paddle/fluid/operators/matmul_v2_op.cc
@@ -62,12 +62,12 @@ class MatMulV2Op : public framework::OperatorWithKernel {
                       0,
                       platform::errors::InvalidArgument(
                           "The Input(X) dims size must be greater than 0,"
-                          " but reviced dims size is 0. "));
+                          " but received dims size is 0. "));
     PADDLE_ENFORCE_GT(ndims_y,
                       0,
                       platform::errors::InvalidArgument(
                           "The Input(Y) dims size must be greater than 0,"
-                          " but reviced dims size is 0. "));
+                          " but received dims size is 0. "));
 
     bool x_broadcasted = false, y_broadcasted = false;
     if (ndims_x == 1) {
@@ -150,7 +150,7 @@ class MatMulV2Op : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
       return framework::OpKernelType(
@@ -160,9 +160,7 @@ class MatMulV2Op : public framework::OperatorWithKernel {
     } else {
 #ifdef PADDLE_WITH_MKLDNN
       // When matmul_v2 is first oneDNN op in a chain (there was some non oneDNN
-      // op
-      // previously)
-      // then we also need to rotate shape NHWC -> NCWH
+      // op previously) then we also need to rotate shape NHWC -> NCWH
       if ((expected_kernel_type.data_layout_ ==
            framework::DataLayout::kMKLDNN) &&
           (tensor.layout() != framework::DataLayout::kMKLDNN) &&
@@ -227,7 +225,7 @@ class MatMulV2OpGrad : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const framework::Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
     if (framework::IsComplexType(expected_kernel_type.data_type_)) {
       // only promote inputs’s types when contains complex input
       return framework::OpKernelType(
diff --git a/paddle/fluid/operators/mean_iou_op.h b/paddle/fluid/operators/mean_iou_op.h
index b5ac6fd677bacc..0ec92251a8e37e 100644
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/minus_op.cc b/paddle/fluid/operators/minus_op.cc
index 310d28738fc756..398a254f45cbcc 100644
--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -130,6 +130,7 @@ class MinusGradMaker : public imperative::GradOpBaseMakerBase {
       op.SetInput("X", this->OutputGrad("Out"));
       op.SetOutput("Out", x_g);
       op.SetAttr("scale", 1.0f);
+      op.SetDefaultAttrsMap(DefaultAttrsMap());
     }
 
     if (!y_g.empty()) {
@@ -138,6 +139,7 @@ class MinusGradMaker : public imperative::GradOpBaseMakerBase {
       op.SetInput("X", this->OutputGrad("Out"));
       op.SetOutput("Out", y_g);
       op.SetAttr("scale", -1.0f);
+      op.SetDefaultAttrsMap(DefaultAttrsMap());
     }
 
     return node;
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index 728d86cd94e33d..4909fdc32ba6fc 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -42,139 +42,6 @@ class MKLDNNActivationKernel
   }
 };
 
-template <typename Functor>
-class MKLDNNActivationGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    Functor functor;
-    functor(ctx);
-  }
-};
-
-template <typename T>
-void eltwise_forward(const framework::ExecutionContext &ctx,
-                     dnnl::algorithm algorithm) {
-  PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()),
-                    true,
-                    paddle::platform::errors::PreconditionNotMet(
-                        "Operator DNNL eletwise_forward must use CPUPlace"));
-  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-  const auto &mkldnn_engine = dev_ctx.GetEngine();
-
-  const auto *x = ctx.Input<Tensor>("X");
-  auto *out = ctx.Output<Tensor>("Out");
-
-  bool is_inplaced = x->IsSharedBufferWith(*out);
-
-  platform::ActivationMKLDNNHandler<T> handler(
-      algorithm, ctx, mkldnn_engine, ctx.GetPlace(), x);
-
-  auto src_memory_p = handler.AcquireSrcMemory(x);
-  std::shared_ptr<dnnl::memory> dst_memory_p = nullptr;
-  if (is_inplaced) {
-    dst_memory_p = src_memory_p;
-    out->mutable_data<T>(ctx.GetPlace());
-  } else {
-    dst_memory_p = handler.AcquireDstMemory(out);
-  }
-  auto activation_p = handler.AcquireForwardPrimitive();
-
-  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-  activation_p->execute(
-      astream, {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}});
-  astream.wait();
-
-  out->set_mem_desc(dst_memory_p->get_desc());
-}
-
-template <typename T>
-void eltwise_grad(const framework::ExecutionContext &ctx,
-                  dnnl::algorithm algorithm) {
-  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-  const auto &mkldnn_engine = dev_ctx.GetEngine();
-
-  const auto *x = ctx.Input<Tensor>("X");
-  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  platform::ActivationMKLDNNHandler<T> handler(
-      algorithm, ctx, mkldnn_engine, ctx.GetPlace(), x, dout);
-
-  auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
-  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
-  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
-  auto activation_backward_p = handler.AcquireBackwardPrimitive();
-
-  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-  activation_backward_p->execute(astream,
-                                 {{DNNL_ARG_SRC, *src_memory_p},
-                                  {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
-                                  {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
-  astream.wait();
-
-  dx->set_mem_desc(diff_src_memory_p->get_desc());
-}
-
-template <typename T>
-void eltwise_grad_use_out(const framework::ExecutionContext &ctx,
-                          dnnl::algorithm algorithm) {
-  auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-  const auto &mkldnn_engine = dev_ctx.GetEngine();
-
-  const auto *out = ctx.Input<Tensor>("Out");
-  const auto *dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-  auto *dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-  platform::ActivationMKLDNNHandler<T> handler(
-      algorithm, ctx, mkldnn_engine, ctx.GetPlace(), out, dout);
-
-  auto dst_memory_p = handler.AcquireBackwardSrcMemory(out);
-  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
-  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
-  auto activation_backward_p = handler.AcquireBackwardPrimitive();
-
-  auto &astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-  activation_backward_p->execute(astream,
-                                 {{DNNL_ARG_DST, *dst_memory_p},
-                                  {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
-                                  {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
-  astream.wait();
-
-  dx->set_mem_desc(diff_src_memory_p->get_desc());
-}
-
-template <typename T, dnnl::algorithm algorithm>
-struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    eltwise_grad<T>(ctx, algorithm);
-  }
-};
-
-template <typename T>
-struct GeluMKLDNNFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const bool approximate = ctx.Attr<bool>("approximate");
-    if (approximate) {
-      eltwise_forward<T>(ctx, dnnl::algorithm::eltwise_gelu_tanh);
-    } else {
-      eltwise_forward<T>(ctx, dnnl::algorithm::eltwise_gelu_erf);
-    }
-  }
-};
-
-template <typename T>
-struct GeluMKLDNNGradFunctor : public BaseActivationFunctor<T> {
-  void operator()(const framework::ExecutionContext &ctx) const {
-    const bool approximate = ctx.Attr<bool>("approximate");
-    if (approximate) {
-      eltwise_grad<T>(ctx, dnnl::algorithm::eltwise_gelu_tanh);
-    } else {
-      eltwise_grad<T>(ctx, dnnl::algorithm::eltwise_gelu_erf);
-    }
-  }
-};
-
 template <typename T>
 struct SoftplusMKLDNNFunctor : public BaseActivationFunctor<T> {
   void operator()(const framework::ExecutionContext &ctx) const {
@@ -182,10 +49,6 @@ struct SoftplusMKLDNNFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-using Relu6MKLDNNGradFunctor =
-    MKLDNNActivationGradFunc<T, dnnl::algorithm::eltwise_bounded_relu>;
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -199,16 +62,4 @@ namespace ops = paddle::operators;
       ops::MKLDNNActivationKernel<ops::functor<float>>,          \
       ops::MKLDNNActivationKernel<ops::functor<paddle::platform::bfloat16>>);
 
-#define REGISTER_GRAD_ACTIVATION_MKLDNN_KERNEL(act_type, grad_functor) \
-  REGISTER_OP_KERNEL(                                                  \
-      act_type##_grad,                                                 \
-      MKLDNN,                                                          \
-      ::paddle::platform::CPUPlace,                                    \
-      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>,       \
-      ops::MKLDNNActivationGradKernel<                                 \
-          ops::grad_functor<paddle::platform::bfloat16>>);
-
 REGISTER_FWD_ACTIVATION_MKLDNN_KERNEL(softplus, SoftplusMKLDNNFunctor);
-REGISTER_FWD_ACTIVATION_MKLDNN_KERNEL(gelu, GeluMKLDNNFunctor);
-REGISTER_GRAD_ACTIVATION_MKLDNN_KERNEL(gelu, GeluMKLDNNGradFunctor);
-REGISTER_GRAD_ACTIVATION_MKLDNN_KERNEL(relu6, Relu6MKLDNNGradFunctor);
diff --git a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
deleted file mode 100644
index 7335693053fa02..00000000000000
--- a/paddle/fluid/operators/mkldnn/cast_mkldnn_op.cc
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-template <typename T>
-class CastMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    int in_dtype = ctx.Attr<int>("in_dtype");
-    int out_dtype = ctx.Attr<int>("out_dtype");
-
-    auto x_paddle_type = framework::proto::VarType::Type(in_dtype);
-    auto out_paddle_type = framework::proto::VarType::Type(out_dtype);
-
-    dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(x_paddle_type);
-    dnnl::memory::data_type out_type =
-        framework::ToMKLDNNDataType(out_paddle_type);
-
-    auto x_tz = phi::vectorize(x->dims());
-
-    platform::ReorderMKLDNNHandler reorder_handler(x_tz,
-                                                   x_paddle_type,
-                                                   x_type,
-                                                   out_paddle_type,
-                                                   out_type,
-                                                   dev_ctx.GetEngine());
-
-    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->mem_desc(), platform::to_void_cast(x->data<T>()));
-    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        out, x->mem_desc(), dev_ctx.GetPlace());
-    auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
-                                                    reorder_src_memory_p);
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-    astream.wait();
-
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    out->set_mem_desc(reorder_dst_memory_p->get_desc());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(cast,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::CastMKLDNNKernel<float>,
-                   ops::CastMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/clip_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/clip_mkldnn_op.cc
deleted file mode 100644
index f1a7ade2b4809c..00000000000000
--- a/paddle/fluid/operators/mkldnn/clip_mkldnn_op.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace {
-
-using paddle::framework::Tensor;
-
-template <typename T>
-class ClipMKLDNNKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const paddle::framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    paddle::platform::ActivationMKLDNNHandler<T> handler(
-        dnnl::algorithm::eltwise_clip_v2,
-        ctx,
-        mkldnn_engine,
-        ctx.GetPlace(),
-        x);
-
-    auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireDstMemory(out);
-    auto activation_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-    activation_p->execute(
-        astream,
-        {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}});
-    astream.wait();
-
-    out->set_mem_desc(dst_memory_p->get_desc());
-  }
-};
-
-template <typename T>
-class ClipGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const paddle::framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* dx = ctx.Output<Tensor>(paddle::framework::GradVarName("X"));
-    auto* dout = ctx.Input<Tensor>(paddle::framework::GradVarName("Out"));
-
-    paddle::platform::ActivationMKLDNNHandler<T> handler(
-        dnnl::algorithm::eltwise_clip_v2,
-        ctx,
-        mkldnn_engine,
-        ctx.GetPlace(),
-        x,
-        dout);
-
-    auto src_memory_p = handler.AcquireBackwardSrcMemory(x);
-    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(dout);
-    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(dx);
-    auto activation_backward_p = handler.AcquireBackwardPrimitive();
-
-    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
-    activation_backward_p->execute(astream,
-                                   {{DNNL_ARG_SRC, *src_memory_p},
-                                    {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
-                                    {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
-    astream.wait();
-
-    dx->set_mem_desc(diff_dst_memory_p->get_desc());
-  }
-};
-
-}  // anonymous namespace
-
-REGISTER_OP_KERNEL(clip,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ClipMKLDNNKernel<float>,
-                   ClipMKLDNNKernel<paddle::platform::bfloat16>);
-
-REGISTER_OP_KERNEL(clip_grad,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ClipGradMKLDNNKernel<float>,
-                   ClipGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
deleted file mode 100644
index b16576505dfd3f..00000000000000
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ /dev/null
@@ -1,264 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <memory>
-
-#include "paddle/fluid/operators/concat_op.h"
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using dnnl::concat;
-using dnnl::memory;
-using dnnl::primitive;
-using dnnl::stream;
-using framework::DataLayout;
-using framework::LoDTensor;
-using framework::Tensor;
-using platform::to_void_cast;
-
-template <typename T>
-class ConcatMKLDNNHandler
-    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::concat> {
- public:
-  ConcatMKLDNNHandler(const framework::ExecutionContext& ctx,
-                      const dnnl::engine mkldnn_engine,
-                      const std::vector<const Tensor*>& inputs,
-                      Tensor* output)
-      : platform::MKLDNNHandlerNoCachingT<T, dnnl::concat>(mkldnn_engine,
-                                                           ctx.GetPlace()) {
-    int concat_axis = ctx.Attr<int>("axis");
-    const int rank = inputs[0]->dims().size();
-    PADDLE_ENFORCE_EQ(
-        concat_axis >= -rank && concat_axis < rank,
-        true,
-        platform::errors::InvalidArgument(
-            "The axis is expected to be in range of [%d, %d), but got %d",
-            -rank,
-            rank,
-            concat_axis));
-
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
-      concat_axis = GetDataFromTensor(axis_tensor)[0];
-      auto out_dims = inputs[0]->dims();
-      for (size_t i = 1; i < inputs.size(); ++i) {
-        out_dims[concat_axis] += inputs[i]->dims()[concat_axis];
-      }
-      output->Resize(out_dims);
-    }
-
-    if (concat_axis < 0) {
-      concat_axis = concat_axis + rank;
-    }
-
-    memory::data_type dt = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(inputs[0]->dtype()));
-    std::vector<memory::desc> srcs_md;
-    srcs_md.reserve(inputs.size());
-
-    // Create memory descriptors for each of inputs
-    for (size_t i = 0; i < inputs.size(); ++i) {
-      srcs_md.push_back(inputs[i]->mem_desc());
-    }
-
-    auto dst_dims = phi::vectorize<int64_t>(output->dims());
-
-    dnnl::memory::desc dst_md;
-
-    // if concat is being used as a stack op(all source memories dims on
-    // concat_axis are equal to 1), then it may choose a non-optimal memory
-    // format tag for destination, because concat primitive is chosing it based
-    // on source memory descriptors and f.e.200x1x10 can be described as both
-    // abc and bac and both would be using exact same physical layout, but in
-    // that scenario bac will be chosen for destination no matter which
-    // formats are being set in inputs. In that scenario we are enforcing using
-    // a dense format, because it is the most common one and should be the best
-    // in terms of the performance
-    const auto src0_tz = srcs_md[0].dims();
-    if (std::find(src0_tz.begin(), src0_tz.end(), 1) != src0_tz.end()) {
-      dst_md = memory::desc(
-          dst_dims, dt, platform::GetPlainMKLDNNFormat(dst_dims.size()));
-    } else {
-      dst_md = memory::desc(dst_dims, dt, MKLDNNMemoryFormat::any);
-    }
-
-    this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
-  }
-
-  // (jczaja) concat oneDNN prim is not having .desc attribute so
-  // we cannot use base AcquireForwardPrimitiveDescriptor
-  void AcquireForwardPrimitiveDescriptor(
-      const memory::desc& dst_md,
-      const int concat_axis,
-      const std::vector<memory::desc>& srcs_md) {
-    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
-        dst_md, concat_axis, srcs_md, this->engine_));
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const Tensor& input, int i) {
-    const T* input_data = input.data<T>();
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
-                                            to_void_cast<T>(input_data));
-  }
-};
-
-static void EnforceLayouts(const std::vector<const Tensor*> inputs) {
-  for (auto* input : inputs) {
-    PADDLE_ENFORCE_EQ(
-        input->layout(),
-        DataLayout::kMKLDNN,
-        platform::errors::InvalidArgument("Wrong layout set for Input tensor"));
-  }
-}
-
-// From a multi-input, gather only nonempty inputs
-static const std::vector<const Tensor*> ReduceMultiInput(
-    const std::vector<const Tensor*>& inputs) {
-  std::vector<const Tensor*> reduced(inputs.size());
-  auto end_it = std::copy_if(
-      inputs.begin(), inputs.end(), reduced.begin(), [](const Tensor* t) {
-        return t->numel() > 0;
-      });
-  reduced.resize(std::distance(reduced.begin(), end_it));
-  return reduced;
-}
-
-template <typename T>
-class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    // If any of the multiple inputs of concat has an input size of 0, the
-    // actual size of the multi_input will change
-    auto multi_input = ReduceMultiInput(ctx.MultiInput<Tensor>("X"));
-    EnforceLayouts(multi_input);
-    Tensor* output = ctx.Output<Tensor>("Out");
-
-    ConcatMKLDNNHandler<T> handler(ctx, mkldnn_engine, multi_input, output);
-
-    std::vector<std::shared_ptr<memory>> srcs;
-    srcs.reserve(multi_input.size());
-
-    auto dst_mem = handler.AcquireDstMemory(output);
-    auto concat_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    std::unordered_map<int, memory> args;
-    for (size_t i = 0; i < multi_input.size(); ++i) {
-      srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i));
-      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs.at(i))});
-    }
-    args.insert({DNNL_ARG_DST, *dst_mem});
-
-    concat_p->execute(astream, args);
-    astream.wait();
-
-    output->set_mem_desc(dst_mem->get_desc());
-  }
-};
-
-template <typename T>
-class ConcatGradMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    auto out_var_names = ctx.OutputNames(framework::GradVarName("X"));
-
-    const auto x = ctx.MultiInput<LoDTensor>("X");
-    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto dx = ctx.MultiOutput<LoDTensor>(framework::GradVarName("X"));
-
-    for (size_t i = 0; i < dx.size(); ++i) {
-      if (dx[i] != nullptr) {
-        dx[i]->set_lod(x[i]->lod());
-      }
-    }
-
-    int axis = ctx.Attr<int>("axis");
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
-      axis = GetDataFromTensor<int>(axis_tensor)[0];
-    }
-
-    auto dout_vec_dims = phi::vectorize(dout->dims());
-
-    axis = ComputeAxis(axis, dout_vec_dims.size());
-
-    std::vector<int64_t> offset(dout_vec_dims.size(), 0);
-
-    dnnl::memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-    platform::ReorderMKLDNNHandler reorder_handler(
-        dout_vec_dims,
-        framework::TransToProtoVarType(dout->dtype()),
-        dout_type,
-        onednn_engine);
-    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
-
-    for (size_t i = 0; i < dx.size(); ++i) {
-      if (out_var_names[i] != framework::kEmptyVarName &&
-          dx[i]->numel() != 0UL) {
-        auto dx_vec_dims = phi::vectorize(dx[i]->dims());
-        auto slice_mem_p = reorder_handler.AcquireSubmemory(
-            dx_vec_dims, offset, reorder_src_memory_p);
-
-        auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-            dx[i],
-            dx_vec_dims,
-            platform::GetPlainMKLDNNFormat(dx_vec_dims.size()),
-            ctx.GetPlace());
-        auto reorder_p =
-            reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
-
-        reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
-
-        offset[axis] += dx[i]->dims()[axis];
-
-        dx[i]->set_mem_desc(reorder_dst_memory_p->get_desc());
-      }
-    }
-    astream.wait();
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(concat,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::ConcatMKLDNNOpKernel<float>,
-                   ops::ConcatMKLDNNOpKernel<paddle::platform::bfloat16>,
-                   ops::ConcatMKLDNNOpKernel<int8_t>,
-                   ops::ConcatMKLDNNOpKernel<uint8_t>);
-
-REGISTER_OP_KERNEL(concat_grad,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::ConcatGradMKLDNNOpKernel<float>,
-                   ops::ConcatGradMKLDNNOpKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
deleted file mode 100644
index d477fa0b2bf2c4..00000000000000
--- a/paddle/fluid/operators/mkldnn/expand_v2_mkldnn_op.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/operators/expand_v2_op.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace {
-
-using paddle::framework::ExecutionContext;
-using paddle::framework::GradVarName;
-using paddle::framework::Tensor;
-using paddle::platform::MKLDNNDeviceContext;
-using phi::vectorize;
-
-template <typename T>
-class ExpandMKLDNNKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const ExecutionContext& ctx) const {
-    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    const auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto x_vec_dims = vectorize(x->dims());
-
-    auto out_new_dims = paddle::operators::get_expand_shape(ctx);
-    for (size_t i = 0; i < out_new_dims.size(); ++i) {
-      out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
-    }
-
-    if (x_vec_dims.size() != out_new_dims.size()) {
-      x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
-    }
-
-    out->Resize(phi::make_ddim(out_new_dims));
-    paddle::platform::BroadcastDataMKLDNNHandler<T> handler(
-        dnnl::algorithm::binary_add,
-        onednn_engine,
-        ctx.GetPlace(),
-        x,
-        out,
-        0.0f,
-        1.0f,
-        x_vec_dims);
-
-    auto src_memory_p = handler.AcquireSrcMemory(x);
-    auto dst_memory_p = handler.AcquireZeroedDstMemory(out);
-    auto binary_p = handler.AcquireForwardPrimitive();
-
-    const std::unordered_map<int, dnnl::memory> args = {
-        {DNNL_ARG_SRC_0, *dst_memory_p},
-        {DNNL_ARG_SRC_1, *src_memory_p},
-        {DNNL_ARG_DST, *dst_memory_p}};
-
-    auto& astream = MKLDNNDeviceContext::tls().get_stream();
-    binary_p->execute(astream, args);
-    astream.wait();
-
-    out->set_mem_desc(dst_memory_p->get_desc());
-  }
-
- private:
-  std::vector<int64_t> GetExtendedXDims(const std::vector<int64_t>& x_vec_dims,
-                                        int new_size) const {
-    std::vector<int64_t> extended_x_dims(new_size, 1);
-    std::copy(x_vec_dims.begin(),
-              x_vec_dims.end(),
-              extended_x_dims.begin() + new_size - x_vec_dims.size());
-
-    return extended_x_dims;
-  }
-};
-
-template <typename T>
-class ExpandGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const ExecutionContext& ctx) const {
-    const auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(GradVarName("X"));
-
-    auto dx_vec_dims = vectorize(dx->dims());
-    auto dout_vec_dims = vectorize(dout->dims());
-
-    if (dx_vec_dims.size() != dout_vec_dims.size()) {
-      dx_vec_dims.insert(
-          dx_vec_dims.begin(), dout_vec_dims.size() - dx_vec_dims.size(), 1);
-    }
-
-    auto& astream = MKLDNNDeviceContext::tls().get_stream();
-    if (dout_vec_dims == dx_vec_dims) {
-      dnnl::memory::data_type dout_type = paddle::framework::ToMKLDNNDataType(
-          paddle::framework::TransToProtoVarType(dout->dtype()));
-      paddle::platform::ReorderMKLDNNHandler reorder_handler(
-          dout_vec_dims,
-          paddle::framework::TransToProtoVarType(dout->dtype()),
-          dout_type,
-          onednn_engine);
-
-      auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-          dout->mem_desc(), paddle::platform::to_void_cast(dout->data<T>()));
-
-      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          dx,
-          paddle::platform::GetPlainMKLDNNFormat(dx_vec_dims.size()),
-          ctx.GetPlace());
-
-      auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
-                                                      reorder_dst_memory_p);
-
-      reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
-      astream.wait();
-
-      dx->set_mem_desc(reorder_dst_memory_p->get_desc());
-    } else {
-      paddle::platform::ReductionMKLDNNHandler<T> handler(
-          dnnl::algorithm::reduction_sum,
-          0.0f,
-          0.0f,
-          onednn_engine,
-          ctx.GetPlace(),
-          dout,
-          dx,
-          dx_vec_dims);
-
-      auto src_memory_p = handler.AcquireSrcMemory(dout);
-      auto dst_memory_p = handler.AcquireDstMemory(dx);
-
-      std::unordered_map<int, dnnl::memory> reduction_args = {
-          {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
-
-      auto reduction_p = handler.AcquireForwardPrimitive();
-
-      reduction_p->execute(astream, reduction_args);
-      astream.wait();
-      dx->set_layout(paddle::framework::DataLayout::kMKLDNN);
-      dx->set_mem_desc(
-          dst_memory_p->get_desc().reshape(vectorize<int64_t>(dx->dims())));
-    }
-  }
-};
-}  // anonymous namespace
-
-REGISTER_OP_KERNEL(expand_v2,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ExpandMKLDNNKernel<float>,
-                   ExpandMKLDNNKernel<paddle::platform::bfloat16>);
-
-REGISTER_OP_KERNEL(expand_v2_grad,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ExpandGradMKLDNNKernel<float>,
-                   ExpandGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 7404972ea7cca0..1cd3e883b42bca 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -16,10 +16,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fc_op.h"
 #include "paddle/fluid/platform/mkldnn_helper.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace operators {
@@ -29,393 +26,131 @@ using dnnl::memory;
 using dnnl::primitive;
 using dnnl::prop_kind;
 using dnnl::stream;
-using framework::DataLayout;
 using framework::DDim;
 using framework::ExecutionContext;
-using framework::LoDTensor;
-using framework::Tensor;
+using LoDTensor = phi::DenseTensor;
 using platform::GetMKLDNNFormat;
 using platform::MKLDNNDeviceContext;
+using platform::MKLDNNGetDataType;
 using platform::to_void_cast;
 
+template <typename T>
+constexpr bool IsInt8() {
+  return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
+}
+
+struct InnerProductCache {
+  dnnl::inner_product_forward inner_product_p;
+  dnnl::memory src_mem;
+  dnnl::memory weights_mem;
+  dnnl::memory bias_mem;
+  dnnl::memory dst_mem;
+};
 template <typename T_in, typename T_w, typename T_out>
-class FCPrimitiveFactory {
+class FCMKLDNNHandler
+    : public platform::MKLDNNHandlerNoCachingT<T_in,
+                                               dnnl::inner_product_forward> {
  public:
-  explicit FCPrimitiveFactory(const dnnl::engine& engine) : engine_(engine) {}
-
-  void ExecuteFcPrimitive(const LoDTensor* input,
-                          const Tensor* weights,
-                          const Tensor* bias,
-                          LoDTensor* output,
-                          const MKLDNNDeviceContext& dev_ctx,
-                          const ExecutionContext& ctx) {
-    RecomputeOutputDims(ctx, input, weights, output);
-    // If primitive has already been created and cached, don't create new one,
-    // but update input and output data pointers and return it.
-    if (fc_) {
-      UpdateDataPointers(ctx, output, input);
-      this->Execute();
-      return;
-    }  // Otherwise, create a new one.
-
-    auto in_col_dims = ctx.Attr<int>("in_num_col_dims");
-    PADDLE_ENFORCE_LE(
-        in_col_dims,
-        2,
-        platform::errors::Unimplemented(
-            "DNNL FC doesn't support in_num_col_dims parameter to "
-            "be higher than "
-            "2."));
-    if (in_col_dims == 2) {
-      PADDLE_ENFORCE_EQ(
-          input->dims().size(),
-          3,
-          platform::errors::Unimplemented(
-              "DNNL FC only supports in_num_col_dims equal to 2 when "
-              "3 dim input is provided."));
-      PADDLE_ENFORCE_EQ(
-          input->format(),
-          MKLDNNMemoryFormat::ncw,
-          platform::errors::Unimplemented(
-              "DNNL FC only supports in_num_col_dims equal to 2 when "
-              "input format is equal to ncw."));
+  FCMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
+                  const platform::MKLDNNDeviceContext& dev_ctx,
+                  const phi::DenseTensor* x,
+                  const phi::DenseTensor* weights,
+                  const phi::DenseTensor* bias,
+                  phi::DenseTensor* out,
+                  const int in_num_col_dims,
+                  dnnl::engine mkldnn_engine,
+                  platform::Place cpu_place)
+      : platform::MKLDNNHandlerNoCachingT<T_in, dnnl::inner_product_forward>(
+            mkldnn_engine, cpu_place),
+        dev_ctx_(dev_ctx) {
+    this->memory_key_ = ctx.InputName("W");
+
+    auto x_vec_dims = phi::vectorize(x->dims());
+    auto weights_vec_dims = phi::vectorize(weights->dims());
+
+    int MB = 1;
+    for (int i = 0; i < in_num_col_dims; ++i) {
+      MB *= x_vec_dims[i];
     }
 
-    weights_ = CreateWeightsMemory(weights);
-
-    // Since MKL-DNN has a lot of limitations on what the input/weights/output
-    // dimensions should be, to simplify the code, the creation of primitive
-    // descriptor has been divided into separate cases, based on the number
-    // of input dimensions.
-    size_t input_dim_num = input->dims().size();
-    paddle::optional<dnnl::inner_product_forward::primitive_desc> fc_prim_desc;
-    memory::desc usr_weights_desc = {};
-    switch (input_dim_num) {
-      case 2:
-        fc_prim_desc =
-            Create2DFcPrimDescriptor(input, weights, bias, output, ctx);
-        usr_weights_desc = Create2DUserWeightsDesc();
-        break;
-      case 3:
-        fc_prim_desc =
-            Create3DFcPrimDescriptor(input, weights, bias, output, ctx);
-        usr_weights_desc = Create3DUserWeightsDesc(weights);
-        break;
-      case 4:
-        fc_prim_desc =
-            Create4DFcPrimDescriptor(input, weights, bias, output, ctx);
-        usr_weights_desc = Create4DUserWeightsDesc(input, weights);
-        break;
-      default:
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "DNNL FC doesn't support input dims different than 2, 3, 4."));
-        break;
+    int IC = 1;
+    for (size_t i = in_num_col_dims; i < x_vec_dims.size(); ++i) {
+      IC *= x_vec_dims[i];
     }
-    input_ = CreateMemory<T_in>(fc_prim_desc->src_desc(), input);
-    // Update weights format inside of its memory
-    weights_ = Reorder(
-        usr_weights_desc, usr_weights_desc, weights_->get_data_handle());
 
-    // Quantize weights and reorder to format chosen by FC primitive descriptor.
-    QuantizeWeights(ctx, fc_prim_desc->weights_desc());
+    int OC = weights_vec_dims[1];
 
-    bias_ = CreateMemoryToBeCached<float>(fc_prim_desc->bias_desc(), bias);
-    // If int8 is desired, quantize bias into 32-bit signed int
-    QuantizeBias(*fc_prim_desc, ctx);
-
-    // Store weights and bias in the mkldnn cache
-    CacheWeightsAndBias(dev_ctx, ctx);
-
-    // Based on format determined by inner_product, create output in desired
-    // memory format
-    output_ = CreateDstMemory(*fc_prim_desc, ctx, output);
-
-    // Return MKL-DNN primitive ready to be fed into pipeline and executed
-    fc_ = inner_product_forward(*fc_prim_desc);
-    this->Execute();
-  }
-
-  void Execute() {
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    if (bias_) {
-      fc_->execute(astream,
-                   {{DNNL_ARG_SRC, *input_},
-                    {DNNL_ARG_WEIGHTS, *weights_},
-                    {DNNL_ARG_BIAS, *bias_},
-                    {DNNL_ARG_DST, *output_}});
-    } else {
-      fc_->execute(astream,
-                   {{DNNL_ARG_SRC, *input_},
-                    {DNNL_ARG_WEIGHTS, *weights_},
-                    {DNNL_ARG_DST, *output_}});
-    }
-    astream.wait();
-  }
-
- private:
-  // DNNL always returns 2-dimensional data block as a result of computing
-  // inner product. Hence the format 'nc' is always set for its output
-  // primitive. Therefore, function SetOutputFormat is needed to choose
-  // an appropriate format based on the number of input dimensions and
-  // format of an input tensor.
-  void SetOutputFormat(MKLDNNMemoryFormat in_format, Tensor* out) {
-    int dim_num = out->dims().size();
-    // In case of 2 dims, we set the only possible format, nc
-    if (dim_num == 2) {
-      out->set_format(MKLDNNMemoryFormat::nc);
-      out->set_mem_desc({phi::vectorize(out->dims()),
-                         platform::MKLDNNGetDataType<T_out>(),
-                         out->format()});
-      // In case of 3 dims, we generate a format that is based on number
-      // of output dims and the layout of input format (nchw or nhwc).
-    } else if (dim_num == 3) {
-      if (in_format == MKLDNNMemoryFormat::nwc ||
-          in_format == MKLDNNMemoryFormat::nhwc) {
-        out->set_format(
-            platform::MKLDNNFormatForSize(dim_num, MKLDNNMemoryFormat::nhwc));
-      } else {
-        out->set_format(
-            platform::MKLDNNFormatForSize(dim_num, MKLDNNMemoryFormat::nchw));
-      }
-      // In any other case we overwrite the output format with the input one.
-    } else {
-      out->set_format(in_format);
-    }
-  }
+    dnnl::memory::desc bias_md;
 
-  void UpdateDataPointers(const ExecutionContext& ctx,
-                          Tensor* out,
-                          const Tensor* in) {
-    input_->set_data_handle(to_void_cast(in->data<T_in>()));
-    output_->set_data_handle(out->mutable_data<T_out>(ctx.GetPlace()));
-    // If the primitive exists, but the output tensor has changed its
-    // variable, update its format to what has been determined in first
-    // call to CreateFcPrimitive method.
-    if (out->format() == MKLDNNMemoryFormat::undef) {
-      SetOutputFormat(in->format(), out);
+    auto src_md = dnnl::memory::desc(
+        {MB, IC}, MKLDNNGetDataType<T_in>(), dnnl::memory::format_tag::any);
+    auto weights_md = dnnl::memory::desc(
+        {OC, IC}, MKLDNNGetDataType<T_w>(), dnnl::memory::format_tag::any);
+    auto dst_md = dnnl::memory::desc(
+        {MB, OC}, MKLDNNGetDataType<T_out>(), dnnl::memory::format_tag::any);
+    if (bias) {
+      bias_md = dnnl::memory::desc({bias->numel()},
+                                   MKLDNNGetDataType<float>(),
+                                   dnnl::memory::format_tag::a);
     }
-  }
-
-  dnnl::inner_product_forward::primitive_desc Create2DFcPrimDescriptor(
-      const LoDTensor* input,
-      const Tensor* weights,
-      const Tensor* bias,
-      LoDTensor* output,
-      const ExecutionContext& ctx) {
-    auto src_desc = CreateMemDescriptor<T_in>(input, MKLDNNMemoryFormat::any);
-    auto weight_dims = Get2DWeightDimsForDNNL(weights);
-    auto weights_desc =
-        CreateMemDescriptor<T_w>(weight_dims, MKLDNNMemoryFormat::any);
-    auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
-    auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
-    const auto attrs = CreateFCAttrs(ctx);
-    return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
-  }
-
-  std::vector<int64_t> Get2DWeightDimsForDNNL(const Tensor* weights) {
-    auto dims = phi::vectorize(weights->dims());
-    std::swap(dims[0], dims[1]);  // swap input dim with output dim
-    return dims;
-  }
-
-  memory::desc Create2DUserWeightsDesc() { return weights_->get_desc(); }
-
-  dnnl::inner_product_forward::primitive_desc Create3DFcPrimDescriptor(
-      const LoDTensor* input,
-      const Tensor* weights,
-      const Tensor* bias,
-      LoDTensor* output,
-      const ExecutionContext& ctx) {
-    auto input_dims = phi::vectorize(input->dims());
-    std::vector<int64_t> new_input_dims = {
-        input_dims[0] * input_dims[1], input_dims[2], 1};
-    auto src_desc =
-        CreateMemDescriptor<T_in>(new_input_dims, MKLDNNMemoryFormat::any);
-
-    auto weight_dims = Get3DWeightDimsForDNNL(weights);
-    auto weights_desc =
-        CreateMemDescriptor<T_w>(weight_dims, MKLDNNMemoryFormat::any);
-
-    auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
 
-    auto dst_dims = {input_dims[0] * input_dims[1], weight_dims[0]};
-    auto dst_desc =
-        CreateMemDescriptor<T_out>(dst_dims, MKLDNNMemoryFormat::any);
     const auto attrs = CreateFCAttrs(ctx);
-    return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
-  }
-
-  std::vector<int64_t> Get3DWeightDimsForDNNL(const Tensor* weights) {
-    auto paddle_w_dims = phi::vectorize(weights->dims());
-    return {paddle_w_dims[1], paddle_w_dims[0], 1};
-  }
-
-  memory::desc Create3DUserWeightsDesc(const Tensor* weights) {
-    auto dims = Get3DWeightDimsForDNNL(weights);
-    return CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oiw);
-  }
-
-  dnnl::inner_product_forward::primitive_desc Create4DFcPrimDescriptor(
-      const LoDTensor* input,
-      const Tensor* weights,
-      const Tensor* bias,
-      LoDTensor* output,
-      const ExecutionContext& ctx) {
-    auto src_desc = CreateMemDescriptor<T_in>(input, MKLDNNMemoryFormat::any);
-    // Since MKL-DNN doesn't support 4D column-major data formats in
-    // inner_product primitive, transpose the weights to be in
-    // row-major format
-    auto dims = Get4DWeightDimsForDNNL(input, weights);
-    auto weights_desc = CreateMemDescriptor<T_w>(dims, MKLDNNMemoryFormat::any);
-    auto bias_desc = CreateMemDescriptor<float>(bias, MKLDNNMemoryFormat::x);
-    auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
-    const auto attrs = CreateFCAttrs(ctx);
-    return CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
-  }
-
-  std::vector<int64_t> Get4DWeightDimsForDNNL(const LoDTensor* input,
-                                              const Tensor* weights) {
-    auto old_w_dims = phi::vectorize(weights->dims());
-    auto old_in_dims = phi::vectorize(input->dims());
-    auto dims = {old_w_dims[1], old_in_dims[1], old_in_dims[2], old_in_dims[3]};
-    return dims;
-  }
 
-  memory::desc Create4DUserWeightsDesc(const LoDTensor* input,
-                                       const Tensor* weights) {
-    auto dims = Get4DWeightDimsForDNNL(input, weights);
-    return CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oihw);
+    this->AcquireForwardPrimitiveDescriptor(attrs,
+                                            prop_kind::forward_inference,
+                                            src_md,
+                                            weights_md,
+                                            bias_md,
+                                            dst_md);
   }
 
-  // Convert data from one data format to another
-  std::shared_ptr<dnnl::memory> Reorder(const memory::desc& src_desc,
-                                        const memory::desc& dst_desc,
-                                        void* src_data) {
-    auto src_mem = memory(src_desc, engine_, src_data);
-    auto dst_mem = std::make_shared<memory>(dst_desc, engine_);
-
-    auto reorder = dnnl::reorder(src_mem, *dst_mem);
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    {
-      platform::RecordEvent record_reorder(
-          "int_reorder",
-          platform::TracerEventType::UserDefined,
-          2,
-          platform::EventRole::kUniqueOp);
-      reorder.execute(astream, src_mem, *dst_mem);
-      astream.wait();
-    }
-
-    return dst_mem;
-  }
-
-  // Convert data from one data format to another and rescale it.
-  // If the desired data type is (un)signed int8, quantization occurs here.
-  std::shared_ptr<dnnl::memory> ReorderWithScale(
-      const std::shared_ptr<memory> src_mem,
-      const memory::desc& dst_md,
-      const std::vector<float>& scale_data) {
-    auto dst_mem = std::make_shared<dnnl::memory>(dst_md, engine_);
+ private:
+  dnnl::primitive_attr CreateFCAttrs(const ExecutionContext& ctx) {
     dnnl::primitive_attr attributes;
-    // According to MKL-DNN's documentation mask determines along which
-    // dimensions should the scale be applied.
-    // 0 - Single scale applied to whole tensor
-    // 1 - Apply Scale along a slice of each dimension which index is 1.
-    //     In case of weights quantization, that dimension is output,
-    //     becuase we perform per-output-channel quantization
-    int mask = CreateMask(0, scale_data.size() > 1);
-    attributes.set_output_scales(mask, scale_data);
-    auto reorder = dnnl::reorder(*src_mem, *dst_mem, attributes);
+    dnnl::post_ops post_operations;
 
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    {
-      platform::RecordEvent record_reorder(
-          "int_reorder",
-          platform::TracerEventType::UserDefined,
-          2,
-          platform::EventRole::kUniqueOp);
-      reorder.execute(astream,
-                      {{DNNL_ARG_FROM, *src_mem}, {DNNL_ARG_TO, *dst_mem}});
-      astream.wait();
+    std::vector<float> output_shift_scale;
+    float scale = 1.0f;
+    if (IsInt8<T_w>()) {
+      std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx);
+      int mask = CreateMask(1, output_shift_scale.size() > 1);
+      attributes.set_output_scales(mask, output_shift_scale);
     }
 
-    return dst_mem;
-  }
-
-  template <typename T>
-  static dnnl::memory::desc CreateMemDescriptor(
-      const std::vector<int64_t>& dims, MKLDNNMemoryFormat format) {
-    return platform::MKLDNNMemDesc(
-        dims, platform::MKLDNNGetDataType<T>(), format);
-  }
-
-  template <typename T>
-  static dnnl::memory::desc CreateMemDescriptor(const Tensor* tensor,
-                                                MKLDNNMemoryFormat format) {
-    auto dims = phi::vectorize(tensor->dims());
-    return CreateMemDescriptor<T>(dims, format);
-  }
-
-  template <typename T>
-  dnnl::memory CreateMemory(const dnnl::memory::desc& desc,
-                            const Tensor* tensor) {
-    return CreateMemory(desc, platform::to_void_cast<T>(tensor->data<T>()));
-  }
-
-  dnnl::memory CreateMemory(const dnnl::memory::desc& desc, void* data) {
-    return memory(desc, engine_, data);
-  }
-
-  template <typename T>
-  std::shared_ptr<dnnl::memory> CreateMemoryToBeCached(
-      const dnnl::memory::desc& desc, const Tensor* tensor) {
-    return CreateMemoryToBeCached(desc,
-                                  platform::to_void_cast<T>(tensor->data<T>()));
-  }
-
-  std::shared_ptr<dnnl::memory> CreateMemoryToBeCached(
-      const dnnl::memory::desc& desc, void* data) {
-    return std::make_shared<memory>(desc, engine_, data);
-  }
+    float sum_scale = 1.0f;
+    if (ctx.HasAttr("fuse_residual_connection") &&
+        ctx.Attr<bool>("fuse_residual_connection")) {
+      post_operations.append_sum(sum_scale);
+    }
 
-  // Create weights memory and transform to default MKL-DNN format
-  std::shared_ptr<dnnl::memory> CreateWeightsMemory(const Tensor* weights) {
-    auto dims = phi::vectorize(weights->dims());
-    std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
-    auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi);
-    // Transpose weights through MKL-DNN's reorder from io to oi format.
-    return Reorder(src_desc,
-                   dst_desc,
-                   platform::to_void_cast<float>(weights->data<float>()));
-  }
+    // ReLU from "fc_fuse_pass"
+    if (ctx.Attr<std::string>("activation_type") == "relu") {
+      post_operations.append_eltwise(
+          scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f);
+    }
+    platform::AppendActivation(ctx, post_operations, scale);
 
-  void CacheWeightsAndBias(const MKLDNNDeviceContext& dev_ctx,
-                           const ExecutionContext& ctx) {
-    std::string key = platform::CreateKey(dev_ctx);
-    key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
+    if (ctx.HasAttr("fused_output_scale")) {
+      float scale_alpha = ctx.Attr<float>("fused_output_scale");
+      post_operations.append_eltwise(
+          1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
+    }
 
-    const std::string weights_key = key + ctx.InputName("W");
-    const std::string bias_key = key + ctx.InputName("Bias");
-    dev_ctx.SetBlob(weights_key, weights_);
-    dev_ctx.SetBlob(bias_key, bias_);
+    attributes.set_post_ops(post_operations);
+    return attributes;
   }
 
   // Compute the bias scales so that its values correspond to the
   // scale of data being an output of weights and input multiplication
-  std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) {
-    auto scale_in_data = ctx.Attr<float>("Scale_in");
-    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-    const size_t weight_scales_num = scale_weights_data.size();
-    std::vector<float> bias_scales(weight_scales_num);
+  std::vector<float> ComputeBiasScales(
+      const float scale_in, const std::vector<float>& scale_weights) {
+    std::vector<float> bias_scales(scale_weights.size());
 
-#pragma omp parallel for
-    for (size_t i = 0; i < weight_scales_num; i++) {
-      if (scale_weights_data[i] == 0.0)
+    for (size_t i = 0; i < bias_scales.size(); ++i) {
+      if (scale_weights[i] == 0.0)
         bias_scales[i] = 1.0f;
       else
-        bias_scales[i] = scale_in_data * scale_weights_data[i];
+        bias_scales[i] = scale_in * scale_weights[i];
     }
 
     return bias_scales;
@@ -442,18 +177,16 @@ class FCPrimitiveFactory {
                             ? 1.0f
                             : ctx.Attr<float>("Scale_out");
     const size_t weight_scales_num = scale_weights_data.size();
-    std::vector<float> output_shift_scale(weight_scales_num);
 
-#pragma omp parallel for
-    for (size_t i = 0; i < weight_scales_num; i++) {
+    for (size_t i = 0; i < weight_scales_num; ++i) {
       if (scale_weights_data[i] == 0.0)
-        output_shift_scale[i] = inner_scale;
+        scale_weights_data[i] = inner_scale;
       else
-        output_shift_scale[i] =
+        scale_weights_data[i] =
             inner_scale / (scale_in_data * scale_weights_data[i]);
     }
 
-    return make_tuple(output_shift_scale, scale);
+    return make_tuple(scale_weights_data, scale);
   }
 
   // Computing MKL-DNN's scaling mask which determines along which dimension
@@ -464,131 +197,300 @@ class FCPrimitiveFactory {
     return is_multi_channel_quantizied ? 1 << slice_dimension : 0;
   }
 
-  void QuantizeWeights(const ExecutionContext& ctx, memory::desc dst) {
-    weights_ = ReorderWithScale(
-        weights_, dst, ctx.Attr<std::vector<float>>("Scale_weights"));
-  }
+  std::shared_ptr<dnnl::memory> AcquireMemoryWithReorderAndAttrs(
+      const dnnl::memory::desc& user_md,
+      const dnnl::memory::desc& target_md,
+      void* ptr,
+      const dnnl::primitive_attr& attrs) {
+    std::shared_ptr<dnnl::memory> target_memory_p;
 
-  void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
-                    const ExecutionContext& ctx) {
-    auto bias_scales = ComputeBiasScales(ctx);
-    bias_ = ReorderWithScale(bias_, fc_prim_desc.bias_desc(), bias_scales);
-  }
+    auto user_memory_p =
+        std::make_shared<dnnl::memory>(user_md, this->engine_, ptr);
+    target_memory_p = std::make_shared<dnnl::memory>(target_md, this->engine_);
+    auto reorder_p = std::make_shared<dnnl::reorder>(
+        *user_memory_p, *target_memory_p, attrs);
 
-  dnnl::primitive_attr CreateFCAttrs(const ExecutionContext& ctx) {
-    dnnl::primitive_attr attributes;
-    dnnl::post_ops post_operations;
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+    reorder_p->execute(
+        astream,
+        {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
+    astream.wait();
 
-    std::vector<float> output_shift_scale;
-    float scale;
-    std::tie(output_shift_scale, scale) = ComputeOutputShiftScale(ctx);
-    int mask = CreateMask(1, output_shift_scale.size() > 1);
-    attributes.set_output_scales(mask, output_shift_scale);
+    return target_memory_p;
+  }
 
-    float sum_scale = 1.0f;
-    if (ctx.HasAttr("fuse_residual_connection") &&
-        ctx.Attr<bool>("fuse_residual_connection")) {
-      post_operations.append_sum(sum_scale);
-    }
+  std::string memory_key_;
+  const platform::MKLDNNDeviceContext& dev_ctx_;
 
-    if (ctx.Attr<std::string>("activation_type") == "relu") {
-      constexpr float negative_slope = 0.0f;
-      constexpr float placeholder = 1.0f;  // beta
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_relu, negative_slope, placeholder);
-    } else if (ctx.Attr<std::string>("activation_type") == "gelu") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_gelu, alpha, beta);
-    } else if (ctx.Attr<std::string>("activation_type") == "gelu_tanh") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_gelu_tanh, alpha, beta);
-    } else if (ctx.Attr<std::string>("activation_type") == "gelu_erf") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_gelu_erf, alpha, beta);
-    } else if (ctx.Attr<std::string>("activation_type") == "tanh") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_tanh, alpha, beta);
-    } else if (ctx.Attr<std::string>("activation_type") == "sigmoid") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_logistic, alpha, beta);
-    } else if (ctx.Attr<std::string>("activation_type") == "mish") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_mish, alpha, beta);
-    } else if (ctx.Attr<std::string>("activation_type") == "hard_swish") {
-      constexpr float alpha = 0.0f;
-      constexpr float beta = 0.0f;
-      post_operations.append_eltwise(
-          scale, dnnl::algorithm::eltwise_hardswish, alpha, beta);
+ public:
+  std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
+      const phi::DenseTensor* x) {
+    const T_in* x_data = x->data<T_in>();
+
+    auto user_md = x->mem_desc();
+    if (x->dims().size() != 2) {
+      // reshape restrictions are always satisfied because in case of 3 or 4 dim
+      // input, plain layout is enforced
+      user_md = user_md.reshape(this->fwd_pd_->src_desc().dims());
     }
 
-    attributes.set_post_ops(post_operations);
-    return attributes;
+    return this->AcquireMemoryWithReorder(
+        user_md, this->fwd_pd_->src_desc(), to_void_cast<T_in>(x_data));
   }
 
-  dnnl::inner_product_forward::primitive_desc CreateFcPrimDesc(
-      const dnnl::memory::desc& input_desc,
-      const dnnl::memory::desc& weights_desc,
-      const dnnl::memory::desc& bias_desc,
-      const dnnl::memory::desc& dst_desc,
-      const dnnl::primitive_attr& attrs) {
-    auto fc_desc = inner_product_forward::desc(prop_kind::forward_scoring,
-                                               input_desc,
-                                               weights_desc,
-                                               bias_desc,
-                                               dst_desc);
+  std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
+      const phi::DenseTensor* bias,
+      const float scale_in,
+      const std::vector<float>& scale_weights) {
+    const float* bias_data = bias->data<float>();
 
-    return inner_product_forward::primitive_desc(fc_desc, attrs, engine_);
+    if (IsInt8<T_w>() == false) {
+      // for BF16/FP32 bias is 1D and has no scales, so reorder is not needed
+      return this->AcquireMemoryFromPrimitive(this->fwd_pd_->bias_desc(),
+                                              to_void_cast<float>(bias_data));
+    } else {
+      const std::string bias_key = this->memory_key_ + "@bias";
+      auto memory_p = std::static_pointer_cast<dnnl::memory>(
+          this->dev_ctx_.GetBlob(bias_key));
+
+      if (!memory_p) {
+        const auto& scale_data = ComputeBiasScales(scale_in, scale_weights);
+        dnnl::primitive_attr attrs;
+
+        int mask = CreateMask(0, scale_data.size() > 1);
+        attrs.set_output_scales(mask, scale_data);
+
+        auto user_md = dnnl::memory::desc({bias->dims()[0]},
+                                          MKLDNNGetDataType<float>(),
+                                          dnnl::memory::format_tag::a);
+
+        memory_p = this->AcquireMemoryWithReorderAndAttrs(
+            user_md,
+            this->fwd_pd_->bias_desc(),
+            to_void_cast<float>(bias_data),
+            attrs);
+        this->dev_ctx_.SetBlob(bias_key, memory_p);
+      }
+      return memory_p;
+    }
   }
 
-  // Create output memory based on output tensor and inner_product
-  // primitive descriptor format chosen for output
-  dnnl::memory CreateDstMemory(
-      const dnnl::inner_product_forward::primitive_desc& fc_prim_desc,
-      const ExecutionContext& ctx,
-      Tensor* output) {
+  std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
+      const phi::DenseTensor* weights, const std::vector<float>& scale_data) {
+    const std::string weights_key = this->memory_key_ + "@weights";
+    auto memory_p = std::static_pointer_cast<dnnl::memory>(
+        this->dev_ctx_.GetBlob(weights_key));
+
+    if (!memory_p) {
+      const float* weights_data = weights->data<float>();
+      auto weights_dims = this->fwd_pd_->weights_desc().dims();
+
+      auto user_md = dnnl::memory::desc(weights_dims,
+                                        MKLDNNGetDataType<float>(),
+                                        dnnl::memory::format_tag::io);
+
+      if (IsInt8<T_w>()) {
+        dnnl::primitive_attr attrs;
+        int mask = CreateMask(0, scale_data.size() > 1);
+        attrs.set_output_scales(mask, scale_data);
+
+        memory_p = this->AcquireMemoryWithReorderAndAttrs(
+            user_md,
+            this->fwd_pd_->weights_desc(),
+            to_void_cast<float>(weights_data),
+            attrs);
+      } else {
+        memory_p =
+            this->AcquireMemoryWithReorder(user_md,
+                                           this->fwd_pd_->weights_desc(),
+                                           to_void_cast<float>(weights_data));
+      }
+
+      this->dev_ctx_.SetBlob(weights_key, memory_p);
+    }
+    return memory_p;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireCustomDstMemory(
+      const ExecutionContext& ctx, phi::DenseTensor* out) {
     if (ctx.HasAttr("fuse_residual_connection") &&
         ctx.Attr<bool>("fuse_residual_connection")) {
-      auto* residual_param = ctx.Output<Tensor>("ResidualData");
+      auto* residual_param = ctx.Output<phi::DenseTensor>("ResidualData");
 
       PADDLE_ENFORCE_EQ(
-          output->dims(),
+          out->dims(),
           residual_param->dims(),
           platform::errors::InvalidArgument(
               "Output and elementwise parameter need to have the "
               "same dimension sizes, but got output's dimension = %d"
               " and residual param's dimension =%d .",
-              output->dims().size(),
+              out->dims().size(),
               residual_param->dims().size()));
 
-      output->ShareDataWith(*residual_param);
+      out->ShareDataWith(*residual_param);
+    }
+    return this->template AcquireDstMemory<T_out>(out);
+  }  // namespace operators
+};   // namespace paddle
+
+template <typename T_in, typename T_w>
+class FCMKLDNNKernel : public framework::OpKernel<T_in> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+    bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
+
+    if (force_fp32_output) {
+      this->RunKernel<float>(ctx);
+    } else if (IsInt8<T_in>()) {
+      if (fuse_relu) {
+        this->RunKernel<uint8_t>(ctx);
+      } else {
+        this->RunKernel<int8_t>(ctx);
+      }
+    } else {
+      this->RunKernel<T_in>(ctx);
+    }
+  }
+
+  void PrepareSrcMem(const std::shared_ptr<inner_product_forward>& fc_p,
+                     const std::shared_ptr<dnnl::memory>& src_mem,
+                     const LoDTensor* x,
+                     const dnnl::engine& engine) const {
+    auto x_md = x->mem_desc().reshape(src_mem->get_desc().dims());
+    if (x_md != src_mem->get_desc()) {
+      dnnl::memory x_mem(x_md, engine, to_void_cast<T_in>(x->data<T_in>()));
+      auto reorder_p = dnnl::reorder(x_mem, *src_mem);
+
+      auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+      reorder_p.execute(astream, x_mem, *src_mem);
+      astream.wait();
+    } else {
+      src_mem->set_data_handle(to_void_cast<T_in>(x->data<T_in>()));
     }
+  }
+
+  template <typename T_out = T_w>
+  void RunKernel(const framework::ExecutionContext& ctx) const {
+    const auto& dev_ctx =
+        ctx.template device_context<platform::MKLDNNDeviceContext>();
+    const auto& mkldnn_engine = dev_ctx.GetEngine();
+
+    const auto* x = ctx.Input<LoDTensor>("Input");
+    const auto* weights = ctx.Input<phi::DenseTensor>("W");
+    const auto* bias = ctx.Input<phi::DenseTensor>("Bias");
+    auto out = ctx.Output<LoDTensor>("Out");
+
+    const float scale_in = ctx.Attr<float>("Scale_in");
+    const auto& scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
+
+    std::shared_ptr<dnnl::inner_product_forward> fc_p;
+    std::shared_ptr<dnnl::memory> src_memory_p;
+    std::shared_ptr<dnnl::memory> weights_memory_p;
+    std::shared_ptr<dnnl::memory> bias_memory_p;
+    std::shared_ptr<dnnl::memory> dst_memory_p;
+
+    std::string cache_key;
+    cache_key.reserve(64);
+    cache_key = platform::ExtendKeyWithThreadInfoIfNeeded(
+        dev_ctx,
+        platform::CreateKey(dev_ctx,
+                            ctx.InputName("Input"),
+                            ctx.InputName("W"),
+                            phi::vectorize(x->dims())));
+
+    auto inner_product_cache =
+        std::static_pointer_cast<InnerProductCache>(dev_ctx.GetBlob(cache_key));
+
+    RecomputeOutputDims(ctx, x, weights, out);
+
+    if (inner_product_cache) {
+      fc_p = std::make_shared<dnnl::inner_product_forward>(
+          inner_product_cache->inner_product_p);
+      src_memory_p =
+          std::make_shared<dnnl::memory>(inner_product_cache->src_mem);
+      PrepareSrcMem(fc_p, src_memory_p, x, mkldnn_engine);
+
+      weights_memory_p =
+          std::make_shared<dnnl::memory>(inner_product_cache->weights_mem);
+
+      dst_memory_p =
+          std::make_shared<dnnl::memory>(inner_product_cache->dst_mem);
+      if (ctx.HasAttr("fuse_residual_connection") &&
+          ctx.Attr<bool>("fuse_residual_connection")) {
+        auto* residual_param = ctx.Output<phi::DenseTensor>("ResidualData");
+        out->ShareDataWith(*residual_param);
+      }
+      auto out_ptr = out->mutable_data<T_out>(
+          ctx.GetPlace(), dst_memory_p->get_desc().get_size());
+      dst_memory_p->set_data_handle(out_ptr);
+
+      if (bias) {
+        bias_memory_p =
+            std::make_shared<dnnl::memory>(inner_product_cache->bias_mem);
+      }
+    } else {
+      auto in_col_dims = ctx.Attr<int>("in_num_col_dims");
+
+      FCMKLDNNHandler<T_in, T_w, T_out> handler(ctx,
+                                                dev_ctx,
+                                                x,
+                                                weights,
+                                                bias,
+                                                out,
+                                                in_col_dims,
+                                                mkldnn_engine,
+                                                ctx.GetPlace());
+
+      src_memory_p = handler.AcquireSrcMemoryWithReorder(x);
+      weights_memory_p =
+          handler.AcquireWeightsMemoryWithReorder(weights, scale_weights);
+      dst_memory_p = handler.AcquireCustomDstMemory(ctx, out);
+
+      if (bias) {
+        bias_memory_p =
+            handler.AcquireBiasMemoryWithReorder(bias, scale_in, scale_weights);
+      }
+
+      fc_p = handler.AcquireForwardPrimitive();
+    }
+
+    auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
+
+    std::unordered_map<int, dnnl::memory> fc_args = {
+        {DNNL_ARG_SRC, *src_memory_p},
+        {DNNL_ARG_WEIGHTS, *weights_memory_p},
+        {DNNL_ARG_DST, *dst_memory_p}};
+
+    if (bias) {
+      fc_args.insert({DNNL_ARG_BIAS, *bias_memory_p});
+    }
+
+    fc_p->execute(astream, fc_args);
+    astream.wait();
 
-    auto dst_desc = fc_prim_desc.dst_desc();
-    auto buffer_size = dst_desc.get_size();
-    T_out* output_data =
-        output->mutable_data<T_out>(ctx.GetPlace(), buffer_size);
-    memory dst_mem(dst_desc, engine_, to_void_cast<T_out>(output_data));
-    SetOutputFormat(ctx.Input<LoDTensor>("Input")->format(), output);
+    if (!inner_product_cache) {
+      auto ip_cache = std::make_shared<InnerProductCache>();
+      ip_cache->inner_product_p = *fc_p;
+      ip_cache->src_mem = *src_memory_p;
+      ip_cache->weights_mem = *weights_memory_p;
+      ip_cache->dst_mem = *dst_memory_p;
+      if (bias) {
+        ip_cache->bias_mem = *bias_memory_p;
+      }
+      dev_ctx.SetBlob(cache_key, ip_cache);
+    }
 
-    return dst_mem;
+    platform::SetOutMemDescWithLogicalLayoutFusesSupport(
+        ctx,
+        out,
+        dst_memory_p->get_desc().reshape(phi::vectorize(out->dims())));
   }
 
   void RecomputeOutputDims(const ExecutionContext& ctx,
-                           const LoDTensor* input,
-                           const Tensor* w,
-                           LoDTensor* output) {
+                           const LoDTensor* x,
+                           const phi::DenseTensor* weights,
+                           LoDTensor* out) const {
     int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
     bool padding_weights = ctx.Attr<bool>("padding_weights");
     PADDLE_ENFORCE_EQ(padding_weights,
@@ -596,102 +498,16 @@ class FCPrimitiveFactory {
                       platform::errors::PermissionDenied(
                           "Weight padding in fc can not be used in MKLDNN."));
     std::vector<int64_t> output_dims;
-    FCOutputSize(input->dims(),
-                 w->dims(),
+    FCOutputSize(x->dims(),
+                 weights->dims(),
                  output_dims,
                  in_num_col_dims,
                  padding_weights);
-    output->Resize(phi::make_ddim(output_dims));
-    output->set_lod(input->lod());
+    out->Resize(phi::make_ddim(output_dims));
+    out->set_lod(x->lod());
   }
-
- private:
-  const dnnl::engine& engine_;
-  paddle::optional<memory> input_;
-  paddle::optional<memory> output_;
-  std::shared_ptr<memory> bias_;
-  std::shared_ptr<memory> weights_;
-  paddle::optional<inner_product_forward> fc_;
 };
 
-// Attempt to fetch cached primitive factory based on provided parameters
-// of input format, weight dimensions and output name.
-// If not cached, create a new one.
-template <typename T_in, typename T_w, typename T_out>
-static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>>
-GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
-                    const std::string& key) {
-  auto prim_creator =
-      std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>(
-          dev_ctx.GetBlob(key));
-  if (prim_creator == nullptr) {
-    prim_creator = std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(
-        dev_ctx.GetEngine());
-    dev_ctx.SetBlob(key, prim_creator);
-  }
-
-  return prim_creator;
-}
-
-// Choose appropriate primitive factory implementation based on inferred
-// output type (uint8, int8 or float).
-template <typename T_in, typename T_w>
-static void ExecuteFc(const ExecutionContext& ctx,
-                      const LoDTensor* input,
-                      const Tensor* w,
-                      const Tensor* bias,
-                      LoDTensor* output,
-                      bool fuse_relu,
-                      bool force_fp32_output) {
-  auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-  std::string prim_key = platform::CreateKey(dev_ctx,
-                                             input->format(),
-                                             input->dims()[0],
-                                             phi::vectorize<int>(w->dims()),
-                                             ctx.OutputName("Out"));
-  prim_key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, prim_key);
-
-  constexpr bool is_int8 =
-      std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value;
-  bool is_bfloat16 = std::is_same<T_in, paddle::platform::bfloat16>::value;
-  if ((!is_int8 && !is_bfloat16) || force_fp32_output) {
-    GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, prim_key)
-        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
-  } else if (is_bfloat16) {
-    GetPrimitiveFactory<T_in, T_w, platform::bfloat16>(dev_ctx, prim_key)
-        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
-  } else if (fuse_relu) {
-    GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, prim_key)
-        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
-  } else {
-    GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, prim_key)
-        ->ExecuteFcPrimitive(input, w, bias, output, dev_ctx, ctx);
-  }
-}
-
-template <typename T_in, typename T_w>
-class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(
-        platform::is_cpu_place(ctx.GetPlace()),
-        true,
-        platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
-    platform::MKLDNNDeviceContext::tls().log_lib_version();
-    auto input = ctx.Input<LoDTensor>("Input");
-    auto w = ctx.Input<Tensor>("W");
-    auto bias = ctx.Input<Tensor>("Bias");
-    auto output = ctx.Output<LoDTensor>("Out");
-
-    bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
-    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-
-    ExecuteFc<T_in, T_w>(
-        ctx, input, w, bias, output, fuse_relu, force_fp32_output);
-
-    output->set_layout(DataLayout::kMKLDNN);
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
@@ -704,7 +520,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc,
                                     ::paddle::platform::CPUPlace,
                                     FP32,
                                     ops::kFCMKLDNNFP32,
-                                    ops::FCMKLDNNOpKernel<float, float>);
+                                    ops::FCMKLDNNKernel<float, float>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
     fc,
@@ -712,19 +528,19 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
     ::paddle::platform::CPUPlace,
     BF16,
     ops::kFCMKLDNNFP32,
-    ops::FCMKLDNNOpKernel<paddle::platform::bfloat16,
-                          paddle::platform::bfloat16>);
+    ops::FCMKLDNNKernel<paddle::platform::bfloat16,
+                        paddle::platform::bfloat16>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc,
                                     MKLDNN,
                                     ::paddle::platform::CPUPlace,
                                     U8,
                                     ops::kFCMKLDNNINT8,
-                                    ops::FCMKLDNNOpKernel<uint8_t, int8_t>);
+                                    ops::FCMKLDNNKernel<uint8_t, int8_t>);
 
 REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc,
                                     MKLDNN,
                                     ::paddle::platform::CPUPlace,
                                     S8,
                                     ops::kFCMKLDNNINT8,
-                                    ops::FCMKLDNNOpKernel<int8_t, int8_t>);
+                                    ops::FCMKLDNNKernel<int8_t, int8_t>);
diff --git a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
deleted file mode 100644
index 615f43bb32c0fb..00000000000000
--- a/paddle/fluid/operators/mkldnn/fill_constant_mkldnn_op.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-template <typename T>
-class FillConstantMKLDNNHandler
-    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
- public:
-  FillConstantMKLDNNHandler(Tensor* out,
-                            dnnl::engine engine,
-                            platform::Place cpu_place)
-      : platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
-    const auto src0_md =
-        dnnl::memory::desc({out->numel(), sizeof(T)},
-                           platform::MKLDNNGetDataType<uint8_t>(),
-                           dnnl::memory::format_tag::ab);
-
-    dnnl::primitive_attr attrs;
-    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
-
-    this->AcquireForwardPrimitiveDescriptor(
-        attrs, dnnl::algorithm::binary_add, src0_md, src1_md, src0_md);
-  }
-
-  static const dnnl::memory::desc src1_md;
-};
-
-template <typename T>
-const dnnl::memory::desc FillConstantMKLDNNHandler<T>::src1_md(
-    {1, sizeof(T)},
-    platform::MKLDNNGetDataType<uint8_t>(),
-    dnnl::memory::format_tag::ab);
-
-template <typename T>
-class FillConstantMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& dnnl_engine = dev_ctx.GetEngine();
-
-    auto* out = ctx.Output<Tensor>("Out");
-    T fill_value = CalculateFillValue(ctx);
-
-    auto shape = GetShape(ctx);
-    out->Resize(shape);
-
-    FillConstantMKLDNNHandler<T> handler(out, dnnl_engine, ctx.GetPlace());
-
-    dnnl::memory constant_value_memory =
-        dnnl::memory(FillConstantMKLDNNHandler<T>::src1_md,
-                     dnnl_engine,
-                     reinterpret_cast<uint8_t*>(&fill_value));
-
-    auto src0_memory_p = handler.AcquireDstMemory(out);
-    auto fill_constant_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    fill_constant_p->execute(astream,
-                             {{DNNL_ARG_SRC_0, *src0_memory_p},
-                              {DNNL_ARG_SRC_1, constant_value_memory},
-                              {DNNL_ARG_DST, *src0_memory_p}});
-    astream.wait();
-
-    // src0_memory_p's md was just to allow the usage of a binary
-    // primitive as a memset, and now we need to create a real one
-    out->set_mem_desc({phi::vectorize(shape),
-                       platform::MKLDNNGetDataType<T>(),
-                       platform::GetPlainMKLDNNFormat(shape.size())});
-  }
-
-  T CalculateFillValue(const framework::ExecutionContext& ctx) const {
-    const auto str_value = ctx.Attr<std::string>("str_value");
-    const auto float_value = ctx.Attr<float>("value");
-
-    T value;
-
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      // handle NaN/Inf first, which cannot be read from stream
-      if (str_value == "inf") {
-        value = static_cast<T>(std::numeric_limits<float>::infinity());
-      } else if (str_value == "-inf") {
-        value = static_cast<T>(-std::numeric_limits<float>::infinity());
-      } else if (str_value == "nan") {
-        value = static_cast<T>(std::numeric_limits<float>::quiet_NaN());
-      } else {
-        std::stringstream convert_stream(str_value);
-        double tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      }
-    }
-
-    if (ctx.HasInput("ValueTensor")) {
-      const auto* value_tensor = ctx.Input<Tensor>("ValueTensor");
-      PADDLE_ENFORCE_EQ(
-          value_tensor->numel(),
-          1,
-          platform::errors::InvalidArgument(
-              "When use Tensor as value to set Tensor value in fill_constant, "
-              "value input(ValueTensor) size must be 1, but got %d",
-              value_tensor->numel()));
-      value = value_tensor->data<T>()[0];
-    }
-
-    return value;
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(fill_constant,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::FillConstantMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
index 64d7bca4d06469..c5d67e567b2b3b 100644
--- a/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/interpolate_mkldnn_op.cc
@@ -181,15 +181,3 @@ REGISTER_OP_KERNEL(bilinear_interp,
                    MKLDNN,
                    ::paddle::platform::CPUPlace,
                    ops::InterpolateMKLDNNKernel<float>);
-
-REGISTER_OP_KERNEL(nearest_interp_v2,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::InterpolateMKLDNNKernel<float>,
-                   ops::InterpolateMKLDNNKernel<paddle::platform::bfloat16>,
-                   ops::InterpolateMKLDNNKernel<int8_t>,
-                   ops::InterpolateMKLDNNKernel<uint8_t>);
-REGISTER_OP_KERNEL(bilinear_interp_v2,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::InterpolateMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
index f8c9c9d86a9953..000e31aad9ac9a 100644
--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -214,10 +214,7 @@ class MatMulMKLDNNHandler
     }
     astream.wait();
 
-    auto format =
-        MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw);
-    out->set_format(format);
-    out->set_layout(DataLayout::kMKLDNN);
+    out->set_mem_desc(dst_memory_p->get_desc().reshape(out->dims()));
   }
 
   std::shared_ptr<dnnl::memory> AcquireDstMemory(
@@ -651,10 +648,18 @@ void ExecuteMatMulV2(const ExecutionContext &ctx,
   auto &astream = MKLDNNDeviceContext::tls().get_stream();
   matmul_p->execute(astream, matmul_args);
   astream.wait();
-  auto format =
-      MKLDNNFormatForSize(out->dims().size(), dnnl::memory::format_tag::nchw);
-  out->set_format(format);
-  out->set_layout(DataLayout::kMKLDNN);
+
+  // TODO(jczaja): Explain why int8 format of dst is ABCD and do not need
+  // permute
+  if (IsOutputFused(ctx) && !IsInt8<T_out>()) {
+    auto axis = ctx.Attr<std::vector<int>>("fused_transpose_Out");
+    auto permuted_md = dst_memory_p->get_desc().permute_axes(axis);
+    out->set_mem_desc(
+        permuted_md.reshape(phi::vectorize<int64_t>(out->dims())));
+  } else {
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(phi::vectorize<int64_t>(out->dims())));
+  }
 }
 
 template <typename T>
@@ -836,8 +841,7 @@ class MatMulV2GradMKLDNNKernel : public paddle::framework::OpKernel<T> {
     reduction_p->execute(astream, reduction_args);
     astream.wait();
 
-    dx->set_format(paddle::platform::GetMKLDNNFormat(
-        dst_memory_p->get_desc().reshape(squeezed_dims)));
+    dx->set_mem_desc(dst_memory_p->get_desc().reshape(squeezed_dims));
   }
 
   std::vector<int64_t> ExtendDimsWithOnes(const std::vector<int64_t> &dims,
@@ -1119,9 +1123,8 @@ void MatMulGradMKLDNNKernel<T>::ExecuteMatMulGrad(
   matmul_p->execute(astream, matmul_args);
   astream.wait();
 
-  out->set_layout(framework::DataLayout::kMKLDNN);
-  out->set_format(platform::GetMKLDNNFormat(
-      dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims()))));
+  out->set_mem_desc(
+      dst_memory_p->get_desc().reshape(vectorize<int64_t>(out->dims())));
 }
 
 template <typename T>
@@ -1184,13 +1187,13 @@ void MatMulGradMKLDNNKernel<T>::RunKernel(const ExecutionContext &ctx) const {
   if (dx) {
     if (dx_dims != x.dims()) {
       dx->Resize(dx_dims);
-      dx->set_format(x.format());
+      dx->set_mem_desc(x.mem_desc());
     }
   }
   if (dy) {
     if (dy_dims != y.dims()) {
       dy->Resize(dy_dims);
-      dy->set_format(y.format());
+      dy->set_mem_desc(y.mem_desc());
     }
   }
 }
diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
index e727a4fe9fb488..e9150b0c58f76d 100644
--- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc
@@ -221,7 +221,7 @@ class MulPrimitiveFactory {
               to_void_cast<T>(x_tmp.data<T>()));
 
       x_tmp.Resize(data->dims());
-      x_tmp.set_format(platform::GetMKLDNNFormat(dst_mdesc));
+      x_tmp.set_mem_desc(dst_mdesc);
       data_matrix = framework::ReshapeToMatrix(x_tmp, num_col_dims);
     } else {
       data_matrix = framework::ReshapeToMatrix(*data, num_col_dims);
@@ -235,11 +235,7 @@ class MulPrimitiveFactory {
                           const Tensor *in) {
     x_input_->set_data_handle(to_void_cast<XT>(in->data<XT>()));
     output_->set_data_handle(out->mutable_data<OT>(ctx.GetPlace()));
-
-    if (out->format() == MKLDNNMemoryFormat::undef) {
-      auto output_format = platform::GetMKLDNNFormat(*output_);
-      out->set_format((MKLDNNMemoryFormat)output_format);
-    }
+    out->set_mem_desc(output_->get_desc());
   }
 
   template <typename T>
@@ -272,7 +268,7 @@ class MulPrimitiveFactory {
     auto buffer_size = dst_desc.get_size();
 
     OT *output_data = output->mutable_data<OT>(ctx.GetPlace(), buffer_size);
-    output->set_format(paddle::platform::GetMKLDNNFormat(dst_desc));
+    output->set_mem_desc(dst_desc);
     return memory(dst_desc, engine_, to_void_cast<OT>(output_data));
   }
 
@@ -392,9 +388,10 @@ class MulMKLDNNINT8Kernel : public framework::OpKernel<XT> {
     if (out_dims.size() != 2) {
       out->Resize(out_dims);
     }
-    out->set_layout(DataLayout::kMKLDNN);
-    out->set_format(platform::MKLDNNFormatForSize(out_dims.size(),
-                                                  MKLDNNMemoryFormat::nchw));
+
+    auto in_md = dnnl::memory::desc(*dnnl_primitive_desc_query_md(
+        mul.get_primitive_desc(), dnnl_query_dst_md, 0));
+    out->set_mem_desc(in_md.reshape(phi::vectorize<int64_t>(out->dims())));
   }
 };
 
@@ -442,10 +439,11 @@ class MulMKLDNNKernel : public framework::OpKernel<XT> {
     matmul_p->execute(astream, matmul_args);
     astream.wait();
 
-    out->set_layout(framework::DataLayout::kMKLDNN);
-    // plain output formats are enforced inside handler
-    out->set_format(platform::MKLDNNFormatForSize(
-        out->dims().size(), dnnl::memory::format_tag::nchw));
+    // This kernel is flattening dims so then we need to unflattened version
+    // that should be set in out reshape require plain layout, but
+    // MatmulV2MKLDNNHanlder enforces one so it should work
+    out->set_mem_desc(
+        dst_memory_p->get_desc().reshape(phi::vectorize<int64_t>(out->dims())));
   }
 
  private:
diff --git a/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc
deleted file mode 100644
index e7a528c452b8df..00000000000000
--- a/paddle/fluid/operators/mkldnn/pad3d_mkldnn_op.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-namespace paddle {
-namespace operators {
-
-using framework::Tensor;
-
-/*
-Pad3D is done by using up to 7 reorders. Following example is done
-on 2D data for simplicity, but it is straightforward to extend it to 3D case.
-
-Let us consider following example:
-
-          N  C  H  W               L  R  T  B
-X_dims = (1, 1, 3, 3), paddings = (1, 2, 3, 4) in order Left, Right, Top, Bottom
-
-We have to copy the X tensor into Out tensor, but except from that we have to
-fill the rest of the memory with an additional padding. To avoid looping through
-the whole Out memory two times, only these parts of Out memory that won't store
-X's memory are filled with pad value. That behavior is achieved by using
-oneDNN's submemory descriptors which allows us to set offsets for each dimension
-and skip some parts of the memory. For 2D case up to 5 reorders will be used in
-Pad3D kernel(if padding=0 reorder is skipped). In the following example i'th
-number means, that this part of memory was filled by i'th reorder. 4'th reorder
-is copying X memory into Out memory. i&j means that both i'th and j'th reorder
-will set the padding at that location:
-
-               INDEX
-     | 0   1   2   3   4   5
-     |_______________________
-   0 |0&2  2   2   2  1&2 1&2
-   1 |0&2  2   2   2  1&2 1&2
-I  2 |0&2  2   2   2  1&2 1&2
-N  3 | 0   4   4   4   1   1
-D  4 | 0   4   4   4   1   1
-E  5 | 0   4   4   4   1   1
-X  6 |0&3  3   3   3  1&3 1&3
-   7 |0&3  3   3   3  1&3 1&3
-   8 |0&3  3   3   3  1&3 1&3
-   9 |0&3  3   3   3  1&3 1&3
-
-Since oneDNN's reorder cannot set the pad value to the memory by itself, we have
-to prefill Out's memory and use it as a temporary buffer, which later is copied
-into the rest of Out's memory. At the end last reorder is done which copies X
-memory into Out memory.
-
-*/
-template <typename T>
-class PadMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    auto* x = ctx.Input<Tensor>("X");
-    auto* out = ctx.Output<Tensor>("Out");
-    auto* paddings_tensor = ctx.Input<Tensor>("Paddings");
-    std::vector<int> paddings(ctx.Attr<std::vector<int>>("paddings"));
-    if (paddings_tensor) {
-      std::copy(paddings_tensor->data<int>(),
-                paddings_tensor->data<int>() + paddings_tensor->numel(),
-                paddings.data());
-    }
-    // pad2d has paddings in order top, bottom, left, right, so we need
-    // to swap some of them to unify paddings between pad2d and pad3d
-    if (ctx.Type() == "pad2d") {
-      std::swap(paddings[0], paddings[2]);
-      std::swap(paddings[1], paddings[3]);
-    }
-
-    const std::string pad_attr_name =
-        ctx.Type() == "pad3d" ? "value" : "pad_value";
-    T pad_value = static_cast<T>(ctx.Attr<float>(pad_attr_name));
-
-    std::vector<int64_t> x_tz = phi::vectorize(x->dims());
-    // due to the need of supporting NDHWC, inferring out shape
-    // must be done inside the kernel
-    std::vector<int64_t> out_tz(x_tz);
-
-    for (size_t i = 0; i < paddings.size() / 2; ++i) {
-      out_tz[out_tz.size() - 1 - i] += paddings[2 * i] + paddings[2 * i + 1];
-    }
-    out->Resize(phi::make_ddim(out_tz));
-
-    auto paddle_dtype = framework::TransToProtoVarType(x->dtype());
-
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_tz,
-        paddle_dtype,
-        framework::ToMKLDNNDataType(paddle_dtype),
-        onednn_engine);
-
-    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->mem_desc(), platform::to_void_cast(x->data<T>()));
-    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        out,
-        out_tz,
-        platform::GetPlainMKLDNNFormat(out_tz.size()),
-        ctx.GetPlace());
-
-    // to avoid allocating new temporary memory, Out's memory is used as a tmp
-    // buffer for storing a contiguous memory consisting of pad_value, which
-    // later is used as a SRC for reorders that are filling Out with padding
-    T* out_ptr = out->data<T>();
-    std::fill(out_ptr,
-              out_ptr + CalculateNumOfPrefillElems(out_tz, paddings),
-              pad_value);
-
-    // paddings are in order: left, right, top, bottom, front, back
-    for (size_t i = 0; i < paddings.size(); ++i) {
-      if (paddings[i] != 0) {
-        std::vector<int64_t> offsets(out_tz.size(), 0);
-        std::vector<int64_t> chunk_tz(out_tz.begin(), out_tz.end());
-
-        chunk_tz[out_tz.size() - 1 - i / 2] = paddings[i];
-        if (i % 2 == 1) {
-          offsets[out_tz.size() - 1 - i / 2] =
-              paddings[i - 1] + x_tz[out_tz.size() - 1 - i / 2];
-        }
-
-        FillPartOfPadding(paddle_dtype,
-                          onednn_engine,
-                          out_ptr,
-                          reorder_dst_memory_p,
-                          chunk_tz,
-                          offsets);
-      }
-    }
-    astream.wait();
-
-    std::vector<int64_t> offsets(out_tz.size(), 0);
-    for (size_t i = 0; i < paddings.size() / 2; ++i) {
-      offsets[out_tz.size() - 1 - i] = paddings[2 * i];
-    }
-
-    auto slice_mem_p =
-        reorder_handler.AcquireSubmemory(x_tz, offsets, reorder_dst_memory_p);
-
-    auto reorder_p =
-        reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p);
-    reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
-    astream.wait();
-
-    out->set_mem_desc(reorder_dst_memory_p->get_desc());
-  }
-
-  int64_t CalculateNumOfPrefillElems(const std::vector<int64_t>& out_tz,
-                                     const std::vector<int>& paddings) const {
-    int64_t max_elems = 0;
-    int64_t independent_dims = out_tz[0] * out_tz[1];
-
-    for (size_t i = 0; i < paddings.size() / 2; ++i) {
-      int64_t elems = std::max(paddings[2 * i], paddings[2 * i + 1]);
-      for (size_t j = 0; j < paddings.size() / 2; ++j) {
-        if (j != i) {
-          elems *= out_tz[out_tz.size() - 1 - j];
-        }
-      }
-
-      if (max_elems < elems) {
-        max_elems = elems;
-      }
-    }
-    return independent_dims * max_elems;
-  }
-
-  void FillPartOfPadding(framework::proto::VarType::Type paddle_dtype,
-                         const dnnl::engine& onednn_engine,
-                         T* prefilled_mem_ptr,
-                         const std::shared_ptr<dnnl::memory>& out_mem_p,
-                         const std::vector<int64_t>& chunk_tz,
-                         const std::vector<int64_t>& offsets) const {
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    dnnl::memory::desc prefilled_mem_desc(
-        chunk_tz,
-        platform::MKLDNNGetDataType<T>(),
-        platform::GetPlainMKLDNNFormat(chunk_tz.size()));
-    dnnl::memory prefilled_mem(
-        prefilled_mem_desc, onednn_engine, prefilled_mem_ptr);
-
-    dnnl::memory::desc out_slice_md =
-        out_mem_p->get_desc().submemory_desc(chunk_tz, {offsets});
-    dnnl::memory out_slice_mem(
-        out_slice_md, onednn_engine, out_mem_p->get_data_handle());
-
-    auto reorder_p = dnnl::reorder(prefilled_mem, out_slice_mem);
-    reorder_p.execute(astream, prefilled_mem, out_slice_mem);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(pad3d,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::PadMKLDNNKernel<float>);
-
-REGISTER_OP_KERNEL(pad2d,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::PadMKLDNNKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
deleted file mode 100644
index 6a05585a37c6f3..00000000000000
--- a/paddle/fluid/operators/mkldnn/shape_mkldnn_op.cc
+++ /dev/null
@@ -1,72 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using SelectedRows = phi::SelectedRows;
-
-template <typename T>
-class ShapeMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in_var = ctx.InputVar("Input");
-    framework::DDim in_dims;
-    if (in_var->IsType<phi::SelectedRows>()) {
-      in_dims = in_var->Get<phi::SelectedRows>().value().dims();
-    } else {
-      in_dims = in_var->Get<LoDTensor>().dims();
-      // Output of shape op is often fed as input to fill_constant ops
-      // and we need to rotate a shape otherwise Tensors of wrong shape may be
-      // allocated
-      if (platform::MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
-              framework::DataLayout::kNHWC &&
-          in_dims.size() >= 3) {
-        auto rdims = phi::vectorize<int>(in_dims);
-        std::rotate(rdims.begin() + 1, rdims.begin() + 2, rdims.end());
-        in_dims = phi::make_ddim(rdims);
-      }
-    }
-    auto* out_t = ctx.Output<Tensor>("Out");
-    out_t->Resize({in_dims.size()});
-    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
-    for (int i = 0; i < in_dims.size(); ++i) {
-      out_data[i] = in_dims[i];
-    }
-
-    dnnl::memory::desc out_mem_desc(
-        phi::vectorize(out_t->dims()),
-        framework::ToMKLDNNDataType(
-            framework::TransToProtoVarType(out_t->dtype())),
-        platform::GetPlainMKLDNNFormat(out_t->dims().size()));
-
-    out_t->set_mem_desc(out_mem_desc);
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(shape,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::ShapeMKLDNNKernel<float>,
-                   ops::ShapeMKLDNNKernel<paddle::platform::bfloat16>,
-                   ops::ShapeMKLDNNKernel<int8_t>,
-                   ops::ShapeMKLDNNKernel<uint8_t>);
diff --git a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
deleted file mode 100644
index e23971c86ada87..00000000000000
--- a/paddle/fluid/operators/mkldnn/slice_mkldnn_op.cc
+++ /dev/null
@@ -1,228 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-template <typename T>
-class SliceMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* x = ctx.Input<Tensor>("Input");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto x_vec_dims = phi::vectorize(x->dims());
-
-    auto axes_int = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-
-    std::vector<int64_t> axes(ctx.Attr<std::vector<int>>("axes").begin(),
-                              ctx.Attr<std::vector<int>>("axes").end());
-    std::vector<int64_t> starts(ctx.Attr<std::vector<int>>("starts").begin(),
-                                ctx.Attr<std::vector<int>>("starts").end());
-    std::vector<int64_t> ends(ctx.Attr<std::vector<int>>("ends").begin(),
-                              ctx.Attr<std::vector<int>>("ends").end());
-
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-
-    std::vector<int64_t> offsets(x_vec_dims.size(), 0);
-    std::vector<int64_t> slice_dims(x_vec_dims);
-
-    for (size_t i = 0; i < axes.size(); ++i) {
-      starts[i] = starts[i] < 0 ? x_vec_dims[axes[i]] + starts[i] : starts[i];
-      ends[i] = ends[i] < 0 ? x_vec_dims[axes[i]] + ends[i]
-                            : std::min(ends[i], x_vec_dims[axes[i]]);
-      offsets[axes[i]] = starts[i];
-      slice_dims[axes[i]] = ends[i] - starts[i];
-    }
-
-    out->Resize(phi::make_ddim(slice_dims));
-
-    dnnl::memory::data_type x_type =
-        framework::ToMKLDNNDataType(framework::TransToProtoVarType(x->dtype()));
-
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_vec_dims,
-        framework::TransToProtoVarType(x->dtype()),
-        x_type,
-        onednn_engine);
-
-    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->mem_desc(), platform::to_void_cast(x->data<T>()));
-    auto slice_mem_p = reorder_handler.AcquireSubmemory(
-        slice_dims, offsets, reorder_src_memory_p);
-    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        out,
-        slice_dims,
-        platform::GetPlainMKLDNNFormat(x_vec_dims.size()),
-        ctx.GetPlace());
-
-    auto reorder_p =
-        reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
-
-    std::vector<int64_t> new_out_dims(slice_dims.size() - decrease_axis.size());
-
-    if (new_out_dims.size() == 0) {
-      new_out_dims.emplace_back(1);
-    } else {
-      for (const auto& axis : decrease_axis) {
-        slice_dims[axis] = 0;
-      }
-
-      int i = 0;
-      for (const auto& slice_dim : slice_dims) {
-        if (slice_dim != 0) new_out_dims[i++] = slice_dim;
-      }
-    }
-
-    astream.wait();
-    out->Resize(phi::make_ddim(new_out_dims));
-    out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(new_out_dims));
-  }
-};
-template <typename T>
-class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
-
-    auto dx_vec_dims = phi::vectorize(dx->dims());
-    auto dout_vec_dims = phi::vectorize(dout->dims());
-
-    auto axes_int = ctx.Attr<std::vector<int>>("axes");
-    auto starts_int = ctx.Attr<std::vector<int>>("starts");
-    auto ends_int = ctx.Attr<std::vector<int>>("ends");
-
-    std::vector<int64_t> axes(ctx.Attr<std::vector<int>>("axes").begin(),
-                              ctx.Attr<std::vector<int>>("axes").end());
-    std::vector<int64_t> starts(ctx.Attr<std::vector<int>>("starts").begin(),
-                                ctx.Attr<std::vector<int>>("starts").end());
-    std::vector<int64_t> ends(ctx.Attr<std::vector<int>>("ends").begin(),
-                              ctx.Attr<std::vector<int>>("ends").end());
-
-    auto starts_tensor_list = ctx.MultiInput<Tensor>("StartsTensorList");
-    if (ctx.HasInput("StartsTensor")) {
-      starts = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("StartsTensor"));
-    } else if (starts_tensor_list.size() > 0) {
-      starts = GetDataFromTensorList<int64_t>(starts_tensor_list);
-    }
-
-    auto ends_tensor_list = ctx.MultiInput<Tensor>("EndsTensorList");
-    if (ctx.HasInput("EndsTensor")) {
-      ends = GetDataFromTensor<int64_t>(ctx.Input<Tensor>("EndsTensor"));
-    } else if (ends_tensor_list.size() > 0) {
-      ends = GetDataFromTensorList<int64_t>(ends_tensor_list);
-    }
-
-    auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
-
-    std::vector<int64_t> offsets(dx_vec_dims.size(), 0);
-    std::vector<int64_t> slice_dims(dx_vec_dims);
-
-    for (size_t i = 0; i < axes.size(); ++i) {
-      starts[i] = starts[i] < 0 ? dx_vec_dims[axes[i]] + starts[i] : starts[i];
-      ends[i] = ends[i] < 0 ? dx_vec_dims[axes[i]] + ends[i]
-                            : std::min(ends[i], dx_vec_dims[axes[i]]);
-      offsets[axes[i]] = starts[i];
-      slice_dims[axes[i]] = ends[i] - starts[i];
-    }
-
-    dnnl::memory::data_type dout_type = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(dout->dtype()));
-
-    platform::ReorderMKLDNNHandler reorder_handler(
-        slice_dims,
-        framework::TransToProtoVarType(dout->dtype()),
-        dout_type,
-        onednn_engine);
-
-    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        dout->mem_desc().reshape(slice_dims),
-        platform::to_void_cast(dout->data<T>()));
-    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-        dx,
-        dx_vec_dims,
-        platform::GetPlainMKLDNNFormat(dx_vec_dims.size()),
-        ctx.GetPlace());
-    memset(dx->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
-
-    auto slice_mem_p = reorder_handler.AcquireSubmemory(
-        slice_dims, offsets, reorder_dst_memory_p);
-
-    auto reorder_p =
-        reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p);
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
-    astream.wait();
-
-    dx->set_mem_desc(reorder_dst_memory_p->get_desc());
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(slice,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::SliceMKLDNNKernel<float>,
-                   ops::SliceMKLDNNKernel<int8_t>,
-                   ops::SliceMKLDNNKernel<uint8_t>,
-                   ops::SliceMKLDNNKernel<paddle::platform::bfloat16>);
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(slice_grad,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::SliceGradMKLDNNKernel<float>,
-                   ops::SliceGradMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 2bb82186483da8..659539e5e39b8f 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -55,36 +52,6 @@ class SoftmaxMKLDNNHandler
     this->AcquireForwardPrimitiveDescriptor(
         prop_kind::forward_scoring, input->mem_desc(), axis);
   }
-
-  SoftmaxMKLDNNHandler(const framework::ExecutionContext& ctx,
-                       const dnnl::engine mkldnn_engine,
-                       platform::Place cpu_place,
-                       const Tensor* out,
-                       const Tensor* out_grad,
-                       Tensor* in_x_grad,
-                       const std::string& unique_name)
-      : platform::MKLDNNHandlerNoCachingT<T,
-                                          dnnl::softmax_forward,
-                                          dnnl::softmax_backward>(mkldnn_engine,
-                                                                  cpu_place) {
-    PADDLE_ENFORCE_EQ(out_grad->dims(),
-                      in_x_grad->dims(),
-                      platform::errors::InvalidArgument(
-                          "The shape of softmax_grad's input "
-                          "and output must be identical, but shapes differ, "
-                          "out_grad: %s in_grad: %s",
-                          out_grad->dims(),
-                          in_x_grad->dims()));
-
-    auto dims = out_grad->dims();  // input and output share the same shape
-    const int axis =
-        phi::funcs::CanonicalAxis(ctx.Attr<int>("axis"), dims.size());
-
-    this->AcquireForwardPrimitiveDescriptor(
-        prop_kind::forward_scoring, out->mem_desc(), axis);
-    this->AcquireBackwardPrimitiveDescriptor(
-        out_grad->mem_desc(), out->mem_desc(), axis);
-  }
 };
 
 template <typename T>
@@ -133,44 +100,6 @@ class SoftmaxMKLDNNKernel : public paddle::framework::OpKernel<T> {
   }
 };
 
-template <typename T>
-class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()),
-                      true,
-                      paddle::platform::errors::PreconditionNotMet(
-                          "Operator DNNL SoftmaxGrad must use CPUPlace"));
-    auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    const Tensor* output = ctx.Input<Tensor>("Out");
-    auto* out_grad = ctx.template Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_x_grad = ctx.template Output<Tensor>(framework::GradVarName("X"));
-
-    SoftmaxMKLDNNHandler<T> handler(ctx,
-                                    mkldnn_engine,
-                                    ctx.GetPlace(),
-                                    output,
-                                    out_grad,
-                                    in_x_grad,
-                                    ctx.InputName("Out"));
-
-    auto dst_memory_p = handler.AcquireDstMemory(output);
-    auto diff_dst_memory_p = handler.AcquireDiffDstMemory(out_grad);
-    auto diff_src_memory_p = handler.AcquireDiffSrcMemory(in_x_grad);
-
-    auto softmax_bwd_p = handler.AcquireBackwardPrimitive();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    softmax_bwd_p->execute(astream,
-                           {{DNNL_ARG_DST, *dst_memory_p},
-                            {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
-                            {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
-    astream.wait();
-
-    in_x_grad->set_mem_desc(diff_src_memory_p->get_desc());
-  }
-};
 }  // namespace operators
 }  // namespace paddle
 
@@ -181,7 +110,3 @@ REGISTER_OP_KERNEL(softmax,
                    ::paddle::platform::CPUPlace,
                    ops::SoftmaxMKLDNNKernel<float>,
                    ops::SoftmaxMKLDNNKernel<paddle::platform::bfloat16>);
-REGISTER_OP_KERNEL(softmax_grad,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::SoftmaxMKLDNNGradKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
deleted file mode 100644
index f71931ad1ecc73..00000000000000
--- a/paddle/fluid/operators/mkldnn/split_mkldnn_op.cc
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-
-namespace paddle {
-namespace operators {
-
-using paddle::framework::Tensor;
-
-static inline std::vector<std::vector<int64_t>> CalculateOutsDims(
-    const framework::DDim& in_dims,
-    const size_t num,
-    const std::vector<int>& sections,
-    const size_t axis,
-    const int outs_number) {
-  std::vector<std::vector<int64_t>> outs_dims(outs_number,
-                                              phi::vectorize(in_dims));
-
-  if (num > 0) {
-    PADDLE_ENFORCE_EQ(in_dims[axis] % num,
-                      0,
-                      platform::errors::InvalidArgument(
-                          "The input's size along the split dimension "
-                          "must be evenly divisible by Attr(num_or_sections). "
-                          "But received Attr(num_or_sections) "
-                          "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
-                          num,
-                          in_dims,
-                          axis));
-
-    const size_t out_axis_dim = in_dims[axis] / num;
-
-    for (auto& out_dim : outs_dims) out_dim[axis] = out_axis_dim;
-  } else {
-    for (size_t i = 0; i < outs_dims.size(); ++i)
-      outs_dims[i][axis] = sections[i];
-  }
-  return outs_dims;
-}
-
-template <typename T>
-class SplitMKLDNNKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    this->RunKernel(ctx);
-  }
-
-  void RunKernel(const framework::ExecutionContext& ctx) const {
-    const auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& onednn_engine = dev_ctx.GetEngine();
-
-    const auto* x = ctx.Input<Tensor>("X");
-    auto outs = ctx.MultiOutput<Tensor>("Out");
-
-    int num = ctx.Attr<int>("num");
-    auto sections = ctx.Attr<std::vector<int>>("sections");
-    int axis = ctx.Attr<int>("axis");
-    auto outs_number = outs.size();
-    const auto x_dims = x->dims();
-
-    bool need_resize = false;
-    if (ctx.HasInput("AxisTensor")) {
-      auto* axis_tensor = ctx.Input<Tensor>("AxisTensor");
-      axis = GetDataFromTensor(axis_tensor)[0];
-      need_resize = true;
-    }
-
-    auto sections_tensor_list = ctx.MultiInput<Tensor>("SectionsTensorList");
-    if (sections_tensor_list.size() > 0) {
-      sections = GetDataFromTensorList(sections_tensor_list);
-      need_resize = true;
-    }
-
-    if (need_resize) {
-      const auto outs_dims =
-          CalculateOutsDims(x->dims(), num, sections, axis, outs_number);
-      for (size_t i = 0; i < outs.size(); ++i) {
-        outs[i]->Resize(phi::make_ddim(outs_dims[i]));
-      }
-    }
-
-    auto x_vec_dims = phi::vectorize(x_dims);
-
-    dnnl::memory::data_type x_type =
-        framework::ToMKLDNNDataType(framework::TransToProtoVarType(x->dtype()));
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-
-    std::vector<int64_t> offset(x_vec_dims.size(), 0);
-
-    platform::ReorderMKLDNNHandler reorder_handler(
-        x_vec_dims,
-        framework::TransToProtoVarType(x->dtype()),
-        x_type,
-        onednn_engine);
-    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
-        x->mem_desc(), platform::to_void_cast(x->data<T>()));
-
-    for (size_t i = 0; i < outs_number; ++i) {
-      auto out_vec_dims = phi::vectorize(outs[i]->dims());
-      auto slice_mem_p = reorder_handler.AcquireSubmemory(
-          out_vec_dims, offset, reorder_src_memory_p);
-
-      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
-          outs[i], out_vec_dims, x->format(), ctx.GetPlace());
-      auto reorder_p =
-          reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
-
-      reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
-
-      offset[axis] += num > 0 ? x->dims()[axis] / num : sections[i];
-
-      outs[i]->set_mem_desc(reorder_dst_memory_p->get_desc());
-    }
-    astream.wait();
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(split,
-                   MKLDNN,
-                   paddle::platform::CPUPlace,
-                   ops::SplitMKLDNNKernel<float>,
-                   ops::SplitMKLDNNKernel<paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
deleted file mode 100644
index 1e546e44fa2416..00000000000000
--- a/paddle/fluid/operators/mkldnn/stack_mkldnn_op.cc
+++ /dev/null
@@ -1,146 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/utils.h"
-#include "paddle/fluid/platform/mkldnn_reuse.h"
-namespace paddle {
-namespace operators {
-
-using dnnl::concat;
-using dnnl::memory;
-using dnnl::primitive;
-using dnnl::stream;
-using framework::DataLayout;
-using framework::LoDTensor;
-using framework::Tensor;
-using platform::to_void_cast;
-
-template <typename T>
-class StackMKLDNNHandler
-    : public platform::MKLDNNHandlerNoCachingT<T, dnnl::concat> {
- public:
-  StackMKLDNNHandler(const framework::ExecutionContext& ctx,
-                     const dnnl::engine mkldnn_engine,
-                     const std::vector<const Tensor*>& inputs,
-                     Tensor* output)
-      : platform::MKLDNNHandlerNoCachingT<T, dnnl::concat>(mkldnn_engine,
-                                                           ctx.GetPlace()) {
-    int stack_axis = ctx.Attr<int>("axis");
-
-    int ndims = inputs[0]->dims().size();
-
-    if (stack_axis < 0) {
-      stack_axis = ndims + 1 + stack_axis;  // +1 to match output's ndims
-    }
-
-    // in stack op all inputs must have same dims
-    auto input_dims = phi::vectorize<int64_t>(inputs[0]->dims());
-
-    memory::data_type dt = framework::ToMKLDNNDataType(
-        framework::TransToProtoVarType(inputs[0]->dtype()));
-    std::vector<memory::desc> srcs_md;
-    memory::desc dst_md;
-    MKLDNNMemoryFormat dst_fmt;
-
-    srcs_md.reserve(inputs.size());
-
-    // if stack is not done on last(non existing) axis, then we can optimize
-    // concat primitive by not adding additional dimension, since it causes
-    // wrong output format deduction and suboptimal performance as a result
-    if (stack_axis != ndims) {
-      for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.push_back(inputs[i]->mem_desc());
-      }
-
-      input_dims[stack_axis] *= inputs.size();
-      dst_md = memory::desc(input_dims, dt, MKLDNNMemoryFormat::any);
-    } else {
-      auto extended_input_dims = phi::vectorize<int64_t>(output->dims());
-      extended_input_dims[stack_axis] = 1;
-
-      for (size_t i = 0; i < inputs.size(); ++i) {
-        srcs_md.push_back(inputs[i]->mem_desc().reshape(extended_input_dims));
-      }
-
-      // concat primitive choses suboptimal format tag because it cannot
-      // distinguish between f.e. abcd and abdc if last dim is equal to 1 so
-      // enforcing is needed for better performance
-      dst_fmt = platform::GetPlainMKLDNNFormat(extended_input_dims.size());
-      dst_md = memory::desc(phi::vectorize(output->dims()), dt, dst_fmt);
-    }
-
-    this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md);
-  }
-
-  // concat oneDNN prim is not having .desc attribute so we cannot use default
-  // AcquireForwardPrimitiveDescriptor
-  void AcquireForwardPrimitiveDescriptor(
-      const memory::desc& dst_md,
-      const int stack_axis,
-      const std::vector<memory::desc>& srcs_md) {
-    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
-        dst_md, stack_axis, srcs_md, this->engine_));
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const Tensor& input, int i) {
-    const T* input_data = input.data<T>();
-    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
-                                            to_void_cast<T>(input_data));
-  }
-};
-
-template <typename T>
-class StackMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
- public:
-  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    auto& dev_ctx =
-        ctx.template device_context<platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-
-    auto multi_input = ctx.MultiInput<Tensor>("X");
-
-    Tensor* output = ctx.Output<Tensor>("Y");
-
-    StackMKLDNNHandler<T> handler(ctx, mkldnn_engine, multi_input, output);
-
-    std::vector<std::shared_ptr<memory>> srcs;
-    srcs.reserve(multi_input.size());
-
-    auto dst_mem = handler.AcquireDstMemory(output);
-    auto concat_p = handler.AcquireForwardPrimitive();
-
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    std::unordered_map<int, memory> args;
-    for (size_t i = 0; i < multi_input.size(); ++i) {
-      srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i));
-      args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs.at(i))});
-    }
-    args.insert({DNNL_ARG_DST, *dst_mem});
-
-    concat_p->execute(astream, args);
-    astream.wait();
-
-    output->set_mem_desc(
-        dst_mem->get_desc().reshape(phi::vectorize(output->dims())));
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-
-REGISTER_OP_KERNEL(stack,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::StackMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index f71785e72cd4df..072016d729cdb6 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -24,7 +24,8 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 
-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace phi {
@@ -37,6 +38,9 @@ namespace operators {
 using paddle::platform::MKLDNNDeviceContext;
 using phi::CPUContext;
 using platform::to_void_cast;
+using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
 
 template <typename T>
 class SumMKLDNNHandler
diff --git a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
index db590807179d91..ad2b53de1ed915 100644
--- a/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
+++ b/paddle/fluid/operators/mkldnn/test_mkldnn_op_nhwc.cc
@@ -34,7 +34,7 @@ PD_DECLARE_KERNEL(relu, OneDNN, ALL_LAYOUT);
 USE_OP_ITSELF(transpose);
 USE_OP_DEVICE_KERNEL(transpose, MKLDNN);
 USE_OP_ITSELF(shape);
-USE_OP_DEVICE_KERNEL(shape, MKLDNN);
+PD_DECLARE_KERNEL(shape, OneDNN, ALL_LAYOUT);
 USE_OP_ITSELF(crop);
 USE_OP_DEVICE_KERNEL(crop, CPU);
 
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index a01901950bc417..246a9a772b0869 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -21,72 +21,8 @@
 namespace paddle {
 namespace operators {
 
-using Tensor = framework::Tensor;
-using framework::DataLayout;
-
-template <typename T>
-class TransposeMKLDNNHandler {
- public:
-  TransposeMKLDNNHandler(std::vector<int64_t>& dims,  // NOLINT
-                         std::vector<int>& axis,      // NOLINT
-                         dnnl::engine engine)
-      : dims_(dims),
-        axis_(axis),
-        logical_axis_(dims.size(), 0),
-        engine_(engine) {}
-
-  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const MKLDNNMemoryFormat& fmt,
-                                                 void* ptr) {
-    // Make memory descriptor using input format, unless it
-    // cannot be trusted (nchw) then make up memory fmt manually
-    for (size_t i = 0; i < this->logical_axis_.size(); ++i) {
-      this->logical_axis_[i] = i;
-    }
-
-    auto src_md = fmt != MKLDNNMemoryFormat::nchw
-                      ? platform::MKLDNNMemDesc(
-                            dims_, platform::MKLDNNGetDataType<T>(), fmt)
-                      : Axis2MemoryDesc(dims_, logical_axis_);
-    return std::make_shared<dnnl::memory>(src_md, engine_, ptr);
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output,
-                                                 platform::Place place) {
-    auto dst_md = Axis2MemoryDesc(dims_, axis_);
-    auto dst_data = output->mutable_data<T>(place, dst_md.get_size());
-    return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
-  }
-
-  std::shared_ptr<dnnl::reorder> AcquireTranspose(
-      std::shared_ptr<dnnl::memory> dst_memory_p,
-      std::shared_ptr<dnnl::memory> src_memory_p) {
-    return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p));
-  }
-
- protected:
-  dnnl::memory::desc Axis2MemoryDesc(std::vector<int64_t>& nchw_tz,  // NOLINT
-                                     std::vector<int>& axis          // NOLINT
-  ) {
-    size_t ndims = axis.size();
-
-    std::vector<int64_t> strides(ndims);
-    unsigned int total_stride = 1;
-    for (int i = ndims - 1; i >= 0; --i) {
-      strides[axis[i]] = total_stride;
-      total_stride *= nchw_tz[axis[i]];
-    }
-    dnnl::memory::desc mem_d(
-        nchw_tz, platform::MKLDNNGetDataType<T>(), strides);
-
-    return mem_d;
-  }
-
- private:
-  std::vector<int64_t> dims_;
-  std::vector<int> axis_;
-  std::vector<int> logical_axis_;
-  dnnl::engine engine_;
-};
+using Tensor = phi::DenseTensor;
+using phi::DataLayout;
 
 template <typename T>
 class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
@@ -98,37 +34,90 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                           "Operator DNNL Transpose must use CPUPlace"));
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    int ndims = axis.size();
-    auto* input = ctx.Input<Tensor>("X");
-    auto* output = ctx.Output<Tensor>("Out");
-    const T* input_data = input->data<T>();
+    const auto& dnnl_engine = dev_ctx.GetEngine();
+    std::vector<int> transpose_axis = ctx.Attr<std::vector<int>>("axis");
+    int ndims = transpose_axis.size();
+    auto* x = ctx.Input<Tensor>("X");
+    auto* out = ctx.Output<Tensor>("Out");
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    platform::SetInMemDescWithLogicalLayoutFusesSupport(
+        ctx, const_cast<phi::DenseTensor*>(x), x->mem_desc());
 
     if (ndims == 1) {
-      framework::TensorCopy(*input, input->place(), output);
-      output->set_format(input->format());
+      framework::TensorCopy(*x, x->place(), out);
+      out->set_mem_desc(x->mem_desc());
       return;
     }
 
-    auto nchw_tz = phi::vectorize<int64_t>(input->dims());
+    auto x_vec_dims = phi::vectorize(x->dims());
 
-    TransposeMKLDNNHandler<T> handler(nchw_tz, axis, mkldnn_engine);
+    framework::proto::VarType::Type x_paddle_type =
+        framework::TransToProtoVarType(x->dtype());
+    dnnl::memory::data_type x_type = framework::ToMKLDNNDataType(x_paddle_type);
+    platform::ReorderMKLDNNHandler reorder_handler(
+        x_vec_dims, x_paddle_type, x_type, dnnl_engine);
 
-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        input->format(), platform::to_void_cast<T>(input_data));
-    auto transpose_dst_memory_p =
-        handler.AcquireDstMemory(output, ctx.GetPlace());
-    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
-                                                transpose_src_memory_p);
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        x->mem_desc(), platform::to_void_cast(x->data<T>()));
 
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    transpose_p->execute(
-        astream, *transpose_src_memory_p, *transpose_dst_memory_p);
+    auto dst_md =
+        dnnl::memory::desc(x_vec_dims,
+                           x->mem_desc().data_type(),
+                           platform::GetPlainMKLDNNFormat(x_vec_dims.size()));
+    // a trick is used here to fake transpose of out_md, so later it will be
+    // "untransposed", leaving output data in plain format tag
+    auto dst_strides = FakeTranposeStrides(dst_md, transpose_axis);
+
+    dst_md =
+        dnnl::memory::desc(x_vec_dims, x->mem_desc().data_type(), dst_strides);
+    auto dst_data =
+        out->mutable_data(ctx.GetPlace(), x->type(), dst_md.get_size());
+
+    auto reorder_dst_memory_p =
+        std::make_shared<dnnl::memory>(dst_md, dnnl_engine, dst_data);
+
+    auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                    reorder_src_memory_p);
+
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
 
-    output->set_layout(DataLayout::kNCHW);
-    output->set_format(MKLDNNMemoryFormat::undef);
+    platform::SetOutMemDescWithLogicalLayoutFusesSupport(
+        ctx,
+        out,
+        reorder_dst_memory_p->get_desc().permute_axes(
+            TransposeToPermuteAxis(transpose_axis)));
+  }
+
+ private:
+  // it is needed because oneDNN's permute axis understand axes order in
+  // different way PaddlePaddle's transpose
+  std::vector<int> TransposeToPermuteAxis(
+      const std::vector<int>& transpose_axis) const {
+    std::vector<int> permute_axis(transpose_axis.size());
+
+    for (size_t i = 0; i < transpose_axis.size(); ++i) {
+      permute_axis[transpose_axis[i]] = i;
+    }
+    return permute_axis;
+  }
+
+  std::vector<int64_t> FakeTranposeStrides(
+      const dnnl::memory::desc& dst_md,
+      const std::vector<int>& transpose_axis) const {
+    std::vector<int64_t> fake_strides(transpose_axis.size());
+    auto dims = dst_md.dims();
+    int total_stride = 1;
+    int ndims = static_cast<int>(dims.size());
+
+    for (int i = ndims - 1; i >= 0; --i) {
+      fake_strides[transpose_axis[i]] = total_stride;
+      total_stride *= dims[transpose_axis[i]];
+    }
+
+    return fake_strides;
   }
 };
 
@@ -140,44 +129,47 @@ class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                       true,
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL TransposeGrad must use CPUPlace"));
-    auto* out_grad =
-        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
-    if (!x_grad) return;
+
+    const auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
+    if (!dx) return;
     auto& dev_ctx =
         ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
-    std::vector<int> axis = ctx.Attr<std::vector<int>>("axis");
-    std::vector<int> reversed_axis(axis);
-    int ndims = axis.size();
+    const auto& dnnl_engine = dev_ctx.GetEngine();
+    std::vector<int> transpose_axis = ctx.Attr<std::vector<int>>("axis");
+
+    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
+
+    int ndims = transpose_axis.size();
     if (ndims == 1) {
-      framework::TensorCopy(*out_grad, out_grad->place(), x_grad);
-      x_grad->set_format(out_grad->format());
+      framework::TensorCopy(*dout, dout->place(), dx);
+      dx->set_mem_desc(dout->mem_desc());
       return;
     }
 
-    for (size_t i = 0; i < axis.size(); i++) {
-      reversed_axis[axis[i]] = i;
-    }
+    auto dout_vec_dims = phi::vectorize(dout->dims());
 
-    const T* out_grad_data = out_grad->data<T>();
-    x_grad->mutable_data<T>(ctx.GetPlace());
+    framework::proto::VarType::Type dout_paddle_type =
+        framework::TransToProtoVarType(dout->dtype());
+    dnnl::memory::data_type dout_type =
+        framework::ToMKLDNNDataType(dout_paddle_type);
 
-    auto nchw_tz = phi::vectorize<int64_t>(out_grad->dims());
+    platform::ReorderMKLDNNHandler reorder_handler(
+        dout_vec_dims, dout_paddle_type, dout_type, dnnl_engine);
 
-    TransposeMKLDNNHandler<T> handler(nchw_tz, reversed_axis, mkldnn_engine);
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
 
-    auto transpose_src_memory_p = handler.AcquireSrcMemory(
-        out_grad->format(), platform::to_void_cast<T>(out_grad_data));
-    auto transpose_dst_memory_p =
-        handler.AcquireDstMemory(x_grad, ctx.GetPlace());
-    auto transpose_p = handler.AcquireTranspose(transpose_dst_memory_p,
-                                                transpose_src_memory_p);
+    auto reorder_dst_memory_p =
+        reorder_handler.AcquireDstMemory(dx, dout->mem_desc(), ctx.GetPlace());
 
-    auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
-    transpose_p->execute(
-        astream, *transpose_src_memory_p, *transpose_dst_memory_p);
+    auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                    reorder_src_memory_p);
+
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
     astream.wait();
+    dx->set_mem_desc(
+        reorder_dst_memory_p->get_desc().permute_axes(transpose_axis));
   }
 };
 
@@ -224,8 +216,3 @@ REGISTER_OP_KERNEL(transpose_grad,
                    MKLDNN,
                    ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNGradOpKernel<float>);
-
-REGISTER_OP_KERNEL(transpose2_grad,
-                   MKLDNN,
-                   ::paddle::platform::CPUPlace,
-                   ops::TransposeMKLDNNGradOpKernel<float>);
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.cc b/paddle/fluid/operators/mlu/mlu_baseop.cc
index 4cd754775d9c0a..a09d79e8d08a5c 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.cc
+++ b/paddle/fluid/operators/mlu/mlu_baseop.cc
@@ -256,6 +256,186 @@ MLUCnnlTensorDesc::~MLUCnnlTensorDesc() {
   }
 }
 
+class MLUOpTensorDescPool {
+ public:
+  mluOpTensorDescriptor_t Pop() {
+    mluOpTensorDescriptor_t raw_desc;
+    if (q_.try_dequeue(raw_desc)) {
+      return raw_desc;
+    } else {
+      mluOpCreateTensorDescriptor(&raw_desc);
+      return raw_desc;
+    }
+  }
+
+  void Recycle(mluOpTensorDescriptor_t desc) {
+    mluOpResetTensorDescriptor(desc);
+    q_.enqueue(desc);
+  }
+
+  ~MLUOpTensorDescPool() {
+    auto size = q_.size_approx();
+    if (size > 0) {
+      std::vector<mluOpTensorDescriptor_t> vec(size);
+      q_.try_dequeue_bulk(vec.data(), size);
+      for (auto desc : vec) {
+        mluOpDestroyTensorDescriptor(desc);
+      }
+    }
+  }
+
+ private:
+  moodycamel::ConcurrentQueue<mluOpTensorDescriptor_t> q_;
+};
+
+static MLUOpTensorDescPool g_mluop_tensor_desc_pool;
+
+MLUOpTensorDesc& MLUOpTensorDesc::operator=(MLUOpTensorDesc&& rhs) {
+  if (raw_tensor_desc) {
+    g_mluop_tensor_desc_pool.Recycle(raw_tensor_desc);
+  }
+  raw_tensor_desc = rhs.raw_tensor_desc;
+  rhs.raw_tensor_desc = nullptr;
+  return *this;
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype) {
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 const mluOpTensorLayout_t layout) {
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(
+      raw_tensor_desc, layout, tensor_dtype, tensor_dim, dim_sizes));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position)
+    : MLUOpTensorDesc(tensor_dim, dim_sizes, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 const mluOpTensorLayout_t layout) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      layout,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const int tensor_dim,
+                                 const int64_t dim_sizes[],
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position) {
+  std::vector<int> dim_sizes_int32(tensor_dim);
+  std::vector<int64_t>::const_iterator int64_cbegin(dim_sizes);
+  std::vector<int64_t>::const_iterator int64_cend(dim_sizes + tensor_dim);
+  std::transform(int64_cbegin,
+                 int64_cend,
+                 dim_sizes_int32.begin(),
+                 &CheckedNarrowing<int64_t, int>);
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(raw_tensor_desc,
+                                                      MLUOP_LAYOUT_ARRAY,
+                                                      tensor_dtype,
+                                                      tensor_dim,
+                                                      dim_sizes_int32.data()));
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 const mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype) {
+  auto dims = phi::vectorize<int>(tensor.dims());
+  int tensor_dim = dims.size();
+  raw_tensor_desc = g_mluop_tensor_desc_pool.Pop();
+  if (tensor_dim == 0) {
+    int scalar_dims[1] = {1};
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptor(
+        raw_tensor_desc, layout, tensor_dtype, 1, scalar_dims));
+  } else {
+    std::vector<int> tensor_dim_sizes_int(dims.begin(), dims.end());
+    PADDLE_ENFORCE_MLU_SUCCESS(
+        mluOpSetTensorDescriptor(raw_tensor_desc,
+                                 layout,
+                                 tensor_dtype,
+                                 tensor_dim,
+                                 tensor_dim_sizes_int.data()));
+  }
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor)
+    : MLUOpTensorDesc(
+          tensor, MLUOP_LAYOUT_ARRAY, ToMluOpDataType(tensor.dtype())) {}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position)
+    : MLUOpTensorDesc(tensor, layout, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(
+      mluOpSetTensorDescriptorPosition(raw_tensor_desc, position));
+}
+
+MLUOpTensorDesc::MLUOpTensorDesc(const Tensor& tensor,
+                                 mluOpTensorLayout_t layout,
+                                 const mluOpDataType_t tensor_dtype,
+                                 int position,
+                                 float scale)
+    : MLUOpTensorDesc(tensor, layout, tensor_dtype) {
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetTensorDescriptorPositionAndScale(
+      raw_tensor_desc, position, scale));
+}
+
+MLUOpTensorDesc::~MLUOpTensorDesc() {
+  if (raw_tensor_desc) {
+    g_mluop_tensor_desc_pool.Recycle(raw_tensor_desc);
+  }
+}
+
 MLUCnnlActivationDesc::MLUCnnlActivationDesc(
     const cnnlActivationMode_t act_mode, const float ceof) {
   PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreateActivationDescriptor(&active_desc_));
@@ -1563,17 +1743,35 @@ MLURNNDesc::~MLURNNDesc() {
     void* indices_out) {
   cnnlHandle_t handle = GetHandleFromCTX(ctx);
 
-  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor(handle,
-                                            input_desc,
-                                            input,
-                                            k,
-                                            dim,
-                                            largest,
-                                            sorted,
-                                            values_output_desc,
-                                            values_out,
-                                            indices_output_desc,
-                                            indices_out));
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetTopKTensorWorkspaceSize(handle,
+                                                            input_desc,
+                                                            k,
+                                                            dim,
+                                                            largest,
+                                                            values_output_desc,
+                                                            indices_output_desc,
+                                                            &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlTopKTensor_v3(handle,
+                                               input_desc,
+                                               input,
+                                               k,
+                                               dim,
+                                               largest,
+                                               sorted,
+                                               false /*lower_index_first*/,
+                                               workspace_ptr,
+                                               workspace_size,
+                                               values_output_desc,
+                                               values_out,
+                                               indices_output_desc,
+                                               indices_out));
 }
 
 /* static */ void MLUCnnl::StridedSlice(
@@ -4527,6 +4725,78 @@ MLURNNDesc::~MLURNNDesc() {
                                                  output));
 }
 
+/* static */ void MLUCnnl::SmoothL1LossForward(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t t_desc,
+    const void* target,
+    const float beta,
+    const cnnlSmoothL1LossAlgorithm_t algorithm,
+    const cnnlTensorDescriptor_t y_desc,
+    void* y) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSmoothL1LossForwardWorkspaceSize(
+      handle, x_desc, algorithm, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSmoothL1LossForward_v2(handle,
+                                                        x_desc,
+                                                        x,
+                                                        t_desc,
+                                                        target,
+                                                        beta,
+                                                        algorithm,
+                                                        workspace_ptr,
+                                                        workspace_size,
+                                                        y_desc,
+                                                        y));
+}
+
+/* static */ void MLUCnnl::SmoothL1LossBackward(
+    const ExecutionContext& ctx,
+    const cnnlTensorDescriptor_t x_desc,
+    const void* x,
+    const cnnlTensorDescriptor_t target_desc,
+    const void* target,
+    const cnnlTensorDescriptor_t dy_desc,
+    const void* dy,
+    const float beta,
+    const cnnlSmoothL1LossAlgorithm_t algorithm,
+    const cnnlTensorDescriptor_t dx_desc,
+    void* dx) {
+  cnnlHandle_t handle = GetHandleFromCTX(ctx);
+
+  size_t workspace_size;
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlGetSmoothL1LossBackwardWorkspaceSize(
+      handle, x_desc, algorithm, &workspace_size));
+
+  auto& dev_ctx = GetDevCtxFromCTX(ctx);
+  Tensor workspace = ctx.AllocateTmpTensor<int8_t, MLUDeviceContext>(
+      {static_cast<int64_t>(workspace_size)}, dev_ctx);
+  void* workspace_ptr = workspace.mutable_data(ctx.GetPlace());
+
+  PADDLE_ENFORCE_MLU_SUCCESS(cnnlSmoothL1LossBackward_v2(handle,
+                                                         x_desc,
+                                                         x,
+                                                         target_desc,
+                                                         target,
+                                                         dy_desc,
+                                                         dy,
+                                                         beta,
+                                                         algorithm,
+                                                         workspace_ptr,
+                                                         workspace_size,
+                                                         dx_desc,
+                                                         dx));
+}
+
 /* static */ void MLUCnnl::EmbeddingForward(
     const ExecutionContext& ctx,
     const int padding_idx,
@@ -5148,5 +5418,94 @@ MLURNNDesc::~MLURNNDesc() {
                                                               diff_x));
 }
 
+/* static */ void MLUOP::OpYoloBox(const ExecutionContext& ctx,
+                                   const mluOpTensorDescriptor_t x_desc,
+                                   const void* x,
+                                   const mluOpTensorDescriptor_t img_size_desc,
+                                   const void* img_size,
+                                   const mluOpTensorDescriptor_t anchors_desc,
+                                   const void* anchors,
+                                   const int class_num,
+                                   const float conf_thresh,
+                                   const int downsample_ratio,
+                                   const bool clip_bbox,
+                                   const float scale,
+                                   const bool iou_aware,
+                                   const float iou_aware_factor,
+                                   const mluOpTensorDescriptor_t boxes_desc,
+                                   void* boxes,
+                                   const mluOpTensorDescriptor_t scores_desc,
+                                   void* scores) {
+  mluOpHandle_t handle = GetMLUOpHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpYoloBox(handle,
+                                          x_desc,
+                                          x,
+                                          img_size_desc,
+                                          img_size,
+                                          anchors_desc,
+                                          anchors,
+                                          class_num,
+                                          conf_thresh,
+                                          downsample_ratio,
+                                          clip_bbox,
+                                          scale,
+                                          iou_aware,
+                                          iou_aware_factor,
+                                          boxes_desc,
+                                          boxes,
+                                          scores_desc,
+                                          scores));
+}
+
+/* static */ void MLUOP::OpPriorBox(
+    const ExecutionContext& ctx,
+    const mluOpTensorDescriptor_t min_sizes_desc,
+    const void* min_sizes,
+    const mluOpTensorDescriptor_t aspect_ratios_desc,
+    const void* aspect_ratios,
+    const mluOpTensorDescriptor_t variances_desc,
+    const void* variances,
+    const mluOpTensorDescriptor_t max_sizes_desc,
+    const void* max_sizes,
+    const int height,
+    const int width,
+    const int im_height,
+    const int im_width,
+    const float step_h,
+    const float step_w,
+    const float offset,
+    const bool clip,
+    const bool min_max_aspect_ratios_order,
+    const mluOpTensorDescriptor_t output_desc,
+    void* output,
+    const mluOpTensorDescriptor_t var_desc,
+    void* var) {
+  mluOpHandle_t handle = GetMLUOpHandleFromCTX(ctx);
+
+  PADDLE_ENFORCE_MLU_SUCCESS(mluOpPriorBox(handle,
+                                           min_sizes_desc,
+                                           min_sizes,
+                                           aspect_ratios_desc,
+                                           aspect_ratios,
+                                           variances_desc,
+                                           variances,
+                                           max_sizes_desc,
+                                           max_sizes,
+                                           height,
+                                           width,
+                                           im_height,
+                                           im_width,
+                                           step_h,
+                                           step_w,
+                                           offset,
+                                           clip,
+                                           min_max_aspect_ratios_order,
+                                           output_desc,
+                                           output,
+                                           var_desc,
+                                           var));
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/mlu/mlu_baseop.h b/paddle/fluid/operators/mlu/mlu_baseop.h
index e56331b2728c43..f2c6a792ece513 100644
--- a/paddle/fluid/operators/mlu/mlu_baseop.h
+++ b/paddle/fluid/operators/mlu/mlu_baseop.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cn_api.h>
 #include <cnnl.h>
 #include <concurrentqueue.h>
+#include <mlu_op.h>
 
 #include <string>
 #include <vector>
@@ -138,6 +139,54 @@ inline cnnlDataType_t ToCnnlDataType() {
   return ToCnnlDataType(type);
 }
 
+inline mluOpDataType_t ToMluOpDataType(
+    const paddle::experimental::DataType& dtype) {
+  mluOpDataType_t type = MLUOP_DTYPE_FLOAT;
+  switch (dtype) {
+    case DataType::FLOAT16:
+      type = MLUOP_DTYPE_HALF;
+      break;
+    case DataType::FLOAT32:
+      type = MLUOP_DTYPE_FLOAT;
+      break;
+    case DataType::FLOAT64:
+      type = MLUOP_DTYPE_DOUBLE;
+      break;
+    case DataType::INT8:
+      type = MLUOP_DTYPE_INT8;
+      break;
+    case DataType::INT16:
+      type = MLUOP_DTYPE_INT16;
+      break;
+    case DataType::INT32:
+      type = MLUOP_DTYPE_INT32;
+      break;
+    case DataType::INT64:
+      type = MLUOP_DTYPE_INT64;
+      break;
+    case DataType::BOOL:
+      type = MLUOP_DTYPE_BOOL;
+      break;
+    case DataType::UINT8:
+      type = MLUOP_DTYPE_UINT8;
+      break;
+    default:
+      break;
+  }
+  return type;
+}
+
+inline mluOpDataType_t ToMluOpDataType(
+    const paddle::framework::proto::VarType::Type& type) {
+  return ToMluOpDataType(framework::TransToPhiDataType(type));
+}
+
+template <typename T>
+inline mluOpDataType_t ToMluOpDataType() {
+  auto type = framework::ToDataType(std::type_index(typeid(T)));
+  return ToMluOpDataType(type);
+}
+
 // Converts (via narrowing) a type T value to a type U, and checks that the
 // value has no value change due to the conversion.
 template <typename WideT, typename NarrowT>
@@ -152,6 +201,10 @@ inline static cnnlHandle_t GetHandleFromCTX(const ExecutionContext& ctx) {
   return ctx.template device_context<MLUDeviceContext>().cnnl_handle();
 }
 
+inline static mluOpHandle_t GetMLUOpHandleFromCTX(const ExecutionContext& ctx) {
+  return ctx.template device_context<MLUDeviceContext>().mluOp_handle();
+}
+
 inline static const MLUDeviceContext& GetDevCtxFromCTX(
     const ExecutionContext& ctx) {
   return ctx.template device_context<MLUDeviceContext>();
@@ -281,6 +334,74 @@ class MLUCnnlTensorDesc {
   cnnlTensorDescriptor_t raw_tensor_desc = nullptr;
 };
 
+class MLUOpTensorDesc {
+ public:
+  MLUOpTensorDesc() {}
+
+  // SE_DISALLOW_COPY_AND_ASSIGN
+  MLUOpTensorDesc(const MLUOpTensorDesc& desc) = delete;
+  MLUOpTensorDesc& operator=(const MLUOpTensorDesc&) = delete;
+
+  MLUOpTensorDesc(MLUOpTensorDesc&& rhs)
+      : raw_tensor_desc(rhs.raw_tensor_desc) {
+    rhs.raw_tensor_desc = nullptr;
+  }
+
+  MLUOpTensorDesc& operator=(MLUOpTensorDesc&& rhs);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  const mluOpTensorLayout_t layout);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  const mluOpTensorLayout_t layout);
+
+  MLUOpTensorDesc(const int tensor_dim,
+                  const int64_t dim_sizes[],
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  const mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype);
+
+  explicit MLUOpTensorDesc(const Tensor& tensor);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype,
+                  int position);
+
+  MLUOpTensorDesc(const Tensor& tensor,
+                  mluOpTensorLayout_t layout,
+                  const mluOpDataType_t tensor_dtype,
+                  int position,
+                  float scale);
+
+  ~MLUOpTensorDesc();
+
+  const mluOpTensorDescriptor_t get() const { return raw_tensor_desc; }
+
+ private:
+  mluOpTensorDescriptor_t raw_tensor_desc = nullptr;
+};
+
 class MLUCnnlActivationDesc {
  public:
   MLUCnnlActivationDesc(const MLUCnnlActivationDesc& desc) = delete;
@@ -1921,6 +2042,28 @@ class MLUCnnl {
                               const cnnlTensorDescriptor_t output_desc,
                               void* output);
 
+  static void SmoothL1LossForward(const ExecutionContext& ctx,
+                                  const cnnlTensorDescriptor_t x_desc,
+                                  const void* x,
+                                  const cnnlTensorDescriptor_t t_desc,
+                                  const void* target,
+                                  const float beta,
+                                  const cnnlSmoothL1LossAlgorithm_t algorithm,
+                                  const cnnlTensorDescriptor_t y_desc,
+                                  void* y);
+
+  static void SmoothL1LossBackward(const ExecutionContext& ctx,
+                                   const cnnlTensorDescriptor_t x_desc,
+                                   const void* x,
+                                   const cnnlTensorDescriptor_t target_desc,
+                                   const void* target,
+                                   const cnnlTensorDescriptor_t dy_desc,
+                                   const void* dy,
+                                   const float beta,
+                                   const cnnlSmoothL1LossAlgorithm_t algorithm,
+                                   const cnnlTensorDescriptor_t dx_desc,
+                                   void* dx);
+
   static void EmbeddingForward(const ExecutionContext& ctx,
                                const int padding_idx,
                                const cnnlTensorDescriptor_t weight_desc,
@@ -2149,6 +2292,50 @@ class MLUCnnl {
       void* diff_x);
 };
 
+class MLUOP {
+ public:
+  static void OpYoloBox(const ExecutionContext& ctx,
+                        const mluOpTensorDescriptor_t x_desc,
+                        const void* x,
+                        const mluOpTensorDescriptor_t img_size_desc,
+                        const void* img_size,
+                        const mluOpTensorDescriptor_t anchors_desc,
+                        const void* anchors,
+                        const int class_num,
+                        const float conf_thresh,
+                        const int downsample_ratio,
+                        const bool clip_bbox,
+                        const float scale,
+                        const bool iou_aware,
+                        const float iou_aware_factor,
+                        const mluOpTensorDescriptor_t boxes_desc,
+                        void* boxes,
+                        const mluOpTensorDescriptor_t scores_desc,
+                        void* scores);
+
+  static void OpPriorBox(const ExecutionContext& ctx,
+                         const mluOpTensorDescriptor_t min_sizes_desc,
+                         const void* min_sizes,
+                         const mluOpTensorDescriptor_t aspect_ratios_desc,
+                         const void* aspect_ratios,
+                         const mluOpTensorDescriptor_t variances_desc,
+                         const void* variances,
+                         const mluOpTensorDescriptor_t max_sizes_desc,
+                         const void* max_sizes,
+                         const int height,
+                         const int width,
+                         const int im_height,
+                         const int im_width,
+                         const float step_h,
+                         const float step_w,
+                         const float offset,
+                         const bool clip,
+                         const bool min_max_aspect_ratios_order,
+                         const mluOpTensorDescriptor_t output_desc,
+                         void* output,
+                         const mluOpTensorDescriptor_t var_desc,
+                         void* var);
+};
 const std::map<const std::string, std::pair<std::vector<int>, std::vector<int>>>
     TransPermMap = {
         // trans_mode, (forward_perm, backward_perm)
diff --git a/paddle/fluid/operators/mul_op.cc b/paddle/fluid/operators/mul_op.cc
index f26b4b948817bb..2d4ca62955eb14 100644
--- a/paddle/fluid/operators/mul_op.cc
+++ b/paddle/fluid/operators/mul_op.cc
@@ -79,10 +79,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("X", "(Tensor), The first input tensor of mul op.");
     AddInput("Y", "(Tensor), The second input tensor of mul op.");
     AddOutput("Out", "(Tensor), The output tensor of mul op.");
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
     AddAttr<int>(
         "x_num_col_dims",
         R"DOC((int, default 1), The mul_op can take tensors with more than two
@@ -113,31 +109,6 @@ class MulOpMaker : public framework::OpProtoAndCheckerMaker {
         )DOC")
         .SetDefault(1)
         .EqualGreaterThan(1);
-    AddAttr<float>(
-        "scale_x",
-        "scale_x to be used for int8 mul input data x. scale_x has the"
-        "same purpose as scale_in in OPs that support quantization."
-        "Only to be used with MKL-DNN INT8")
-        .SetDefault(1.0f)
-        .AsExtra();
-    AddAttr<std::vector<float>>(
-        "scale_y",
-        "scale_y to be used for int8 mul input data y. scale_y has the"
-        "same purpose as scale_weights in OPs that support quantization."
-        "Only to be used with MKL-DNN INT8")
-        .SetDefault({1.0f})
-        .AsExtra();
-    AddAttr<float>("scale_out",
-                   "scale_out to be used for int8 output data."
-                   "Only used with MKL-DNN INT8")
-        .SetDefault(1.0f)
-        .AsExtra();
-    AddAttr<bool>(
-        "force_fp32_output",
-        "(bool, default false) Force quantize kernel output FP32, only "
-        "used in quantized MKL-DNN.")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 Mul Operator.
 
diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc
index c9c4d1a4c74f33..dd093729d1913b 100644
--- a/paddle/fluid/operators/nce_op.cc
+++ b/paddle/fluid/operators/nce_op.cc
@@ -207,34 +207,6 @@ class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
 
     // for parameter prefetch
     AddAttr<bool>("remote_prefetch", "").SetDefault(false);
-    AddAttr<int>("trainer_id", "trainer id from 0 ~ worker_num.")
-        .SetDefault(0)
-        .AsExtra();
-    AddAttr<std::vector<int64_t>>("height_sections",
-                                  "Height for each output SelectedRows.")
-        .SetDefault(std::vector<int64_t>({}))
-        .AsExtra();
-    AddAttr<std::vector<std::string>>(
-        "epmap",
-        "(string vector, default 127.0.0.1:6164)"
-        "Server endpoints in the order of input variables for mapping")
-        .SetDefault({})
-        .AsExtra();
-    AddAttr<std::vector<std::string>>(
-        "table_names",
-        "(string vector, the split table names that will be fetched from "
-        "parameter server)"
-        "in the order of input variables for mapping")
-        .SetDefault({})
-        .AsExtra();
-
-    AddAttr<std::vector<int>>("custom_neg_classes",
-                              "This attribute only be used in unitest. Classes "
-                              "in this list wiil be used as negative classes "
-                              "for every samples. Under normal conditions, "
-                              "user should avoid setting this attribute.")
-        .SetDefault({})
-        .AsExtra();
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference "
                   "only, false for training.")
diff --git a/paddle/fluid/operators/one_hot_v2_op_mlu.cc b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
index f574cc525f142b..21e8975e37dcb3 100644
--- a/paddle/fluid/operators/one_hot_v2_op_mlu.cc
+++ b/paddle/fluid/operators/one_hot_v2_op_mlu.cc
@@ -97,4 +97,6 @@ class OneHotV2MLUKernel : public framework::OpKernel<T> {
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
 
-REGISTER_OP_MLU_KERNEL(one_hot_v2, ops::OneHotV2MLUKernel<int32_t>);
+REGISTER_OP_MLU_KERNEL(one_hot_v2,
+                       ops::OneHotV2MLUKernel<int32_t>,
+                       ops::OneHotV2MLUKernel<int64_t>);
diff --git a/paddle/fluid/operators/op_debug_string_test.cc b/paddle/fluid/operators/op_debug_string_test.cc
index 372a71706ab5ec..fd8e027092410d 100644
--- a/paddle/fluid/operators/op_debug_string_test.cc
+++ b/paddle/fluid/operators/op_debug_string_test.cc
@@ -41,8 +41,6 @@ TEST(op_debug_str, test_unknown_dtype) {
   desc.SetOutput(framework::GradVarName("Y"), {framework::GradVarName("Y")});
   desc.SetAttr("axis", -1);
   desc.SetAttr("use_mkldnn", false);
-  desc.SetAttr("x_data_format", "");
-  desc.SetAttr("y_data_format", "");
 
   auto x_tensor = scope.Var("X")->GetMutable<framework::LoDTensor>();
   x_tensor->Resize(dim);
diff --git a/paddle/fluid/operators/optimizers/adam_op_mlu.cc b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
index ecc527d5c72bf0..aff468cc3c8808 100644
--- a/paddle/fluid/operators/optimizers/adam_op_mlu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_mlu.cc
@@ -291,11 +291,38 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
       skip_update = skip_update_vec[0];
     }
     bool with_decay = ctx.Attr<bool>("with_decay");
+    const bool multi_precision = ctx.Attr<bool>("multi_precision");
+    auto* param_out = ctx.Output<LoDTensor>("ParamOut");
+    auto* master_param_out = ctx.Output<LoDTensor>("MasterParamOut");
+    const auto* master_param = ctx.Input<LoDTensor>("MasterParam");
+
     VLOG(3) << "Skip update: " << skip_update << ", With decay: " << with_decay;
     if (!skip_update && with_decay) {
-      if (ctx.HasInput("MasterParam")) {
-        PADDLE_THROW(platform::errors::Unimplemented(
-            "Master Param is not supported on MLU"));
+      auto* param = ctx.Input<LoDTensor>("Param");
+      MLUCnnlTensorDesc param_desc(*param);
+      if (multi_precision) {
+        VLOG(3) << "[adamw] multi_precision, cast masterparam to param.";
+        bool has_master =
+            ctx.HasInput("MasterParam") && ctx.HasOutput("MasterParamOut");
+        PADDLE_ENFORCE_EQ(
+            has_master,
+            true,
+            platform::errors::InvalidArgument(
+                "The Input(MasterParam) and Output(MasterParamOut) "
+                "should not be null when "
+                "the attr `multi_precision` is true"));
+        // cast masterparam (fp32) to param (fp16), then paramout (fp16) to
+        // masterparamout (fp32)
+        MLUCnnlTensorDesc master_param_desc(*master_param);
+        cnnlCastDataType_t cast_type = GetCastDataType(
+            framework::TransToProtoVarType(master_param->dtype()),
+            framework::TransToProtoVarType(param->dtype()));
+        MLUCnnl::Cast(ctx,
+                      cast_type,
+                      master_param_desc.get(),
+                      GetBasePtr(master_param),
+                      param_desc.get(),
+                      const_cast<void*>(GetBasePtr(param)));
       } else {
         const auto* param_var = ctx.InputVar("Param");
         PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(),
@@ -305,13 +332,12 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
                               "but the received is %s",
                               ctx.InputNames("Param").front(),
                               framework::ToTypeName(param_var->Type())));
-        auto* param = ctx.Input<LoDTensor>("Param");
+
         auto* lr = ctx.Input<LoDTensor>("LearningRate");
         float coeff = ctx.Attr<float>("coeff");
 
         // update param with decay coeff: mul(-1 * lr, coeff * param) + param
         MLUCnnlTensorDesc lr_desc(*lr);
-        MLUCnnlTensorDesc param_desc(*param);
         MLUCnnlOpTensorDesc mul_op_desc(
             CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
 
@@ -330,9 +356,244 @@ class AdamWMLUKernel : public AdamMLUKernel<T> {
       }
     }
     AdamMLUKernel<T>::Compute(ctx);
+    if (multi_precision) {
+      VLOG(3) << "[adamw] multi_precision, cast paramout to masterparamout.";
+      // cast paramout to masterparamout
+      master_param_out->mutable_data<float>(ctx.GetPlace());
+      cnnlCastDataType_t cast_type = GetCastDataType(
+          framework::TransToProtoVarType(param_out->dtype()),
+          framework::TransToProtoVarType(master_param_out->dtype()));
+      MLUCnnlTensorDesc param_out_desc(*param_out);
+      MLUCnnlTensorDesc master_param_out_desc(*master_param_out);
+
+      MLUCnnl::Cast(ctx,
+                    cast_type,
+                    param_out_desc.get(),
+                    GetBasePtr(param_out),
+                    master_param_out_desc.get(),
+                    GetBasePtr(master_param_out));
+    }
   }
 };
 
+template <typename T>
+class MergedAdamMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    // Get inputs and outputs
+    auto params = ctx.MultiInput<framework::Tensor>("Param");
+    auto grads = ctx.MultiInput<framework::Tensor>("Grad");
+    auto lrs = ctx.MultiInput<framework::Tensor>("LearningRate");
+    auto mom1s = ctx.MultiInput<framework::Tensor>("Moment1");
+    auto mom2s = ctx.MultiInput<framework::Tensor>("Moment2");
+    auto beta1_pows = ctx.MultiInput<framework::Tensor>("Beta1Pow");
+    auto beta2_pows = ctx.MultiInput<framework::Tensor>("Beta2Pow");
+    auto master_params = ctx.MultiInput<framework::Tensor>("MasterParam");
+    auto param_outs = ctx.MultiOutput<framework::Tensor>("ParamOut");
+    auto mom1_outs = ctx.MultiOutput<framework::Tensor>("Moment1Out");
+    auto mom2_outs = ctx.MultiOutput<framework::Tensor>("Moment2Out");
+    auto beta1_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta1PowOut");
+    auto beta2_pow_outs = ctx.MultiOutput<framework::Tensor>("Beta2PowOut");
+
+    // Check validation of inputs and outputs
+    size_t param_num = params.size();
+    PADDLE_ENFORCE_EQ(param_num,
+                      param_outs.size(),
+                      platform::errors::InvalidArgument(
+                          "The size of Output(ParamOut) must be equal to "
+                          "Input(Param), but got the size of Output(ParamOut) "
+                          "is %d, the size of Input(Param) is %d.",
+                          param_outs.size(),
+                          param_num));
+
+    bool skip_update = false;
+    if (ctx.HasInput("SkipUpdate")) {
+      auto* skip_update_tensor = ctx.Input<framework::Tensor>("SkipUpdate");
+      PADDLE_ENFORCE_EQ(skip_update_tensor->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "Input(SkipUpdate) size must be 1, but get %d",
+                            skip_update_tensor->numel()));
+      std::vector<bool> skip_update_vec;
+      paddle::framework::TensorToVector(
+          *skip_update_tensor, ctx.device_context(), &skip_update_vec);
+      ctx.device_context().Wait();
+      skip_update = skip_update_vec[0];
+    }
+    // skip_update=true, just copy input to output, and TensorCopy will call
+    // mutable_data
+
+    if (skip_update) {
+      VLOG(4) << "MergedAdam skip update";
+      for (size_t i = 0; i < param_num; ++i) {
+        framework::TensorCopy(
+            *params[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            param_outs[i]);
+        framework::TensorCopy(
+            *mom1s[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            mom1_outs[i]);
+        framework::TensorCopy(
+            *mom2s[i],
+            ctx.GetPlace(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            mom2_outs[i]);
+        framework::TensorCopy(
+            *beta1_pows[i],
+            beta1_pows[i]->place(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            beta1_pow_outs[i]);
+        framework::TensorCopy(
+            *beta2_pows[i],
+            beta2_pows[i]->place(),
+            ctx.template device_context<platform::MLUDeviceContext>(),
+            beta2_pow_outs[i]);
+      }
+      return;
+    }
+
+    bool use_global_beta_pow = ctx.Attr<bool>("use_global_beta_pow");
+    VLOG(4) << "use_global_beta_pow:" << use_global_beta_pow;
+
+    // Get beta1, beta2 and epsilon from attribute.
+    const Tensor* beta1_tensor = nullptr;
+    const Tensor* beta2_tensor = nullptr;
+    const Tensor* epsilon_tensor = nullptr;
+
+    Tensor beta1_tmp(experimental::DataType::FLOAT32);
+    Tensor beta2_tmp(experimental::DataType::FLOAT32);
+    Tensor epsilon_tmp(experimental::DataType::FLOAT32);
+
+    T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
+    T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
+    T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
+    beta1_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    beta2_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    epsilon_tmp.mutable_data<T>({1}, ctx.GetPlace());
+    MLUCnnlTensorDesc beta1_tmp_desc(beta1_tmp);
+    MLUCnnlTensorDesc beta2_tmp_desc(beta2_tmp);
+    MLUCnnlTensorDesc epsilon_tmp_desc(epsilon_tmp);
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &beta1,
+                  beta1_tmp_desc.get(),
+                  GetBasePtr(&beta1_tmp));
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &beta2,
+                  beta2_tmp_desc.get(),
+                  GetBasePtr(&beta2_tmp));
+    MLUCnnl::Fill(ctx,
+                  CNNL_POINTER_MODE_HOST,
+                  &epsilon,
+                  epsilon_tmp_desc.get(),
+                  GetBasePtr(&epsilon_tmp));
+    beta1_tensor = &beta1_tmp;
+    beta2_tensor = &beta2_tmp;
+    epsilon_tensor = &epsilon_tmp;
+
+    // Loop to compute
+    for (size_t i = 0; i < param_num; ++i) {
+      VLOG(4) << "[MergedAdam] loop: " << i;
+      param_outs[i]->ShareDataWith(*params[i]);
+      mom1_outs[i]->ShareDataWith(*mom1s[i]);
+      mom2_outs[i]->ShareDataWith(*mom2s[i]);
+
+      LoDTensor beta1_pow_tmp;
+      LoDTensor beta2_pow_tmp;
+      if (beta1_pows[i]->place() == platform::CPUPlace()) {
+        T beta1 = *beta1_pows[i]->data<T>();
+        beta1_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc beta1_pow_tmp_desc(beta1_pow_tmp);
+        MLUCnnl::Fill(ctx,
+                      CNNL_POINTER_MODE_HOST,
+                      &beta1,
+                      beta1_pow_tmp_desc.get(),
+                      GetBasePtr(&beta1_pow_tmp));
+        beta1_pows[i] = &beta1_pow_tmp;
+      }
+      if (beta2_pows[i]->place() == platform::CPUPlace()) {
+        T beta2 = *beta2_pows[i]->data<T>();
+        beta2_pow_tmp.mutable_data<T>({1}, ctx.GetPlace());
+        MLUCnnlTensorDesc beta2_pow_tmp_desc(beta2_pow_tmp);
+        MLUCnnl::Fill(ctx,
+                      CNNL_POINTER_MODE_HOST,
+                      &beta2,
+                      beta2_pow_tmp_desc.get(),
+                      GetBasePtr(&beta2_pow_tmp));
+        beta2_pows[i] = &beta2_pow_tmp;
+      }
+
+      VLOG(3) << "beta1_pow.numel() : " << beta1_pows[i]->numel()
+              << "beta2_pow.numel() : " << beta2_pows[i]->numel();
+      VLOG(3) << "param.numel(): " << params[i]->numel();
+      PADDLE_ENFORCE_EQ(beta1_pow_outs[i]->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "beta1 pow output size should be 1, but received "
+                            "value is:%d.",
+                            beta1_pow_outs[i]->numel()));
+
+      PADDLE_ENFORCE_EQ(beta2_pow_outs[i]->numel(),
+                        1,
+                        platform::errors::InvalidArgument(
+                            "beta2 pow output size should be 1, but received "
+                            "value is:%d.",
+                            beta2_pow_outs[i]->numel()));
+      MLUCnnlTensorDesc param_desc(*params[i]);
+      MLUCnnlTensorDesc mom1_desc(*mom1s[i]);
+      MLUCnnlTensorDesc mom2_desc(*mom2s[i]);
+      MLUCnnlTensorDesc grad_desc(*grads[i]);
+      MLUCnnl::ApplyAdam(ctx,
+                         param_desc.get(),
+                         GetBasePtr(param_outs[i]),
+                         mom1_desc.get(),
+                         GetBasePtr(mom1_outs[i]),
+                         mom2_desc.get(),
+                         GetBasePtr(mom2_outs[i]),
+                         grad_desc.get(),
+                         GetBasePtr(grads[i]),
+                         GetBasePtr(lrs[i]),
+                         GetBasePtr(beta1_tensor),
+                         GetBasePtr(beta2_tensor),
+                         GetBasePtr(beta1_pows[i]),
+                         GetBasePtr(beta2_pows[i]),
+                         GetBasePtr(epsilon_tensor),
+                         /*use_nesterov*/ false);
+      if (!use_global_beta_pow) {
+        beta1_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
+        beta2_pow_outs[i]->mutable_data<T>(ctx.GetPlace());
+
+        MLUCnnlTensorDesc beta1_desc(*beta1_tensor);
+        MLUCnnlOpTensorDesc mul_op_desc(
+            CNNL_OP_TENSOR_MUL, ToCnnlDataType<T>(), CNNL_NOT_PROPAGATE_NAN);
+
+        MLUCnnl::OpTensor(ctx,
+                          mul_op_desc.get(),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_pows[i]),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_tensor),
+                          beta1_desc.get(),
+                          GetBasePtr(beta1_pow_outs[i]),
+                          ToCnnlDataType<T>());
+
+        MLUCnnl::OpTensor(ctx,
+                          mul_op_desc.get(),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_pows[i]),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_tensor),
+                          beta1_desc.get(),
+                          GetBasePtr(beta2_pow_outs[i]),
+                          ToCnnlDataType<T>());
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
 
@@ -346,3 +607,7 @@ REGISTER_OP_MLU_KERNEL(adam,
 REGISTER_OP_MLU_KERNEL(adamw,
                        ops::AdamWMLUKernel<float>,
                        ops::AdamWMLUKernel<plat::float16>);
+
+REGISTER_OP_MLU_KERNEL(merged_adam,
+                       ops::MergedAdamMLUKernel<float>,
+                       ops::MergedAdamMLUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc b/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
deleted file mode 100644
index e332972f7576ab..00000000000000
--- a/paddle/fluid/operators/optimizers/mkldnn/sgd_mkldnn_op.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <cstring>
-
-#include "paddle/fluid/operators/mkldnn/axpy_handler.h"
-#include "paddle/fluid/operators/optimizers/sgd_op.h"
-
-namespace pplat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-class SGDOneDNNKernel : public SGDOpKernel<phi::CPUContext, T> {
- protected:
-  void dense_param_and_grad_kernel(
-      const framework::ExecutionContext &ctx) const override {
-    VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, LodTensor>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    const auto *param = ctx.Input<framework::Tensor>("Param");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<framework::Tensor>("Grad");
-
-    auto *out_data = param_out->mutable_data<T>(ctx.GetPlace());
-    const T *param_data = param->data<T>();
-    const auto *grad_data = grad->data<T>();
-    const auto *lr = learning_rate->data<T>();
-    // Since denese SGD is not in place operation, first copy params to output
-    // tensor and then update it.
-    std::memcpy(out_data, param_data, param->memory_size());
-    OneDNNAXPYHandler<T>(param_out->numel(), -lr[0])(grad_data, out_data);
-  }
-
-  void dense_param_sparse_grad_kernel(
-      const framework::ExecutionContext &ctx) const override {
-    VLOG(4) << "[ONEDNN]: sgd_dense_param_kernel<T, SelectedRows>";
-    const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
-    auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
-    const auto *grad = ctx.Input<phi::SelectedRows>("Grad");
-
-    const auto &grad_value = grad->value();
-    const auto &grad_rows = grad->rows();
-    const auto grad_height = grad->height();
-    const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
-    const auto grad_width = grad_value.numel() / grad_val_height;
-
-    const auto *grad_data = grad_value.data<T>();
-    auto *out_data = param_out->data<T>();
-    const auto *lr = learning_rate->data<T>();
-
-    OneDNNAXPYHandler<T> axpy_handler(grad_width, -lr[0]);
-
-    for (size_t i = 0; i < grad_rows.size(); ++i) {
-      PADDLE_ENFORCE_LT(
-          grad_rows[i],
-          grad_height,
-          pplat::errors::OutOfRange(
-              "Grad rows index value should be less than grad height."
-              "Got [%s], but expected less than [%s]",
-              grad_rows[i],
-              grad_height));
-      const int64_t row = grad_rows[i];
-      const auto *src = grad_data + i * grad_width;
-      auto *dst = out_data + row * grad_width;
-      axpy_handler(src, dst);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_KERNEL(sgd,
-                   MKLDNN,
-                   pplat::CPUPlace,
-                   ops::SGDOneDNNKernel<float>,
-                   ops::SGDOneDNNKernel<pplat::bfloat16>);
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index 301c21b2fcdcf3..02d2eab181bcf8 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -54,7 +54,7 @@ class Pad3dOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name,
       const Tensor& tensor,
-      const framework::OpKernelType& expected_kernel_type) const {
+      const framework::OpKernelType& expected_kernel_type) const override {
 #ifdef PADDLE_WITH_MKLDNN
     if ((expected_kernel_type.data_layout_ == framework::DataLayout::kMKLDNN) &&
         (tensor.layout() != framework::DataLayout::kMKLDNN)) {
diff --git a/paddle/fluid/operators/partial_concat_op.h b/paddle/fluid/operators/partial_concat_op.h
index affe06f20956ad..927ffbede6e6c6 100644
--- a/paddle/fluid/operators/partial_concat_op.h
+++ b/paddle/fluid/operators/partial_concat_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
diff --git a/paddle/fluid/operators/partial_sum_op.h b/paddle/fluid/operators/partial_sum_op.h
index 58ac0671dde10c..a5956303192201 100644
--- a/paddle/fluid/operators/partial_sum_op.h
+++ b/paddle/fluid/operators/partial_sum_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index c5b1ce12f17fee..e8b35b89157a31 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -186,34 +186,12 @@ void Pool2dOpMaker::Make() {
       "pooling in each grid area to get output pooling value. "
       "Default False.")
       .SetDefault(false);
-
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool) Only used in cudnn kernel, need install cudnn. Default False")
-      .SetDefault(false)
-      .AsExtra();
   AddAttr<bool>(
       "ceil_mode",
       "(bool) Whether to use the ceil function to calculate "
       "output height and width. False is the default. If it is set to False, "
       "the floor function will be used. Default False")
       .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool) Only used in mkldnn kernel. Default False")
-      .SetDefault(false)
-      .AsExtra();
-  AddAttr<bool>(
-      "use_quantizer",
-      "(bool, default false) "
-      "This parameter is no longer used. Use 'mkldnn_data_type' instead.")
-      .SetDefault(false)
-      .AsExtra();
-  AddAttr<std::string>(
-      "mkldnn_data_type",
-      "(string, default \"float32\"). Data type of mkldnn kernel")
-      .SetDefault("float32")
-      .InEnum({"float32", "int8", "bfloat16"})
-      .AsExtra();
   AddAttr<std::string>(
       "data_format",
       "(string, default NCHW) Only used in "
@@ -221,12 +199,6 @@ void Pool2dOpMaker::Make() {
       "Defaults to \"NHWC\". Specify the data format of the output data, "
       "the input will be transformed automatically. ")
       .SetDefault("NCHW");
-  AddAttr<bool>("is_test",
-                "(bool, default false) Set to true for inference only, false "
-                "for training. Some layers may run faster when this is true.")
-      .SetDefault(false)
-      .AsExtra();
-
   AddAttr<std::string>(
       "padding_algorithm",
       "(string, default \"EXPLICIT\") An optional string from: \"EXPLICIT\","
@@ -234,7 +206,11 @@ void Pool2dOpMaker::Make() {
       "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
       .SetDefault("EXPLICIT");
   // TODO(dzhwinter): need to registered layout transform function
-
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool) Only used in cudnn kernel, need install cudnn. Default False")
+      .SetDefault(false)
+      .AsExtra();
   AddComment(R"DOC(
 This operation calculates the pooling output based on
 the input, pooling_type and pool_size, pool_stride, pool_padding parameters.
@@ -407,22 +383,12 @@ void Pool3dOpMaker::Make() {
       "pooling in each grid area to get output pooling value. "
       "Default False")
       .SetDefault(false);
-
-  AddAttr<bool>(
-      "use_cudnn",
-      "(bool) Only used in cudnn kernel, need install cudnn. Default False")
-      .SetDefault(false)
-      .AsExtra();
   AddAttr<bool>(
       "ceil_mode",
       "(bool) Whether to use the ceil function to calculate "
       "output height and width. False is the default. If it is set to False, "
       "the floor function will be used. Default False")
       .SetDefault(false);
-  AddAttr<bool>("use_mkldnn",
-                "(bool) Only used in mkldnn kernel. Default False")
-      .SetDefault(false)
-      .AsExtra();
   AddAttr<std::string>(
       "data_format",
       "(string, default NCDHW) Only used in "
@@ -436,8 +402,11 @@ void Pool3dOpMaker::Make() {
       "\"SAME\",\"VALID\". Set to \"EXPLICIT\" for explicit padding. "
       "Set to \"SAME\" or \"VALID\" for algorithm of padding. ")
       .SetDefault("EXPLICIT");
-  // TODO(dzhwinter): need to registered layout transform function
-
+  AddAttr<bool>(
+      "use_cudnn",
+      "(bool) Only used in cudnn kernel, need install cudnn. Default False")
+      .SetDefault(false)
+      .AsExtra();
   AddComment(R"DOC(
 This operation calculates the output based on
 the input, pooling_type, pool_size, pool_stride, and pool_padding parameters.
diff --git a/paddle/fluid/operators/pool_op_mlu.cc b/paddle/fluid/operators/pool_op_mlu.cc
index 5eaf8dbff88ab6..988eb182a16f04 100644
--- a/paddle/fluid/operators/pool_op_mlu.cc
+++ b/paddle/fluid/operators/pool_op_mlu.cc
@@ -141,10 +141,9 @@ class MLUPoolOpKernel : public framework::OpKernel<T> {
           handle, pool_mode, out_w, out_h, &extra_input_size);
 
       if (extra_input_size > 0) {
-        phi::CPUContext cpu_ctx;
-        framework::Tensor extra_host_tensor =
-            ctx.AllocateTmpTensor<int8_t, phi::CPUContext>(
-                {static_cast<int64_t>(extra_input_size)}, cpu_ctx);
+        framework::Tensor extra_host_tensor;
+        extra_host_tensor.mutable_data<int8_t>(
+            {static_cast<int64_t>(extra_input_size)}, platform::CPUPlace());
         cnnlInitPoolingExtraInput(handle,
                                   pool_desc.get(),
                                   trans_in_x_desc.get(),
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index df58a2abe87b53..f7abaf648ebcfd 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -71,7 +71,7 @@ class PReluOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
       const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
+      const framework::OpKernelType &expected_kernel_type) const override {
     return innerGetKernelTypeForVar(tensor, expected_kernel_type);
   }
 };
@@ -150,7 +150,7 @@ class PReluGradOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name,
       const Tensor &tensor,
-      const framework::OpKernelType &expected_kernel_type) const {
+      const framework::OpKernelType &expected_kernel_type) const override {
     return innerGetKernelTypeForVar(tensor, expected_kernel_type);
   }
 };
diff --git a/paddle/fluid/operators/prim_ops/CMakeLists.txt b/paddle/fluid/operators/prim_ops/CMakeLists.txt
index 1f63d5d1721b5c..30e162a4dd2a96 100644
--- a/paddle/fluid/operators/prim_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/prim_ops/CMakeLists.txt
@@ -8,7 +8,7 @@ register_operators()
 set(PRIM_OP_SRCS
     reshape_p_op.cc
     broadcast_p_op.cc
-    reduce_p_op.cc
+    reduce_sum_p_op.cc
     transpose_p_op.cc
     split_p_op.cc
     concat_p_op.cc
@@ -30,9 +30,14 @@ set(PRIM_OP_SRCS
     log_p_op.cc
     select_p_op.cc
     eq_p_op.cc
+    gt_p_op.cc
+    ge_p_op.cc
+    ne_p_op.cc
     pow_p_op.cc
     max_p_op.cc
-    erf_p_op.cc)
+    erf_p_op.cc
+    abs_p_op.cc
+    cast_p_op.cc)
 
 cc_test(
   prim_op_test
diff --git a/paddle/fluid/operators/prim_ops/abs_p_op.cc b/paddle/fluid/operators/prim_ops/abs_p_op.cc
new file mode 100644
index 00000000000000..8ad9d131689e70
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/abs_p_op.cc
@@ -0,0 +1,71 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class AbsPrimOp : public framework::OperatorBase {
+ public:
+  AbsPrimOp(const std::string &type,
+            const framework::VariableNameMap &inputs,
+            const framework::VariableNameMap &outputs,
+            const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator abs_p should not be excuted directly"));
+  }
+};
+
+class AbsPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of abs_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of abs_p op.");
+    AddComment(R"DOC(Autograd primitive abs_p operator.)DOC");
+  }
+};
+
+class AbsPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
+    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
+  }
+};
+
+class AbsPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Output(ctx, "Y")[0];
+    SetType(ctx, y_name, GetType(ctx, x_name));
+    SetDataType(ctx, y_name, GetDataType(ctx, x_name));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(abs_p,
+                  paddle::operators::AbsPrimOp,
+                  paddle::operators::AbsPrimOpMaker,
+                  paddle::operators::AbsPrimOpShapeInference,
+                  paddle::operators::AbsPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/cast_p_op.cc b/paddle/fluid/operators/prim_ops/cast_p_op.cc
new file mode 100644
index 00000000000000..5c8b9ab45c6bca
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/cast_p_op.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+class InferShapeContext;
+class VarDesc;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+class CastPrimOp : public framework::OperatorBase {
+ public:
+  CastPrimOp(const std::string &type,
+             const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator cast_p should not be excuted directly"));
+  }
+};
+
+class CastPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of cast_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of cast_p op.");
+    AddAttr<int>("dtype", "output data type");
+    AddComment(R"DOC(Autograd primitive cast_p operator.)DOC");
+  }
+};
+
+class CastPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetOutputVarPtrs("Y")[0];
+    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
+    PADDLE_GET(framework::VarDesc *, y_var_ptr)->SetShape(x_var->GetShape());
+  }
+};
+
+class CastPrimOpVarTypeInference
+    : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto out_type = static_cast<framework::proto::VarType::Type>(
+        PADDLE_GET_CONST(int, ctx->GetAttr("dtype")));
+    ctx->SetOutputDataType("Y", out_type);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(cast_p,
+                  paddle::operators::CastPrimOp,
+                  paddle::operators::CastPrimOpMaker,
+                  paddle::operators::CastPrimOpShapeInference,
+                  paddle::operators::CastPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/ge_p_op.cc b/paddle/fluid/operators/prim_ops/ge_p_op.cc
new file mode 100644
index 00000000000000..33fbd4cd71497f
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/ge_p_op.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class GePrimOp : public framework::OperatorBase {
+ public:
+  GePrimOp(const std::string &type,
+           const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator ge_p should not be excuted directly"));
+  }
+};
+
+class GePrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of ge_p op.");
+    AddInput("Y", "(Tensor), The input tensor of ge_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of ge_p op.");
+    AddComment(R"DOC(
+Autograd primitive ge_p operator.
+)DOC");
+  }
+};
+
+class GePrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank,
+                      y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank,
+                          y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i],
+          y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i,
+              x_shape[i],
+              y_shape[i]));
+    }
+
+    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class GePrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type,
+                      y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type,
+                          y_type));
+    PADDLE_ENFORCE_EQ(x_dtype,
+                      y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype,
+                          y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(ge_p,
+                  paddle::operators::GePrimOp,
+                  paddle::operators::GePrimOpMaker,
+                  paddle::operators::GePrimOpShapeInference,
+                  paddle::operators::GePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/gt_p_op.cc b/paddle/fluid/operators/prim_ops/gt_p_op.cc
new file mode 100644
index 00000000000000..baacab62d8c3eb
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/gt_p_op.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class GtPrimOp : public framework::OperatorBase {
+ public:
+  GtPrimOp(const std::string &type,
+           const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator gt_p should not be excuted directly"));
+  }
+};
+
+class GtPrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of gt_p op.");
+    AddInput("Y", "(Tensor), The input tensor of gt_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of gt_p op.");
+    AddComment(R"DOC(
+Autograd primitive gt_p operator.
+)DOC");
+  }
+};
+
+class GtPrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank,
+                      y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank,
+                          y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i],
+          y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i,
+              x_shape[i],
+              y_shape[i]));
+    }
+
+    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class GtPrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type,
+                      y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type,
+                          y_type));
+    PADDLE_ENFORCE_EQ(x_dtype,
+                      y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype,
+                          y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(gt_p,
+                  paddle::operators::GtPrimOp,
+                  paddle::operators::GtPrimOpMaker,
+                  paddle::operators::GtPrimOpShapeInference,
+                  paddle::operators::GtPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/ne_p_op.cc b/paddle/fluid/operators/prim_ops/ne_p_op.cc
new file mode 100644
index 00000000000000..fac503309de1b7
--- /dev/null
+++ b/paddle/fluid/operators/prim_ops/ne_p_op.cc
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+class NePrimOp : public framework::OperatorBase {
+ public:
+  NePrimOp(const std::string &type,
+           const framework::VariableNameMap &inputs,
+           const framework::VariableNameMap &outputs,
+           const framework::AttributeMap &attrs)
+      : framework::OperatorBase(type, inputs, outputs, attrs) {}
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "Prim operator ne_p should not be excuted directly"));
+  }
+};
+
+class NePrimOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor), The input tensor of ne_p op.");
+    AddInput("Y", "(Tensor), The input tensor of ne_p op.");
+    AddOutput("Z", "(Tensor), The output tensor of ne_p op.");
+    AddComment(R"DOC(
+Autograd primitive ne_p operator.
+)DOC");
+  }
+};
+
+class NePrimOpShapeInference : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
+    framework::InferShapeVarPtr y_var_ptr = ctx->GetInputVarPtrs("Y")[0];
+    framework::InferShapeVarPtr z_var_ptr = ctx->GetOutputVarPtrs("Z")[0];
+
+    framework::VarDesc *x_var = PADDLE_GET(framework::VarDesc *, x_var_ptr);
+    framework::VarDesc *y_var = PADDLE_GET(framework::VarDesc *, y_var_ptr);
+    auto x_shape = x_var->GetShape();
+    auto y_shape = y_var->GetShape();
+    size_t x_rank = x_shape.size();
+    size_t y_rank = y_shape.size();
+    PADDLE_ENFORCE_EQ(x_rank,
+                      y_rank,
+                      platform::errors::InvalidArgument(
+                          "The dimensions of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_rank,
+                          y_rank));
+    for (size_t i = 0; i < x_rank; ++i) {
+      PADDLE_ENFORCE_EQ(
+          x_shape[i],
+          y_shape[i],
+          platform::errors::InvalidArgument(
+              "The shape of two input tensor at dimension %d should be same, "
+              "but get %d and %d",
+              i,
+              x_shape[i],
+              y_shape[i]));
+    }
+
+    PADDLE_GET(framework::VarDesc *, z_var_ptr)->SetShape(x_shape);
+  }
+};
+
+class NePrimOpVarTypeInference : public framework::StaticGraphVarTypeInference {
+ public:
+  void operator()(framework::InferVarTypeContext *ctx) const override {
+    auto x_name = Input(ctx, "X")[0];
+    auto y_name = Input(ctx, "Y")[0];
+    auto z_name = Output(ctx, "Z")[0];
+    auto x_type = GetType(ctx, x_name);
+    auto y_type = GetType(ctx, y_name);
+    auto x_dtype = GetDataType(ctx, x_name);
+    auto y_dtype = GetDataType(ctx, y_name);
+    PADDLE_ENFORCE_EQ(x_type,
+                      y_type,
+                      platform::errors::InvalidArgument(
+                          "The type of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_type,
+                          y_type));
+    PADDLE_ENFORCE_EQ(x_dtype,
+                      y_dtype,
+                      platform::errors::InvalidArgument(
+                          "The datatype of two input tensor should be same, "
+                          "but get %d and %d",
+                          x_dtype,
+                          y_dtype));
+
+    SetType(ctx, z_name, x_type);
+    SetDataType(ctx, z_name, framework::proto::VarType::BOOL);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(ne_p,
+                  paddle::operators::NePrimOp,
+                  paddle::operators::NePrimOpMaker,
+                  paddle::operators::NePrimOpShapeInference,
+                  paddle::operators::NePrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/prim_ops/prim_op_test.cc b/paddle/fluid/operators/prim_ops/prim_op_test.cc
index 44872f9060bfe3..153a4575463bc8 100644
--- a/paddle/fluid/operators/prim_ops/prim_op_test.cc
+++ b/paddle/fluid/operators/prim_ops/prim_op_test.cc
@@ -18,7 +18,7 @@
 
 USE_OP_ITSELF(reshape_p);
 USE_OP_ITSELF(broadcast_p);
-USE_OP_ITSELF(reduce_p);
+USE_OP_ITSELF(reduce_sum_p);
 USE_OP_ITSELF(transpose_p);
 USE_OP_ITSELF(split_p);
 USE_OP_ITSELF(concat_p);
@@ -130,7 +130,7 @@ TEST(PrimOp, broadcast_p) {
   ASSERT_EQ(shapes[2], 5L);
 }
 
-TEST(PrimOp, reduce_p) {
+TEST(PrimOp, reduce_sum_p) {
   ProgramDesc program;
   auto *block = program.MutableBlock(0);
   std::vector<int64_t> shape{3, 4, 5};
@@ -141,7 +141,7 @@ TEST(PrimOp, reduce_p) {
 
   NewVar(block, x0, shape);
   AppendOp(block,
-           "reduce_p",
+           "reduce_sum_p",
            {{"X", {x0}}},
            {{"Y", {x1}}},
            {{"axis", std::vector<int64_t>{0, 2}}, {"keepdim", false}});
@@ -151,7 +151,7 @@ TEST(PrimOp, reduce_p) {
   ASSERT_EQ(shapes.size(), 1UL);
   ASSERT_EQ(shapes[0], 4L);
   AppendOp(block,
-           "reduce_p",
+           "reduce_sum_p",
            {{"X", {x0}}},
            {{"Y", {x2}}},
            {{"axis", std::vector<int64_t>{0, 2}}, {"keepdim", true}});
diff --git a/paddle/fluid/operators/prim_ops/reduce_p_op.cc b/paddle/fluid/operators/prim_ops/reduce_sum_p_op.cc
similarity index 74%
rename from paddle/fluid/operators/prim_ops/reduce_p_op.cc
rename to paddle/fluid/operators/prim_ops/reduce_sum_p_op.cc
index 3c18ce46f9d937..b31b4934706a93 100644
--- a/paddle/fluid/operators/prim_ops/reduce_p_op.cc
+++ b/paddle/fluid/operators/prim_ops/reduce_sum_p_op.cc
@@ -24,25 +24,25 @@ class VarDesc;
 
 namespace paddle {
 namespace operators {
-class ReducePrimOp : public framework::OperatorBase {
+class ReduceSumPrimOp : public framework::OperatorBase {
  public:
-  ReducePrimOp(const std::string &type,
-               const framework::VariableNameMap &inputs,
-               const framework::VariableNameMap &outputs,
-               const framework::AttributeMap &attrs)
+  ReduceSumPrimOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
       : framework::OperatorBase(type, inputs, outputs, attrs) {}
   void RunImpl(const framework::Scope &scope,
                const platform::Place &dev_place) const override {
     PADDLE_THROW(platform::errors::Unimplemented(
-        "Prim operator reduce_p should not be excuted directly"));
+        "Prim operator reduce_sum_p should not be excuted directly"));
   }
 };
 
-class ReducePrimOpMaker : public framework::OpProtoAndCheckerMaker {
+class ReduceSumPrimOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    AddInput("X", "(Tensor), The input tensor of reduce_p op.");
-    AddOutput("Y", "(Tensor), The output tensor of reduce_p op.");
+    AddInput("X", "(Tensor), The input tensor of reduce_sum_p op.");
+    AddOutput("Y", "(Tensor), The output tensor of reduce_sum_p op.");
     AddAttr<std::vector<int64_t>>(
         "axis",
         "(std::vector<int64_t>) The axis along which to reduce on. Must be in "
@@ -53,12 +53,12 @@ class ReducePrimOpMaker : public framework::OpProtoAndCheckerMaker {
                   "If true, retain the reduced axis with length 1.")
         .SetDefault(false);
     AddComment(R"DOC(
-Autograd primitive reduce_p operator.
+Autograd primitive reduce_sum_p operator.
 )DOC");
   }
 };
 
-class ReducePrimOpShapeInference : public framework::InferShapeBase {
+class ReduceSumPrimOpShapeInference : public framework::InferShapeBase {
  public:
   void operator()(framework::InferShapeContext *ctx) const override {
     framework::InferShapeVarPtr x_var_ptr = ctx->GetInputVarPtrs("X")[0];
@@ -87,7 +87,7 @@ class ReducePrimOpShapeInference : public framework::InferShapeBase {
   }
 };
 
-class ReducePrimOpVarTypeInference
+class ReduceSumPrimOpVarTypeInference
     : public framework::StaticGraphVarTypeInference {
  public:
   void operator()(framework::InferVarTypeContext *ctx) const override {
@@ -101,8 +101,8 @@ class ReducePrimOpVarTypeInference
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OPERATOR(reduce_p,
-                  paddle::operators::ReducePrimOp,
-                  paddle::operators::ReducePrimOpMaker,
-                  paddle::operators::ReducePrimOpShapeInference,
-                  paddle::operators::ReducePrimOpVarTypeInference);
+REGISTER_OPERATOR(reduce_sum_p,
+                  paddle::operators::ReduceSumPrimOp,
+                  paddle::operators::ReduceSumPrimOpMaker,
+                  paddle::operators::ReduceSumPrimOpShapeInference,
+                  paddle::operators::ReduceSumPrimOpVarTypeInference);
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 840e33939897fd..a2bf63da10bd2f 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -113,11 +113,6 @@ class DistributedPushSparseOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_cvm_op", "(boolean, default false) Use cvm op or not.")
         .SetDefault(false);
 
-    AddAttr<std::vector<int>>("slots",
-                              "[slot_id1, slot_id2] Slots array of Ids.")
-        .SetDefault({})
-        .AsExtra();
-
     AddComment(R"DOC(
 Lookup Tablel Prefetch Operator.
 This operator is used to perform lookup on parameter W,
diff --git a/paddle/fluid/operators/quantize_linear_op.cc b/paddle/fluid/operators/quantize_linear_op.cc
index 7012da3aeda94a..d4cd685575eecc 100644
--- a/paddle/fluid/operators/quantize_linear_op.cc
+++ b/paddle/fluid/operators/quantize_linear_op.cc
@@ -134,10 +134,6 @@ class QuantizeLinearOpMaker : public framework::OpProtoAndCheckerMaker {
     AddOutput("OutScale", "(Tensor) Current scale")
         .AsDispensable()
         .AsExtra();  // only qat use
-    AddAttr<float>("moving_rate",
-                   "(float, default 0.9) moving rate.")  // only qat use
-        .SetDefault(0.9)
-        .AsExtra();
     AddAttr<int>("quant_axis",
                  "(int, default 0) The axis for quantization. "
                  "For conv2d, depthwise_conv2d, conv2d_transpose "
diff --git a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
index 310c1db205da68..75b0c1f16deae9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_max_op_mlu.cc
@@ -92,6 +92,112 @@ class ReduceMaxMLUKernel : public framework::OpKernel<T> {
   }
 };
 
+template <typename T>
+class ReduceMaxGradMLUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
+    auto* out = context.Input<Tensor>("Out");
+    auto* out_grad = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto reduce_dims = context.Attr<std::vector<int>>("dim");
+    bool reduce_all = context.Attr<bool>("reduce_all");
+    int in_dtype = context.Attr<int>("in_dtype");
+
+    PADDLE_ENFORCE_EQ(
+        in_dtype == -1,
+        true,
+        platform::errors::InvalidArgument(
+            "MLU only support in_dtype == -1 in reduce_max_grad op."));
+    auto* x_grad = context.Output<Tensor>(framework::GradVarName("X"));
+    x_grad->mutable_data<T>(context.GetPlace());
+
+    auto place = context.GetPlace();
+
+    // broadcast
+    auto x_dims_vec = phi::vectorize(x->dims());
+    if (reduce_all) {
+      reduce_dims.clear();
+      for (size_t d = 0; d < x_dims_vec.size(); ++d) {
+        reduce_dims.push_back(static_cast<int>(d));
+      }
+    }
+
+    Tensor tmp_out, tmp_out_grad;
+    auto tmp_out_dims_vec = x_dims_vec;
+    for (auto d : reduce_dims) {
+      if (d < 0) {
+        d += x_dims_vec.size();
+      }
+      tmp_out_dims_vec[d] = 1;
+    }
+
+    tmp_out.ShareDataWith(*out);
+    tmp_out.Resize(phi::make_ddim(tmp_out_dims_vec));
+    tmp_out_grad.ShareDataWith(*out_grad);
+    tmp_out_grad.Resize(phi::make_ddim(tmp_out_dims_vec));
+
+    Tensor transformed_out(x->type());
+    transformed_out.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out.mutable_data<T>(place);
+
+    MLUCnnlTensorDesc tmp_out_desc(tmp_out);
+    MLUCnnlTensorDesc transformed_out_desc(transformed_out);
+
+    MLUCnnl::BroadcastTo(context,
+                         tmp_out_desc.get(),
+                         GetBasePtr(&tmp_out),
+                         transformed_out_desc.get(),
+                         GetBasePtr(&transformed_out));
+
+    Tensor transformed_out_grad(x->type());
+    transformed_out_grad.Resize(phi::make_ddim(x_dims_vec));
+    transformed_out_grad.mutable_data<T>(place);
+    MLUCnnlTensorDesc tmp_out_grad_desc(tmp_out_grad);
+    MLUCnnlTensorDesc transformed_out_grad_desc(transformed_out_grad);
+
+    MLUCnnl::BroadcastTo(context,
+                         tmp_out_grad_desc.get(),
+                         GetBasePtr(&tmp_out_grad),
+                         transformed_out_grad_desc.get(),
+                         GetBasePtr(&transformed_out_grad));
+
+    // compare
+    Tensor equal_cond;
+    equal_cond.mutable_data<bool>(x_grad->dims(), place);
+
+    MLUCnnlTensorDesc x_desc(*x);
+    MLUCnnlTensorDesc equal_cond_desc(equal_cond);
+
+    MLUCnnl::Logic(context,
+                   CNNL_LOGIC_OP_EQ,
+                   x_desc.get(),
+                   GetBasePtr(x),
+                   transformed_out_desc.get(),
+                   GetBasePtr(&transformed_out),
+                   equal_cond_desc.get(),
+                   GetBasePtr(&equal_cond));
+
+    // select
+    Tensor t_zero;
+    t_zero.mutable_data<T>(x_grad->dims(), place);
+    FillMLUTensorWithHostValue<T>(context, static_cast<T>(0), &t_zero);
+    t_zero.Resize(x_grad->dims());
+
+    MLUCnnlTensorDesc t_zero_desc(t_zero);
+    MLUCnnlTensorDesc x_grad_desc(*x_grad);
+
+    MLUCnnl::Select(context,
+                    equal_cond_desc.get(),
+                    GetBasePtr(&equal_cond),
+                    transformed_out_grad_desc.get(),
+                    GetBasePtr(&transformed_out_grad),
+                    t_zero_desc.get(),
+                    GetBasePtr(&t_zero),
+                    x_grad_desc.get(),
+                    GetBasePtr(x_grad));
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -102,3 +208,7 @@ REGISTER_OP_MLU_KERNEL(reduce_max,
                        ops::ReduceMaxMLUKernel<float>,
                        ops::ReduceMaxMLUKernel<plat::float16>,
                        ops::ReduceMaxMLUKernel<int>);
+REGISTER_OP_MLU_KERNEL(reduce_max_grad,
+                       ops::ReduceMaxGradMLUKernel<float>,
+                       ops::ReduceMaxGradMLUKernel<plat::float16>,
+                       ops::ReduceMaxGradMLUKernel<int>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index d305a65e0d133d..22a251706a97d5 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -634,20 +634,26 @@ class ReduceGradOp : public framework::OperatorWithKernel {
                    "ReduceOp");
     auto x_dims = ctx->GetInputDim("X");
     auto x_rank = x_dims.size();
-    auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
-    for (size_t i = 0; i < dims.size(); ++i) {
-      PADDLE_ENFORCE_LT(dims[i],
-                        x_rank,
-                        platform::errors::InvalidArgument(
-                            "The reduce dim index %d should be in the "
-                            "range [-dimension(X), dimension(X)], "
-                            "which dimesion = %d. But received dim index = %d.",
-                            i,
-                            x_rank,
-                            dims[i]));
-      if (dims[i] < 0) dims[i] = x_rank + dims[i];
+    // TODO(dev): We should delete Infershape and migrate it into
+    // UnchangeInferMeta.In case of 'dim' is Variable, it will
+    // not exist in Attrs but in Inputs.
+    if (ctx->HasAttr("dim")) {
+      auto dims = ctx->Attrs().Get<std::vector<int>>("dim");
+      for (size_t i = 0; i < dims.size(); ++i) {
+        PADDLE_ENFORCE_LT(
+            dims[i],
+            x_rank,
+            platform::errors::InvalidArgument(
+                "The reduce dim index %d should be in the "
+                "range [-dimension(X), dimension(X)], "
+                "which dimesion = %d. But received dim index = %d.",
+                i,
+                x_rank,
+                dims[i]));
+        if (dims[i] < 0) dims[i] = x_rank + dims[i];
+      }
     }
-    sort(dims.begin(), dims.end());
+
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
       ctx->SetOutputDim(x_grad_name, x_dims);
diff --git a/paddle/fluid/operators/rnn_op.cc b/paddle/fluid/operators/rnn_op.cc
index 4a97afdfc4a069..aba720a99ba273 100644
--- a/paddle/fluid/operators/rnn_op.cc
+++ b/paddle/fluid/operators/rnn_op.cc
@@ -103,6 +103,9 @@ class RNNOpMaker : public framework::OpProtoAndCheckerMaker {
         "mode",
         "(string) rnn types, including: LSTM, GRU, RNN_RELU, RNN_TANH.");
     AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
+    AddAttr<bool>("is_test", "True if in test phase.")
+        .SetDefault(false)
+        .AsExtra();
     AddComment(R"DOC(
 )DOC");
   }
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
deleted file mode 100644
index 740285fb6505d0..00000000000000
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ /dev/null
@@ -1,278 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef PADDLE_WITH_XPU
-#include <memory>
-#include <string>
-
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename DeviceContext, typename T>
-class XPUROIAlignOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-    auto* out = ctx.Output<Tensor>("Out");
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    const auto& in_dims = in->dims();
-    int batch_size = in_dims[0];
-    int channels = in_dims[1];
-    int height = in_dims[2];
-    int width = in_dims[3];
-
-    int rois_num = rois->dims()[0];
-
-    if (rois_num == 0) return;
-
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-    int* roi_batch_id_data = roi_batch_id_list.mutable_data<int>(cplace);
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto xplace = ctx.GetPlace();
-    int rois_batch_size = 0;
-    int* cpu_lod = nullptr;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size,
-          batch_size,
-          platform::errors::InvalidArgument(
-              "The rois_batch_size and imgs "
-              "batch_size must be the same. But received rois_batch_size = %d, "
-              "batch_size = %d",
-              rois_batch_size,
-              batch_size));
-
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace,
-                   rois_num_list.data(),
-                   xplace,
-                   rois_num_t->data<int>(),
-                   sizeof(int) * rois_batch_size);
-      cpu_lod = new int[rois_batch_size + 1];
-      cpu_lod[0] = 0;
-      for (int i = 0; i < rois_batch_size; i++) {
-        cpu_lod[i + 1] = cpu_lod[i] + rois_num_list[i];
-      }
-    } else {
-      auto lod = rois->lod();
-      PADDLE_ENFORCE_EQ(
-          lod.empty(),
-          false,
-          platform::errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
-                                            "not contain LoD information."));
-      auto rois_lod = lod.back();
-      rois_batch_size = rois_lod.size() - 1;
-      PADDLE_ENFORCE_EQ(
-          rois_batch_size,
-          batch_size,
-          platform::errors::InvalidArgument(
-              "The batch size of rois and batch size "
-              "of images must be the same. But received rois batch size = %d, "
-              "and images batch size = %d",
-              rois_batch_size,
-              batch_size));
-      int rois_num_with_lod = rois_lod[rois_batch_size];
-      PADDLE_ENFORCE_EQ(
-          rois_num,
-          rois_num_with_lod,
-          platform::errors::InvalidArgument(
-              "The actual number of rois and the number of rois "
-              "provided from Input(RoIsLoD) in RoIAlign must be the same."
-              " But received actual number of rois is %d, and the number "
-              "of rois from RoIsLoD is %d",
-              rois_num,
-              rois_num_with_lod));
-      for (int n = 0; n < rois_batch_size; ++n) {
-        for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
-          roi_batch_id_data[i] = n;
-        }
-      }
-      cpu_lod = new int[rois_batch_size + 1];
-      for (int i = 0; i < rois_batch_size + 1; i++) {
-        cpu_lod[i] = rois_lod[i];
-      }
-    }
-
-    int* roi_id_data = nullptr;
-    int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
-                       (rois_batch_size + 1) * sizeof(int));
-    PADDLE_ENFORCE_EQ(r,
-                      xpu::Error_t::SUCCESS,
-                      platform::errors::External("no enough memory in xpu"));
-    memory::Copy(xplace,
-                 roi_id_data,
-                 cplace,
-                 cpu_lod,
-                 (rois_batch_size + 1) * sizeof(int));
-    delete[] cpu_lod;
-    r = xpu::roi_align<T, int>(dev_ctx.x_context(),
-                               in->data<T>(),
-                               out->mutable_data<T>(ctx.GetPlace()),
-                               rois->data<T>(),
-                               roi_id_data,
-                               batch_size,
-                               channels,
-                               height,
-                               width,
-                               out->dims()[0],
-                               pooled_height,
-                               pooled_width,
-                               spatial_scale,
-                               sampling_ratio,
-                               true,
-                               aligned);
-    PADDLE_ENFORCE_EQ(r,
-                      xpu::Error_t::SUCCESS,
-                      platform::errors::External(
-                          "The roi_align XPU OP return wrong value[%d %s]",
-                          r,
-                          XPUAPIErrorMsg[r]));
-    if (dev_ctx.x_context()->xpu_stream) {
-      dev_ctx.Wait();
-    }
-    xpu_free(roi_id_data);
-  }
-};
-
-template <typename DeviceContext, typename T>
-class XPUROIAlignGradOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* in = ctx.Input<Tensor>("X");
-    auto* rois = ctx.Input<LoDTensor>("ROIs");
-
-    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* in_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
-
-    auto pooled_height = ctx.Attr<int>("pooled_height");
-    auto pooled_width = ctx.Attr<int>("pooled_width");
-    auto spatial_scale = ctx.Attr<float>("spatial_scale");
-    auto sampling_ratio = ctx.Attr<int>("sampling_ratio");
-    auto aligned = ctx.Attr<bool>("aligned");
-
-    int rois_num = rois->dims()[0];
-    int channels = in->dims()[1];
-    int height = in->dims()[2];
-    int width = in->dims()[3];
-
-    if (!in_grad) {
-      return;
-    }
-    Tensor roi_batch_id_list;
-    roi_batch_id_list.Resize({rois_num});
-    auto cplace = platform::CPUPlace();
-
-    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto xplace = ctx.GetPlace();
-
-    int rois_batch_size = 0;
-    int* cpu_lod = nullptr;
-    if (ctx.HasInput("RoisNum")) {
-      auto* rois_num_t = ctx.Input<Tensor>("RoisNum");
-      rois_batch_size = rois_num_t->numel();
-      std::vector<int> rois_num_list(rois_batch_size);
-      memory::Copy(cplace,
-                   rois_num_list.data(),
-                   xplace,
-                   rois_num_t->data<int>(),
-                   sizeof(int) * rois_batch_size);
-      cpu_lod = new int[rois_batch_size + 1];
-      cpu_lod[0] = 0;
-      for (int i = 0; i < rois_batch_size; i++) {
-        cpu_lod[i + 1] = cpu_lod[i] + rois_num_list[i];
-      }
-    } else {
-      auto rois_lod = rois->lod().back();
-      rois_batch_size = rois_lod.size() - 1;
-      cpu_lod = new int[rois_batch_size + 1];
-      for (int i = 0; i < rois_batch_size + 1; i++) {
-        cpu_lod[i] = rois_lod[i];
-      }
-    }
-    int* roi_id_data = nullptr;
-    int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
-                       (rois_batch_size + 1) * sizeof(int));
-    PADDLE_ENFORCE_EQ(r,
-                      xpu::Error_t::SUCCESS,
-                      platform::errors::External("no enough memory in xpu"));
-    memory::Copy(xplace,
-                 roi_id_data,
-                 cplace,
-                 cpu_lod,
-                 (rois_batch_size + 1) * sizeof(int));
-    in_grad->mutable_data<T>(ctx.GetPlace());
-
-    int output_grad_size = out_grad->numel();
-
-    delete[] cpu_lod;
-    if (output_grad_size > 0) {
-      r = xpu::roi_align_grad<T, int>(dev_ctx.x_context(),
-                                      out_grad->data<T>(),
-                                      in_grad->data<T>(),
-                                      rois->data<T>(),
-                                      roi_id_data,
-                                      in->dims()[0],
-                                      channels,
-                                      height,
-                                      width,
-                                      out_grad->dims()[0],
-                                      pooled_height,
-                                      pooled_width,
-                                      spatial_scale,
-                                      sampling_ratio,
-                                      true,
-                                      aligned);
-      PADDLE_ENFORCE_EQ(
-          r,
-          xpu::Error_t::SUCCESS,
-          platform::errors::External(
-              "The roi_align_grad XPU OP return wrong value[%d %s]",
-              r,
-              XPUAPIErrorMsg[r]));
-    }
-    if (dev_ctx.x_context()->xpu_stream) {
-      dev_ctx.Wait();
-    }
-    xpu_free(roi_id_data);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_XPU_KERNEL(
-    roi_align,
-    ops::XPUROIAlignOpKernel<paddle::platform::XPUDeviceContext, float>);
-REGISTER_OP_XPU_KERNEL(
-    roi_align_grad,
-    ops::XPUROIAlignGradOpKernel<paddle::platform::XPUDeviceContext, float>);
-
-#endif
diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc
index 3c2b5363d81ce2..cab04e43e8681f 100644
--- a/paddle/fluid/operators/scale_op.cc
+++ b/paddle/fluid/operators/scale_op.cc
@@ -75,10 +75,6 @@ if bias_after_scale=True:
         "Apply bias addition after or before scaling. It is useful for "
         "numeric stability in some circumstances.")
         .SetDefault(true);
-    AddAttr<bool>("use_mkldnn",
-                  "(bool, default false) Only used in mkldnn kernel")
-        .SetDefault(false)
-        .AsExtra();
   }
 };
 
@@ -108,11 +104,6 @@ class ScaleGradMaker : public framework::SingleGradOpMaker<T> {
     VLOG(6) << "Finish Set Attr bias";
     grad_op->SetAttr("bias_after_scale", true);
     VLOG(6) << "Finish Set Attr bias_after_scale";
-    if (grad_op->HasAttr("use_mkldnn")) {
-      VLOG(6) << "Finish Check Attr use_mkldnn";
-      grad_op->SetAttr("use_mkldnn", this->GetAttr("use_mkldnn"));
-      VLOG(6) << "Finish Set Attr use_mkldnn";
-    }
     VLOG(6) << "Finish Apply";
   }
 };
diff --git a/paddle/fluid/operators/sequence_ops/CMakeLists.txt b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
index fe36afd96c5e88..06281b6f376fde 100644
--- a/paddle/fluid/operators/sequence_ops/CMakeLists.txt
+++ b/paddle/fluid/operators/sequence_ops/CMakeLists.txt
@@ -4,3 +4,7 @@ if(WITH_UNITY_BUILD)
   include(unity_build_rule.cmake)
 endif()
 register_operators()
+
+if(WITH_UNITY_BUILD)
+  target_link_libraries(paddle_operators_sequence_ops_unity sequence_pooling)
+endif()
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
index 5417c20f3d4196..5a6d2ab0820e25 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_op.cc
@@ -73,14 +73,6 @@ class SequenceSoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
         "(bool, default false) Only used in cudnn kernel, need install cudnn")
         .SetDefault(false)
         .AsExtra();
-    AddAttr<std::string>(
-        "data_format",
-        "(string, default NCHW) Only used in "
-        "An optional string from: \"NHWC\", \"NCHW\". "
-        "Defaults to \"NHWC\". Specify the data format of the output data, "
-        "the input will be transformed automatically. ")
-        .SetDefault("AnyLayout")
-        .AsExtra();
     AddComment(R"DOC(
 Sequence Softmax Operator.
 
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index 074642e1b0241d..12cdccc50317c9 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -104,7 +104,8 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
                  framework::proto::VarType::INT32,
                  framework::proto::VarType::INT64,
                  framework::proto::VarType::FP32,
-                 framework::proto::VarType::FP64})
+                 framework::proto::VarType::FP64,
+                 framework::proto::VarType::FP16})
         .SetDefault(framework::proto::VarType::FP32);
     AddAttr<std::vector<int64_t>>(
         "axes", "(list<int64_t>) Axes that `starts` and `ends` apply to.");
@@ -135,6 +136,8 @@ class SetValueMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault({});
     AddAttr<std::vector<double>>("fp64_values", "Store the float64 values.")
         .SetDefault({});
+    AddAttr<std::vector<float>>("fp16_values", "Store the float16 values.")
+        .SetDefault({});
 
     AddAttr<std::vector<int64_t>>("shape", "(vector<int64_t>) Shape of values.")
         .SetDefault({});
diff --git a/paddle/fluid/operators/slice_op.cc b/paddle/fluid/operators/slice_op.cc
old mode 100644
new mode 100755
index 9d9e5816db7028..c26589677b9fc9
--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -155,8 +155,10 @@ class SliceOp : public framework::OperatorWithKernel {
 #ifdef PADDLE_WITH_MKLDNN
       auto input_data_type =
           framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input");
-
-      if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
+      auto vec_dims = phi::vectorize(in_tensor.dims());
+      bool all_zero_dims = std::all_of(
+          vec_dims.cbegin(), vec_dims.cend(), [](int64_t i) { return i == 0; });
+      if (!all_zero_dims && this->CanMKLDNNBeUsed(ctx, input_data_type)) {
         // OneDNN uses blocking format, which cannot be always supported with
         // reorders, because if blocked dimension is not divisible by 8 or
         // 16(depending on which blocking format is used) submemory cannot be
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 3966b850c7b830..b8bc13ef075e59 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -92,6 +92,11 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
         "Defaults to \"NHWC\". Specify the data format of the output data, "
         "the input will be transformed automatically. ")
         .SetDefault("AnyLayout");
+    AddAttr<bool>(
+        "use_cudnn",
+        "(bool, default false) Only used in cudnn kernel, need install cudnn")
+        .SetDefault(false)
+        .AsExtra();
     AddComment(R"DOC(
 Softmax Operator.
 
diff --git a/paddle/fluid/operators/sparse_manual_op.cc b/paddle/fluid/operators/sparse_manual_op.cc
new file mode 100644
index 00000000000000..f95d5250c62f6f
--- /dev/null
+++ b/paddle/fluid/operators/sparse_manual_op.cc
@@ -0,0 +1,59 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/sparse/binary.h"
+#include "paddle/phi/infermeta/sparse/unary.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+class SparseIndicesOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("x", "(Tensor), input 0 of sparse_indices op.");
+    AddOutput("out", "(Tensor), output 0 of sparse_indices op.");
+    AddComment(R"DOC(
+TODO: Documentation of sparse_indices op.
+)DOC");
+  }
+};
+
+class SparseIndicesOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+};
+
+DECLARE_INFER_SHAPE_FUNCTOR(sparse_indices,
+                            SparseIndicesInferShapeFunctor,
+                            PD_INFER_META(phi::sparse::IndicesInferMeta));
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(sparse_indices,
+                  ops::SparseIndicesOp,
+                  ops::SparseIndicesOpMaker,
+                  ops::SparseIndicesInferShapeFunctor);
diff --git a/paddle/fluid/operators/stack_op.cc b/paddle/fluid/operators/stack_op.cc
index a0351b41a244ba..e9706f00ce889a 100644
--- a/paddle/fluid/operators/stack_op.cc
+++ b/paddle/fluid/operators/stack_op.cc
@@ -56,11 +56,6 @@ class StackOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<int>("axis",
                  "The axis along which all of the Inputs(X) should be stacked.")
         .SetDefault(0);
-    AddAttr<bool>(
-        "use_mkldnn",
-        "(bool, default false) Indicates if MKL-DNN kernel will be used")
-        .SetDefault(false)
-        .AsExtra();
     AddComment(R"DOC(
 Stack Operator.
 Stack all of the Inputs(X) into one tensor along Attr(axis). The dims of all Inputs(X) must be the same.
diff --git a/paddle/fluid/operators/strided_slice_op_mlu.cc b/paddle/fluid/operators/strided_slice_op_mlu.cc
index 95972d8159267e..81d5b9089a9189 100644
--- a/paddle/fluid/operators/strided_slice_op_mlu.cc
+++ b/paddle/fluid/operators/strided_slice_op_mlu.cc
@@ -19,6 +19,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+using Tensor = phi::DenseTensor;
+using Variable = framework::Variable;
+using LoDTensorArray = framework::LoDTensorArray;
+using DDim = framework::DDim;
+
 static void ProcessStridedSliceParams(
     const std::vector<int>& axes,
     const DDim& input_dims,
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index ca851b8ee75b1e..ec570f709c35c8 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -9,15 +9,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/sum_op.h"
-
 #include <algorithm>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/var_type_inference.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -32,94 +34,6 @@ class SumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "sum");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "sum");
-
-    if (ctx->IsRuntime() && ctx->GetOutputsVarType("Out")[0] ==
-                                framework::proto::VarType::LOD_TENSOR_ARRAY) {
-      return;  // skip runtime infershape when is tensor array;
-    }
-
-    auto x_var_types = ctx->GetInputsVarType("X");
-    auto x_dims = ctx->GetInputsDim("X");
-
-    auto N = x_dims.size();
-    PADDLE_ENFORCE_GT(
-        N,
-        0,
-        platform::errors::InvalidArgument(
-            "The input tensor X's dimensions of SumOp "
-            "should be larger than 0. But received X's dimensions %d, "
-            "X's shape = [%s].",
-            N,
-            &x_dims));
-    if (N == 1) {
-      VLOG(3) << "Warning: SumOp have only one input, may waste memory";
-    }
-
-    framework::DDim in_dim({0});
-    for (size_t i = 0; i < x_dims.size(); ++i) {
-      auto& x_dim = x_dims[i];
-      // x_dim.size() == 1 means the real dim of selected rows is [0]
-      if (x_var_types[i] == framework::proto::VarType::SELECTED_ROWS &&
-          x_dim.size() == 1) {
-        continue;
-      }
-      if (phi::product(x_dim) == 0) {
-        continue;
-      }
-      if (phi::product(in_dim) == 0) {
-        in_dim = x_dim;
-      } else {
-        if (ctx->IsRuntime()) {
-          PADDLE_ENFORCE_EQ(in_dim,
-                            x_dim,
-                            platform::errors::InvalidArgument(
-                                "The input tensor X of SumOp must"
-                                " have same shape. But received X[0]'s shape = "
-                                "[%s], X[%d]'s shape = [%s].",
-                                in_dim,
-                                i,
-                                x_dim));
-        } else {
-          PADDLE_ENFORCE_EQ(
-              in_dim.size(),
-              x_dim.size(),
-              platform::errors::InvalidArgument(
-                  "The input tensor X of SumOp must have same "
-                  "dimensions. But received X[0]'s dimensions = %d, X[0]'s "
-                  "shape = "
-                  "[%s], X[%d]'s dimensions = %d, X[%d]'s shape = [%s].",
-                  in_dim.size(),
-                  in_dim,
-                  i,
-                  x_dim.size(),
-                  i,
-                  x_dim));
-          // if in_dim or x_dim has -1, not check equal
-          for (int j = 0; j < x_dim.size(); ++j) {
-            if (x_dim[j] == -1 || in_dim[j] == -1) {
-              continue;
-            }
-            PADDLE_ENFORCE_EQ(
-                in_dim[j],
-                x_dim[j],
-                platform::errors::InvalidArgument(
-                    "The input tensor X of SumOp must have same shape "
-                    "if not -1."
-                    "But received X[0]'s shape = [%s], X[%d]'s shape = [%s].",
-                    in_dim,
-                    i,
-                    x_dim));
-          }
-        }
-      }
-    }
-    ctx->SetOutputDim("Out", in_dim);
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -334,6 +248,7 @@ class SumGradOpBaseMaker : public imperative::GradOpBaseMakerBase {
         op.SetInput("X", og);
         op.SetOutput("Out", InputGradsType{x_grad});
         op.SetAttr("scale", 1.0f);
+        op.SetDefaultAttrsMap(DefaultAttrsMap());
       }
       return node;
     } else {
@@ -349,18 +264,16 @@ DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"});
 
 namespace ops = paddle::operators;
 
+namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(sum,
+                            AddNInferShapeFunctor,
+                            PD_INFER_META(phi::AddNTensorArrayInferMeta));
+
 REGISTER_OPERATOR(sum,
                   ops::SumOp,
                   ops::SumOpMaker,
                   ops::SumGradDescMaker,
                   ops::SumGradOpBaseMaker,
                   ops::SumOpVarTypeInference,
-                  ops::SumInplaceInferer);
-
-REGISTER_OP_CPU_KERNEL(
-    sum,
-    ops::SumKernel<phi::CPUContext, float>,
-    ops::SumKernel<phi::CPUContext, double>,
-    ops::SumKernel<phi::CPUContext, int>,
-    ops::SumKernel<phi::CPUContext, paddle::platform::bfloat16>,
-    ops::SumKernel<phi::CPUContext, int64_t>);
+                  ops::SumInplaceInferer,
+                  AddNInferShapeFunctor);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
deleted file mode 100644
index 35a1680d84d81c..00000000000000
--- a/paddle/fluid/operators/sum_op.cu
+++ /dev/null
@@ -1,280 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <paddle/fluid/platform/device_context.h>
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace plat = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-#define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
-
-using LoDTensor = framework::LoDTensor;
-
-template <class T>
-__global__ void Sum2CUDAKernel(const T *in_0,
-                               const T *in_1,
-                               T *out,
-                               int64_t N) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    out[id] = in_0[id] + in_1[id];
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumArrayCUDAKernel(
-    T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    T total(read_dst ? out[id] : static_cast<T>(0));
-    for (int i = 0; i < in_size; ++i) {
-      const T *tmp = in[i];
-      if (tmp) {
-        total += tmp[id];
-      }
-    }
-    out[id] = total;
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
-                                          int64_t N,
-                                          size_t rows) {
-  int id = blockIdx.x * blockDim.x + threadIdx.x;
-  while (id < N) {
-    for (int i = 0; i < 2 * rows; i += 2) {
-      const T *tmp = sr_in_out[i];
-      T *tmp_out = sr_in_out[i + 1];
-      if (tmp && tmp_out) {
-        tmp_out[id] += tmp[id];
-      }
-    }
-    id += blockDim.x * gridDim.x;
-  }
-}
-
-template <class T>
-void SumToLoDTensor(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  const size_t in_num = in_vars.size();
-
-  constexpr size_t theory_sm_threads = 1024;
-  auto &dev_ctx = context.template device_context<phi::GPUContext>();
-  auto stream = dev_ctx.stream();
-
-  auto max_threads = dev_ctx.GetMaxPhysicalThreadCount();
-  auto sm_count = max_threads / theory_sm_threads;
-  size_t tile_size = 0;
-  dim3 grids;
-  dim3 blocks;
-
-  auto ComputeKernelParameter = [&](size_t length) {
-    if (length >= max_threads)
-      tile_size = 1024;
-    else if (length < max_threads && length > sm_count * 128)
-      tile_size = 512;
-    else if (length <= sm_count * 128)
-      tile_size = 256;
-    grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
-    blocks = dim3(tile_size, 1, 1);
-  };
-
-  auto *out = context.Output<LoDTensor>("Out");
-  bool in_place = in_vars[0] == context.OutputVar("Out");
-
-  if (!in_place) {
-    auto *out_ptr = out->mutable_data<T>(context.GetPlace());
-    if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>()) {
-      auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
-      if (in_0_tensor.numel() > 0) {
-        in_place = (in_0_tensor.data<T>() == out_ptr);
-      }
-    }
-  }
-
-  // Sum of two tensors
-  if (in_num == 2 && in_vars[0]->IsType<framework::LoDTensor>() &&
-      in_vars[1]->IsType<framework::LoDTensor>()) {
-    auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
-    auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
-    int64_t length_0 = in_0.numel();
-    int64_t length_1 = in_1.numel();
-    if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      auto in_0_e = EigenVector<T>::Flatten(in_0);
-      auto in_1_e = EigenVector<T>::Flatten(in_1);
-      result.device(place) = in_0_e + in_1_e;
-    } else if (length_0 && in_0.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_0);
-    } else if (length_1 && in_1.IsInitialized()) {
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place = *dev_ctx.eigen_device();
-      result.device(place) = EigenVector<T>::Flatten(in_1);
-    }
-    return;
-  }
-
-  int start = in_place ? 1 : 0;
-  if (!in_place) {
-    phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
-    constant_functor(context.template device_context<phi::GPUContext>(),
-                     out,
-                     static_cast<T>(0));
-  }
-
-  std::vector<const T *> in_data;
-  std::vector<int> selectrow_index;
-  int64_t lod_length = 0;
-  bool dst_write = false;
-  for (int i = start; i < in_num; ++i) {
-    if (in_vars[i]->IsType<framework::LoDTensor>()) {
-      auto &in_i = in_vars[i]->Get<framework::LoDTensor>();
-      lod_length = in_i.numel();
-      if (lod_length && in_i.IsInitialized()) {
-        in_data.emplace_back(in_i.data<T>());
-      }
-    } else if (in_vars[i]->IsType<phi::SelectedRows>()) {
-      selectrow_index.push_back(i);
-    }
-  }
-
-  // compute select rows separately.
-  if (!selectrow_index.empty()) {
-    std::vector<const T *> sr_in_out_data;
-    size_t rows = 0;
-    int64_t length = 0;
-    for (auto index : selectrow_index) {
-      auto &sr = in_vars[index]->Get<phi::SelectedRows>();
-      auto &sr_value = sr.value();
-      auto &sr_rows = sr.rows();
-
-      auto row_numel = sr_value.numel() / sr_rows.size();
-      auto out_dims = out->dims();
-
-      PADDLE_ENFORCE_EQ(sr.height(),
-                        out_dims[0],
-                        platform::errors::InvalidArgument(
-                            "The table height of input must be same as output, "
-                            "but received input height is %d"
-                            ", output height is %d",
-                            sr.height(),
-                            out_dims[0]));
-      PADDLE_ENFORCE_EQ(row_numel,
-                        out->numel() / sr.height(),
-                        platform::errors::InvalidArgument(
-                            "The table width of input must be same as output, "
-                            "but received input width is %d"
-                            ", output width is %d",
-                            row_numel,
-                            out->numel() / sr.height()));
-
-      auto *sr_data = sr_value.data<T>();
-      auto *sr_out_data = out->data<T>();
-      rows += sr_rows.size();
-      length = row_numel;
-
-      for (size_t i = 0; i < sr_rows.size(); ++i) {
-        sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
-        sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
-      }
-    }
-    if (!sr_in_out_data.empty()) {
-      auto tmp_sr_in_out_array = memory::Alloc(
-          dev_ctx.GetPlace(),
-          sr_in_out_data.size() * sizeof(T *),
-          phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-
-      memory::Copy(dev_ctx.GetPlace(),
-                   tmp_sr_in_out_array->ptr(),
-                   platform::CPUPlace(),
-                   reinterpret_cast<void *>(sr_in_out_data.data()),
-                   sr_in_out_data.size() * sizeof(T *),
-                   dev_ctx.stream());
-
-      T **sr_in_out_array_data =
-          reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
-
-      ComputeKernelParameter(length);
-      SumSelectedRowsCUDAKernel<T>
-          <<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
-      dst_write = true;
-    }
-  }
-  // if indata not null, merge into one kernel call.
-  if (!in_data.empty()) {
-    auto tmp_in_array = memory::Alloc(
-        dev_ctx.GetPlace(),
-        in_data.size() * sizeof(T *),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
-
-    memory::Copy(dev_ctx.GetPlace(),
-                 tmp_in_array->ptr(),
-                 platform::CPUPlace(),
-                 reinterpret_cast<void *>(in_data.data()),
-                 in_data.size() * sizeof(T *),
-                 dev_ctx.stream());
-
-    T **in_array_data = reinterpret_cast<T **>(tmp_in_array->ptr());
-    ComputeKernelParameter(lod_length);
-    SumArrayCUDAKernel<T><<<grids, blocks, 0, stream>>>(in_array_data,
-                                                        out->data<T>(),
-                                                        lod_length,
-                                                        in_data.size(),
-                                                        dst_write | in_place);
-  }
-}
-
-template <typename T>
-class SumKernel<phi::GPUContext, T> : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto out_var = context.OutputVar("Out");
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      SumToLoDTensor<T>(context);
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      SelectedRowsCompute<phi::GPUContext, T>(context);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<phi::GPUContext, T>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor,  SelectedRows or "
-          "LodTensorArray. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(sum,
-                        ops::SumKernel<phi::GPUContext, float>,
-                        ops::SumKernel<phi::GPUContext, double>,
-                        ops::SumKernel<phi::GPUContext, int>,
-                        ops::SumKernel<phi::GPUContext, int64_t>,
-                        ops::SumKernel<phi::GPUContext, plat::float16>,
-                        ops::SumKernel<phi::GPUContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
deleted file mode 100644
index 804bfbb2099722..00000000000000
--- a/paddle/fluid/operators/sum_op.h
+++ /dev/null
@@ -1,222 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/lod_tensor_array.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/selected_rows_functor.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using SelectedRows = phi::SelectedRows;
-using LoDTensor = framework::LoDTensor;
-template <typename T,
-          int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-
-template <typename DeviceContext, typename T>
-void SelectedRowsCompute(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  auto out_var = context.OutputVar("Out");
-  bool in_place = out_var == in_vars[0];
-
-  if (in_place && in_vars.size() < 2) {
-    return;
-  }
-
-  std::vector<const phi::SelectedRows *> inputs;
-  SelectedRows temp_in0;
-
-  if (in_place) {
-    auto &in0 = in_vars[0]->Get<phi::SelectedRows>();
-    temp_in0.set_height(in0.height());
-    temp_in0.set_rows(in0.rows());
-    framework::TensorCopy(in0.value(),
-                          in0.place(),
-                          context.device_context(),
-                          temp_in0.mutable_value());
-    inputs.push_back(&temp_in0);
-    for (size_t i = 1; i < in_vars.size(); ++i) {
-      auto &in = in_vars[i]->Get<phi::SelectedRows>();
-      if (in.rows().size() > 0) {
-        inputs.push_back(&in);
-      }
-    }
-  } else {
-    for (auto &in_var : in_vars) {
-      auto &in = in_var->Get<phi::SelectedRows>();
-      if (in.rows().size() > 0) {
-        inputs.push_back(&in_var->Get<phi::SelectedRows>());
-      }
-    }
-  }
-
-  auto *out = context.Output<phi::SelectedRows>("Out");
-  out->mutable_rows()->clear();
-
-  bool has_data = false;
-  for (auto &in : inputs) {
-    if (in->rows().size() > 0) {
-      has_data = true;
-      break;
-    }
-  }
-  if (has_data) {
-    math::scatter::MergeAdd<DeviceContext, T> merge_add;
-    merge_add(context.template device_context<DeviceContext>(), inputs, out);
-
-    out->SyncIndex();
-
-  } else {
-    // no data, just set a empty out tensor.
-    out->mutable_value()->mutable_data<T>(phi::make_ddim({0}),
-                                          context.GetPlace());
-  }
-}
-
-template <typename DeviceContext, typename T>
-void LodTensorArrayCompute(const framework::ExecutionContext &context) {
-  auto in_vars = context.MultiInputVar("X");
-  auto out_var = context.OutputVar("Out");
-  bool in_place = out_var == in_vars[0];
-  auto &out_array = *out_var->GetMutable<framework::LoDTensorArray>();
-  for (size_t i = in_place ? 1 : 0; i < in_vars.size(); ++i) {
-    PADDLE_ENFORCE_EQ(in_vars[i]->IsType<framework::LoDTensorArray>(),
-                      true,
-                      platform::errors::InvalidArgument(
-                          "Only support all inputs are TensorArray, "
-                          "but inputs[%d] is not TensorArray.",
-                          i));
-    auto &in_array = in_vars[i]->Get<framework::LoDTensorArray>();
-
-    for (size_t i = 0; i < in_array.size(); ++i) {
-      if (in_array[i].IsInitialized() && (in_array[i].numel() != 0)) {
-        if (i >= out_array.size()) {
-          out_array.resize(i + 1);
-        }
-        if (!out_array[i].IsInitialized() || (out_array[i].numel() == 0)) {
-          framework::TensorCopy(in_array[i],
-                                in_array[i].place(),
-                                context.device_context(),
-                                &out_array[i]);
-          out_array[i].set_lod(in_array[i].lod());
-        } else {
-          PADDLE_ENFORCE_EQ(
-              out_array[i].lod(),
-              in_array[i].lod(),
-              platform::errors::InvalidArgument(
-                  "The lod message between inputs[%d] and"
-                  " outputs[%d] must be same, but now is not same.",
-                  i,
-                  i));
-          auto in = EigenVector<T>::Flatten(in_array[i]);
-          auto result = EigenVector<T>::Flatten(out_array[i]);
-          result.device(*context.template device_context<DeviceContext>()
-                             .eigen_device()) = result + in;
-        }
-      }
-    }
-  }
-}
-
-template <typename DeviceContext, typename T>
-class SumKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    VLOG(10) << "start sum kernel";
-    auto in_vars = context.MultiInputVar("X");
-    size_t in_num = in_vars.size();
-    auto out_var = context.OutputVar("Out");
-
-    bool in_place = out_var == in_vars[0];
-
-    if (out_var->IsType<framework::LoDTensor>()) {
-      auto *out = out_var->GetMutable<framework::LoDTensor>();
-      auto *out_ptr = out->mutable_data<T>(context.GetPlace());
-      if (in_num >= 1 && in_vars[0]->IsType<framework::LoDTensor>() &&
-          in_vars[0]->Get<framework::LoDTensor>().IsInitialized()) {
-        auto &in_0_tensor = in_vars[0]->Get<framework::LoDTensor>();
-        if (in_0_tensor.numel() > 0) {
-          in_place = (in_0_tensor.data<T>() == out_ptr);
-        }
-      }
-
-      auto result = EigenVector<T>::Flatten(*out);
-      auto &place =
-          *context.template device_context<DeviceContext>().eigen_device();
-      int start = in_place ? 1 : 0;
-      if (!in_place) {
-        if ((in_num >= 2) && in_vars[0]->IsType<framework::LoDTensor>() &&
-            in_vars[1]->IsType<framework::LoDTensor>() &&
-            in_vars[0]->Get<framework::LoDTensor>().IsInitialized() &&
-            in_vars[1]->Get<framework::LoDTensor>().IsInitialized()) {
-          auto &in_0 = in_vars[0]->Get<framework::LoDTensor>();
-          auto &in_1 = in_vars[1]->Get<framework::LoDTensor>();
-          if (in_0.numel() && in_1.numel()) {
-            auto in_0_e = EigenVector<T>::Flatten(in_0);
-            auto in_1_e = EigenVector<T>::Flatten(in_1);
-            result.device(place) = in_0_e + in_1_e;
-            start = 2;
-          }
-        }
-        if (start != 2) {
-          VLOG(10) << "Fill with constant = 0 in sum kernel.";
-          phi::funcs::SetConstant<DeviceContext, T> constant_functor;
-          constant_functor(context.template device_context<DeviceContext>(),
-                           out,
-                           static_cast<T>(0));
-        }
-      }
-
-      math::SelectedRowsAddToTensor<DeviceContext, T> functor;
-      // If in_place, just skip the first tensor
-      for (size_t i = start; i < in_num; i++) {
-        if (in_vars[i]->IsType<framework::LoDTensor>()) {
-          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
-          if (!in_t.IsInitialized() || in_t.numel() == 0) {
-            continue;
-          }
-          auto in = EigenVector<T>::Flatten(in_t);
-          result.device(place) = result + in;
-        } else if (in_vars[i]->IsType<phi::SelectedRows>()) {
-          auto &in_t = in_vars[i]->Get<phi::SelectedRows>();
-          functor(context.template device_context<DeviceContext>(), in_t, out);
-        } else {
-          PADDLE_THROW(platform::errors::InvalidArgument(
-              "Expected type of Input(X) of %d-th must be Tensor, "
-              "SelectedRows. But got "
-              "unsupport type: %s.",
-              framework::ToTypeName(in_vars[i]->Type())));
-        }
-      }
-    } else if (out_var->IsType<phi::SelectedRows>()) {
-      SelectedRowsCompute<DeviceContext, T>(context);
-    } else if (out_var->IsType<framework::LoDTensorArray>()) {
-      LodTensorArrayCompute<DeviceContext, T>(context);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Expected type of Output(out) must be Tensor, SelectedRows, "
-          "LoDTensorArray. But got "
-          "unsupport type: %s.",
-          framework::ToTypeName(out_var->Type())));
-    }
-    VLOG(10) << "end sum kernel";
-  }
-};
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/sum_op_mlu.cc b/paddle/fluid/operators/sum_op_mlu.cc
index 6195a6c9b7af08..0bb51581e9360a 100644
--- a/paddle/fluid/operators/sum_op_mlu.cc
+++ b/paddle/fluid/operators/sum_op_mlu.cc
@@ -12,13 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/mlu/mlu_baseop.h"
-#include "paddle/fluid/operators/sum_op.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
 class SumMLUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/sum_op_npu.cc b/paddle/fluid/operators/sum_op_npu.cc
index 8beecb70e7fcbb..a7bb442fa650c4 100644
--- a/paddle/fluid/operators/sum_op_npu.cc
+++ b/paddle/fluid/operators/sum_op_npu.cc
@@ -16,13 +16,16 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
 
 template <typename DeviceContext, typename T>
 class SumNPUKernel : public framework::OpKernel<T> {
diff --git a/paddle/fluid/operators/sum_op_xpu.cc b/paddle/fluid/operators/sum_op_xpu.cc
index b73677b59cee90..a445868153452e 100644
--- a/paddle/fluid/operators/sum_op_xpu.cc
+++ b/paddle/fluid/operators/sum_op_xpu.cc
@@ -13,14 +13,16 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/fluid/operators/sum_op.h"
+#include "paddle/fluid/framework/lod_tensor_array.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device/xpu/xpu_header.h"
 
 namespace paddle {
 namespace operators {
 using framework::Tensor;
-
+using SelectedRows = phi::SelectedRows;
+using LoDTensor = framework::LoDTensor;
 template <typename DeviceContext, typename T>
 class SumXPUKernel : public framework::OpKernel<T> {
   using XPUType = typename XPUTypeTrait<T>::Type;
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 0f8a3d12062646..3296f9c4395537 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -503,6 +503,18 @@ class TensorRTEngineOp : public framework::OperatorBase {
       // convert input and copy to TRT engine's buffer
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      PADDLE_ENFORCE_GT(
+          t.numel(),
+          0,
+          phi::errors::InvalidArgument(
+              "The input tensor named %s of trt-subgraph must "
+              "have >0 elements, but now have %d elements. "
+              "It's likely that this tensor is connected to a Concat op inside "
+              "a trt-subgraph, "
+              "try to ues API to forbid this op into trt-subgraph.",
+              x,
+              t.numel()));
+
       // check the input_tensor
       if (!platform::is_gpu_place(t.place())) {
         framework::Tensor out;
@@ -563,6 +575,18 @@ class TensorRTEngineOp : public framework::OperatorBase {
 #if IS_TRT_VERSION_GE(6000)
         trt_context->setBindingDimensions(
             bind_index, inference::tensorrt::Vec2TRT_Dims(t_shape, x, true));
+        // If this x is a shape tensor, we need call setInputShapeBinding
+        if (engine->engine()->isShapeBinding(bind_index) &&
+            engine->engine()->bindingIsInput(bind_index)) {
+          std::vector<int> shape_v(t.numel());
+          paddle::memory::Copy(platform::CPUPlace(),
+                               shape_v.data(),
+                               platform::CUDAPlace(),
+                               t.data<int32_t>(),
+                               t.numel() * sizeof(int),
+                               nullptr);
+          trt_context->setInputShapeBinding(bind_index, shape_v.data());
+        }
 #endif
       }
       runtime_batch = t_shape[0];
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
index 7b58a1bb7d6d27..eab919135c7bbc 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op_test.cc
@@ -159,6 +159,8 @@ void DynamicShapeTest(bool allow_build_at_runtime) {
 
   // Execute them.
   LOG(INFO) << "engine_op run";
+  inference::tensorrt::OpTeller::Global().SetOpConverterType(
+      "fc", inference::tensorrt::OpConverterType::Default);
   engine_op->Run(scope, place);
 }
 
diff --git a/paddle/fluid/operators/transpose_op.cc b/paddle/fluid/operators/transpose_op.cc
index 610d6e1f48aadb..535300c826d24d 100644
--- a/paddle/fluid/operators/transpose_op.cc
+++ b/paddle/fluid/operators/transpose_op.cc
@@ -39,11 +39,13 @@ class TransposeOp : public framework::OperatorWithKernel {
     size_t x_rank = x_dims.size();
     size_t axis_size = axis.size();
 
-    PADDLE_ENFORCE_EQ(x_rank,
+    // Note: x_rank > axis_size when fuse squeeze2 + transpose2, else x_rank ==
+    // axis_size
+    PADDLE_ENFORCE_GE(x_rank,
                       axis_size,
                       platform::errors::InvalidArgument(
                           "The input tensor's dimension "
-                          "should be equal to the axis's size. "
+                          "should be equal to or greater than the axis's size. "
                           "But received input tensor's dimension is %d, "
                           "axis's size is %d",
                           x_rank,
@@ -275,13 +277,49 @@ class Transpose2Op : public TransposeOp {
   }
 };
 
-class Transpose2OpMaker : public TransposeOpMaker {
+class Transpose2OpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   void Make() override {
-    TransposeOpMaker::Make();
+    AddInput(
+        "X",
+        "(Tensor) The input tensor, tensors with rank up to 6 are supported.");
+    AddOutput("Out", "(Tensor)The output tensor.");
+    AddAttr<std::vector<int>>(
+        "axis",
+        "(vector<int>) A list of values, and the size of the list should be "
+        "the same with the input tensor rank. This operator permutes the input "
+        "tensor's axes according to the values given.");
     AddOutput("XShape", "(Tensor)The output tensor.")
         .AsIntermediate()
         .AsExtra();
+    AddComment(R"DOC(
+Transpose Operator.
+
+The input tensor will be permuted according to the axes given.
+The behavior of this operator is similar to how `numpy.transpose` works.
+
+- suppose the input `X` is a 2-D tensor:
+    $$
+    X = \begin{pmatrix}
+    0 &1 &2 \\
+    3 &4 &5
+    \end{pmatrix}$$
+
+    the given `axes` is: $[1, 0]$, and $Y$ = transpose($X$, axis)
+
+    then the output $Y$ is:
+
+    $$
+    Y = \begin{pmatrix}
+         0 &3 \\
+         1 &4  \\
+         2 &5
+    \end{pmatrix}$$
+
+- Given a input tensor with shape $(N, C, H, W)$ and the `axes` is
+$[0, 2, 3, 1]$, then shape of the output tensor will be: $(N, H, W, C)$.
+
+)DOC");
   }
 };
 
diff --git a/paddle/fluid/operators/unity_build_rule.cmake b/paddle/fluid/operators/unity_build_rule.cmake
index 62aa990ca7bc82..7cc02a0f527a9f 100644
--- a/paddle/fluid/operators/unity_build_rule.cmake
+++ b/paddle/fluid/operators/unity_build_rule.cmake
@@ -108,7 +108,6 @@ register_unity_group(
 register_unity_group(
   cc
   flatten_op.cc
-  flip_op.cc
   fsp_op.cc
   gather_nd_op.cc
   gather_op.cc
@@ -424,7 +423,6 @@ register_unity_group(cu expand_v2_op.cu fake_dequantize_op.cu
                      fill_any_like_op.cu)
 register_unity_group(
   cu
-  flip_op.cu
   fsp_op.cu
   gather_nd_op.cu
   gather_op.cu
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
index d1d33d50a5dbb9..fa21a5f096611b 100644
--- a/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
+++ b/paddle/fluid/platform/device/gpu/cuda/cuda_helper.h
@@ -70,11 +70,12 @@ namespace platform {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
-  int64_t __index__ =                                              \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);                \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
 
 class CublasHandleHolder {
  public:
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index b99d6de5dbbb42..96eddf09237d98 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -198,61 +198,6 @@ __device__ __forceinline__ void fastAtomicAdd(T *arr,
                                               T value) {
   CudaAtomicAdd(arr + index, value);
 }
-
-#ifdef PADDLE_WITH_CUDA
-/*
- * One thead block deals with elementwise atomicAdd for vector of len.
- * @in: [x1, x2, x3, ...]
- * @out:[y1+x1, y2+x2, y3+x3, ...]
- * */
-template <typename T,
-          typename std::enable_if<
-              !std::is_same<platform::float16, T>::value>::type * = nullptr>
-__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
-    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
-  for (int i = tid; i < len; i += threads_per_block) {
-    CudaAtomicAdd(&out[i], in[i]);
-  }
-}
-
-// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
-template <typename T,
-          typename std::enable_if<
-              std::is_same<platform::float16, T>::value>::type * = nullptr>
-__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
-    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
-#if ((CUDA_VERSION < 10000) || \
-     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
-  for (int i = tid; i < len; i += threads_per_block) {
-    CudaAtomicAdd(&out[i], in[i]);
-  }
-#else
-  int i = 0;
-  int loops = len / 2 * 2;
-
-  bool aligned_half2 =
-      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
-
-  if (aligned_half2) {
-    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
-      __half2 value2;
-      T value_1 = in[i];
-      T value_2 = in[i + 1];
-      value2.x = *reinterpret_cast<__half *>(&value_1);
-      value2.y = *reinterpret_cast<__half *>(&value_2);
-      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
-    }
-    for (; i < len; i += threads_per_block) {
-      fastAtomicAdd(out, i, len, in[i]);
-    }
-  } else {
-    for (int i = tid; i < len; i += threads_per_block) {
-      fastAtomicAdd(out, i, len, in[i]);
-    }
-  }
-#endif
-}
-#endif
 #endif
 
 // NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
@@ -601,5 +546,61 @@ CUDA_ATOMIC_WRAPPER(Min, float16) {
 }
 #endif
 
+#ifdef PADDLE_CUDA_FP16
+#ifdef PADDLE_WITH_CUDA
+/*
+ * One thead block deals with elementwise atomicAdd for vector of len.
+ * @in: [x1, x2, x3, ...]
+ * @out:[y1+x1, y2+x2, y3+x3, ...]
+ * */
+template <typename T,
+          typename std::enable_if<
+              !std::is_same<platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+}
+
+// Note: assume that len is even. If len is odd, call fastAtomicAdd directly.
+template <typename T,
+          typename std::enable_if<
+              std::is_same<platform::float16, T>::value>::type * = nullptr>
+__device__ __forceinline__ void VectorizedAtomicAddPerBlock(
+    const int64_t len, int tid, int threads_per_block, const T *in, T *out) {
+#if ((CUDA_VERSION < 10000) || \
+     (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700)))
+  for (int i = tid; i < len; i += threads_per_block) {
+    CudaAtomicAdd(&out[i], in[i]);
+  }
+#else
+  int i = 0;
+  int loops = len / 2 * 2;
+
+  bool aligned_half2 =
+      (reinterpret_cast<std::uintptr_t>(out) % sizeof(__half2) == 0);
+
+  if (aligned_half2) {
+    for (i = tid * 2; i < loops; i += threads_per_block * 2) {
+      __half2 value2;
+      T value_1 = in[i];
+      T value_2 = in[i + 1];
+      value2.x = *reinterpret_cast<__half *>(&value_1);
+      value2.y = *reinterpret_cast<__half *>(&value_2);
+      atomicAdd(reinterpret_cast<__half2 *>(&out[i]), value2);
+    }
+    for (; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  } else {
+    for (int i = tid; i < len; i += threads_per_block) {
+      fastAtomicAdd(out, i, len, in[i]);
+    }
+  }
+#endif
+}
+#endif
+#endif
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index a5d89f6001fa18..5d89da86efa6cf 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == framework::proto::VarType::BF16) {
     return ncclBfloat16;
 #endif
@@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(experimental::DataType type) {
     return ncclInt8;
   } else if (type == experimental::DataType::BOOL) {
     return ncclUint8;
-#if CUDNN_VERSION_MIN(8, 1, 0) && NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000
   } else if (type == experimental::DataType::BFLOAT16) {
     return ncclBfloat16;
 #endif
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
index 8bcae15d3517eb..45eba2b1537c87 100644
--- a/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
+++ b/paddle/fluid/platform/device/gpu/rocm/rocm_helper.h
@@ -70,8 +70,9 @@ namespace platform {
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
   int64_t __index__ =                                                       \
       static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
   for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+       __index__ += __stride__, i = __index__)
 
 class CublasHandleHolder {
  public:
diff --git a/paddle/fluid/platform/device/ipu/ipu_backend.cc b/paddle/fluid/platform/device/ipu/ipu_backend.cc
index 9e960a99123c07..ed4053e0aea9ec 100644
--- a/paddle/fluid/platform/device/ipu/ipu_backend.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_backend.cc
@@ -52,9 +52,20 @@ void IpuBackend::Compile(framework::ir::Graph* graph,
   if (ipu_strategy_->is_training) {
     compiler_->LowerOptimizer(scope_);
   }
+
+  // environment variable IPU_ONNX_DUMP_PATH have higher priority
+  std::string onnx_dump_path;
   if (!ipu_strategy_->onnx_dump_path.empty()) {
-    SaveModelProto(ipu_strategy_->onnx_dump_path);
+    onnx_dump_path = ipu_strategy_->onnx_dump_path;
+  }
+  auto* ipu_onnx_dump_path = getenv("IPU_ONNX_DUMP_PATH");
+  if (ipu_onnx_dump_path) {
+    onnx_dump_path = std::string{ipu_onnx_dump_path};
   }
+  if (!onnx_dump_path.empty()) {
+    SaveModelProto(onnx_dump_path);
+  }
+
   executor_->SetCompilerResources(compiler_->GetResources());
   executor_->Prepare(compiler_->GetModelProto());
   is_compiled_ = true;
diff --git a/paddle/fluid/platform/device/mlu/CMakeLists.txt b/paddle/fluid/platform/device/mlu/CMakeLists.txt
index 43a8f175047502..1d4d87b9ec5326 100644
--- a/paddle/fluid/platform/device/mlu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/mlu/CMakeLists.txt
@@ -9,7 +9,7 @@ cc_test(
 cc_library(
   mlu_info
   SRCS mlu_info.cc
-  DEPS enforce glog monitor neuware_lib)
+  DEPS enforce glog malloc monitor neuware_lib)
 cc_library(
   mlu_stream
   SRCS mlu_stream.cc
diff --git a/paddle/fluid/platform/device/mlu/device_context.cc b/paddle/fluid/platform/device/mlu/device_context.cc
index 087b4803320e50..796d7006834e4d 100644
--- a/paddle/fluid/platform/device/mlu/device_context.cc
+++ b/paddle/fluid/platform/device/mlu/device_context.cc
@@ -28,11 +28,13 @@ MLUContext::MLUContext(const MLUPlace& place, const int priority) {
   MLUDeviceGuard guard(place_.device);
   stream_.reset(new stream::MLUStream(place_, priority));
   InitCNNLContext();
+  InitMLUOPContext();
 }
 
 MLUContext::~MLUContext() {
   MLUDeviceGuard guard(place_.device);
   DestoryCNNLContext();
+  DestoryMLUOPContext();
 }
 
 MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
@@ -41,6 +43,7 @@ MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
   driver_version_ = GetMLUDriverVersion(place_.device);
   runtime_version_ = GetMLURuntimeVersion(place_.device);
   cnnl_version_ = GetMLUCnnlVersion(place_.device);
+  mluOp_version_ = GetMLUOpVersion(place_.device);
 
   LOG_FIRST_N(WARNING, 1)
       << "Please NOTE: device: " << static_cast<int>(place_.device)
@@ -51,7 +54,9 @@ MLUDeviceContext::MLUDeviceContext(MLUPlace place) : place_(place) {
       << ", Runtime API Version: " << runtime_version_ / 10000 << "."
       << (runtime_version_ / 100) % 100 << "." << runtime_version_ % 100
       << ", Cnnl API Version: " << cnnl_version_ / 10000 << "."
-      << (cnnl_version_ / 100) % 100 << "." << cnnl_version_ % 100;
+      << (cnnl_version_ / 100) % 100 << "." << cnnl_version_ % 100
+      << ", MluOp API Version: " << mluOp_version_ / 10000 << "."
+      << (mluOp_version_ / 100) % 100 << "." << mluOp_version_ % 100;
 
   default_ctx_.reset(new MLUContext(place_));
 }
@@ -70,6 +75,10 @@ mluCnnlHandle MLUDeviceContext::cnnl_handle() const {
   return context()->CnnlHandle();
 }
 
+mluOpHandle MLUDeviceContext::mluOp_handle() const {
+  return context()->MluOpHandle();
+}
+
 mluStream MLUDeviceContext::stream() const { return context()->RawStream(); }
 
 #endif
diff --git a/paddle/fluid/platform/device/mlu/device_context.h b/paddle/fluid/platform/device/mlu/device_context.h
index d8bb762315948a..e1028667bc2070 100644
--- a/paddle/fluid/platform/device/mlu/device_context.h
+++ b/paddle/fluid/platform/device/mlu/device_context.h
@@ -53,12 +53,19 @@ class MLUContext {
 
   const mluCnnlHandle& CnnlHandle() const { return cnnl_handle_; }
 
+  const mluOpHandle& MluOpHandle() const { return mluOp_handle_; }
+
  private:
   void InitCNNLContext() {
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlCreate(&cnnl_handle_));
     PADDLE_ENFORCE_MLU_SUCCESS(cnnlSetQueue(cnnl_handle_, RawStream()));
   }
 
+  void InitMLUOPContext() {
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpCreate(&mluOp_handle_));
+    PADDLE_ENFORCE_MLU_SUCCESS(mluOpSetQueue(mluOp_handle_, RawStream()));
+  }
+
   void DestoryCNNLContext() {
     if (cnnl_handle_) {
       PADDLE_ENFORCE_MLU_SUCCESS(cnnlDestroy(cnnl_handle_));
@@ -66,10 +73,18 @@ class MLUContext {
     cnnl_handle_ = nullptr;
   }
 
+  void DestoryMLUOPContext() {
+    if (mluOp_handle_) {
+      PADDLE_ENFORCE_MLU_SUCCESS(mluOpDestroy(mluOp_handle_));
+    }
+    mluOp_handle_ = nullptr;
+  }
+
   MLUPlace place_;
   std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
   std::unique_ptr<stream::MLUStream> stream_;
   mluCnnlHandle cnnl_handle_;
+  mluOpHandle mluOp_handle_;
 
   DISABLE_COPY_AND_ASSIGN(MLUContext);
 };
@@ -89,6 +104,9 @@ class MLUDeviceContext : public DeviceContext {
   /*! \brief  Return cnnl handle in the device context. */
   mluCnnlHandle cnnl_handle() const;
 
+  /*! \brief  Return mluOp handle in the device context. */
+  mluOpHandle mluOp_handle() const;
+
   /*! \brief  Return mlu stream in the device context. */
   mluStream stream() const;
 
@@ -135,6 +153,7 @@ class MLUDeviceContext : public DeviceContext {
   int driver_version_;
   int runtime_version_;
   int cnnl_version_;
+  int mluOp_version_;
   MLUPlace place_;
   std::shared_ptr<MLUContext> default_ctx_;
 
diff --git a/paddle/fluid/platform/device/mlu/enforce.h b/paddle/fluid/platform/device/mlu/enforce.h
index 05327a771d8c92..8b0d0bb36f5008 100644
--- a/paddle/fluid/platform/device/mlu/enforce.h
+++ b/paddle/fluid/platform/device/mlu/enforce.h
@@ -41,6 +41,7 @@ struct MLUStatusType {};
 
 DEFINE_MLU_STATUS_TYPE(cnrtStatus, cnrtSuccess, CNRT);
 DEFINE_MLU_STATUS_TYPE(cnnlStatus, CNNL_STATUS_SUCCESS, CNNL);
+DEFINE_MLU_STATUS_TYPE(mluOpStatus, MLUOP_STATUS_SUCCESS, MLUOP);
 DEFINE_MLU_STATUS_TYPE(cnStatus, CN_SUCCESS, CN);
 #ifdef PADDLE_WITH_CNCL
 DEFINE_MLU_STATUS_TYPE(cnclStatus, CNCL_RET_SUCCESS, CNCL);
@@ -68,6 +69,15 @@ inline std::string build_mlu_error_msg(cnnlStatus stat) {
   return sout.str();
 }
 
+/*************** MLU OP ERROR ***************/
+inline bool is_error(mluOpStatus stat) { return stat != MLUOP_STATUS_SUCCESS; }
+
+inline std::string build_mlu_error_msg(mluOpStatus stat) {
+  std::ostringstream sout;
+  sout << "MLU OP error(" << stat << "), " << mluOpGetErrorString(stat) << ". ";
+  return sout.str();
+}
+
 /*************** CN API ERROR ***************/
 inline bool is_error(cnStatus stat) { return stat != CN_SUCCESS; }
 
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.cc b/paddle/fluid/platform/device/mlu/mlu_info.cc
index e27720849e0f3e..a2e063397bd3a9 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.cc
+++ b/paddle/fluid/platform/device/mlu/mlu_info.cc
@@ -126,6 +126,13 @@ int GetMLUCnnlVersion(int id) {
   return x * 10000 + y * 100 + z;
 }
 
+int GetMLUOpVersion(int id) {
+  CheckDeviceId(id);
+  int x, y, z;
+  mluOpGetLibVersion(&x, &y, &z);
+  return x * 10000 + y * 100 + z;
+}
+
 int GetMLUCurrentDeviceId() {
   int device_id;
   PADDLE_ENFORCE_MLU_SUCCESS(cnrtGetDevice(&device_id));
diff --git a/paddle/fluid/platform/device/mlu/mlu_info.h b/paddle/fluid/platform/device/mlu/mlu_info.h
index 14f37879ef070d..c0cd24f00fbb6e 100644
--- a/paddle/fluid/platform/device/mlu/mlu_info.h
+++ b/paddle/fluid/platform/device/mlu/mlu_info.h
@@ -16,10 +16,11 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_MLU
 #include <cn_api.h>
-#include <cndrv_id.h>
 #include <cnnl.h>
 #include <cnpapi.h>
+#include <cnpapi_cndrv_id.h>
 #include <cnrt.h>
+#include <mlu_op.h>
 #ifdef PADDLE_WITH_CNCL
 #include <cncl.h>
 #endif
@@ -30,11 +31,13 @@ namespace paddle {
 using cnStatus = CNresult;
 using cnrtStatus = cnrtRet_t;
 using cnnlStatus = cnnlStatus_t;
+using mluOpStatus = mluOpStatus_t;
 #ifdef PADDLE_WITH_CNCL
 using cnclStatus = cnclResult_t;
 #endif
 using mluStream = cnrtQueue_t;
 using mluCnnlHandle = cnnlHandle_t;
+using mluOpHandle = mluOpHandle_t;
 using mluEventHandle = cnrtNotifier_t;
 using mluDeviceHandle = CNdev;
 
@@ -49,6 +52,9 @@ int GetMLURuntimeVersion(int id);
 //! Get the cnnl version of the ith MLU.
 int GetMLUCnnlVersion(int id);
 
+//! Get the mluOp version of the ith MLU.
+int GetMLUOpVersion(int id);
+
 //! Get the total number of MLU devices in system.
 int GetMLUDeviceCount();
 
diff --git a/paddle/fluid/platform/device/xpu/xpu2_op_list.h b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
index 0aa0e2049180fb..ba03a3b53dba37 100644
--- a/paddle/fluid/platform/device/xpu/xpu2_op_list.h
+++ b/paddle/fluid/platform/device/xpu/xpu2_op_list.h
@@ -654,7 +654,14 @@ XPUOpMap& get_kl2_ops() {
       {"resnet_basic_block_grad",
        XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
       {"resnet_basic_block",
-       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})}};
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
+      {"fused_gemm_epilogue",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+      {"fused_gemm_epilogue_grad",
+       XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
+                     pOpKernelType(vartype::FP16, XPUPlace())})},
+  };
 
   return s_xpu2_kernels;
 }
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index e2fec11c190d33..f7c715d7905a52 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -65,6 +65,8 @@ DeviceType Place2DeviceType(const platform::Place& place) {
     return platform::DeviceType::NPU;
   } else if (platform::is_mlu_place(place)) {
     return platform::DeviceType::MLU;
+  } else if (platform::is_custom_place(place)) {
+    return platform::DeviceType::CUSTOM_DEVICE;
   } else {
     PADDLE_THROW(platform::errors::Unavailable(
         "Unsupported place %s to convert into platform::DeviceType.", place));
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index 4b8833f9a6cd6d..7939f8ff7c0660 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -117,8 +117,9 @@ enum DeviceType {
   XPU = 3,
   IPU = 4,
   MLU = 5,
+  CUSTOM_DEVICE = 6,
 
-  MAX_DEVICE_TYPES = 6,
+  MAX_DEVICE_TYPES = 7,
 };
 
 DeviceType Place2DeviceType(const platform::Place& place);
@@ -129,6 +130,7 @@ constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kNPU = DeviceType::NPU;
 constexpr DeviceType kIPU = DeviceType::IPU;
 constexpr DeviceType kMLU = DeviceType::MLU;
+constexpr DeviceType kCUSOTM_DEVICE = DeviceType::CUSTOM_DEVICE;
 
 using DeviceContext = phi::DeviceContext;
 
diff --git a/paddle/fluid/platform/dynload/cublasLt.h b/paddle/fluid/platform/dynload/cublasLt.h
index 3a1d28072c591c..c3425ac6048581 100644
--- a/paddle/fluid/platform/dynload/cublasLt.h
+++ b/paddle/fluid/platform/dynload/cublasLt.h
@@ -40,26 +40,28 @@ namespace dynload {
 
 // APIs available after CUDA 10.1
 // #if CUDA_VERSION >= 10100
-#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
-  __macro(cublasLtCreate);                       \
-  __macro(cublasLtDestroy);                      \
-  __macro(cublasLtMatmul);                       \
-  __macro(cublasLtMatmulDescCreate);             \
-  __macro(cublasLtMatmulDescDestroy);            \
-  __macro(cublasLtMatmulDescSetAttribute);       \
-  __macro(cublasLtMatmulDescGetAttribute);       \
-  __macro(cublasLtMatrixLayoutCreate);           \
-  __macro(cublasLtMatrixLayoutDestroy);          \
-  __macro(cublasLtMatrixLayoutSetAttribute);     \
-  __macro(cublasLtMatrixLayoutGetAttribute);     \
-  __macro(cublasLtMatmulPreferenceCreate);       \
-  __macro(cublasLtMatmulPreferenceDestroy);      \
-  __macro(cublasLtMatmulPreferenceSetAttribute); \
-  __macro(cublasLtMatmulAlgoGetHeuristic);       \
-  __macro(cublasLtMatrixTransform);              \
-  __macro(cublasLtMatrixTransformDescCreate);    \
-  __macro(cublasLtMatrixTransformDescDestroy);   \
-  __macro(cublasLtMatrixTransformDescSetAttribute);
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+  __macro(cublasLtCreate);                          \
+  __macro(cublasLtDestroy);                         \
+  __macro(cublasLtMatmul);                          \
+  __macro(cublasLtMatmulDescCreate);                \
+  __macro(cublasLtMatmulDescDestroy);               \
+  __macro(cublasLtMatmulDescSetAttribute);          \
+  __macro(cublasLtMatmulDescGetAttribute);          \
+  __macro(cublasLtMatrixLayoutCreate);              \
+  __macro(cublasLtMatrixLayoutDestroy);             \
+  __macro(cublasLtMatrixLayoutSetAttribute);        \
+  __macro(cublasLtMatrixLayoutGetAttribute);        \
+  __macro(cublasLtMatmulPreferenceCreate);          \
+  __macro(cublasLtMatmulPreferenceDestroy);         \
+  __macro(cublasLtMatmulPreferenceSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetHeuristic);          \
+  __macro(cublasLtMatrixTransform);                 \
+  __macro(cublasLtMatrixTransformDescCreate);       \
+  __macro(cublasLtMatrixTransformDescDestroy);      \
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);
 
 CUBLASLT_BLAS_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index 8d700faac0c148..d3f3eeadee1eba 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -41,21 +41,11 @@ void* GetDsoHandle(const std::string& dso_name) {
 
   void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
 
-  if (nullptr == dso_handle) {
-    auto error_msg =
-        "You are using Paddle compiled with TensorRT, but TensorRT dynamic "
-        "library is not found. Ignore this if TensorRT is not needed.\n"
-        "The TensorRT that Paddle depends on is not configured correctly.\n"
-        "  Suggestions:\n"
-        "  1. Check if the TensorRT is installed correctly and its version"
-        " is matched with paddlepaddle you installed.\n"
-        "  2. Configure environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n";
-    LOG(WARNING) << error_msg;
-  }
+  PADDLE_ENFORCE_NOT_NULL(dso_handle,
+                          paddle::platform::errors::NotFound(
+                              "TensorRT is needed, "
+                              "but TensorRT dynamic library is not found."));
+
   return dso_handle;
 }
 
diff --git a/paddle/fluid/platform/flags.cc b/paddle/fluid/platform/flags.cc
index 28dddc1fbebdd6..518aabbb09ead8 100644
--- a/paddle/fluid/platform/flags.cc
+++ b/paddle/fluid/platform/flags.cc
@@ -388,11 +388,7 @@ PADDLE_DEFINE_EXPORTED_int32(
  *       enable garbage collection strategy when training large networks.
  */
 // Disable gc by default when inference library is built
-#ifdef PADDLE_ON_INFERENCE
-static const double kDefaultEagerDeleteTensorGB = -1;
-#else
 static const double kDefaultEagerDeleteTensorGB = 0;
-#endif
 
 PADDLE_DEFINE_EXPORTED_double(
     eager_delete_tensor_gb,
@@ -663,7 +659,7 @@ PADDLE_DEFINE_EXPORTED_bool(use_mkldnn, false, "Use MKLDNN to run");
  * If FLAGS_call_stack_level == 2, the python stack, c++ stack, and error
  * message summary will be shown.
  */
-#ifdef PADDLE_ON_INFERENCE
+#ifdef PADDLE_NO_PYTHON
 static const int32_t kDefaultCallStackLevel = 2;
 #else
 static const int32_t kDefaultCallStackLevel = 1;
@@ -1014,12 +1010,13 @@ PADDLE_DEFINE_EXPORTED_bool(
  * Name: FLAGS_jit_engine_type
  * Since Version: 2.3.0
  * Value Range: string, {Executor, PE},
- * default=PE
+ * default=Predictor
  * Example:
  * Note:
  * FLAGS_jit_engine_type == Executor, using ExecutorEngine by default
  * FLAGS_jit_engine_type == PE, using PEEngine by default
+ * FLAGS_jit_engine_type == Predictor, using inference Predictor by default
  */
 PADDLE_DEFINE_EXPORTED_string(jit_engine_type,
-                              "PE",
+                              "Predictor",
                               "Choose default funciton type in JitLayer.");
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 933ac4f12e3c4e..221eab737d95a3 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -111,6 +111,111 @@ static void AppendActivation(const framework::ExecutionContext& ctx,
   }
 }
 
+static void SetOutMemDescWithUnsqueeze2FuseSupport(
+    const framework::ExecutionContext& ctx,
+    phi::DenseTensor* out,
+    const dnnl::memory::desc& out_md) {
+  const std::vector<int>& fused_unsqueeze2_axes =
+      ctx.Attr<std::vector<int>>("fused_unsqueeze2_axes");
+  const std::vector<int64_t>& op_tz = out_md.dims();
+  std::vector<int64_t> unsqueezed_op_tz(
+      op_tz.size() + fused_unsqueeze2_axes.size(), 0);
+  for (const auto& axis : fused_unsqueeze2_axes) {
+    int positive_axis = axis < 0 ? unsqueezed_op_tz.size() + axis : axis;
+    unsqueezed_op_tz[positive_axis] = 1;
+  }
+  int j = 0;
+  for (size_t i = 0; i < unsqueezed_op_tz.size(); ++i) {
+    if (unsqueezed_op_tz[i] == 0) {
+      unsqueezed_op_tz[i] = op_tz[j++];
+    }
+  }
+  out->set_mem_desc(out_md.reshape(unsqueezed_op_tz));
+  out->Resize(phi::make_ddim(unsqueezed_op_tz));
+}
+
+static void SetOutMemDescWithReshape2FuseSupport(
+    const framework::ExecutionContext& ctx,
+    phi::DenseTensor* out,
+    const dnnl::memory::desc& out_md) {
+  std::vector<int64_t> fused_reshape2_shape(
+      ctx.Attr<std::vector<int>>("fused_reshape2_shape").begin(),
+      ctx.Attr<std::vector<int>>("fused_reshape2_shape").end());
+  const int out_shape_numel = out->numel();
+  const int new_shape_numel = std::accumulate(fused_reshape2_shape.begin(),
+                                              fused_reshape2_shape.end(),
+                                              1,
+                                              std::multiplies<int64_t>());
+  for (size_t i = 0; i < fused_reshape2_shape.size(); ++i) {
+    if (fused_reshape2_shape[i] == -1) {
+      fused_reshape2_shape[i] = -out_shape_numel / new_shape_numel;
+      break;
+    }
+  }
+  out->set_mem_desc(out_md.reshape(fused_reshape2_shape));
+  out->Resize(phi::make_ddim(fused_reshape2_shape));
+}
+
+static void SetOutMemDescWithLogicalLayoutFusesSupport(
+    const framework::ExecutionContext& ctx,
+    phi::DenseTensor* out,
+    const dnnl::memory::desc& out_md) {
+  if (ctx.HasAttr("fused_unsqueeze2_axes")) {
+    SetOutMemDescWithUnsqueeze2FuseSupport(ctx, out, out_md);
+  } else if (ctx.HasAttr("fused_reshape2_shape")) {
+    SetOutMemDescWithReshape2FuseSupport(ctx, out, out_md);
+  } else if (ctx.HasAttr("fused_squeeze2_axes")) {
+    out->set_mem_desc(out_md);
+    out->Resize(phi::make_ddim(out_md.dims()));
+  } else {
+    out->set_mem_desc(out_md);
+  }
+}
+
+static void SetInMemDescWithSqueeze2FuseSupport(
+    const framework::ExecutionContext& ctx,
+    phi::DenseTensor* in,
+    const dnnl::memory::desc& in_md) {
+  const std::vector<int> fused_squeeze2_axes =
+      ctx.Attr<std::vector<int>>("fused_squeeze2_axes");
+  const std::set<int64_t> squeeze2_axes_set(fused_squeeze2_axes.begin(),
+                                            fused_squeeze2_axes.end());
+  const std::vector<int64_t>& x_vec_dims = in_md.dims();
+  std::vector<int64_t> squeezed_op_tz(
+      x_vec_dims.size() - fused_squeeze2_axes.size(), 0);
+
+  int j = 0;
+  for (size_t i = 0; i < x_vec_dims.size(); ++i) {
+    if (squeeze2_axes_set.count(i) ||
+        squeeze2_axes_set.count(i - x_vec_dims.size())) {
+      PADDLE_ENFORCE_EQ(
+          x_vec_dims[i],
+          1,
+          platform::errors::InvalidArgument(
+              "Squeeze2 input '%d' dim should be equal to one, but get '%d'.",
+              i,
+              x_vec_dims[i]));
+      continue;
+    }
+    squeezed_op_tz[j++] = x_vec_dims[i];
+  }
+
+  in->set_mem_desc(in_md.reshape(squeezed_op_tz));
+  in->Resize(phi::make_ddim(squeezed_op_tz));
+}
+
+static void SetInMemDescWithLogicalLayoutFusesSupport(
+    const framework::ExecutionContext& ctx,
+    phi::DenseTensor* in,
+    const dnnl::memory::desc& in_md) {
+  if (ctx.HasAttr("fused_squeeze2_axes")) {
+    SetInMemDescWithSqueeze2FuseSupport(ctx, in, in_md);
+  } else {
+    in->set_mem_desc(in_md);
+    in->Resize(phi::make_ddim(in_md.dims()));
+  }
+}
+
 template <typename T>
 constexpr bool IsInt8() {
   return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
@@ -196,7 +301,8 @@ class MatMulV2MKLDNNHandler
       out_strides[i] = out_ddims[i + 1] * out_strides[i + 1];
     }
 
-    if (!IsInt8<OT>() && !IsBfloat16<OT>() && is_output_fused) {
+    // TODO(jczaja): Why not for int8??
+    if (!IsInt8<OT>() && is_output_fused) {
       out_strides = FakeTransposeStrides(out_ddims);
     }
 
@@ -250,6 +356,12 @@ class MatMulV2MKLDNNHandler
 
     AppendActivation(ctx, post_operations);
 
+    if (ctx.HasAttr("fused_output_scale")) {
+      float scale_alpha = ctx.Attr<float>("fused_output_scale");
+      post_operations.append_eltwise(
+          1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
+    }
+
     matmul_attrs.set_post_ops(post_operations);
     return matmul_attrs;
   }
@@ -293,103 +405,6 @@ class MatMulV2MKLDNNHandler
   }
 };
 
-template <typename T>
-class ActivationMKLDNNHandler
-    : public MKLDNNHandlerNoCachingT<T,
-                                     dnnl::eltwise_forward,
-                                     dnnl::eltwise_backward> {
- public:
-  ActivationMKLDNNHandler(dnnl::algorithm algorithm,
-                          const framework::ExecutionContext& ctx,
-                          const dnnl::engine engine,
-                          Place cpu_place,
-                          const framework::Tensor* x)
-      : platform::MKLDNNHandlerNoCachingT<T,
-                                          dnnl::eltwise_forward,
-                                          dnnl::eltwise_backward>(engine,
-                                                                  cpu_place) {
-    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
-    float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
-
-    if (ctx.Type() == "scale") {
-      bool bias_after_scale = ctx.Attr<bool>("bias_after_scale");
-      auto* scale_tensor = ctx.Input<Tensor>("ScaleTensor");
-      alpha = (scale_tensor == nullptr)
-                  ? ctx.Attr<float>("scale")
-                  : static_cast<float>(*(scale_tensor->data<T>()));
-      beta = ctx.Attr<float>("bias");
-      // if bias_after_scale == true
-      //   out = scale*X + bias
-      // else
-      //   out = scale*(X + bias) = scale*X + scale*bias
-      if (!bias_after_scale) {
-        beta *= alpha;
-      }
-    } else if (ctx.Type() == "clip") {
-      alpha = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min")->data<float>()[0]
-                                  : ctx.Attr<float>("min");
-      beta = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max")->data<float>()[0]
-                                 : ctx.Attr<float>("max");
-    } else {
-      // paddle uses beta but mkldnn uses alpha for swish
-      if (algorithm == dnnl::algorithm::eltwise_swish) {
-        std::swap(alpha, beta);
-      } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
-        alpha = ctx.Attr<float>("threshold");
-      }
-    }
-
-    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
-                                            algorithm,
-                                            x->mem_desc(),
-                                            alpha,
-                                            beta);
-  }
-
-  ActivationMKLDNNHandler(dnnl::algorithm algorithm,
-                          const framework::ExecutionContext& ctx,
-                          const dnnl::engine engine,
-                          Place cpu_place,
-                          const framework::Tensor* x,
-                          const Tensor* dout)
-      : platform::MKLDNNHandlerNoCachingT<T,
-                                          dnnl::eltwise_forward,
-                                          dnnl::eltwise_backward>(engine,
-                                                                  cpu_place) {
-    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
-    float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
-
-    // paddle uses beta but mkldnn uses alpha for swish
-    if (algorithm == dnnl::algorithm::eltwise_swish) {
-      std::swap(alpha, beta);
-    } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
-      alpha = ctx.Attr<float>("threshold");
-    }
-
-    if (ctx.Type() == "clip_grad") {
-      alpha = ctx.HasInput("Min") ? ctx.Input<Tensor>("Min")->data<float>()[0]
-                                  : ctx.Attr<float>("min");
-      beta = ctx.HasInput("Max") ? ctx.Input<Tensor>("Max")->data<float>()[0]
-                                 : ctx.Attr<float>("max");
-    }
-
-    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
-                                            algorithm,
-                                            x->mem_desc(),
-                                            alpha,
-                                            beta);
-    this->AcquireBackwardPrimitiveDescriptor(
-        algorithm, dout->mem_desc(), x->mem_desc(), alpha, beta);
-  }
-
-  std::shared_ptr<dnnl::memory> AcquireBackwardSrcMemory(
-      const framework::Tensor* input) {
-    const T* input_data = input->data<T>();
-    return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(),
-                                            to_void_cast<T>(input_data));
-  }
-};
-
 static std::unordered_map<std::string, std::string> GetAttributeMap(
     std::string act_type) {
   std::unordered_map<std::string, std::string> attr_map;
diff --git a/paddle/fluid/platform/profiler/chrometracing_logger.cc b/paddle/fluid/platform/profiler/chrometracing_logger.cc
index 1e22ffe1a8dcf7..287628c85e5041 100644
--- a/paddle/fluid/platform/profiler/chrometracing_logger.cc
+++ b/paddle/fluid/platform/profiler/chrometracing_logger.cc
@@ -546,10 +546,10 @@ void ChromeTracingLogger::HandleTypeMemset(
 }
 
 void ChromeTracingLogger::StartLog() {
-  output_file_stream_ << string_format(std::string(
+  output_file_stream_ << std::string(
       R"JSON(
   { 
-    "displayTimeUnit": "ms",)JSON"));
+    "displayTimeUnit": "ms",)JSON");
 }
 
 void ChromeTracingLogger::LogMetaInfo(const std::string& version,
diff --git a/paddle/fluid/platform/profiler/profiler.cc b/paddle/fluid/platform/profiler/profiler.cc
index 5957c4c24ca3be..72fb647a04efbd 100644
--- a/paddle/fluid/platform/profiler/profiler.cc
+++ b/paddle/fluid/platform/profiler/profiler.cc
@@ -29,7 +29,10 @@
 #include "paddle/fluid/platform/profiler/custom_device/custom_tracer.h"
 #include "paddle/fluid/platform/profiler/extra_info.h"
 #include "paddle/fluid/platform/profiler/host_tracer.h"
+#ifdef PADDLE_WITH_MLU
+#include "paddle/fluid/platform/device/mlu/enforce.h"
 #include "paddle/fluid/platform/profiler/mlu/mlu_tracer.h"
+#endif
 #include "paddle/fluid/platform/profiler/trace_event_collector.h"
 #include "paddle/fluid/platform/profiler/utils.h"
 
@@ -80,9 +83,11 @@ Profiler::Profiler(const ProfilerOptions& options,
   if (trace_switch.test(kProfileGPUOptionBit)) {
     tracers_.emplace_back(&CudaTracer::GetInstance(), false);
   }
+#ifdef PADDLE_WITH_MLU
   if (trace_switch.test(kProfileMLUOptionBit)) {
     tracers_.emplace_back(&MluTracer::GetInstance(), false);
   }
+#endif
   if (trace_switch.test(kProfileCustomDeviceOptionBit)) {
     for (const auto& dev_type : custom_device_types) {
       tracers_.emplace_back(&CustomTracer::GetInstance(dev_type), false);
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 6aa2e7394c90bd..0a59caae2bbe81 100755
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -137,9 +137,10 @@ set(PYBIND_SRCS
 
 if(WITH_CUSTOM_DEVICE)
   set(PYBIND_DEPS ${PYBIND_DEPS} phi_capi)
+  set(PYBIND_DEPS ${PYBIND_DEPS} custom_device_common_op_registry)
 endif()
 
-if(NOT ON_INFER)
+if(WITH_PYTHON)
   set(PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if(WITH_NCCL OR WITH_RCCL)
     set(PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
@@ -571,25 +572,33 @@ if(WITH_PYTHON)
     list(APPEND PYBIND_DEPS custom_operator_node)
   endif()
 
+  # On Linux, cc_library(paddle SHARED ..) will generate the libpaddle.so,
+  # add a prefix `lib` by default, but on Windows, cc_library(paddle SHARED ..)
+  # will not add prefix, so it generate paddle.lib and paddle.pyd,
+  # we need to pay attention to the difference
+  set(SHARD_LIB_NAME paddle)
+  if(WIN32)
+    set(SHARD_LIB_NAME libpaddle)
+  endif()
   cc_library(
-    paddle_pybind SHARED
+    ${SHARD_LIB_NAME} SHARED
     SRCS ${PYBIND_SRCS}
     DEPS ${PYBIND_DEPS} ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
 
   if(NOT ((NOT WITH_PYTHON) AND ON_INFER))
-    add_dependencies(paddle_pybind legacy_eager_codegen)
-    add_dependencies(paddle_pybind eager_legacy_op_function_generator_cmd)
+    add_dependencies(${SHARD_LIB_NAME} legacy_eager_codegen)
+    add_dependencies(${SHARD_LIB_NAME} eager_legacy_op_function_generator_cmd)
   endif()
 
   if(NOT APPLE AND NOT WIN32)
-    target_link_libraries(paddle_pybind rt)
+    target_link_libraries(${SHARD_LIB_NAME} rt)
   endif()
 
   if(WITH_ROCM)
-    target_link_libraries(paddle_pybind ${ROCM_HIPRTC_LIB})
+    target_link_libraries(${SHARD_LIB_NAME} ${ROCM_HIPRTC_LIB})
   endif()
 
   get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
-  target_link_libraries(paddle_pybind ${os_dependency_modules})
-  add_dependencies(paddle_pybind op_function_generator_cmd)
+  target_link_libraries(${SHARD_LIB_NAME} ${os_dependency_modules})
+  add_dependencies(${SHARD_LIB_NAME} op_function_generator_cmd)
 endif()
diff --git a/paddle/fluid/pybind/bind_fleet_executor.cc b/paddle/fluid/pybind/bind_fleet_executor.cc
index b4a6432e9e58b2..616dbada4d227f 100644
--- a/paddle/fluid/pybind/bind_fleet_executor.cc
+++ b/paddle/fluid/pybind/bind_fleet_executor.cc
@@ -65,6 +65,7 @@ struct npy_format_descriptor<paddle::platform::float16> {
 namespace paddle {
 namespace pybind {
 
+using paddle::distributed::DependType;
 using paddle::distributed::DistModel;
 using paddle::distributed::DistModelConfig;
 using paddle::distributed::DistModelDataBuf;
@@ -164,18 +165,17 @@ void BindFleetExecutor(py::module* m) {
       .def(
           "run", &FleetExecutor::Run, py::call_guard<py::gil_scoped_release>());
 
+  py::enum_<DependType>(*m, "DependType")
+      .value("NORMAL", DependType::NORMAL)
+      .value("LOOP", DependType::LOOP)
+      .value("STOP_LOOP", DependType::STOP_LOOP);
+
   py::class_<TaskNode>(*m, "TaskNode")
-      .def(py::init<framework::ProgramDesc*,
-                    int64_t,
-                    int64_t,
-                    int64_t,
-                    int64_t>())
       .def(py::init<framework::ProgramDesc*, int64_t, int64_t, int64_t>())
       .def(py::init<int32_t,
                     const std::vector<framework::OpDesc*>&,
                     int64_t,
                     int64_t,
-                    int64_t,
                     int64_t>())
       .def("task_id", &TaskNode::task_id)
       .def("add_upstream_task", &TaskNode::AddUpstreamTask)
@@ -183,7 +183,10 @@ void BindFleetExecutor(py::module* m) {
       .def("set_run_pre_steps", &TaskNode::SetRunPerSteps)
       .def("set_run_at_offset", &TaskNode::SetRunAtOffset)
       .def("set_type", &TaskNode::SetType)
+      .def("set_cond_var_name", &TaskNode::SetCondVarName)
       .def("role", &TaskNode::role)
+      .def("set_vars_to_shape", &TaskNode::SetVarsToShape)
+      .def("set_vars_to_dtype", &TaskNode::SetVarsToDtype)
       .def("init", [](TaskNode& self) { self.Init(); })
       .def("set_program", &TaskNode::SetProgram);
 
diff --git a/paddle/fluid/pybind/cuda_streams_py.cc b/paddle/fluid/pybind/cuda_streams_py.cc
index 66cd20340ca857..8fa1df2a53d3a3 100644
--- a/paddle/fluid/pybind/cuda_streams_py.cc
+++ b/paddle/fluid/pybind/cuda_streams_py.cc
@@ -321,8 +321,8 @@ void BindCudaStream(py::module *m_ptr) {
       Parameters:
         enable_timing(bool, optional): Whether the event will measure time. Default: False.
         blocking(bool, optional): Whether the wait() func will be blocking. Default: False;
-        interprocess(bool, optional): Whether the event can be shared between processes. Defalut: False.
-      
+        interprocess(bool, optional): Whether the event can be shared between processes. Default: False.
+
       Examples:
         .. code-block:: python
 
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index 5a7e2355f64ebc..29e6e9e5d1e79c 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/ProcessGroupStream.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/Utils.h"
 #include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
@@ -171,6 +172,24 @@ void BindDistributed(py::module *m) {
               py::arg("source_rank"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "broadcast",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::BroadcastOptions opts{src};
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Broadcast(tensors, tensors, opts, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "barrier",
               [](distributed::ProcessGroup &self, std::vector<int> place_ids) {
@@ -196,6 +215,23 @@ void BindDistributed(py::module *m) {
               py::arg("dst"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "send",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int dst,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors, dst, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "send_partial",
               [](distributed::ProcessGroup &self,
@@ -206,9 +242,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int send_numel = numel / nranks;
-                int offset = send_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
                 return self.Send_Partial(*dense, dst_rank, offset, send_numel);
               },
               py::arg("tensor"),
@@ -217,6 +253,30 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "send_partial",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int dst_rank,
+                 int nranks,
+                 int rank_id,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int64_t numel = (*dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
+                return self.Send_Partial(
+                    *dense, dst_rank, offset, send_numel, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("num"),
+              py::arg("id"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "recv",
               [](distributed::ProcessGroup &self,
@@ -232,6 +292,23 @@ void BindDistributed(py::module *m) {
               py::arg("src"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "recv",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors, src, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "recv_partial",
               [](distributed::ProcessGroup &self,
@@ -242,9 +319,9 @@ void BindDistributed(py::module *m) {
                 auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
                 auto dense =
                     std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
-                int numel = (*dense).numel();
-                int recv_numel = numel / nranks;
-                int offset = recv_numel * rank_id;
+                int64_t numel = (*dense).numel();
+                int64_t recv_numel = numel / nranks;
+                int64_t offset = recv_numel * rank_id;
                 return self.Recv_Partial(*dense, src_rank, offset, recv_numel);
               },
               py::arg("tensor"),
@@ -253,6 +330,30 @@ void BindDistributed(py::module *m) {
               py::arg("id"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "recv_partial",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_tensor,
+                 int src_rank,
+                 int nranks,
+                 int rank_id,
+                 bool sync_op) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int64_t numel = (*dense).numel();
+                int64_t recv_numel = numel / nranks;
+                int64_t offset = recv_numel * rank_id;
+                return self.Recv_Partial(
+                    *dense, src_rank, offset, recv_numel, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("num"),
+              py::arg("id"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "all_gather",
               [](distributed::ProcessGroup &self,
@@ -272,6 +373,57 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "allgather",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor_list,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                const auto *dev_ctx = self.GetDeviceContext(in_tensor.place());
+                auto task = self.AllGather(in_wrapper, out_wrapper, sync_op);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "allgather_into_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllGather(in_wrapper, out_wrapper, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "all_gather_partial",
               [](distributed::ProcessGroup &self,
@@ -287,9 +439,9 @@ void BindDistributed(py::module *m) {
                     out_tensor.impl());
                 std::vector<phi::DenseTensor> in_tensors = {*in_dense};
                 std::vector<phi::DenseTensor> out_tensors = {*out_dense};
-                int numel = (*in_dense).numel();
-                int send_numel = numel / nranks;
-                int offset = send_numel * rank_id;
+                int64_t numel = (*in_dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
                 return self.AllGather_Partial(
                     in_tensors, out_tensors, offset, send_numel);
               },
@@ -318,6 +470,61 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "alltoall",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor_list,
+                 bool sync_op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                // in_tensor_list should not be empty
+                const auto *dev_ctx =
+                    self.GetDeviceContext(in_tensor_list.back().place());
+                auto task = self.AllToAll(in_wrapper, out_wrapper, sync_op);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAll(in_wrapper, out_wrapper, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "alltoall_single",
               [](distributed::ProcessGroup &self,
@@ -342,6 +549,34 @@ void BindDistributed(py::module *m) {
               py::arg("out_sizes"),
               py::call_guard<py::gil_scoped_release>())
 
+          .def(
+              "alltoall_single",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 std::vector<int64_t> &in_sizes,
+                 std::vector<int64_t> &out_sizes,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAllSingle(
+                    in_wrapper, out_wrapper, in_sizes, out_sizes, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("in_sizes"),
+              py::arg("out_sizes"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "reduce",
               [](distributed::ProcessGroup &self,
@@ -361,6 +596,83 @@ void BindDistributed(py::module *m) {
               py::arg("dst"),
               py::arg("op") = distributed::ReduceOp::SUM,
               py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 int dst,
+                 distributed::ReduceOp op,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                distributed::ReduceOptions opts{op, dst};
+                auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Reduce(tensors, tensors, opts, sync_op);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("op"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op,
+                 bool sync_op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(
+                    in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(
+                    in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "scatter",
               [](distributed::ProcessGroup &self,
@@ -383,31 +695,171 @@ void BindDistributed(py::module *m) {
               py::arg("out"),
               py::arg("src"),
               py::call_guard<py::gil_scoped_release>())
+
           .def(
-              "_reduce_scatter_base",
+              "scatter",
               [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor_list,
                  py::handle py_out_tensor,
-                 py::handle py_in_tensor,
-                 distributed::ReduceOp op) {
-                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                 int src,
+                 bool sync_op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
                 auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
-                distributed::ReduceScatterOptions opts;
-                opts.reduce_op = op;
-                auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
                     out_tensor.impl());
-                auto dense_in = std::dynamic_pointer_cast<phi::DenseTensor>(
-                    in_tensor.impl());
-                return self._ReduceScatterBase(*dense_out, *dense_in, opts);
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper, out_wrapper, opts, sync_op);
               },
-              py::arg("out_tensor"),
-              py::arg("in_tensor"),
-              py::arg("op") = distributed::ReduceOp::SUM,
-              py::call_guard<py::gil_scoped_release>());
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter_tensor",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int src,
+                 bool sync_op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper, out_wrapper, opts, sync_op);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::arg("sync_op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "_reduce_scatter_base",
+              [](distributed::ProcessGroup &self,
+                 py::handle py_out_tensor,
+                 py::handle py_in_tensor,
+                 distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                distributed::ReduceScatterOptions opts;
+                opts.reduce_op = op;
+                auto dense_out = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                auto dense_in = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                return self._ReduceScatterBase(*dense_out, *dense_in, opts);
+              },
+              py::arg("out_tensor"),
+              py::arg("in_tensor"),
+              py::arg("op") = distributed::ReduceOp::SUM,
+              py::call_guard<py::gil_scoped_release>());
 
   auto ProcessGroupStream =
       py::class_<distributed::ProcessGroupStream,
                  std::shared_ptr<distributed::ProcessGroupStream>>(
           *m, "ProcessGroupStream", ProcessGroup)
+          .def(
+              "allgather_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor_list) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                const auto *dev_ctx =
+                    self.GetDeviceContext(in_tensor.place(), true);
+                auto task = self.AllGather(in_wrapper,
+                                           out_wrapper,
+                                           /*sync_op*/ true,
+                                           /*use_calc_stream*/ true);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "allgather_into_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllGather(in_wrapper,
+                                      out_wrapper,
+                                      /*sync_op*/ true,
+                                      /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "all_gather_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int nranks,
+                 int rank_id) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> in_tensors = {*in_dense};
+                std::vector<phi::DenseTensor> out_tensors = {*out_dense};
+                int64_t numel = (*in_dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
+                return self.AllGather_Partial(in_tensors,
+                                              out_tensors,
+                                              offset,
+                                              send_numel,
+                                              /*sync_op*/ true,
+                                              /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("num"),
+              py::arg("id"),
+              py::call_guard<py::gil_scoped_release>())
+
           .def(
               "allreduce_on_calc_stream",
               [](distributed::ProcessGroupStream &self,
@@ -427,6 +879,339 @@ void BindDistributed(py::module *m) {
               },
               py::arg("tensor"),
               py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor_list) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor_list =
+                    CastPyArg2VectorOfTensor(py_out_tensor_list.ptr(), 0);
+                Tensor concat_out_tensor = paddle::concat(out_tensor_list, 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                // in_tensor_list must not be empty
+                const auto *dev_ctx = self.GetDeviceContext(
+                    in_tensor_list.back().place(), /*use_calc_stream*/ true);
+                auto task = self.AllToAll(in_wrapper,
+                                          out_wrapper,
+                                          /*sync_op*/ true,
+                                          /*use_calc_stream*/ true);
+                distributed::SplitTensor(dev_ctx, *out_dense, &out_tensor_list);
+                return task;
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAll(in_wrapper,
+                                     out_wrapper,
+                                     /*sync_op*/ true,
+                                     /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "alltoall_single_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 std::vector<int64_t> &in_sizes,
+                 std::vector<int64_t> &out_sizes) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                return self.AllToAllSingle(in_wrapper,
+                                           out_wrapper,
+                                           in_sizes,
+                                           out_sizes,
+                                           /*sync_op*/ true,
+                                           /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("in_sizes"),
+              py::arg("out_sizes"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "broadcast_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                distributed::BroadcastOptions opts{src};
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Broadcast(tensors,
+                                      tensors,
+                                      opts,
+                                      /*sync_op*/ true,
+                                      /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 int dst,
+                 distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                distributed::ReduceOptions opts{op, dst};
+                auto dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Reduce(tensors,
+                                   tensors,
+                                   opts,
+                                   /*sync_op*/ true,
+                                   /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(in_wrapper,
+                                          out_wrapper,
+                                          opts,
+                                          /*sync_op*/ true,
+                                          /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "reduce_scatter_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 distributed::ReduceOp op) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ReduceScatterOptions opts{op};
+                return self.ReduceScatter(in_wrapper,
+                                          out_wrapper,
+                                          opts,
+                                          /*sync_op*/ true,
+                                          /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("op"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor_list,
+                 py::handle py_out_tensor,
+                 int src) {
+                auto in_tensor_list =
+                    CastPyArg2VectorOfTensor(py_in_tensor_list.ptr(), 0);
+                Tensor concat_in_tensor = paddle::concat(in_tensor_list, 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    concat_in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper,
+                                    out_wrapper,
+                                    opts,
+                                    /*sync_op*/ true,
+                                    /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "scatter_tensor_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_in_tensor,
+                 py::handle py_out_tensor,
+                 int src) {
+                auto in_tensor = CastPyArg2Tensor(py_in_tensor.ptr(), 0);
+                auto in_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    in_tensor.impl());
+                std::vector<phi::DenseTensor> in_wrapper = {*in_dense};
+
+                auto out_tensor = CastPyArg2Tensor(py_out_tensor.ptr(), 0);
+                auto out_dense = std::dynamic_pointer_cast<phi::DenseTensor>(
+                    out_tensor.impl());
+                std::vector<phi::DenseTensor> out_wrapper = {*out_dense};
+
+                distributed::ScatterOptions opts{src};
+                return self.Scatter(in_wrapper,
+                                    out_wrapper,
+                                    opts,
+                                    /*sync_op*/ true,
+                                    /*use_calc_stream*/ true);
+              },
+              py::arg("in"),
+              py::arg("out"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int dst) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Send(tensors,
+                                 dst,
+                                 /*sync_op*/ true,
+                                 /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "send_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int dst_rank,
+                 int nranks,
+                 int rank_id) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int64_t numel = (*dense).numel();
+                int64_t send_numel = numel / nranks;
+                int64_t offset = send_numel * rank_id;
+                return self.Send_Partial(*dense,
+                                         dst_rank,
+                                         offset,
+                                         send_numel,
+                                         /*sync_op*/ true,
+                                         /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("dst"),
+              py::arg("num"),
+              py::arg("id"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                std::vector<phi::DenseTensor> tensors = {*dense};
+                return self.Recv(tensors,
+                                 src,
+                                 /*sync_op*/ true,
+                                 /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::call_guard<py::gil_scoped_release>())
+
+          .def(
+              "recv_partial_on_calc_stream",
+              [](distributed::ProcessGroupStream &self,
+                 py::handle py_tensor,
+                 int src_rank,
+                 int nranks,
+                 int rank_id) {
+                auto tensor = CastPyArg2Tensor(py_tensor.ptr(), 0);
+                auto dense =
+                    std::dynamic_pointer_cast<phi::DenseTensor>(tensor.impl());
+                int64_t numel = (*dense).numel();
+                int64_t recv_numel = numel / nranks;
+                int64_t offset = recv_numel * rank_id;
+                return self.Recv_Partial(*dense,
+                                         src_rank,
+                                         offset,
+                                         recv_numel,
+                                         /*sync_op*/ true,
+                                         /*use_calc_stream*/ true);
+              },
+              py::arg("tensor"),
+              py::arg("src"),
+              py::arg("num"),
+              py::arg("id"),
               py::call_guard<py::gil_scoped_release>());
 
 #if defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)
diff --git a/paddle/fluid/pybind/eager_custom_python_api.h b/paddle/fluid/pybind/eager_custom_python_api.h
index 1bb8fdd9360645..85afc274623ea5 100644
--- a/paddle/fluid/pybind/eager_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_custom_python_api.h
@@ -30,13 +30,13 @@ static PyObject *eager_api_linear(PyObject *self,
     auto bias = GetTensorFromArgs("linear", "Bias", args, 2, true);
     tstate = PyEval_SaveThread();
     if (bias.initialized()) {
-      auto mm_out = matmul_dygraph_function(x, weight, false, false);
-      auto out = add_dygraph_function(mm_out, bias);
+      auto mm_out = matmul_ad_func(x, weight, false, false);
+      auto out = add_ad_func(mm_out, bias);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(out);
     } else {
-      auto mm_out = matmul_dygraph_function(x, weight, false, false);
+      auto mm_out = matmul_ad_func(x, weight, false, false);
       PyEval_RestoreThread(tstate);
       tstate = nullptr;
       return ToPyObject(mm_out);
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
index 16a5cff031d65a..0db9e943f3f42d 100644
--- a/paddle/fluid/pybind/eager_functions.cc
+++ b/paddle/fluid/pybind/eager_functions.cc
@@ -903,7 +903,7 @@ static PyObject* eager_api_to_uva_tensor(PyObject* self,
         "Input object type error or incompatible array data type. "
         "tensor.set() supports array with bool, float16, float32, "
         "float64, int8, int16, int32, int64,"
-        "please check your input or input array data type."));
+        "please check your input or input array data type.."));
   }
 
   return ToPyObject(*(new_tensor.get()));
diff --git a/paddle/fluid/pybind/eager_legacy_custom_python_api.h b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
index 7ed58a1e956f6a..c599346bdb7a89 100644
--- a/paddle/fluid/pybind/eager_legacy_custom_python_api.h
+++ b/paddle/fluid/pybind/eager_legacy_custom_python_api.h
@@ -38,7 +38,7 @@ static PyObject *eager_api_run_program(PyObject *self,
         "run_program", args, 6, PyTuple_GET_SIZE(args), attrs);
 
     tstate = PyEval_SaveThread();
-    run_program_dygraph_function(X, Params, Out, OutScope, DOut, attrs);
+    run_program_ad_func(X, Params, Out, OutScope, DOut, attrs);
     PyEval_RestoreThread(tstate);
     tstate = nullptr;
     Py_RETURN_NONE;
diff --git a/paddle/fluid/pybind/eager_legacy_op_function_generator.cc b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
index 1d27d45beb7368..fff811e84ba6f6 100644
--- a/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
+++ b/paddle/fluid/pybind/eager_legacy_op_function_generator.cc
@@ -416,6 +416,11 @@ GenerateOpFunctions() {
     if (CUSTOM_HANDWRITE_OPS_SET.count(op_type)) {
       continue;
     }
+    // Skip the sparse op
+    if (op_type.compare(0, 7, "sparse_") == 0 && op_type != "sparse_momentum" &&
+        op_type != "sparse_attention") {
+      continue;
+    }
     // Skip operator which is not inherit form OperatorWithKernel, like while,
     // since only OperatorWithKernel can run in dygraph mode.
     // if the phi lib contains op kernel, we still generate ops method
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index 0e8bf1d0f88619..74c66bfb0ac700 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -47,7 +47,9 @@ typedef SSIZE_T ssize_t;
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #pragma GCC diagnostic ignored "-Wmissing-field-initializers"
+#include "paddle/fluid/eager/amp_utils.h"
 #include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h"
+#include "paddle/fluid/eager/eager_amp_auto_cast.h"
 #include "paddle/fluid/framework/python_headers.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/pybind/tensor_py.h"
@@ -759,8 +761,10 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
       decrease_axis, none_axes, infer_flags, list_select_idxs;
   // if index is a list, list_select_flag will be true
   bool list_select_flag = false;
+  // Note(0x45f): Using defined() instead of initialized()
+  // to support slice tensor which shape like [0, 0, 0].
   PADDLE_ENFORCE_EQ(
-      self->tensor.initialized(),
+      self->tensor.defined(),
       true,
       platform::errors::InvalidArgument(
           "tensor %s has not been initialized, we can only slice initialized "
@@ -806,14 +810,14 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
                                            decrease_axis.end());
 
     if (op_type == "slice") {
-      out = slice_dygraph_function(self->tensor,
-                                   slice_axes_tmp,
-                                   slice_starts,
-                                   slice_ends,
-                                   infer_flags_tmp,
-                                   decrease_axis_tmp);
+      out = slice_ad_func(self->tensor,
+                          slice_axes_tmp,
+                          slice_starts,
+                          slice_ends,
+                          infer_flags_tmp,
+                          decrease_axis_tmp);
     } else if (op_type == "strided_slice") {
-      out = strided_slice_dygraph_function(
+      out = strided_slice_ad_func(
           self->tensor, slice_axes, slice_starts, slice_ends, slice_strides);
     } else {
       PADDLE_THROW(platform::errors::InvalidArgument(
@@ -852,7 +856,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
       }
 
       paddle::experimental::Tensor new_out;
-      new_out = unsqueeze_dygraph_function(out, none_axes);
+      new_out = unsqueeze_ad_func(out, none_axes);
       return ToPyObject(new_out);
     }
   }
@@ -868,7 +872,7 @@ static PyObject* tensor__getitem_index_not_tensor(TensorObject* self,
     paddle::framework::TensorFromVector(
         list_select_idxs, *dev_ctx, idx_tensor.get());
     framework::AttributeMap attrs = {{"dim", 0}};
-    out = index_select_dygraph_function(self->tensor, select_index, 0);
+    out = index_select_ad_func(self->tensor, select_index, 0);
   }
 
   return ToPyObject(out);
@@ -1100,27 +1104,11 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
             "please check the type of tensor."));
       }
 
-      if (!value_tensor_tmp.initialized()) {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-        SetTensorFromPyArray(
-            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
-            value,
-            platform::Place(platform::CUDAPlace(0)),
-            false);
-#else
-        SetTensorFromPyArray(
-            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
-            value,
-            platform::Place(platform::CPUPlace()),
-            false);
-#endif
-      } else {
-        SetTensorFromPyArray(
-            static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
-            value,
-            value_tensor_tmp.place(),
-            false);
-      }
+      SetTensorFromPyArray(
+          static_cast<phi::DenseTensor*>(value_tensor_tmp.impl().get()),
+          value,
+          self->tensor.place(),
+          false);
 
       value_tensor = value_tensor_tmp;
     } else {
@@ -1147,11 +1135,15 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
         } else if (self->tensor.dtype() ==
                    paddle::experimental::DataType::BOOL) {
           attrs["bool_values"] = std::vector<int>{value_obj_tmp.cast<bool>()};
+        } else if (self->tensor.dtype() ==
+                   paddle::experimental::DataType::FLOAT16) {
+          attrs["fp16_values"] =
+              std::vector<float>{value_obj_tmp.cast<float>()};
         } else {
           PADDLE_THROW(platform::errors::InvalidArgument(
               "When assign a value to a paddle.Tensor, "
               "the data type of the paddle.Tensor must be bool, "
-              "float32, int32 or int64, "
+              "float32, int32, int64 or float16, "
               "please check the type of tensor."));
         }
         attrs["shape"] = std::vector<int64_t>{1};
@@ -1169,6 +1161,17 @@ static PyObject* tensor_method__setitem_eager_tensor(TensorObject* self,
       // Release gil and do tracing
       py::gil_scoped_release release;
       // use inplace set_value_ operator
+      if (value_tensor.initialized() &&
+          (self->tensor.dtype() != value_tensor.dtype())) {
+        paddle::small_vector<std::vector<paddle::experimental::Tensor>,
+                             egr::kSlotSmallVectorSize>
+            tmps = {{self->tensor}, {value_tensor}};
+        auto amp_dtype = egr::GetAmpDestDtype("set_value", tmps);
+        self->tensor = egr::EagerAmpAutoCast(
+            self->tensor.name(), self->tensor, amp_dtype, "set_value");
+        value_tensor = egr::EagerAmpAutoCast(
+            value_tensor.name(), value_tensor, amp_dtype, "set_value");
+      }
       self->tensor = set_value__dygraph_function(
           self->tensor, value_tensor, {}, {}, {}, attrs);
     }
@@ -1569,6 +1572,15 @@ static PyObject* tensor_method_to_sparse_csr(TensorObject* self,
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
 
+static PyObject* tensor_method_is_same_shape(TensorObject* self,
+                                             PyObject* args,
+                                             PyObject* kwargs) {
+  EAGER_TRY
+  auto other = CastPyArg2Tensor(PyTuple_GET_ITEM(args, 0), 0);
+  return ToPyObject(self->tensor.shape() == other.shape());
+  EAGER_CATCH_AND_THROW_RETURN_NULL
+}
+
 static PyObject* tensor__inplace_version(TensorObject* self,
                                          PyObject* args,
                                          PyObject* kwargs) {
@@ -1964,6 +1976,10 @@ PyMethodDef variable_methods[] = {
      (PyCFunction)(void (*)(void))tensor_method_is_sparse_csr,
      METH_VARARGS | METH_KEYWORDS,
      NULL},
+    {"is_same_shape",
+     (PyCFunction)(void (*)(void))tensor_method_is_same_shape,
+     METH_VARARGS | METH_KEYWORDS,
+     NULL},
     {"to_sparse_csr",
      (PyCFunction)(void (*)(void))tensor_method_to_sparse_csr,
      METH_VARARGS | METH_KEYWORDS,
diff --git a/paddle/fluid/pybind/eager_properties.cc b/paddle/fluid/pybind/eager_properties.cc
index dfe9e03df5f243..6987950c9e035b 100644
--- a/paddle/fluid/pybind/eager_properties.cc
+++ b/paddle/fluid/pybind/eager_properties.cc
@@ -184,6 +184,42 @@ PyObject* tensor_properties_get_shape(TensorObject* self, void* closure) {
     }
   }
 
+  auto desired_layout =
+      paddle::imperative::LayoutAutoTune::Instance().GetDesiredLayout();
+  auto default_layout =
+      paddle::imperative::LayoutAutoTune::Instance().GetDefaultLayout();
+  bool change_dim =
+      (desired_layout != default_layout &&
+       self->tensor.layout() == desired_layout && value.size() == 4);
+  VLOG(6) << "eager_properties 'Shape' method, layout autotune "
+          << " desired_layout: " << desired_layout
+          << " default_layout: " << default_layout
+          << " tensor layout: " << self->tensor.layout()
+          << " tensor's shape size is : " << value.size();
+  std::vector<int64_t> dims = value;
+  if (change_dim &&
+      paddle::framework::DataLayoutToString(desired_layout) == "NCHW") {
+    // NCHW -> NHWC
+    VLOG(6) << "layout autotune get Shape from NCHW -> NHWC " << value[0] << " "
+            << value[1] << " " << value[2] << " " << value[3] << " to "
+            << dims[0] << " " << dims[2] << " " << dims[3] << " " << dims[1];
+    value[0] = dims[0];
+    value[1] = dims[2];
+    value[2] = dims[3];
+    value[3] = dims[1];
+  } else if (change_dim &&
+             paddle::framework::DataLayoutToString(desired_layout) == "NHWC") {
+    // NHWC -> NCHW
+    VLOG(6) << "layout autotune get Shape from NHWC -> NCHW " << value[0] << " "
+            << value[1] << " " << value[2] << " " << value[3] << " to "
+            << dims[0] << " " << dims[3] << " " << dims[1] << " " << dims[2]
+            << " " << dims[1];
+    value[0] = dims[0];
+    value[1] = dims[3];
+    value[2] = dims[1];
+    value[3] = dims[2];
+  }
+
   return ToPyObject(value);
   EAGER_CATCH_AND_THROW_RETURN_NULL
 }
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index aeaa0dbff78165..26b3b307ef80b6 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -964,11 +964,15 @@ void BindImperative(py::module *m_ptr) {
                              framework::proto::VarType::BOOL) {
                     attrs["bool_values"] =
                         std::vector<int>{value_obj.cast<bool>()};
+                  } else if (self->DataType() ==
+                             framework::proto::VarType::FP16) {
+                    attrs["fp16_values"] =
+                        std::vector<float>{value_obj.cast<float>()};
                   } else {
                     PADDLE_THROW(platform::errors::InvalidArgument(
                         "When assign a value to a paddle.Tensor, "
                         "the data type of the paddle.Tensor must be bool, "
-                        "float32, int32 or int64, "
+                        "float32, int32, int64 or float16, "
                         "please check the type of tensor."));
                   }
                   attrs["shape"] = std::vector<int64_t>{1};
@@ -2044,8 +2048,49 @@ void BindImperative(py::module *m_ptr) {
           "shape",
           [](imperative::VarBase &self) {
             if (self.Var().IsType<framework::LoDTensor>()) {
-              return phi::vectorize<int>(
+              auto value = phi::vectorize<int>(
                   self.Var().Get<framework::LoDTensor>().dims());
+              auto tensor = self.Var().Get<framework::LoDTensor>();
+              auto tmp_value = value;
+              auto desired_layout =
+                  paddle::imperative::LayoutAutoTune::Instance()
+                      .GetDesiredLayout();
+              auto default_layout =
+                  paddle::imperative::LayoutAutoTune::Instance()
+                      .GetDefaultLayout();
+              bool change_dim =
+                  (desired_layout != default_layout &&
+                   tensor.layout() == desired_layout && value.size() == 4);
+              VLOG(6) << "'Shape' method, layout autotune,"
+                      << " desired_layout: " << desired_layout
+                      << " default_layout: " << default_layout
+                      << " tensor layout: " << tensor.layout()
+                      << " tensor's shape size is : " << value.size();
+
+              if (change_dim && paddle::framework::DataLayoutToString(
+                                    desired_layout) == "NCHW") {
+                VLOG(6) << "layout autotune get Shape from NHWC -> NCHW "
+                        << value[0] << " " << value[1] << " " << value[2] << " "
+                        << value[3] << " to " << tmp_value[3] << " "
+                        << tmp_value[1] << " " << tmp_value[2] << " "
+                        << tmp_value[1];
+                // NCHW -> NHWC
+                value[1] = tmp_value[2];
+                value[2] = tmp_value[3];
+                value[3] = tmp_value[1];
+              } else if (change_dim && paddle::framework::DataLayoutToString(
+                                           desired_layout) == "NHWC") {
+                VLOG(6) << "layout autotune get Shape from NHWC -> NCHW "
+                        << value[0] << " " << value[1] << " " << value[2] << " "
+                        << value[3] << " to " << tmp_value[0] << " "
+                        << tmp_value[3] << " " << tmp_value[1] << " "
+                        << tmp_value[2];
+                // NHWC -> NCHW
+                value[1] = tmp_value[3];
+                value[2] = tmp_value[1];
+                value[3] = tmp_value[2];
+              }
+              return value;
             } else if (self.Var().IsType<phi::SelectedRows>()) {
               return phi::vectorize<int>(
                   self.Var().Get<phi::SelectedRows>().value().dims());
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
old mode 100755
new mode 100644
index ddd75f677e4f07..d7b28aab2301a1
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -35,6 +35,11 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+#include "paddle/phi/core/cuda_stream.h"
+#endif
 
 #ifdef PADDLE_WITH_ONNXRUNTIME
 #include "paddle/fluid/inference/api/onnxruntime_predictor.h"
@@ -397,6 +402,12 @@ void BindInferenceApi(py::module *m) {
                new paddle_infer::Predictor(config));
            return pred;
          });
+  m->def(
+      "_get_phi_kernel_name",
+      [](const std::string &fluid_op_name) {
+        return phi::TransToPhiKernelName(fluid_op_name);
+      },
+      py::return_value_policy::reference);
   m->def("copy_tensor", &CopyPaddleInferTensor);
   m->def("paddle_dtype_size", &paddle::PaddleDtypeSize);
   m->def("paddle_tensor_to_bytes", &SerializePDTensorToBytes);
@@ -542,7 +553,13 @@ void BindPaddlePredictor(py::module *m) {
       .def("get_input_names", &PaddlePredictor::GetInputNames)
       .def("get_output_names", &PaddlePredictor::GetOutputNames)
       .def("zero_copy_run", &PaddlePredictor::ZeroCopyRun)
-      .def("clone", &PaddlePredictor::Clone)
+      .def("clone", [](PaddlePredictor &self) { return self.Clone(nullptr); })
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("clone",
+           [](PaddlePredictor &self, phi::CUDAStream &stream) {
+             return self.Clone(stream.raw_stream());
+           })
+#endif
       .def("get_serialized_program", &PaddlePredictor::GetSerializedProgram);
 
   auto config = py::class_<PaddlePredictor::Config>(paddle_predictor, "Config");
@@ -583,7 +600,14 @@ void BindNativePredictor(py::module *m) {
       .def("get_input_tensor", &NativePaddlePredictor::GetInputTensor)
       .def("get_output_tensor", &NativePaddlePredictor::GetOutputTensor)
       .def("zero_copy_run", &NativePaddlePredictor::ZeroCopyRun)
-      .def("clone", &NativePaddlePredictor::Clone)
+      .def("clone",
+           [](NativePaddlePredictor &self) { return self.Clone(nullptr); })
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("clone",
+           [](NativePaddlePredictor &self, phi::CUDAStream &stream) {
+             return self.Clone(stream.raw_stream());
+           })
+#endif
       .def("scope",
            &NativePaddlePredictor::scope,
            py::return_value_policy::reference);
@@ -599,13 +623,6 @@ void BindAnalysisConfig(py::module *m) {
       .value("Bfloat16", AnalysisConfig::Precision::kBf16)
       .export_values();
 
-  py::enum_<AnalysisConfig::Backend>(analysis_config, "Backend")
-      .value("CPU", AnalysisConfig::Backend::kCPU)
-      .value("GPU", AnalysisConfig::Backend::kGPU)
-      .value("NPU", AnalysisConfig::Backend::kNPU)
-      .value("XPU", AnalysisConfig::Backend::kXPU)
-      .export_values();
-
   analysis_config.def(py::init<>())
       .def(py::init<const AnalysisConfig &>())
       .def(py::init<const std::string &>())
@@ -625,7 +642,14 @@ void BindAnalysisConfig(py::module *m) {
       .def("enable_use_gpu",
            &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"),
-           py::arg("device_id") = 0)
+           py::arg("device_id") = 0,
+           py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("set_exec_stream",
+           [](AnalysisConfig &self, phi::CUDAStream &stream) {
+             self.SetExecStream(stream.raw_stream());
+           })
+#endif
       .def("enable_xpu",
            &AnalysisConfig::EnableXpu,
            py::arg("l3_workspace_size") = 16 * 1024 * 1024,
@@ -650,6 +674,14 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("ipu_replica_num") = 1,
            py::arg("ipu_available_memory_proportion") = 1.0,
            py::arg("ipu_enable_half_partial") = false)
+      .def("set_ipu_custom_info",
+           &AnalysisConfig::SetIpuCustomInfo,
+           py::arg("ipu_custom_ops_info") =
+               std::vector<std::vector<std::string>>({}),
+           py::arg("ipu_custom_patterns") = std::map<std::string, bool>({}))
+      .def("load_ipu_config",
+           &AnalysisConfig::LoadIpuConfig,
+           py::arg("config_path"))
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("enable_onnxruntime", &AnalysisConfig::EnableONNXRuntime)
       .def("disable_onnxruntime", &AnalysisConfig::DisableONNXRuntime)
@@ -874,7 +906,13 @@ void BindAnalysisPredictor(py::module *m) {
       .def("analysis_argument",
            &AnalysisPredictor::analysis_argument,
            py::return_value_policy::reference)
-      .def("clone", &AnalysisPredictor::Clone)
+      .def("clone", [](AnalysisPredictor &self) { return self.Clone(nullptr); })
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("clone",
+           [](AnalysisPredictor &self, phi::CUDAStream &stream) {
+             return self.Clone(stream.raw_stream());
+           })
+#endif
       .def("scope",
            &AnalysisPredictor::scope,
            py::return_value_policy::reference)
@@ -901,7 +939,14 @@ void BindPaddleInferPredictor(py::module *m) {
 #endif
              self.Run();
            })
-      .def("clone", &paddle_infer::Predictor::Clone)
+      .def("clone",
+           [](paddle_infer::Predictor &self) { return self.Clone(nullptr); })
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      .def("clone",
+           [](paddle_infer::Predictor &self, phi::CUDAStream &stream) {
+             return self.Clone(stream.raw_stream());
+           })
+#endif
       .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
       .def("clear_intermediate_tensor",
            &paddle_infer::Predictor::ClearIntermediateTensor);
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index ba0f872cb7449f..af080bd0b3431c 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -71,6 +71,12 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
       "FFN1Bias",
       "FFN2Weight",
       "FFN2Bias"}},
+    {"fused_multi_transformer_int8",
+     {"X",           "LnScale",           "LnBias",       "QKVW",
+      "QKVBias",     "CacheKV",           "TimeStep",     "SrcMask",
+      "OutLinearW",  "OutLinearBias",     "FFNLnScale",   "FFNLnBias",
+      "FFN1Weight",  "FFN1Bias",          "FFN2Weight",   "FFN2Bias",
+      "QKVOutScale", "OutLinearOutScale", "FFN1OutScale", "FFN2OutScale"}},
     {"fused_bias_dropout_residual_layer_norm",
      {"X", "Residual", "Bias", "LnScale", "LnBias"}},
     {"instance_norm", {"X", "Scale", "Bias"}},
@@ -329,6 +335,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
       "Beta2PowOut",
       "MasterParamOut"}},
     {"fused_multi_transformer", {"CacheKVOut", "Out"}},
+    {"fused_multi_transformer_int8", {"CacheKVOut", "Out"}},
     {"resnet_basic_block",
      {"Y",         "Conv1",     "SavedMean1", "SavedInvstd1", "Mean1Out",
       "Var1Out",   "Conv2",     "SavedMean2", "SavedInvstd2", "Mean2Out",
@@ -433,6 +440,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
     {"split", {"Out"}},
     {"concat", {"Out"}},
     {"fused_multi_transformer", {"CacheKVOut"}},
+    {"fused_multi_transformer_int8", {"CacheKVOut"}},
     {"group_norm", {"Mean", "Variance"}},
     {"resnet_basic_block",
      {"Mean1Out", "Var1Out", "Mean2Out", "Var2Out", "Mean3Out", "Var3Out"}},
diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc
index ab725575351ea7..c68898ea6a6f27 100644
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -275,7 +275,8 @@ void BindVarDsec(pybind11::module *m) {
       .value("RAW", pd::proto::VarType::RAW)
       .value("STRING", pd::proto::VarType::STRING)
       .value("STRINGS", pd::proto::VarType::STRINGS)
-      .value("VOCAB", pd::proto::VarType::VOCAB);
+      .value("VOCAB", pd::proto::VarType::VOCAB)
+      .value("SPARSE_COO", pd::proto::VarType::SPARSE_COO);
 }
 
 void BindOpDesc(pybind11::module *m) {
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 4c5fd8a6a39844..67f0d9cc8eb853 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include <map>
 #include <memory>
 #include <mutex>  // NOLINT // for call_once
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <type_traits>
@@ -154,6 +155,7 @@ limitations under the License. */
 #endif
 
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
+#include "paddle/fluid/operators/custom_device_common_op_registry.h"
 #include "paddle/phi/capi/capi.h"
 #endif
 
@@ -203,6 +205,14 @@ PyTypeObject *g_framework_scope_pytype = nullptr;
 PyTypeObject *g_framework_lodtensorarray_pytype = nullptr;
 PyTypeObject *g_custom_op_kernel_ctx_pytype = nullptr;
 
+bool IsCompiledWithAVX() {
+#ifndef PADDLE_WITH_AVX
+  return false;
+#else
+  return true;
+#endif
+}
+
 bool IsCompiledWithCUDA() {
 #if !defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_HIP)
   return false;
@@ -346,6 +356,52 @@ bool IsCompiledWithDIST() {
 #endif
 }
 
+struct iinfo {
+  int64_t min, max;
+  int bits;
+  std::string dtype;
+
+  explicit iinfo(const framework::proto::VarType::Type &type) {
+    switch (type) {
+      case framework::proto::VarType::INT16:
+        min = std::numeric_limits<int16_t>::min();
+        max = std::numeric_limits<int16_t>::max();
+        bits = 16;
+        dtype = "int16";
+        break;
+      case framework::proto::VarType::INT32:
+        min = std::numeric_limits<int32_t>::min();
+        max = std::numeric_limits<int32_t>::max();
+        bits = 32;
+        dtype = "int32";
+        break;
+      case framework::proto::VarType::INT64:
+        min = std::numeric_limits<int64_t>::min();
+        max = std::numeric_limits<int64_t>::max();
+        bits = 64;
+        dtype = "int64";
+        break;
+      case framework::proto::VarType::INT8:
+        min = std::numeric_limits<int8_t>::min();
+        max = std::numeric_limits<int8_t>::max();
+        bits = 8;
+        dtype = "int8";
+        break;
+      case framework::proto::VarType::UINT8:
+        min = std::numeric_limits<uint8_t>::min();
+        max = std::numeric_limits<uint8_t>::max();
+        bits = 8;
+        dtype = "uint8";
+        break;
+      default:
+        PADDLE_THROW(platform::errors::InvalidArgument(
+            "the argument of paddle.iinfo can only be paddle.int8, "
+            "paddle.int16, paddle.int32, paddle.int64, or paddle.uint8"));
+        break;
+    }
+  }
+};
+
 static PyObject *GetPythonAttribute(PyObject *obj, const char *attr_name) {
   // NOTE(zjl): PyObject_GetAttrString would return nullptr when attr_name
   // is not inside obj, but it would also set the error flag of Python.
@@ -528,12 +584,7 @@ static int GetNCCLVersion() {
 }
 #endif
 
-#ifdef PADDLE_WITH_AVX
-PYBIND11_MODULE(core_avx, m) {
-#else
-PYBIND11_MODULE(core_noavx, m) {
-#endif
-
+PYBIND11_MODULE(libpaddle, m) {
   BindImperative(&m);
   BindEager(&m);
   BindEagerStringTensor(&m);
@@ -555,6 +606,21 @@ PYBIND11_MODULE(core_noavx, m) {
 
   BindException(&m);
 
+  py::class_<iinfo>(m, "iinfo")
+      .def(py::init<const framework::proto::VarType::Type &>())
+      .def_readonly("min", &iinfo::min)
+      .def_readonly("max", &iinfo::max)
+      .def_readonly("bits", &iinfo::bits)
+      .def_readonly("dtype", &iinfo::dtype)
+      .def("__repr__", [](const iinfo &a) {
+        std::ostringstream oss;
+        oss << "paddle.iinfo(min=" << a.min;
+        oss << ", max=" << a.max;
+        oss << ", bits=" << a.bits;
+        oss << ", dtype=" << a.dtype << ")";
+        return oss.str();
+      });
+
   m.def("set_num_threads", &platform::SetNumThreads);
 
   m.def("disable_signal_handler", &DisableSignalHandler);
@@ -1014,7 +1080,8 @@ All parameter, weight, gradient are variables in Paddle.
            R"DOC(
            Delete all sub-scopes of the current scope.
            )DOC")
-      .def("_kids", &Scope::kids);
+      .def("_kids", &Scope::kids)
+      .def_property("_can_reuesd", &Scope::CanReuesd, &Scope::SetCanReuesd);
 
   m.def(
       "Scope",
@@ -1632,9 +1699,17 @@ All parameter, weight, gradient are variables in Paddle.
     egr::Controller::Instance().MergeOpMetaInfoMap(
         framework::LoadOpMetaInfoAndRegisterOp(dso_name));
   });
-  m.def("init_devices", []() { framework::InitDevices(); });
+  m.def("init_devices", []() {
+    framework::InitDevices();
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+    for (auto &dev_type : phi::DeviceManager::GetAllCustomDeviceTypes()) {
+      paddle::operators::RegisterCustomDeviceCommonKernel(dev_type);
+    }
+#endif
+  });
   m.def("init_default_kernel_signatures",
         []() { framework::InitDefaultKernelSignatureMap(); });
+  m.def("is_compiled_with_avx", IsCompiledWithAVX);
   m.def("is_compiled_with_cuda", IsCompiledWithCUDA);
   m.def("is_compiled_with_ascend", IsCompiledWithAscend);
   m.def("is_compiled_with_rocm", IsCompiledWithROCM);
@@ -1843,6 +1918,9 @@ All parameter, weight, gradient are variables in Paddle.
               if (data_is_lod_tensor(self[i])) {
                 auto &data = PADDLE_GET(LoDTensor, self[i]);
                 res[i] = py::cast(std::move(data));
+              } else if (data_is_sparse_coo_tensor(self[i])) {
+                auto &data = PADDLE_GET(phi::SparseCooTensor, self[i]);
+                res[i] = py::cast(std::move(data));
               } else {
                 auto &data = PADDLE_GET(LoDTensorArray, self[i]);
                 py::list tmp(data.size());
@@ -2428,19 +2506,14 @@ All parameter, weight, gradient are variables in Paddle.
     return res;
   });
 
-  m.def("enable_layout_autotune", [] {
-    return paddle::imperative::LayoutAutoTune::Instance()
-        .EnableLayoutAutoTune();
-  });
+  m.def("enable_layout_autotune",
+        [] { return egr::Controller::Instance().EnableLayoutAutoTune(); });
 
-  m.def("disable_layout_autotune", [] {
-    return paddle::imperative::LayoutAutoTune::Instance()
-        .DisableLayoutAutoTune();
-  });
+  m.def("disable_layout_autotune",
+        [] { return egr::Controller::Instance().DisableLayoutAutoTune(); });
 
-  m.def("use_layout_autotune", [] {
-    return paddle::imperative::LayoutAutoTune::Instance().UseLayoutAutoTune();
-  });
+  m.def("use_layout_autotune",
+        [] { return egr::Controller::Instance().UseLayoutAutoTune(); });
 
   BindFleetWrapper(&m);
   BindIO(&m);
@@ -2476,7 +2549,7 @@ All parameter, weight, gradient are variables in Paddle.
   BindCompatible(&m);
   BindDataset(&m);
   BindGenerator(&m);
-#ifndef PADDLE_ON_INFERENCE
+#ifndef PADDLE_NO_PYTHON
   BindDistributed(&m);
 #endif
 #ifdef PADDLE_WITH_ASCEND
diff --git a/paddle/fluid/pybind/tensor.cc b/paddle/fluid/pybind/tensor.cc
index 8396a970bdd4f8..6cc18bf5e23b2d 100644
--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -1105,6 +1105,20 @@ void BindTensor(pybind11::module &m) {  // NOLINT
         std::copy(rows.begin(), rows.end(), std::back_inserter(new_rows));
         return new_rows;
       });
+
+  py::class_<phi::SparseCooTensor>(m, "SparseCooTensor")
+      .def("__init__",
+           [](phi::SparseCooTensor &instance) {
+             new (&instance) phi::SparseCooTensor();
+           })
+      .def("numel",
+           [](const phi::SparseCooTensor &self) -> int64_t {
+             return self.numel();
+           })
+      .def("indices",
+           [](const phi::SparseCooTensor &self) -> framework::Tensor {
+             return self.indices();
+           });
 }
 
 }  // namespace pybind
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 4a8ef8795e0892..4b01f2b568b0fc 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -35,6 +35,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
index 49fe069217ed7d..6f5db28abf6eee 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.cc
@@ -69,10 +69,16 @@ bool ProtoArgumentMappingContext::IsDenseTensorInputs(
   return true;
 }
 
+bool ProtoArgumentMappingContext::IsSelectedRowsInputs(
+    const std::string& name) const {
+  return false;
+}
+
 bool ProtoArgumentMappingContext::IsSelectedRowsInput(
     const std::string& name) const {
   return false;
 }
+
 bool ProtoArgumentMappingContext::IsDenseTensorVectorInput(
     const std::string& name) const {
   return false;
diff --git a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
index a1c730fd84918c..7c0bd0ff39945c 100644
--- a/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
+++ b/paddle/infrt/dialect/phi/pass/proto_arg_map_context.h
@@ -45,6 +45,7 @@ class ProtoArgumentMappingContext : public ::phi::ArgumentMappingContext {
   bool IsDenseTensorInput(const std::string& name) const override;
   bool IsDenseTensorInputs(const std::string& name) const override;
   bool IsSelectedRowsInput(const std::string& name) const override;
+  bool IsSelectedRowsInputs(const std::string& name) const override;
   bool IsDenseTensorVectorInput(const std::string& name) const override;
 
   bool IsDenseTensorOutput(const std::string& name) const override;
diff --git a/paddle/phi/api/include/context_pool.h b/paddle/phi/api/include/context_pool.h
index b429252beb7fde..f2e797575fd576 100644
--- a/paddle/phi/api/include/context_pool.h
+++ b/paddle/phi/api/include/context_pool.h
@@ -59,7 +59,7 @@ struct DefaultDeviceContextType<AllocationType::GPU> {
  *
  * Note: DeviceContextPool is an experimental API and may be removed in the
  * future. From 2.3, we recommend directly using the C++ API to combine new
- * perators.
+ * operators.
  */
 class PADDLE_API DeviceContextPool {
  public:
diff --git a/paddle/phi/api/lib/CMakeLists.txt b/paddle/phi/api/lib/CMakeLists.txt
index 924f6f3526a6b8..c4310b43f29bbc 100644
--- a/paddle/phi/api/lib/CMakeLists.txt
+++ b/paddle/phi/api/lib/CMakeLists.txt
@@ -21,9 +21,9 @@ set(api_gen_base ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_base.py)
 
 # forward api file
 set(api_gen_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/api_gen.py)
-set(api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/api.yaml)
+set(api_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/ops.yaml)
 set(legacy_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/legacy_ops.yaml)
 set(api_header_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/api.h)
 set(api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/api.cc)
 set(api_header_file_tmp ${api_header_file}.tmp)
@@ -55,7 +55,7 @@ set(dygraph_api_source_file_tmp ${dygraph_api_source_file}.tmp)
 set(sparse_api_gen_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_api_gen.py)
 set(sparse_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_ops.yaml)
 set(sparse_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/sparse_api.h)
 set(sparse_api_source_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/lib/sparse_api.cc)
@@ -66,7 +66,7 @@ set(sparse_api_source_file_tmp ${sparse_api_source_file}.tmp)
 set(sparse_bw_api_gen_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py)
 set(sparse_bw_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_bw_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/sparse_backward.yaml)
 set(sparse_bw_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/backward/sparse_bw_api.h)
 set(sparse_bw_api_source_file
@@ -78,7 +78,7 @@ set(sparse_bw_api_source_file_tmp ${sparse_bw_api_source_file}.tmp)
 set(strings_api_gen_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/strings_api_gen.py)
 set(strings_api_yaml_file
-    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/strings_api.yaml)
+    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/strings_ops.yaml)
 set(strings_api_header_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/include/strings_api.h)
 set(strings_api_source_file
@@ -97,8 +97,7 @@ set(wrapped_infermeta_source_file
 # op extra info file
 set(ops_extra_info_gen_file
     ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/generator/ops_extra_info_gen.py)
-set(api_compat_yaml_file
-    ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/api_compat.yaml)
+set(op_compat_yaml_file ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/op_compat.yaml)
 set(ops_extra_info_file
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/ops_extra_info.cc)
 
@@ -119,8 +118,13 @@ endif()
 set(parsed_api_dir ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml/parsed_apis)
 set(generated_op_path
     ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_op.cc)
+set(generated_sparse_ops_path
+    ${CMAKE_SOURCE_DIR}/paddle/fluid/operators/generated_sparse_op.cc)
 set(generated_argument_mapping_path
     ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sig.cc)
+set(generated_sparse_argument_mapping_path
+    ${CMAKE_SOURCE_DIR}/paddle/phi/ops/compat/generated_sparse_sig.cc)
+
 message(
   "parse api yamls:
 - ${api_yaml_file}
@@ -130,17 +134,23 @@ message(
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml
   COMMAND ${CMAKE_COMMAND} -E make_directory ${parsed_api_dir}
-  COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./api.yaml
-          --output_path ./parsed_apis/api.parsed.yaml
+  COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./ops.yaml
+          --output_path ./parsed_apis/ops.parsed.yaml
   COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path
-          ./legacy_api.yaml --output_path ./parsed_apis/legacy_api.parsed.yaml
+          ./legacy_ops.yaml --output_path ./parsed_apis/legacy_ops.parsed.yaml
   COMMAND
     ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path ./backward.yaml
-    --output_path ./parsed_apis/backward_api.parsed.yaml --backward
+    --output_path ./parsed_apis/backward_ops.parsed.yaml --backward
   COMMAND
     ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path
     ./legacy_backward.yaml --output_path
-    ./parsed_apis/legacy_backward_api.parsed.yaml --backward RESULTS_VARIABLE
+    ./parsed_apis/legacy_backward_ops.parsed.yaml --backward
+  COMMAND ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path
+          ./sparse_ops.yaml --output_path ./parsed_apis/sparse_ops.parsed.yaml
+  COMMAND
+    ${PYTHON_EXECUTABLE} generator/parse_api.py --api_yaml_path
+    ./sparse_backward.yaml --output_path
+    ./parsed_apis/sparse_backward.parsed.yaml --backward RESULTS_VARIABLE
     _results)
 foreach(_result in ${_results})
   if(${_result})
@@ -150,38 +160,53 @@ endforeach()
 
 # validation of api yamls
 message("validate api yaml:
-- ${parsed_api_dir}/api.parsed.yaml
-- ${parsed_api_dir}/backward_api.parsed.yaml")
+- ${parsed_api_dir}/ops.parsed.yaml
+- ${parsed_api_dir}/backward_ops.parsed.yaml")
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml
   COMMAND
     ${PYTHON_EXECUTABLE} generator/cross_validate.py --forward_yaml_paths
-    ./parsed_apis/api.parsed.yaml ./parsed_apis/legacy_api.parsed.yaml
-    --backward_yaml_paths ./parsed_apis/backward_api.parsed.yaml
-    ./parsed_apis/legacy_backward_api.parsed.yaml
-  RESULT_VARIABLE _result)
-if(${_result})
-  message(FATAL_ERROR "api validation failed, exiting.")
-endif()
+    ./parsed_apis/ops.parsed.yaml ./parsed_apis/legacy_ops.parsed.yaml
+    --backward_yaml_paths ./parsed_apis/backward_ops.parsed.yaml
+    ./parsed_apis/legacy_backward_ops.parsed.yaml
+  COMMAND
+    ${PYTHON_EXECUTABLE} generator/cross_validate.py --forward_yaml_paths
+    ./parsed_apis/sparse_ops.parsed.yaml --backward_yaml_paths
+    ./parsed_apis/sparse_backward.parsed.yaml
+  RESULT_VARIABLE _results)
+foreach(_result in ${_results})
+  if(${_result})
+    message(FATAL_ERROR "ops validation failed, exiting.")
+  endif()
+endforeach()
 
 # code generation for op, op makers, and argument mapping functions
 message(
   "create or remove auto-geneated operators: ${generated_op_path}.tmp
 create or remove auto-geneated argument mappings: ${generated_argument_mapping_path}.tmp"
 )
+
 execute_process(
   WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/paddle/phi/api/yaml
   COMMAND
-    ${PYTHON_EXECUTABLE} generator/generate_op.py --api_yaml_path
-    ./parsed_apis/api.parsed.yaml --backward_api_yaml_path
-    ./parsed_apis/backward_api.parsed.yaml --api_version_yaml_path
-    api_version.yaml --api_compat_yaml_path api_compat.yaml --output_op_path
+    ${PYTHON_EXECUTABLE} generator/generate_op.py --ops_yaml_path
+    ./parsed_apis/ops.parsed.yaml --backward_yaml_path
+    ./parsed_apis/backward_ops.parsed.yaml --op_version_yaml_path
+    op_version.yaml --op_compat_yaml_path op_compat.yaml --output_op_path
     "${generated_op_path}.tmp" --output_arg_map_path
     "${generated_argument_mapping_path}.tmp"
-  RESULT_VARIABLE _result)
-if(${_result})
-  message(FATAL_ERROR "operator codegen failed, exiting.")
-endif()
+  COMMAND
+    ${PYTHON_EXECUTABLE} generator/generate_sparse_op.py --ops_yaml_path
+    ./parsed_apis/sparse_ops.parsed.yaml --backward_ops_yaml_path
+    ./parsed_apis/sparse_backward.parsed.yaml --output_op_path
+    "${generated_sparse_ops_path}.tmp" --output_arg_map_path
+    "${generated_sparse_argument_mapping_path}.tmp"
+  RESULT_VARIABLE _results)
+foreach(_result in ${_results})
+  if(${_result})
+    message(FATAL_ERROR "operator codegen failed, exiting.")
+  endif()
+endforeach()
 
 if(EXISTS "${generated_op_path}.tmp" AND EXISTS "${generated_op_path}")
   execute_process(COMMAND ${CMAKE_COMMAND} -E copy_if_different
@@ -196,6 +221,25 @@ else()
   message("remove ${generated_op_path}")
 endif()
 
+if(EXISTS "${generated_sparse_ops_path}.tmp" AND EXISTS
+                                                 "${generated_sparse_ops_path}")
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -E copy_if_different
+            "${generated_sparse_ops_path}.tmp" "${generated_sparse_ops_path}")
+  message(
+    "copy if different ${generated_sparse_ops_path}.tmp ${generated_sparse_ops_path}"
+  )
+elseif(EXISTS "${generated_sparse_ops_path}.tmp")
+  execute_process(
+    COMMAND ${CMAKE_COMMAND} -E copy "${generated_sparse_ops_path}.tmp"
+            "${generated_sparse_ops_path}")
+  message("copy ${generated_sparse_ops_path}.tmp ${generated_sparse_ops_path}")
+else()
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
+                          "${generated_sparse_ops_path}")
+  message("remove ${generated_sparse_ops_path}")
+endif()
+
 if(EXISTS "${generated_argument_mapping_path}.tmp"
    AND EXISTS "${generated_argument_mapping_path}")
   execute_process(
@@ -219,11 +263,34 @@ else()
   message("remove ${generated_argument_mapping_path}")
 endif()
 
+if(EXISTS "${generated_sparse_argument_mapping_path}.tmp"
+   AND EXISTS "${generated_sparse_argument_mapping_path}")
+  execute_process(
+    COMMAND
+      ${CMAKE_COMMAND} -E copy_if_different
+      "${generated_sparse_argument_mapping_path}.tmp"
+      "${generated_sparse_argument_mapping_path}")
+  message(
+    "copy if different ${generated_sparse_argument_mapping_path}.tmp ${generated_sparse_argument_mapping_path}"
+  )
+elseif(EXISTS "${generated_sparse_argument_mapping_path}.tmp")
+  execute_process(
+    COMMAND
+      ${CMAKE_COMMAND} -E copy "${generated_sparse_argument_mapping_path}.tmp"
+      "${generated_sparse_argument_mapping_path}")
+  message(
+    "copy ${generated_sparse_argument_mapping_path}.tmp ${generated_sparse_argument_mapping_path}"
+  )
+else()
+  execute_process(COMMAND ${CMAKE_COMMAND} -E remove -f
+                          "${generated_sparse_argument_mapping_path}")
+  message("remove ${generated_sparse_argument_mapping_path}")
+endif()
+
 # generate ops extra info
 execute_process(
-  COMMAND
-    ${PYTHON_EXECUTABLE} ${ops_extra_info_gen_file} --api_compat_yaml_path
-    ${api_compat_yaml_file} --ops_extra_info_path ${ops_extra_info_file})
+  COMMAND ${PYTHON_EXECUTABLE} ${ops_extra_info_gen_file} --op_compat_yaml_path
+          ${op_compat_yaml_file} --ops_extra_info_path ${ops_extra_info_file})
 message("generate ${ops_extra_info_file}")
 
 # generate forward api
@@ -372,12 +439,6 @@ cc_library(
   SRCS api_custom_impl.cc
   DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils backward_infermeta
        phi_data_transform)
-cc_library(
-  sparse_api_custom_impl
-  SRCS sparse_api_custom_impl.cc
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils phi_data_transform
-       tensor_copy)
-
 cc_library(
   phi_function_api
   SRCS ${api_source_file}
@@ -391,6 +452,7 @@ cc_library(
        kernel_dispatch
        api_gen_utils
        backward_infermeta
+       sparse_backward_infermeta
        phi_data_transform
        phi_function_api
        api_custom_impl
@@ -398,12 +460,12 @@ cc_library(
 cc_library(
   sparse_api
   SRCS ${sparse_api_source_file}
-  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api_custom_impl)
+  DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils)
 cc_library(
   sparse_bw_api
   SRCS ${sparse_bw_api_source_file}
   DEPS phi_tensor_raw phi kernel_dispatch api_gen_utils sparse_api
-       sparse_api_custom_impl)
+       sparse_backward_infermeta)
 cc_library(
   phi_dygraph_api
   SRCS ${dygraph_api_source_file}
@@ -426,6 +488,7 @@ cc_library(
        api_gen_utils
        kernel_dispatch
        infermeta
+       sparse_infermeta
        sparse_api
        strings_api)
 cc_library(
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index 2313f4c1296323..37fd84e8b1b139 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -34,6 +34,95 @@ namespace experimental {
 
 ////////////////// Forward api impls //////////////////////
 
+Tensor add_n_impl(const std::vector<Tensor>& x) {
+  Backend kernel_backend = Backend::UNDEFINED;
+  DataLayout kernel_layout = DataLayout::UNDEFINED;
+  DataType kernel_data_type = DataType::UNDEFINED;
+
+  if (kernel_backend == Backend::UNDEFINED ||
+      kernel_layout == DataLayout::UNDEFINED ||
+      kernel_data_type == DataType::UNDEFINED) {
+    auto kernel_key_set = ParseKernelKeyByInputArgs(x);
+    auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
+    if (kernel_backend == Backend::UNDEFINED) {
+      kernel_backend = kernel_key.backend();
+    }
+    if (kernel_layout == DataLayout::UNDEFINED) {
+      kernel_layout = kernel_key.layout();
+    }
+    if (kernel_data_type == DataType::UNDEFINED) {
+      kernel_data_type = kernel_key.dtype();
+    }
+  }
+
+  bool is_sr_kernel = true;
+  for (auto& input : x) {
+    if (phi::DenseTensor::classof(input.impl().get())) {
+      is_sr_kernel = false;
+      break;
+    }
+  }
+
+  const std::string kernel_name = (is_sr_kernel ? "add_n_sr" : "add_n");
+
+  VLOG(6) << "add_n API kernel key: [" << kernel_backend << ", "
+          << kernel_layout << ", " << kernel_data_type << "]";
+  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
+      kernel_name, {kernel_backend, kernel_layout, kernel_data_type});
+  const auto& kernel = kernel_result.kernel;
+  VLOG(6) << kernel_name << " kernel: " << kernel;
+  auto* dev_ctx = GetDeviceContextByBackend(
+      kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
+
+  Tensor api_output;
+
+  if (is_sr_kernel) {
+    std::vector<const phi::SelectedRows*> input_x(x.size());
+    for (size_t i = 0; i < input_x.size(); ++i) {
+      input_x[i] = static_cast<phi::SelectedRows*>(x[i].impl().get());
+    }
+    auto x_meta_vec = MakeMetaTensor(input_x);
+    std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
+    for (size_t i = 0; i < x_meta_vec.size(); ++i) {
+      x_metas[i] = &x_meta_vec[i];
+    }
+    auto kernel_out = SetSelectedRowsKernelOutput(&api_output);
+    phi::MetaTensor meta_out(kernel_out);
+    phi::AddNInferMeta(x_metas, &meta_out);
+
+    using kernel_signature =
+        void (*)(const platform::DeviceContext&,
+                 const std::vector<const phi::SelectedRows*>&,
+                 phi::SelectedRows*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+    (*kernel_fn)(*dev_ctx, input_x, kernel_out);
+  } else {
+    std::vector<const phi::TensorBase*> input_x(x.size());
+    for (size_t i = 0; i < input_x.size(); ++i) {
+      input_x[i] = x[i].impl().get();
+    }
+    auto x_meta_vec = MakeMetaTensor(input_x);
+    std::vector<const phi::MetaTensor*> x_metas(x_meta_vec.size());
+    for (size_t i = 0; i < x_meta_vec.size(); ++i) {
+      x_metas[i] = &x_meta_vec[i];
+    }
+    auto kernel_out = SetKernelOutput(&api_output);
+    phi::MetaTensor meta_out(kernel_out);
+    phi::AddNInferMeta(x_metas, &meta_out);
+
+    using kernel_signature =
+        void (*)(const platform::DeviceContext&,
+                 const std::vector<const phi::TensorBase*>&,
+                 phi::DenseTensor*);
+    auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
+
+    (*kernel_fn)(*dev_ctx, input_x, kernel_out);
+  }
+
+  return api_output;
+}
+
 Tensor copy_to_impl(const Tensor& x, Place place, bool blocking) {
   Tensor out;
   copy(x, place, blocking, &out);
diff --git a/paddle/phi/api/lib/api_custom_impl.h b/paddle/phi/api/lib/api_custom_impl.h
index e7fca7bfbc84d1..ab1d17051499c2 100644
--- a/paddle/phi/api/lib/api_custom_impl.h
+++ b/paddle/phi/api/lib/api_custom_impl.h
@@ -31,6 +31,8 @@ namespace experimental {
 
 ////////////////// Forward api impls //////////////////////
 
+Tensor add_n_impl(const std::vector<Tensor>& x);
+
 std::tuple<Tensor, Tensor, Tensor, Tensor, Tensor, Tensor> batch_norm_impl(
     const Tensor& x,
     const Tensor& scale,
diff --git a/paddle/phi/api/lib/api_gen_utils.cc b/paddle/phi/api/lib/api_gen_utils.cc
index cbcc475a0df5b4..6d72db7fa10d6b 100644
--- a/paddle/phi/api/lib/api_gen_utils.cc
+++ b/paddle/phi/api/lib/api_gen_utils.cc
@@ -88,12 +88,26 @@ std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor) {
   return std::dynamic_pointer_cast<phi::StringTensor>(tensor.impl());
 }
 
+std::shared_ptr<phi::SparseCooTensor> TensorToSparseCooTensor(
+    const Tensor& tensor) {
+  return std::static_pointer_cast<phi::SparseCooTensor>(tensor.impl());
+}
 /* ----------------- for infer_meta --------------------- */
 
 phi::MetaTensor MakeMetaTensor(const phi::TensorBase& tensor) {
   return phi::MetaTensor(tensor);
 }
 
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::TensorBase*>& tensors) {
+  std::vector<phi::MetaTensor> meta_tensors;
+  meta_tensors.reserve(tensors.size());
+  for (const auto* t : tensors) {
+    meta_tensors.emplace_back(*t);
+  }
+  return meta_tensors;
+}
+
 phi::MetaTensor MakeMetaTensor(
     const paddle::optional<phi::DenseTensor>& tensor) {
   if (tensor) {
@@ -112,6 +126,16 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
   return meta_tensors;
 }
 
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::SelectedRows*>& tensors) {
+  std::vector<phi::MetaTensor> meta_tensors;
+  meta_tensors.reserve(tensors.size());
+  for (const auto* t : tensors) {
+    meta_tensors.emplace_back(*t);
+  }
+  return meta_tensors;
+}
+
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<phi::DenseTensor*>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
@@ -130,6 +154,22 @@ phi::MetaTensor MakeMetaTensor(
   return phi::MetaTensor();
 }
 
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SparseCooTensor>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return phi::MetaTensor();
+}
+
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SparseCsrTensor>& tensor) {
+  if (tensor) {
+    return {phi::MetaTensor(*tensor)};
+  }
+  return phi::MetaTensor();
+}
+
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<std::vector<const phi::DenseTensor*>>& tensors) {
   std::vector<phi::MetaTensor> meta_tensors;
@@ -210,6 +250,9 @@ phi::SelectedRows* SetSelectedRowsKernelOutput(Tensor* out) {
 }
 
 phi::TensorBase* SetSparseKernelOutput(Tensor* out, TensorType type) {
+  if (!out) {
+    return nullptr;
+  }
   if (!out->initialized()) {
     if (type == TensorType::SPARSE_COO) {
       auto sparse_tensor = std::make_shared<phi::SparseCooTensor>(
diff --git a/paddle/phi/api/lib/api_gen_utils.h b/paddle/phi/api/lib/api_gen_utils.h
index 2a7f283dabb649..98c6a1e0f72218 100644
--- a/paddle/phi/api/lib/api_gen_utils.h
+++ b/paddle/phi/api/lib/api_gen_utils.h
@@ -52,6 +52,8 @@ paddle::optional<phi::SelectedRows> TensorToSelectedRows(
 
 std::shared_ptr<phi::StringTensor> TensorToStringTensor(const Tensor& tensor);
 
+std::shared_ptr<phi::SparseCooTensor> TensorToSparseCooTensor(
+    const Tensor& tensor);
 /* ----------------- for infer_meta --------------------- */
 
 phi::MetaTensor MakeMetaTensor(const phi::TensorBase& tensor);
@@ -65,12 +67,24 @@ std::vector<phi::MetaTensor> MakeMetaTensor(
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const std::vector<phi::DenseTensor*>& tensors);
 
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::SelectedRows*>& tensors);
+
 phi::MetaTensor MakeMetaTensor(
     const paddle::optional<phi::SelectedRows>& tensor);
 
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SparseCooTensor>& tensor);
+
+phi::MetaTensor MakeMetaTensor(
+    const paddle::optional<phi::SparseCsrTensor>& tensor);
+
 std::vector<phi::MetaTensor> MakeMetaTensor(
     const paddle::optional<std::vector<const phi::DenseTensor*>>& tensors);
 
+std::vector<phi::MetaTensor> MakeMetaTensor(
+    const std::vector<const phi::TensorBase*>& tensors);
+
 /* ------------------ for output ----------------------- */
 
 phi::DenseTensor* SetKernelOutput(Tensor* out);
diff --git a/paddle/phi/api/lib/data_transform.cc b/paddle/phi/api/lib/data_transform.cc
index 10b01f94662b5f..048a24ff5e312f 100644
--- a/paddle/phi/api/lib/data_transform.cc
+++ b/paddle/phi/api/lib/data_transform.cc
@@ -40,7 +40,7 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
                                const TransformFlag& transform_flag) {
   // NOTE(dev): The default value of TransformFlag is True, if it is set with
   // False
-  // somewhere such as api.yaml or backward.yaml that means we should skip data
+  // somewhere such as ops.yaml or backward.yaml that means we should skip data
   // transform. Because "stop_transform_" has highest priority.
   if (!transform_flag.need_trans_backend()) {
     return false;
@@ -52,9 +52,9 @@ inline bool NeedTransformPlace(const paddle::platform::Place& input,
   return ret;
 }
 
-inline bool NeedTransformLayout(const paddle::platform::Place& place,
-                                const DataLayout& input,
+inline bool NeedTransformLayout(const DataLayout& input,
                                 const DataLayout& target,
+                                const paddle::platform::Place& place,
                                 const TransformFlag& transform_flag) {
   bool ret = transform_flag.need_trans_layout() &&
              (input != DataLayout::ALL_LAYOUT &&
@@ -81,7 +81,7 @@ inline phi::DenseTensor TransDataLayout(const phi::DenseTensor& tensor,
 }
 
 template <typename Context>
-phi::DenseTensor CastDateType(const Context& dev_ctx,
+phi::DenseTensor CastDataType(const Context& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
   switch (tensor.dtype()) {
@@ -111,7 +111,7 @@ phi::DenseTensor CastDateType(const Context& dev_ctx,
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-phi::DenseTensor CastDateType(const phi::GPUContext& dev_ctx,
+phi::DenseTensor CastDataType(const phi::GPUContext& dev_ctx,
                               const phi::DenseTensor& tensor,
                               DataType dtype) {
   switch (tensor.dtype()) {
@@ -151,11 +151,11 @@ inline phi::DenseTensor TransDataType(const phi::DenseTensor& tensor,
 
   if (platform::is_cpu_place(tensor.place())) {
     auto* dev_ctx = static_cast<phi::CPUContext*>(pool.Get(tensor.place()));
-    return CastDateType(*dev_ctx, tensor, dtype);
+    return CastDataType(*dev_ctx, tensor, dtype);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else if (platform::is_gpu_place(tensor.place())) {
     auto* dev_ctx = static_cast<phi::GPUContext*>(pool.Get(tensor.place()));
-    return CastDateType(*dev_ctx, tensor, dtype);
+    return CastDataType(*dev_ctx, tensor, dtype);
 #endif
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -202,10 +202,11 @@ phi::DenseTensor TransformData(phi::DenseTensor* tensor,
   bool trans_layout = false;
   bool trans_dtype = false;
 
-  if (NeedTransformLayout(tensor->place(),
-                          tensor->layout(),
+  if (NeedTransformLayout(tensor->layout(),
                           target_args_def.layout,
-                          transform_flag)) {
+                          tensor->place(),
+                          transform_flag) &&
+      tensor->dims().size() != 1) {
     out = TransDataLayout(out, target_args_def.layout);
     trans_layout = true;
   }
@@ -240,9 +241,9 @@ std::shared_ptr<phi::DenseTensor> PrepareData(
              dense_tensor.place(), target_args_def.backend, transform_flag) &&
          !NeedTransformDataType(
              dense_tensor.dtype(), target_args_def.dtype, transform_flag) &&
-         !NeedTransformLayout(dense_tensor.place(),
-                              dense_tensor.layout(),
+         !NeedTransformLayout(dense_tensor.layout(),
                               target_args_def.layout,
+                              dense_tensor.place(),
                               transform_flag))) {
       return std::static_pointer_cast<phi::DenseTensor>(tensor_in);
     }
@@ -277,9 +278,9 @@ std::unique_ptr<std::vector<phi::DenseTensor>> PrepareData(
              tensor_in->place(), target_args_def.backend, transform_flag) &&
          !NeedTransformDataType(
              tensor_in->dtype(), target_args_def.dtype, transform_flag) &&
-         !NeedTransformLayout(tensor_in->place(),
-                              tensor_in->layout(),
+         !NeedTransformLayout(tensor_in->layout(),
                               target_args_def.layout,
+                              tensor_in->place(),
                               transform_flag))) {
       pt_tensors->emplace_back(
           *std::dynamic_pointer_cast<phi::DenseTensor>(tensor_in));
@@ -307,7 +308,7 @@ paddle::optional<std::vector<phi::DenseTensor>> PrepareData(
 void TransDataBackend(const phi::DenseTensor* tensor,
                       Backend target_backend,
                       phi::DenseTensor* out) {
-  if (tensor) {
+  if (tensor && tensor->initialized()) {
     *out = TransDataPlace(*tensor, phi::TransToPhiPlace(target_backend));
   }
 }
diff --git a/paddle/phi/api/lib/sparse_api_custom_impl.cc b/paddle/phi/api/lib/sparse_api_custom_impl.cc
deleted file mode 100644
index 6aaf21a5e7f49f..00000000000000
--- a/paddle/phi/api/lib/sparse_api_custom_impl.cc
+++ /dev/null
@@ -1,202 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
-
-#include <memory>
-
-#include "glog/logging.h"
-#include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/core/kernel_registry.h"
-
-namespace paddle {
-namespace experimental {
-namespace sparse {
-
-Tensor to_sparse_coo_impl(const Tensor& x, const int64_t sparse_dim) {
-  if (x.layout() == phi::DataLayout::SPARSE_COO) {
-    return x;
-  }
-
-  // 1. Get kernel signature and kernel
-  std::string kernel_name = "dense_to_coo";
-  if (x.layout() == phi::DataLayout::SPARSE_CSR) {
-    kernel_name = "csr_to_coo";
-  }
-
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-
-  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_name, kernel_key);
-  const auto& kernel = kernel_result.kernel;
-
-  VLOG(6) << "add API kernel key: " << kernel_key;
-  VLOG(6) << "to API kernel: " << kernel;
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = phi::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  if (x.layout() == phi::DataLayout::SPARSE_CSR) {
-    auto input = std::dynamic_pointer_cast<phi::SparseCsrTensor>(x.impl());
-    kernel_context.EmplaceBackInput(input.get());
-  } else {
-    auto input = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
-    kernel_context.EmplaceBackInput(input.get());
-    kernel_context.EmplaceBackAttr(sparse_dim);
-  }
-
-  // 4. InferMeta
-  auto indices_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
-
-  // 5. Prepare outputs
-  // create empty SparseCooTensor
-  phi::DenseTensor non_zero_indices(std::make_shared<phi::Allocation>(),
-                                    std::move(indices_meta));
-  phi::DenseTensor non_zero_elements(std::make_shared<phi::Allocation>(),
-                                     std::move(elements_meta));
-  auto coo = std::make_shared<phi::SparseCooTensor>(
-      non_zero_indices, non_zero_elements, x.dims());
-
-  kernel_context.EmplaceBackOutput(coo.get());
-  Tensor out;
-  out.set_impl(coo);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-Tensor to_sparse_csr_impl(const Tensor& x) {
-  if (x.layout() == phi::DataLayout::SPARSE_CSR) {
-    return x;
-  }
-  // 1. Get kernel signature and kernel
-  std::string kernel_name = "dense_to_csr";
-  if (x.layout() == phi::DataLayout::SPARSE_COO) {
-    kernel_name = "coo_to_csr";
-  }
-
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-
-  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_name, kernel_key);
-  const auto& kernel = kernel_result.kernel;
-
-  VLOG(6) << "add API kernel key: " << kernel_key;
-  VLOG(6) << "to API kernel: " << kernel;
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = phi::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  if (x.layout() == phi::DataLayout::SPARSE_COO) {
-    auto input = std::dynamic_pointer_cast<phi::SparseCooTensor>(x.impl());
-    kernel_context.EmplaceBackInput(input.get());
-  } else {
-    auto input = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
-    kernel_context.EmplaceBackInput(input.get());
-  }
-
-  // 4. InferMeta
-  auto crows_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
-  auto cols_meta =
-      phi::DenseTensorMeta(phi::DataType::INT64, {1}, phi::DataLayout::NCHW);
-  auto elements_meta = phi::DenseTensorMeta(x.dtype(), {1}, x.layout());
-
-  // 5. Prepare outputs
-  // create empty SparseCooTensor
-  phi::DenseTensor non_zero_crows(std::make_shared<phi::Allocation>(),
-                                  std::move(crows_meta));
-  phi::DenseTensor non_zero_cols(std::make_shared<phi::Allocation>(),
-                                 std::move(cols_meta));
-  phi::DenseTensor non_zero_elements(std::make_shared<phi::Allocation>(),
-                                     std::move(elements_meta));
-  auto csr = std::make_shared<phi::SparseCsrTensor>(
-      non_zero_crows, non_zero_cols, non_zero_elements, x.dims());
-
-  kernel_context.EmplaceBackOutput(csr.get());
-  Tensor out;
-  out.set_impl(csr);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-Tensor to_dense_impl(const Tensor& x) {
-  if (x.layout() != phi::DataLayout::SPARSE_CSR &&
-      x.layout() != phi::DataLayout::SPARSE_COO) {
-    return x;
-  }
-
-  // 1. Get kernel signature and kernel
-  std::string kernel_name = "coo_to_dense";
-  if (x.layout() == phi::DataLayout::SPARSE_CSR) {
-    kernel_name = "csr_to_dense";
-  }
-
-  auto kernel_key_set = ParseKernelKeyByInputArgs(x);
-  auto kernel_key = kernel_key_set.GetHighestPriorityKernelKey();
-
-  auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
-      kernel_name, kernel_key);
-  const auto& kernel = kernel_result.kernel;
-
-  VLOG(6) << "add API kernel key: " << kernel_key;
-  VLOG(6) << "to API kernel: " << kernel;
-
-  // 2. Get Device Context
-  auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
-  auto kernel_context = phi::KernelContext(dev_ctx);
-
-  // 3. Auto data transform
-  if (x.layout() == phi::DataLayout::SPARSE_COO) {
-    auto input = std::dynamic_pointer_cast<phi::SparseCooTensor>(x.impl());
-    kernel_context.EmplaceBackInput(input.get());
-  } else {
-    auto input = std::dynamic_pointer_cast<phi::SparseCsrTensor>(x.impl());
-    kernel_context.EmplaceBackInput(input.get());
-  }
-
-  // 4. InferMeta
-  auto dense_meta = phi::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
-
-  // 5. Prepare outputs
-  // create empty SparseCooTensor
-  auto dense_out = std::make_shared<phi::DenseTensor>(
-      std::make_shared<phi::Allocation>(), std::move(dense_meta));
-
-  kernel_context.EmplaceBackOutput(dense_out.get());
-  Tensor out;
-  out.set_impl(dense_out);
-
-  // 6. Call kernel
-  kernel(&kernel_context);
-
-  return out;
-}
-
-}  // namespace sparse
-}  // namespace experimental
-}  // namespace paddle
diff --git a/paddle/phi/api/lib/tensor_method.cc b/paddle/phi/api/lib/tensor_method.cc
index 96f9aefbb1f649..312f52fa5e6614 100644
--- a/paddle/phi/api/lib/tensor_method.cc
+++ b/paddle/phi/api/lib/tensor_method.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/api/lib/kernel_dispatch.h"
 #include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/infermeta/unary.h"
+// clang-format off
 
 namespace paddle {
 namespace experimental {
@@ -165,7 +166,11 @@ void Tensor::copy_(const Tensor &src,
               static_cast<phi::SelectedRows *>(impl_.get()));
   } else if (kernel_type == KernelType::SPARSE_COO_KERNEL) {
     SetSparseKernelOutput(this, TensorType::SPARSE_COO);
-    // TODO(zhangkaihuo) add sparse infer_meta
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::SparseCooTensor>(src.impl_))),
+        &meta_out);
     phi::Copy(*dev_ctx,
               (*(std::static_pointer_cast<phi::SparseCooTensor>(src.impl_))),
               target_place,
@@ -173,7 +178,11 @@ void Tensor::copy_(const Tensor &src,
               static_cast<phi::SparseCooTensor *>(impl_.get()));
   } else if (kernel_type == KernelType::SPARSE_CSR_KERNEL) {
     SetSparseKernelOutput(this, TensorType::SPARSE_CSR);
-    // TODO(zhangkaihuo) add sparse infer_meta
+    phi::MetaTensor meta_out(impl_.get());
+    phi::UnchangedInferMeta(
+        MakeMetaTensor(
+            *(std::static_pointer_cast<phi::SparseCsrTensor>(src.impl_))),
+        &meta_out);
     phi::Copy(*dev_ctx,
               (*(std::static_pointer_cast<phi::SparseCsrTensor>(src.impl_))),
               target_place,
diff --git a/paddle/phi/api/yaml/api_compat.yaml b/paddle/phi/api/yaml/api_compat.yaml
deleted file mode 100644
index 2f34993e7ff103..00000000000000
--- a/paddle/phi/api/yaml/api_compat.yaml
+++ /dev/null
@@ -1,412 +0,0 @@
-- api : abs
-  backward : abs_grad
-  extra :
-    attrs : [bool use_cudnn = false, bool use_mkldnn = false]
-
-- api : addmm
-  backward : addmm_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : affine_grid
-  backward : affine_grid_grad
-  extra :
-    attrs : [bool use_cudnn = true]
-
-- api : angle
-  backward : angle_grad
-  extra :
-    attrs : [bool use_cudnn = false, bool use_mkldnn = false]
-
-- api : atan2
-  inputs :
-    {x : X1, y : X2}
-  outputs :
-    out : Out
-
-- api : batch_norm
-  backward : batch_norm_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
-
-- api : bernoulli
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : bicubic_interp (bicubic_interp_v2)
-  backward : bicubic_interp_grad (bicubic_interp_v2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : bilinear_interp (bilinear_interp_v2)
-  backward : bilinear_interp_grad (bilinear_interp_v2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : cholesky
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : cholesky_solve
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- api : clip
-  backward : clip_grad
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- api : concat
-  backward : concat_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
-
-- api : conv2d
-  backward : conv2d_grad
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
-             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
-             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
-             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
-             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-
-- api : conv2d_fusion
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
-             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
-             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
-             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
-             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-
-- api : conv2d_transpose
-  backward : conv2d_transpose_grad
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool force_fp32_output = false,
-             str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
-             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()]
-
-- api : conv3d
-  backward : conv3d_grad
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
-             bool use_addto = false, bool fuse_residual_connection = false, bool force_fp32_output = false,
-             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-
-- api : conv3d_transpose
-  backward : conv3d_transpose_grad
-  extra :
-    attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()]
-
-- api : cross
-  inputs :
-    {x : X, y : Y}
-  attrs :
-    axis : dim
-  outputs :
-    out : Out
-
-- api : data_norm
-  backward : data_norm_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : depthwise_conv2d
-  backward : depthwise_conv2d_grad
-  extra :
-    attrs : [bool is_test = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
-             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
-             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
-             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
-             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
-
-- api : depthwise_conv2d_transpose
-  backward : depthwise_conv2d_transpose_grad
-  extra :
-    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool force_fp32_output = false,
-             str mkldnn_data_type = "float32", bool fuse_relu = false,
-             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
-             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()]
-
-- api : diag (diag_v2)
-  backward : diag_grad (diag_v2_grad)
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : diagonal
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- api : digamma
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : dist
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- api : dot
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- api : dropout
-  backward : dropout_grad
-  extra :
-    attrs : [bool fix_seed = false, int seed = 0]
-
-- api : dropout_nd
-  backward : dropout_nd_grad
-  extra :
-    attrs : [bool fix_seed = false, int seed = 0]
-
-- api : erf
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : erfinv
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : fft_c2c
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- api : fft_c2r
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- api : fft_r2c
-  inputs: {x: X}
-  outputs: {out: Out}
-
-- api : frobenius_norm
-  backward : frobenius_norm_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : gelu
-  backward : gelu_grad
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_cudnn = false]
-
-- api : grid_sampler
-  backward : grid_sampler_grad
-  extra :
-    attrs : [bool use_cudnn = true]
-
-- api : gru
-  backward : gru_grad
-  extra :
-    attrs : [bool is_test = false]
-
-- api : inplace_abn
-  backward : inplace_abn_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
-
-- api : layer_norm
-  backward : layer_norm_grad
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
-
-- api : lgamma
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : linear_interp (linear_interp_v2)
-  backward : linear_interp_grad (linear_interp_v2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : log_softmax
-  backward : log_softmax_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : lrn
-  backward : lrn_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool is_test = false]
-
-- api : matmul (matmul_v2)
-  backward : matmul_grad (matmul_v2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false, 'int[] fused_reshape_Out = {}', 'int[] fused_transpose_Out = {}',
-             str mkldnn_data_type = "float32", 'int[] fused_reshape_X = {}', 'int[] fused_reshape_Y = {}',
-             'int[] fused_transpose_X = {}', 'int[] fused_transpose_Y = {}',]
-
-- api : mv
-  inputs :
-    {x : X, vec : Vec}
-  outputs :
-    out : Out
-
-- api : nearest_interp (nearest_interp_v2)
-  backward : nearest_interp_grad (nearest_interp_v2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : pad2d
-  backward : pad2d_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : pad3d
-  backward : pad3d_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : partial_sum
-  backward : partial_sum_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : poisson
-  inputs :
-    x : X
-  outputs :
-    out : Out
-
-- api : reduce_all
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_amax
-  backward : reduce_amax_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_amin
-  backward : reduce_amin_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_any
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_max
-  backward : reduce_max_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_mean
-  backward : reduce_mean_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_min
-  backward : reduce_min_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_prod
-  backward : reduce_prod_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : reduce_sum
-  backward : reduce_sum_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : renorm
-  backward : renorm_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
-
-- api : rnn
-  backward : rnn_grad
-  extra :
-    attrs : [bool is_test = false]
-
-- api : seed
-  extra :
-    attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
-
-- api : shape
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- api : shuffle_channel
-  backward : shuffle_channel_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : slice
-  backward : slice_grad
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- api : softmax
-  backward : softmax_grad
-  extra :
-    attrs : [bool use_cudnn = false, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
-- api : prelu
-  backward : prelu_grad
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
-
-- api : solve
-  inputs :
-    {x : X, y : Y}
-  outputs :
-    out : Out
-
-- api : squeeze (squeeze2)
-  backward : squeeze_grad (squeeze2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
-
-- api : stack
-  backward : stack_grad
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : sync_batch_norm
-  backward : sync_batch_norm_grad
-  extra :
-    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
-
-- api : trace
-  inputs :
-    x : Input
-  outputs :
-    out : Out
-
-- api : trilinear_interp (trilinear_interp_v2)
-  backward : trilinear_interp_grad (trilinear_interp_v2_grad)
-  extra :
-    attrs : [bool use_mkldnn = false]
-
-- api : trunc
-  inputs :
-    x : X
-  outputs :
-    out : Out
diff --git a/paddle/phi/api/yaml/backward.yaml b/paddle/phi/api/yaml/backward.yaml
index d2ed2533ae03e4..6603c785a0f48a 100644
--- a/paddle/phi/api/yaml/backward.yaml
+++ b/paddle/phi/api/yaml/backward.yaml
@@ -1,4 +1,4 @@
-- backward_api : atan2_grad
+- backward_op : atan2_grad
   forward : atan2 (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -8,7 +8,7 @@
   kernel :
     func : atan2_grad
 
-- backward_api : cholesky_grad
+- backward_op : cholesky_grad
   forward : cholesky (Tensor x, bool upper) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, bool upper)
   output : Tensor(x_grad)
@@ -18,7 +18,7 @@
   kernel :
     func : cholesky_grad
 
-- backward_api : cholesky_solve_grad
+- backward_op : cholesky_solve_grad
   forward : cholesky_solve (Tensor x, Tensor y, bool upper) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -28,7 +28,7 @@
   kernel :
     func : cholesky_solve_grad
 
-- backward_api : cross_grad
+- backward_op : cross_grad
   forward : cross (Tensor x, Tensor y, int axis = 9) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -39,7 +39,7 @@
     func : cross_grad
     data_type : out_grad
 
-- backward_api : diag_grad
+- backward_op : diag_grad
   forward : diag (Tensor x, int offset, float padding_value) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int offset)
   output : Tensor(x_grad)
@@ -51,7 +51,7 @@
     data_type : out_grad
   no_need_buffer : x
 
-- backward_api : diagonal_grad
+- backward_op : diagonal_grad
   forward : diagonal (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int offset = 0, int axis1 = 0, int axis2 = 1)
   output : Tensor(x_grad)
@@ -63,7 +63,7 @@
     data_type : out_grad
   no_need_buffer : x
 
-- backward_api : digamma_grad
+- backward_op : digamma_grad
   forward : digamma (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -73,7 +73,7 @@
   kernel :
     func : digamma_grad
 
-- backward_api : dist_grad
+- backward_op : dist_grad
   forward : dist (Tensor x, Tensor y, float p) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, float p)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -83,7 +83,7 @@
   kernel :
     func : dist_grad
 
-- backward_api : dot_grad
+- backward_op : dot_grad
   forward : dot (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -94,7 +94,7 @@
     func : dot_grad
     data_type : out_grad
 
-- backward_api : erf_grad
+- backward_op : erf_grad
   forward : erf (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -105,7 +105,7 @@
     func : erf_grad
     data_type : out_grad
 
-- backward_api : erfinv_grad
+- backward_op : erfinv_grad
   forward : erfinv (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -115,7 +115,7 @@
   kernel :
     func : erfinv_grad
     
-- backward_api : fft_c2c_grad
+- backward_op : fft_c2c_grad
   forward: fft_c2c(Tensor x, int64_t[] axes, str normalization, bool forward) -> Tensor(out)
   args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward)
   output: Tensor(x_grad)
@@ -125,7 +125,7 @@
   kernel :
     func : fft_c2c_grad
 
-- backward_api : fft_c2r_grad
+- backward_op : fft_c2r_grad
   forward: fft_c2r(Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size) -> Tensor(out)
   args : (Tensor out_grad, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size)
   output: Tensor(x_grad)
@@ -135,7 +135,7 @@
     func : fft_c2r_grad
     data_type: out_grad
 
-- backward_api : fft_r2c_grad
+- backward_op : fft_r2c_grad
   forward: fft_r2c(Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int64_t[] axes, str normalization, bool forward, bool onesided)
   output: Tensor(x_grad)
@@ -147,7 +147,13 @@
     data_type: out_grad
   no_need_buffer: x
 
-- backward_api : graph_send_uv_grad
+- backward_op : flip_grad
+  forward : flip (Tensor x, int[] axis) -> Tensor(out)
+  args : (Tensor out_grad, int[] axis)
+  output : Tensor(x_grad)
+  invoke : flip(out_grad, axis)
+
+- backward_op : graph_send_uv_grad
   forward : graph_send_uv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD") -> Tensor(out)
   args: (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out_grad, str message_op = "ADD")
   output : Tensor(x_grad), Tensor(y_grad)
@@ -158,7 +164,7 @@
     func : graph_send_uv_grad
     data_type : x
 
-- backward_api : lgamma_grad
+- backward_op : lgamma_grad
   forward : lgamma(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -168,7 +174,7 @@
   kernel :
     func : lgamma_grad
 
-- backward_api : mv_grad
+- backward_op : mv_grad
   forward : mv (Tensor x, Tensor vec) -> Tensor(out)
   args : (Tensor x, Tensor vec, Tensor out_grad)
   output : Tensor(x_grad), Tensor(vec_grad)
@@ -178,7 +184,7 @@
   kernel :
     func : mv_grad
 
-- backward_api : poisson_grad
+- backward_op : poisson_grad
   forward : poisson (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -188,7 +194,7 @@
   kernel :
     func : poisson_grad
 
-- backward_api : solve_grad
+- backward_op : solve_grad
   forward : solve (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -198,7 +204,7 @@
   kernel :
     func : solve_grad
 
-- backward_api : trace_grad
+- backward_op : trace_grad
   forward : trace (Tensor x, int offset, int axis1, int axis2) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int offset, int axis1, int axis2)
   output : Tensor(x_grad)
@@ -210,7 +216,7 @@
     data_type : out_grad
   no_need_buffer : x
 
-- backward_api : trunc_grad
+- backward_op : trunc_grad
   forward : trunc (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
index f76bc688ec25e7..cbee67aaa5c371 100644
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -50,7 +50,7 @@ def __init__(self, api_item_yaml):
             self.inplace_map, self.view_map = {}, {}
 
     def get_api_name(self, api_item_yaml):
-        return api_item_yaml['api']
+        return api_item_yaml['op']
 
     def get_api_func_name(self):
         return self.api
@@ -881,7 +881,7 @@ def gen_kernel_code(self, kernel_name, code_indent, inplace_flag=False):
 
     def get_condition_code(self, kernel_name):
         assert self.kernel['dispatch'][kernel_name], \
-                f"{self.api} api: the tensor type of inputs and outputs for kernel isn't set, see also 'kernel:func' of 'scale' in api.yaml."
+                f"{self.api} api: the tensor type of inputs and outputs for kernel isn't set, see also 'kernel:func' of 'scale' in ops.yaml."
         input_types = self.kernel['dispatch'][kernel_name][0]
         condition_list = []
         for i, in_type in enumerate(input_types):
diff --git a/paddle/phi/api/yaml/generator/api_gen.py b/paddle/phi/api/yaml/generator/api_gen.py
index dc30472f9447ec..80a9d586ca3e0e 100644
--- a/paddle/phi/api/yaml/generator/api_gen.py
+++ b/paddle/phi/api/yaml/generator/api_gen.py
@@ -319,7 +319,7 @@ def main():
     parser.add_argument('--api_yaml_path',
                         help='path to api yaml file',
                         nargs='+',
-                        default='paddle/phi/api/yaml/api.yaml')
+                        default='paddle/phi/api/yaml/ops.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/paddle/phi/api/yaml/generator/backward_api_gen.py b/paddle/phi/api/yaml/generator/backward_api_gen.py
index f2a42be7373d9f..1a8b9cc4d390b0 100644
--- a/paddle/phi/api/yaml/generator/backward_api_gen.py
+++ b/paddle/phi/api/yaml/generator/backward_api_gen.py
@@ -28,14 +28,14 @@ def __init__(self, backward_item_yaml):
         self.no_need_buffer = self.parse_no_need_buffer(backward_item_yaml)
 
     def get_api_name(self, api_item_yaml):
-        return api_item_yaml['backward_api']
+        return api_item_yaml['backward_op']
 
     def parse_forward_config(self, forward_config):
         # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
         result = re.search(
-            r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
+            r"(?P<op>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
             forward_config)
-        api = result.group('api')
+        api = result.group('op')
         _, outputs, _, = self.parse_output(self.api, result.group('outputs'))
         outputs = [item.split('@')[0] for item in outputs]
         fw_inputs, fw_attrs = self.parse_input_and_attr(api,
diff --git a/paddle/phi/api/yaml/generator/generate_op.py b/paddle/phi/api/yaml/generator/generate_op.py
index 24f30323a935b2..4984db8f8b3cda 100644
--- a/paddle/phi/api/yaml/generator/generate_op.py
+++ b/paddle/phi/api/yaml/generator/generate_op.py
@@ -21,18 +21,32 @@
 import yaml
 from jinja2 import Environment, FileSystemLoader, StrictUndefined
 
-from filters import to_op_attr_type, to_opmaker_name, to_opmaker_name_cstr, to_pascal_case
-from tests import is_base_api, is_vec, is_scalar, is_initializer_list, supports_inplace, supports_no_need_buffer
+from filters import (
+    to_op_attr_type,
+    to_opmaker_name,
+    to_opmaker_name_cstr,
+    to_pascal_case,
+)
+from tests import (
+    is_base_api,
+    is_vec,
+    is_scalar,
+    is_initializer_list,
+    supports_inplace,
+    supports_no_need_buffer,
+)
 from filters import to_input_name, cartesian_prod_mapping
 from parse_utils import to_named_dict
 
 file_loader = FileSystemLoader(Path(__file__).parent / "templates")
-env = Environment(loader=file_loader,
-                  keep_trailing_newline=True,
-                  trim_blocks=True,
-                  lstrip_blocks=True,
-                  undefined=StrictUndefined,
-                  extensions=['jinja2.ext.do'])
+env = Environment(
+    loader=file_loader,
+    keep_trailing_newline=True,
+    trim_blocks=True,
+    lstrip_blocks=True,
+    undefined=StrictUndefined,
+    extensions=['jinja2.ext.do'],
+)
 env.filters["to_op_attr_type"] = to_op_attr_type
 env.filters["to_opmaker_name"] = to_opmaker_name
 env.filters["to_pascal_case"] = to_pascal_case
@@ -56,7 +70,6 @@ def restruct_io(api):
 
 # replace name of op and params for OpMaker
 def replace_compat_name(api_op_map, forward_api_dict, backward_api_dict):
-
     def get_api_and_op_name(api_item):
         names = api_item.split('(')
         if len(names) == 1:
@@ -65,7 +78,7 @@ def get_api_and_op_name(api_item):
             return names[0].strip(), names[1].split(')')[0].strip()
 
     for api_args in api_op_map:
-        api_name, op_name = get_api_and_op_name(api_args['api'])
+        api_name, op_name = get_api_and_op_name(api_args['op'])
         if api_name not in forward_api_dict:
             continue
         forward_api_item = forward_api_dict[api_name]
@@ -76,7 +89,8 @@ def get_api_and_op_name(api_item):
             forward_api_item['op_name'] = op_name
         if 'backward' in api_args and has_backward:
             bw_api_name, bw_op_name = get_api_and_op_name(
-                api_args['backward'].split(',')[0])
+                api_args['backward'].split(',')[0]
+            )
             forward_api_item['backward'] = bw_op_name
             backward_api_item['op_name'] = bw_op_name
 
@@ -102,8 +116,10 @@ def get_api_and_op_name(api_item):
         ]
         if forward_api_item['kernel']['data_type']:
             forward_api_item['kernel']['data_type']['candidates'] = [
-                args_map[param] if param in args_map else param for param in
-                forward_api_item['kernel']['data_type']['candidates']
+                args_map[param] if param in args_map else param
+                for param in forward_api_item['kernel']['data_type'][
+                    'candidates'
+                ]
             ]
         if forward_api_item['kernel']['backend']:
             forward_api_item['kernel']['backend']['candidates'] = [
@@ -130,21 +146,36 @@ def get_api_and_op_name(api_item):
             for args_item in backward_api_item['inputs']:
                 if args_item['name'] in args_map:
                     args_item['name'] = args_map[args_item['name']]
-                elif args_item['name'].endswith(
-                        '_grad') and args_item['name'][:-5] in args_map:
-                    args_map[args_item['name']] = args_map[args_item['name']
-                                                           [:-5]] + '_grad'
+                elif (
+                    args_item['name'].endswith('_grad')
+                    and args_item['name'][:-5] in args_map
+                ):
+                    args_map[args_item['name']] = (
+                        args_map[args_item['name'][:-5]] + '_grad'
+                    )
                     args_item['name'] = args_map[args_item['name']]
             for args_item in backward_api_item['attrs']:
                 if args_item['name'] in args_map:
                     args_item['name'] = args_map[args_item['name']]
             for args_item in backward_api_item['outputs']:
-                if args_item['name'].endswith(
-                        '_grad') and args_item['name'][:-5] in args_map:
-                    args_map[args_item['name']] = args_map[args_item['name']
-                                                           [:-5]] + '_grad'
+                if (
+                    args_item['name'].endswith('_grad')
+                    and args_item['name'][:-5] in args_map
+                ):
+                    args_map[args_item['name']] = (
+                        args_map[args_item['name'][:-5]] + '_grad'
+                    )
                     args_item['name'] = args_map[args_item['name']]
 
+            if 'invoke' in backward_api_item:
+                backward_api_item['invoke']['args'] = [
+                    args_map[param.strip()]
+                    if param.strip() in args_map
+                    else param.strip()
+                    for param in backward_api_item['invoke']['args'].split(',')
+                ]
+                continue
+
             backward_api_item['infer_meta']['param'] = [
                 args_map[param] if param in args_map else param
                 for param in backward_api_item['infer_meta']['param']
@@ -155,18 +186,24 @@ def get_api_and_op_name(api_item):
             ]
             if backward_api_item['kernel']['data_type']:
                 backward_api_item['kernel']['data_type']['candidates'] = [
-                    args_map[param] if param in args_map else param for param in
-                    backward_api_item['kernel']['data_type']['candidates']
+                    args_map[param] if param in args_map else param
+                    for param in backward_api_item['kernel']['data_type'][
+                        'candidates'
+                    ]
                 ]
             if backward_api_item['kernel']['backend']:
                 backward_api_item['kernel']['backend']['candidates'] = [
-                    args_map[param] if param in args_map else param for param in
-                    backward_api_item['kernel']['backend']['candidates']
+                    args_map[param] if param in args_map else param
+                    for param in backward_api_item['kernel']['backend'][
+                        'candidates'
+                    ]
                 ]
             if backward_api_item['kernel']['layout']:
                 backward_api_item['kernel']['layout']['candidates'] = [
-                    args_map[param] if param in args_map else param for param in
-                    backward_api_item['kernel']['layout']['candidates']
+                    args_map[param] if param in args_map else param
+                    for param in backward_api_item['kernel']['layout'][
+                        'candidates'
+                    ]
                 ]
             if backward_api_item['no_need_buffer']:
                 backward_api_item['no_need_buffer'] = [
@@ -175,9 +212,56 @@ def get_api_and_op_name(api_item):
                 ]
 
 
-def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path,
-         api_version_yaml_path, output_op_path, output_arg_map_path):
-    with open(api_yaml_path, "rt") as f:
+def process_invoke_op(forward_api_dict, backward_api_dict):
+    for bw_api in backward_api_dict.values():
+        if 'invoke' in bw_api:
+            invoke_op = bw_api['invoke']['func']
+            args_list = bw_api['invoke']['args']
+            args_index = 0
+            if invoke_op in forward_api_dict:
+                reuse_op = forward_api_dict[invoke_op]
+                bw_api['invoke']['inputs'] = []
+                bw_api['invoke']['attrs'] = []
+                bw_api['invoke']['outputs'] = []
+                for input_item in reuse_op['inputs']:
+                    bw_api['invoke']['inputs'].append(
+                        {
+                            'name': input_item['name'],
+                            'value': args_list[args_index],
+                        }
+                    )
+                    args_index = args_index + 1
+                for attr in reuse_op['attrs']:
+                    if args_index < len(args_list):
+                        attr_value = (
+                            f"this->GetAttr(\"{args_list[args_index]}\")"
+                            if args_list[args_index] in bw_api['attr_dict']
+                            else args_list[args_index]
+                        )
+                        bw_api['invoke']['attrs'].append(
+                            {'name': attr['name'], 'value': attr_value}
+                        )
+                        args_index = args_index + 1
+                    else:
+                        break
+                for idx, output_item in enumerate(reuse_op['outputs']):
+                    bw_api['invoke']['outputs'].append(
+                        {
+                            'name': output_item['name'],
+                            'value': bw_api['outputs'][idx]['name'],
+                        }
+                    )
+
+
+def main(
+    ops_yaml_path,
+    backward_yaml_path,
+    op_compat_yaml_path,
+    op_version_yaml_path,
+    output_op_path,
+    output_arg_map_path,
+):
+    with open(ops_yaml_path, "rt") as f:
         apis = yaml.safe_load(f)
         apis = [restruct_io(api) for api in apis]
     forward_api_dict = to_named_dict(apis)
@@ -187,13 +271,13 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path,
         backward_apis = [restruct_io(api) for api in backward_apis]
     backward_api_dict = to_named_dict(backward_apis)
 
-    with open(api_version_yaml_path, "rt") as f:
+    with open(op_version_yaml_path, "rt") as f:
         api_versions = yaml.safe_load(f)
     # add api version info into api
     for api_version in api_versions:
-        forward_api_dict[api_version['api']]['version'] = api_version['version']
+        forward_api_dict[api_version['op']]['version'] = api_version['version']
 
-    with open(api_compat_yaml_path, "rt") as f:
+    with open(op_compat_yaml_path, "rt") as f:
         api_op_map = yaml.safe_load(f)
 
     for api in apis:
@@ -203,6 +287,9 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path,
 
     replace_compat_name(api_op_map, forward_api_dict, backward_api_dict)
 
+    # prepare for invoke case
+    process_invoke_op(forward_api_dict, backward_api_dict)
+
     # fill backward field for an api if another api claims it as forward
     for name, backward_api in backward_api_dict.items():
         forward_name = backward_api["forward"]["name"]
@@ -224,9 +311,9 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path,
 
     op_template = env.get_template('op.c.j2')
     with open(output_op_path, "wt") as f:
-        msg = op_template.render(apis=apis,
-                                 backward_apis=backward_apis,
-                                 api_dict=api_dict)
+        msg = op_template.render(
+            apis=apis, backward_apis=backward_apis, api_dict=api_dict
+        )
         f.write(msg)
 
     ks_template = env.get_template('ks.c.j2')
@@ -237,28 +324,35 @@ def main(api_yaml_path, backward_yaml_path, api_compat_yaml_path,
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
-        description="Generate operator file from api yaml.")
-    parser.add_argument('--api_yaml_path',
-                        type=str,
-                        help="parsed api yaml file.")
-    parser.add_argument('--backward_api_yaml_path',
-                        type=str,
-                        help="parsed backward api yaml file.")
-    parser.add_argument('--api_compat_yaml_path',
-                        type=str,
-                        help="api args compat yaml file.")
-    parser.add_argument('--api_version_yaml_path',
-                        type=str,
-                        help="api version yaml file.")
-    parser.add_argument("--output_op_path",
-                        type=str,
-                        help="path to save generated operators.")
+        description="Generate operator file from api yaml."
+    )
+    parser.add_argument(
+        '--ops_yaml_path', type=str, help="parsed ops yaml file."
+    )
+    parser.add_argument(
+        '--backward_yaml_path', type=str, help="parsed backward ops yaml file."
+    )
+    parser.add_argument(
+        '--op_compat_yaml_path', type=str, help="ops args compat yaml file."
+    )
+    parser.add_argument(
+        '--op_version_yaml_path', type=str, help="ops version yaml file."
+    )
+    parser.add_argument(
+        "--output_op_path", type=str, help="path to save generated operators."
+    )
     parser.add_argument(
         "--output_arg_map_path",
         type=str,
-        help="path to save generated argument mapping functions.")
+        help="path to save generated argument mapping functions.",
+    )
 
     args = parser.parse_args()
-    main(args.api_yaml_path, args.backward_api_yaml_path,
-         args.api_compat_yaml_path, args.api_version_yaml_path,
-         args.output_op_path, args.output_arg_map_path)
+    main(
+        args.ops_yaml_path,
+        args.backward_yaml_path,
+        args.op_compat_yaml_path,
+        args.op_version_yaml_path,
+        args.output_op_path,
+        args.output_arg_map_path,
+    )
diff --git a/paddle/phi/api/yaml/generator/generate_sparse_op.py b/paddle/phi/api/yaml/generator/generate_sparse_op.py
new file mode 100644
index 00000000000000..48ba0d81eca3d6
--- /dev/null
+++ b/paddle/phi/api/yaml/generator/generate_sparse_op.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import os
+from pathlib import Path
+
+import yaml
+from jinja2 import Environment, FileSystemLoader, StrictUndefined
+
+from filters import (
+    to_op_attr_type,
+    to_opmaker_name,
+    to_opmaker_name_cstr,
+    to_pascal_case,
+)
+from tests import (
+    is_base_api,
+    is_vec,
+    is_scalar,
+    is_initializer_list,
+    supports_inplace,
+    supports_no_need_buffer,
+)
+from filters import to_input_name, cartesian_prod_mapping
+from parse_utils import to_named_dict
+from generate_op import process_invoke_op
+
+file_loader = FileSystemLoader(Path(__file__).parent / "templates")
+env = Environment(
+    loader=file_loader,
+    keep_trailing_newline=True,
+    trim_blocks=True,
+    lstrip_blocks=True,
+    undefined=StrictUndefined,
+    extensions=['jinja2.ext.do'],
+)
+env.filters["to_op_attr_type"] = to_op_attr_type
+env.filters["to_opmaker_name"] = to_opmaker_name
+env.filters["to_pascal_case"] = to_pascal_case
+env.filters["to_input_name"] = to_input_name
+env.filters["to_opmaker_name_cstr"] = to_opmaker_name_cstr
+env.filters["cartesian_prod_mapping"] = cartesian_prod_mapping
+env.tests["base_api"] = is_base_api
+env.tests["vec"] = is_vec
+env.tests["scalar"] = is_scalar
+env.tests["initializer_list"] = is_initializer_list
+env.tests["supports_inplace"] = supports_inplace
+env.tests["supports_no_need_buffer"] = supports_no_need_buffer
+
+
+def restruct_io(api):
+    api["input_dict"] = to_named_dict(api["inputs"])
+    api["attr_dict"] = to_named_dict(api["attrs"])
+    api["output_dict"] = to_named_dict(api["outputs"])
+    return api
+
+
+SPARSE_OP_PREFIX = 'sparse_'
+
+
+def main(
+    api_yaml_path, backward_yaml_path, output_op_path, output_arg_map_path
+):
+    with open(api_yaml_path, "rt") as f:
+        apis = yaml.safe_load(f)
+        apis = [restruct_io(api) for api in apis]
+    forward_api_dict = to_named_dict(apis)
+
+    with open(backward_yaml_path, "rt") as f:
+        backward_apis = yaml.safe_load(f)
+        backward_apis = [restruct_io(api) for api in backward_apis]
+    backward_api_dict = to_named_dict(backward_apis)
+
+    for api in apis:
+        api['op_name'] = SPARSE_OP_PREFIX + api['name']
+        api['name'] = api['op_name']
+        if api["backward"] is not None:
+            api["backward"] = SPARSE_OP_PREFIX + api["backward"]
+    for bw_api in backward_apis:
+        bw_api['op_name'] = SPARSE_OP_PREFIX + bw_api['name']
+        bw_api['name'] = bw_api['op_name']
+        if 'invoke' in bw_api:
+            bw_api['invoke']['args'] = [
+                param.strip() for param in bw_api['invoke']['args'].split(',')
+            ]
+
+    # prepare for invoke case
+    process_invoke_op(forward_api_dict, backward_api_dict)
+    for bw_api in backward_apis:
+        if 'invoke' in bw_api:
+            if bw_api['invoke']['func'] in forward_api_dict:
+                bw_api['invoke']['func'] = (
+                    SPARSE_OP_PREFIX + bw_api['invoke']['func']
+                )
+
+    # fill backward field for an api if another api claims it as forward
+    for name, backward_api in backward_api_dict.items():
+        forward_name = backward_api["forward"]["name"]
+        if forward_name in backward_api_dict:
+            forward_api = backward_api_dict[forward_name]
+            if forward_api["backward"] is None:
+                forward_api["backward"] = name
+            forward_api["backward"] = SPARSE_OP_PREFIX + forward_api["backward"]
+
+    api_dict = {}
+    api_dict.update(forward_api_dict)
+    api_dict.update(backward_api_dict)
+
+    if len(apis) == 0 and len(backward_apis) == 0:
+        if os.path.isfile(output_op_path):
+            os.remove(output_op_path)
+        if os.path.isfile(output_arg_map_path):
+            os.remove(output_arg_map_path)
+        return
+
+    op_template = env.get_template('sparse_op.c.j2')
+    with open(output_op_path, "wt") as f:
+        msg = op_template.render(
+            apis=apis, backward_apis=backward_apis, api_dict=api_dict
+        )
+        f.write(msg)
+
+    ks_template = env.get_template('sparse_ks.c.j2')
+    with open(output_arg_map_path, 'wt') as f:
+        msg = ks_template.render(apis=apis, backward_apis=backward_apis)
+        f.write(msg)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Generate operator file from api yaml."
+    )
+    parser.add_argument(
+        '--ops_yaml_path', type=str, help="parsed sparse ops yaml file."
+    )
+    parser.add_argument(
+        '--backward_ops_yaml_path',
+        type=str,
+        help="parsed backward sparse ops yaml file.",
+    )
+    parser.add_argument(
+        "--output_op_path", type=str, help="path to save generated operators."
+    )
+    parser.add_argument(
+        "--output_arg_map_path",
+        type=str,
+        help="path to save generated argument mapping functions.",
+    )
+
+    args = parser.parse_args()
+    main(
+        args.ops_yaml_path,
+        args.backward_ops_yaml_path,
+        args.output_op_path,
+        args.output_arg_map_path,
+    )
diff --git a/paddle/phi/api/yaml/generator/intermediate_api_gen.py b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
index 7834e5c230c179..ce615dcb2488cd 100644
--- a/paddle/phi/api/yaml/generator/intermediate_api_gen.py
+++ b/paddle/phi/api/yaml/generator/intermediate_api_gen.py
@@ -43,7 +43,6 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/infermeta/binary.h"
 #include "paddle/phi/infermeta/multiary.h"
@@ -51,6 +50,10 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/infermeta/ternary.h"
 
+#include "paddle/phi/infermeta/sparse/unary.h"
+#include "paddle/phi/infermeta/sparse/binary.h"
+#include "paddle/phi/infermeta/sparse/multiary.h"
+
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 #include "paddle/fluid/platform/profiler/supplement_tracing.h"
 """
@@ -135,11 +138,11 @@ def main():
     parser.add_argument('--api_yaml_path',
                         nargs='+',
                         help='path to api yaml file',
-                        default='paddle/phi/api/yaml/api.yaml')
+                        default='paddle/phi/api/yaml/ops.yaml')
 
     parser.add_argument('--sparse_api_yaml_path',
                         help='path to sparse api yaml file',
-                        default='paddle/phi/api/yaml/sparse_api.yaml')
+                        default='paddle/phi/api/yaml/sparse_ops.yaml')
 
     parser.add_argument('--dygraph_api_header_path',
                         help='output of generated dygraph api header code file',
diff --git a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py
index d7ece0d2a4563d..b862d8bfe0a85b 100644
--- a/paddle/phi/api/yaml/generator/ops_extra_info_gen.py
+++ b/paddle/phi/api/yaml/generator/ops_extra_info_gen.py
@@ -59,15 +59,15 @@ def map_code_template(attrs_str, attrs_checker_str):
 
 def parse_attr(attr_str):
     result = re.search(
-        r"(?P<attr_type>[a-z[\]]+)\s+(?P<name>[a-zA-Z0-9_]+)\s*=\s*(?P<default_val>\S+)",
+        r"(?P<attr_type>[a-zA-Z0-9_[\]]+)\s+(?P<name>[a-zA-Z0-9_]+)\s*=\s*(?P<default_val>\S+)",
         attr_str)
     return ATTR_TYPE_STRING_MAP[result.group('attr_type')], result.group(
         'name'), result.group('default_val')
 
 
-def generate_extra_info(api_compat_yaml_path, ops_extra_info_path):
+def generate_extra_info(op_compat_yaml_path, ops_extra_info_path):
     compat_apis = []
-    with open(api_compat_yaml_path, 'rt') as f:
+    with open(op_compat_yaml_path, 'rt') as f:
         compat_apis = yaml.safe_load(f)
 
     def get_op_name(api_item):
@@ -80,9 +80,9 @@ def get_op_name(api_item):
     extra_map_str_list = []
     extra_checker_str_list = []
 
-    for api_compat_args in compat_apis:
-        if 'extra' in api_compat_args:
-            extra_args_map = api_compat_args['extra']
+    for op_compat_args in compat_apis:
+        if 'extra' in op_compat_args:
+            extra_args_map = op_compat_args['extra']
             # TODO(chenweihang): add inputs and outputs
             if 'attrs' in extra_args_map:
                 attr_map_list = []
@@ -103,13 +103,13 @@ def get_op_name(api_item):
                 api_extra_attr_checkers = ",\n      ".join(
                     attr_checker_func_list)
                 extra_map_str_list.append(
-                    f"{{\"{get_op_name(api_compat_args['api'])}\", {{ {api_extra_attr_map} }}}}"
+                    f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_map} }}}}"
                 )
                 extra_checker_str_list.append(
-                    f"{{\"{get_op_name(api_compat_args['api'])}\", {{ {api_extra_attr_checkers} }}}}"
+                    f"{{\"{get_op_name(op_compat_args['op'])}\", {{ {api_extra_attr_checkers} }}}}"
                 )
-                if 'backward' in api_compat_args:
-                    for bw_item in api_compat_args['backward'].split(','):
+                if 'backward' in op_compat_args:
+                    for bw_item in op_compat_args['backward'].split(','):
                         bw_op_name = get_op_name(bw_item)
                         extra_map_str_list.append(
                             f"{{\"{bw_op_name}\", {{ {api_extra_attr_map} }}}}")
@@ -127,9 +127,9 @@ def get_op_name(api_item):
 def main():
     parser = argparse.ArgumentParser(
         description='Generate PaddlePaddle Extra Param Info for Op')
-    parser.add_argument('--api_compat_yaml_path',
+    parser.add_argument('--op_compat_yaml_path',
                         help='path to api compat yaml file',
-                        default='paddle/phi/api/yaml/api_compat.yaml')
+                        default='paddle/phi/api/yaml/op_compat.yaml')
 
     parser.add_argument('--ops_extra_info_path',
                         help='output of generated extra_prama_info code file',
@@ -137,10 +137,10 @@ def main():
 
     options = parser.parse_args()
 
-    api_compat_yaml_path = options.api_compat_yaml_path
+    op_compat_yaml_path = options.op_compat_yaml_path
     ops_extra_info_path = options.ops_extra_info_path
 
-    generate_extra_info(api_compat_yaml_path, ops_extra_info_path)
+    generate_extra_info(op_compat_yaml_path, ops_extra_info_path)
 
 
 if __name__ == '__main__':
diff --git a/paddle/phi/api/yaml/generator/parse_api.py b/paddle/phi/api/yaml/generator/parse_api.py
index fcaf365951ee6d..91fd44b7093393 100644
--- a/paddle/phi/api/yaml/generator/parse_api.py
+++ b/paddle/phi/api/yaml/generator/parse_api.py
@@ -27,7 +27,7 @@ def main(api_yaml_path, output_path, backward):
             apis = []
         else:
             apis = [
-                parse_api_entry(api, "backward_api" if backward else "api")
+                parse_api_entry(api, "backward_op" if backward else "op")
                 for api in apis
             ]
 
diff --git a/paddle/phi/api/yaml/generator/parse_utils.py b/paddle/phi/api/yaml/generator/parse_utils.py
index 11a0b49eeefc3c..0502dba0fdf8a8 100644
--- a/paddle/phi/api/yaml/generator/parse_utils.py
+++ b/paddle/phi/api/yaml/generator/parse_utils.py
@@ -35,39 +35,42 @@ def parse_arg(api_name: str, s: str) -> Dict[str, str]:
     2. typename name = default_value
     """
     typename, rest = [item.strip() for item in s.split(" ", 1)]
-    assert len(
-        typename
-    ) > 0, f"The arg typename should not be empty. Please check the args of {api_name} in yaml."
+    assert (
+        len(typename) > 0
+    ), f"The arg typename should not be empty. Please check the args of {api_name} in yaml."
 
-    assert rest.count(
-        "=") <= 1, f"There is more than 1 = in an arg in {api_name}"
+    assert (
+        rest.count("=") <= 1
+    ), f"There is more than 1 = in an arg in {api_name}"
     if rest.count("=") == 1:
         name, default_value = [item.strip() for item in rest.split("=", 1)]
-        assert len(
-            name
-        ) > 0, f"The arg name should not be empty. Please check the args of {api_name} in yaml."
-        assert len(
-            default_value
-        ) > 0, f"The default value should not be empty. Please check the args of {api_name} in yaml."
+        assert (
+            len(name) > 0
+        ), f"The arg name should not be empty. Please check the args of {api_name} in yaml."
+        assert (
+            len(default_value) > 0
+        ), f"The default value should not be empty. Please check the args of {api_name} in yaml."
         return {
             "typename": typename,
             "name": name,
-            "default_value": default_value
+            "default_value": default_value,
         }
     else:
         name = rest.strip()
-        assert len(
-            name
-        ) > 0, f"The arg name should not be empty. Please check the args of {api_name} in yaml."
+        assert (
+            len(name) > 0
+        ), f"The arg name should not be empty. Please check the args of {api_name} in yaml."
         return {"typename": typename, "name": name}
 
 
-def parse_input_and_attr(api_name: str,
-                         arguments: str) -> Tuple[List, List, Dict, Dict]:
+def parse_input_and_attr(
+    api_name: str, arguments: str
+) -> Tuple[List, List, Dict, Dict]:
     args_str = arguments.strip()
-    assert args_str.startswith('(') and args_str.endswith(')'), \
-        (f"Args declaration should start with '(' and end with ')', "
-         f"please check the args of {api_name} in yaml.")
+    assert args_str.startswith('(') and args_str.endswith(')'), (
+        f"Args declaration should start with '(' and end with ')', "
+        f"please check the args of {api_name} in yaml."
+    )
     args_str = args_str[1:-1]
     args = parse_plain_list(args_str)
 
@@ -81,14 +84,17 @@ def parse_input_and_attr(api_name: str,
         typename = item["typename"]
         name = item["name"]
         if is_input(typename):
-            assert len(attrs) == 0, \
-                (f"The input Tensor should appear before attributes. "
+            assert len(attrs) == 0, (
+                f"The input Tensor should appear before attributes. "
                 f"please check the position of {api_name}:input({name}) "
-                f"in yaml.")
+                f"in yaml."
+            )
             inputs.append(item)
         elif is_attr(typename):
             if met_attr_with_default_value:
-                assert "default_value" in item, f"{api_name}: Arguments with default value should not precede those without default value"
+                assert (
+                    "default_value" in item
+                ), f"{api_name}: Arguments with default value should not precede those without default value"
             elif "default_value" in item:
                 met_attr_with_default_value = True
             attrs.append(item)
@@ -101,7 +107,8 @@ def parse_output(api_name: str, s: str) -> Dict[str, str]:
     """parse an output, typename or typename(name)."""
     match = re.search(
         r"(?P<out_type>[a-zA-Z0-9_[\]]+)\s*(?P<name>\([a-zA-Z0-9_@]+\))?\s*(?P<expr>\{[^\}]+\})?",
-        s)
+        s,
+    )
     typename = match.group("out_type")
     name = match.group("name")
     size_expr = match.group("expr")
@@ -109,13 +116,15 @@ def parse_output(api_name: str, s: str) -> Dict[str, str]:
     name = name[1:-1] if name is not None else 'out'
     size_expr = size_expr[1:-1] if size_expr is not None else None
 
-    assert is_output(typename), \
-        (f"Invalid output type: {typename} in api: {api_name}."
-            f"Supported types are Tensor and Tensor[]")
+    assert is_output(typename), (
+        f"Invalid output type: {typename} in api: {api_name}."
+        f"Supported types are Tensor and Tensor[]"
+    )
     if size_expr is not None:
-        assert is_vec(typename), \
-            (f"Invalid output size: output {name} in api: {api_name} is "
-             f"not a vector but has size expr")
+        assert is_vec(typename), (
+            f"Invalid output size: output {name} in api: {api_name} is "
+            f"not a vector but has size expr"
+        )
         return {"typename": typename, "name": name, "size": size_expr}
     else:
         return {"typename": typename, "name": name}
@@ -149,22 +158,24 @@ def parse_plain_list(s: str, sep=",") -> List[str]:
     return items
 
 
-def parse_kernel(api_name: str, kernel_config: Dict[str,
-                                                    Any]) -> Dict[str, Any]:
+def parse_kernel(
+    api_name: str, kernel_config: Dict[str, Any]
+) -> Dict[str, Any]:
     # kernel :
     #    func : [], Kernel functions (example: scale, scale_sr)
     #    param : [], Input params of kernel
     #    backend : str, the names of param to choose the kernel backend, default is None
     #    layout : str, the names of param to choose the kernel layout, default is None
     #    data_type : str, the names of param to choose the kernel data_type, default is None
+    #    dispatch : {}, the key is kernel_func, the value is type of inputs and outputs for kernel (example: {kernel_name : (['dense','sparse_coo']#input,['sparse_coo']#output)})
     kernel = {
-        'func': None,  # up to 2 function names
+        'func': [],  # up to 2 function names
         'param': None,
         'backend': None,
         'layout': None,
-        'data_type': None
+        'data_type': None,
+        'dispatch': {},
     }
-    kernel['func'] = parse_plain_list(kernel_config['func'])
     if 'param' in kernel_config:
         kernel['param'] = kernel_config['param']
 
@@ -176,6 +187,42 @@ def parse_kernel(api_name: str, kernel_config: Dict[str,
 
     if 'data_type' in kernel_config:
         kernel['data_type'] = parse_candidates(kernel_config["data_type"])
+
+    kernel_funcs = re.compile(r'([a-zA-Z0-9_]+)\s*({[^}]+})?').findall(
+        kernel_config['func']
+    )
+
+    def parse_kernel_in_out_type(in_out_str):
+        if len(in_out_str) == 0:
+            return None
+        tmp_in_out_list = in_out_str[1:-1].split('->')
+        inputs = [item.strip() for item in tmp_in_out_list[0].split(',')]
+        outputs = [item.strip() for item in tmp_in_out_list[1].split(',')]
+
+        # check the tensor type
+        for item in inputs:
+            assert item in [
+                'dense',
+                'selected_rows',
+                'sparse_coo',
+                'sparse_csr',
+            ], f"{api_name} : Invalid input tensor type ('{item}'), here we only support 'dense', 'selected_rows', 'sparse_coo' and 'sparse_csr'."
+        for item in outputs:
+            assert item in [
+                'dense',
+                'selected_rows',
+                'sparse_coo',
+                'sparse_csr',
+            ], f"{api_name} : Invalid output tensor type ('{item}'), here we only support 'dense', 'selected_rows', 'sparse_coo' and 'sparse_csr'."
+
+        return (inputs, outputs)
+
+    for func_item in kernel_funcs:
+        kernel['func'].append(func_item[0])
+        kernel['dispatch'][func_item[0]] = parse_kernel_in_out_type(
+            func_item[1]
+        )
+
     return kernel
 
 
@@ -200,19 +247,19 @@ def parse_invoke(api_name: str, invoke_config: str) -> Dict[str, Any]:
 
 def extract_type_and_name(records: List[Dict]) -> List[Dict]:
     """extract type and name from forward call, it is simpler than forward api."""
-    extracted = [{
-        "name": item["name"],
-        "typename": item["typename"]
-    } for item in records]
+    extracted = [
+        {"name": item["name"], "typename": item["typename"]} for item in records
+    ]
     return extracted
 
 
 def parse_forward(api_name: str, forward_config: str) -> Dict[str, Any]:
     # api_name (const Tensor& input, ... , int attr, ...) -> Tensor(out)
     result = re.search(
-        r"(?P<api>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
-        forward_config)
-    api = result.group("api")
+        r"(?P<op>[a-z][a-z0-9_]+)\s*(?P<args>\([^\)]+\))\s*->\s*(?P<outputs>.+)",
+        forward_config,
+    )
+    api = result.group("op")
     outputs = parse_outputs(api_name, result.group("outputs"))
     outputs = extract_type_and_name(outputs)
 
@@ -223,12 +270,12 @@ def parse_forward(api_name: str, forward_config: str) -> Dict[str, Any]:
         "name": api,
         "inputs": inputs,
         "attrs": attrs,
-        "outputs": outputs
+        "outputs": outputs,
     }
     return forward_cfg
 
 
-def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
+def parse_api_entry(api_entry: Dict[str, Any], name_field="op"):
     api_name = api_entry[name_field]
     inputs, attrs = parse_input_and_attr(api_name, api_entry["args"])
     outputs = parse_outputs(api_name, api_entry["output"])
@@ -239,13 +286,19 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
             typename = attr["typename"]
             default_value = attr["default_value"]
             if typename == "DataType":
-                assert "DataType" in default_value, f"invalid DataType default value in {api_name}"
+                assert (
+                    "DataType" in default_value
+                ), f"invalid DataType default value in {api_name}"
                 # remove namespace
-                default_value = default_value[default_value.find("DataType"):]
+                default_value = default_value[default_value.find("DataType") :]
                 attr["default_value"] = default_value
             elif typename == "DataLayout":
-                assert "DataLayout" in default_value, f"invalid DataLayout default value in {api_name}"
-                default_value = default_value[default_value.find("DataLayout"):]
+                assert (
+                    "DataLayout" in default_value
+                ), f"invalid DataLayout default value in {api_name}"
+                default_value = default_value[
+                    default_value.find("DataLayout") :
+                ]
                 attr["default_value"] = default_value
 
     input_names = [item["name"] for item in inputs]
@@ -258,7 +311,9 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
     if "optional" in api_entry:
         optional_args = parse_plain_list(api_entry["optional"])
         for name in optional_args:
-            assert name in input_names, f"{api_name} has an optional input: '{name}' which is not an input."
+            assert (
+                name in input_names
+            ), f"{api_name} has an optional input: '{name}' which is not an input."
         for input in inputs:
             if input["name"] in optional_args:
                 input["optional"] = True
@@ -269,7 +324,9 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
     if "intermediate" in api_entry:
         intermediate_outs = parse_plain_list(api_entry["intermediate"])
         for name in intermediate_outs:
-            assert name in output_names, f"{api_name} has an intermediate output: '{name}' which is not an output."
+            assert (
+                name in output_names
+            ), f"{api_name} has an intermediate output: '{name}' which is not an output."
         for output in outputs:
             if output["name"] in intermediate_outs:
                 output["intermediate"] = True
@@ -280,7 +337,9 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
     if "no_need_buffer" in api_entry:
         no_buffer_args = parse_plain_list(api_entry["no_need_buffer"])
         for name in no_buffer_args:
-            assert name in input_names, f"{api_name} has an no buffer input: '{name}' which is not an input."
+            assert (
+                name in input_names
+            ), f"{api_name} has an no buffer input: '{name}' which is not an input."
         for input in inputs:
             if input["name"] in no_buffer_args:
                 input["no_need_buffer"] = True
@@ -294,7 +353,7 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
         "inputs": inputs,
         "attrs": attrs,
         "outputs": outputs,
-        "no_need_buffer": no_buffer_args
+        "no_need_buffer": no_buffer_args,
     }
 
     # invokes another api?
@@ -316,11 +375,13 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
             inplace_pairs = parse_inplace(api_name, api_entry["inplace"])
         else:
             inplace_pairs = None
-        api.update({
-            "infer_meta": infer_meta,
-            "kernel": kernel,
-            "inplace": inplace_pairs
-        })
+        api.update(
+            {
+                "infer_meta": infer_meta,
+                "kernel": kernel,
+                "inplace": inplace_pairs,
+            }
+        )
     else:
         # invoke
         invoke = parse_invoke(api_name, api_entry["invoke"])
@@ -334,13 +395,14 @@ def parse_api_entry(api_entry: Dict[str, Any], name_field="api"):
     api["backward"] = backward
 
     # forward for backward_apis
-    is_backward_api = name_field == "backward_api"
+    is_backward_api = name_field == "backward_op"
     if is_backward_api:
         if "forward" in api_entry:
             forward = parse_forward(api_name, api_entry["forward"])
             # validate_fb
-            validate_backward_inputs(api_name, forward["inputs"],
-                                     forward["outputs"], inputs)
+            validate_backward_inputs(
+                api_name, forward["inputs"], forward["outputs"], inputs
+            )
             validate_backward_attrs(api_name, forward["attrs"], attrs)
             validate_backward_outputs(api_name, forward["inputs"], outputs)
         else:
@@ -356,23 +418,27 @@ def validate_backward_attrs(api, forward_attrs, backward_attrs):
     # this is a not-that-clean trick to allow backward api to has more attrs
     # than the forward api, as long as they all have default value
     for i in range(-num_exceptional_attrs, 0):
-        assert "default_value" in backward_attrs[
-            i], f"{api} has exceptional attr without default value"
+        assert (
+            "default_value" in backward_attrs[i]
+        ), f"{api} has exceptional attr without default value"
 
 
-def validate_backward_inputs(api, forward_inputs, forward_outputs,
-                             backward_inputs):
+def validate_backward_inputs(
+    api, forward_inputs, forward_outputs, backward_inputs
+):
     foward_input_names = [item["name"] for item in forward_inputs]
     forward_output_names = [item["name"] for item in forward_outputs]
     backward_input_names = [item["name"] for item in backward_inputs]
 
     assert len(backward_input_names) <= len(foward_input_names) + 2 * len(
-        forward_output_names), f"{api} has too many inputs."
+        forward_output_names
+    ), f"{api} has too many inputs."
 
 
 def validate_backward_outputs(api, forward_inputs, backward_outputs):
     assert len(backward_outputs) <= len(
-        forward_inputs), f"{api} has too many outputs"
+        forward_inputs
+    ), f"{api} has too many outputs"
 
 
 def cross_validate(apis):
@@ -391,15 +457,17 @@ def cross_validate(apis):
                         f"Something Wrong here, {name}'s forward api({fw_name}) does not claim {name} as its backward."
                     )
                 else:
-                    assert fw_api[
-                        "backward"] == name, f"{name}: backward and forward name mismatch"
+                    assert (
+                        fw_api["backward"] == name
+                    ), f"{name}: backward and forward name mismatch"
 
                 assert len(fw_call["inputs"]) <= len(
                     fw_api["inputs"]
                 ), f"{name}: forward call has more inputs than the api"
                 for (input, input_) in zip(fw_call["inputs"], fw_api["inputs"]):
-                    assert input["typename"] == input_[
-                        "typename"], f"type mismatch in {name} and {fw_name}"
+                    assert (
+                        input["typename"] == input_["typename"]
+                    ), f"type mismatch in {name} and {fw_name}"
 
                 assert len(fw_call["attrs"]) <= len(
                     fw_api["attrs"]
@@ -411,13 +479,16 @@ def cross_validate(apis):
                             r"Scalar(\(\w+\))*", attr_["typename"]
                         ), f"type mismatch in {name} and {fw_name}"
                     else:
-                        assert attr["typename"] == attr_[
-                            "typename"], f"type mismatch in {name} and {fw_name}"
+                        assert (
+                            attr["typename"] == attr_["typename"]
+                        ), f"type mismatch in {name} and {fw_name}"
 
                 assert len(fw_call["outputs"]) == len(
                     fw_api["outputs"]
                 ), f"{name}: forward call has more outputs than the api"
-                for (output, output_) in zip(fw_call["outputs"],
-                                             fw_api["outputs"]):
-                    assert output["typename"] == output_[
-                        "typename"], f"type mismatch in {name} and {fw_name}"
+                for (output, output_) in zip(
+                    fw_call["outputs"], fw_api["outputs"]
+                ):
+                    assert (
+                        output["typename"] == output_["typename"]
+                    ), f"type mismatch in {name} and {fw_name}"
diff --git a/paddle/phi/api/yaml/generator/sparse_api_gen.py b/paddle/phi/api/yaml/generator/sparse_api_gen.py
index ac98c78f58a3f2..cfd3c698b04fc8 100644
--- a/paddle/phi/api/yaml/generator/sparse_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_api_gen.py
@@ -18,10 +18,10 @@
 import re
 
 from api_gen import ForwardAPI
+from api_base import PREFIX_TENSOR_NAME
 
 
 class SparseAPI(ForwardAPI):
-
     def __init__(self, api_item_yaml):
         super(SparseAPI, self).__init__(api_item_yaml)
 
@@ -31,11 +31,13 @@ def gene_api_declaration(self):
 {super(SparseAPI, self).gene_api_declaration()}
 """
 
-    def gene_output(self,
-                    out_dtype_list,
-                    out_tensor_type_list=None,
-                    code_indent='',
-                    inplace_flag=False):
+    def gene_output(
+        self,
+        out_dtype_list,
+        out_tensor_type_list=None,
+        code_indent='',
+        inplace_flag=False,
+    ):
         kernel_output = []
         output_names = []
         output_create = ""
@@ -43,15 +45,19 @@ def gene_output(self,
         output_type_map = {
             'dense': 'TensorType::DENSE_TENSOR',
             'sparse_coo': 'TensorType::SPARSE_COO',
-            'sparse_csr': 'TensorType::SPARSE_CSR'
+            'sparse_csr': 'TensorType::SPARSE_CSR',
         }
 
         if len(out_dtype_list) == 1:
             kernel_output.append('kernel_out')
             output_names.append('kernel_out')
-            inplace_assign = " = " + self.inplace_map[self.outputs['names'][
-                0]] if inplace_flag and self.inplace_map is not None and self.outputs[
-                    'names'][0] in self.inplace_map else ""
+            inplace_assign = (
+                " = " + self.inplace_map[self.outputs['names'][0]]
+                if inplace_flag
+                and self.inplace_map is not None
+                and self.outputs['names'][0] in self.inplace_map
+                else ""
+            )
             output_create = f"""
     {return_type} api_output{inplace_assign};
     auto* kernel_out = SetSparseKernelOutput(&api_output, {output_type_map[out_dtype_list[0]]});"""
@@ -66,8 +72,9 @@ def gene_output(self,
 
                 for out_name in self.outputs['names']:
                     if out_name in self.inplace_map:
-                        output_create = output_create + self.inplace_map[
-                            out_name] + ', '
+                        output_create = (
+                            output_create + self.inplace_map[out_name] + ', '
+                        )
                     else:
                         output_create += 'Tensor(), '
                 output_create = output_create[:-2] + '};'
@@ -75,28 +82,30 @@ def gene_output(self,
             for i in range(len(out_dtype_list)):
                 kernel_output.append(f'kernel_out_{i}')
                 output_names.append(f'kernel_out_{i}')
-                output_create = output_create + f"""
+                output_create = (
+                    output_create
+                    + f"""
     auto* kernel_out_{i} = SetSparseKernelOutput(&std::get<{i}>(api_output), {output_type_map[out_dtype_list[i]]});"""
+                )
 
         else:
             raise ValueError(
                 "{} : Output error: the output should not be empty.".format(
-                    self.api))
+                    self.api
+                )
+            )
 
         return kernel_output, output_names, output_create
 
     def gen_sparse_kernel_context(self, kernel_output_names):
         input_trans_map = {
-            'const Tensor&':
-            'const phi::TenseBase&',
-            'const std::vector<Tensor>&':
-            'const std::vector<phi::TenseBase>&',
-            'const paddle::optional<Tensor>&':
-            'paddle::optional<const phi::TenseBase&>'
+            'const Tensor&': 'const phi::TenseBase&',
+            'const std::vector<Tensor>&': 'const std::vector<phi::TenseBase>&',
+            'const paddle::optional<Tensor>&': 'paddle::optional<const phi::TenseBase&>',
         }
         out_trans_map = {
             'Tensor': 'phi::TenseBase*',
-            'std::vector<Tensor>': 'std::vector<phi::TenseBase*>'
+            'std::vector<Tensor>': 'std::vector<phi::TenseBase*>',
         }
         input_names = self.inputs['names']
         input_infos = self.inputs['input_info']
@@ -110,11 +119,17 @@ def gen_sparse_kernel_context(self, kernel_output_names):
         for param in kernel_param:
             if param in input_names:
                 if param in self.optional_vars:
-                    kernel_context_code = kernel_context_code + f"""
+                    kernel_context_code = (
+                        kernel_context_code
+                        + f"""
     kernel_context.EmplaceBackInput({param} ? {param}->impl().get() : nullptr);"""
+                    )
                 else:
-                    kernel_context_code = kernel_context_code + f"""
+                    kernel_context_code = (
+                        kernel_context_code
+                        + f"""
     kernel_context.EmplaceBackInput({param}.impl().get());"""
+                    )
 
                 continue
             if param in attr_names:
@@ -127,23 +142,82 @@ def gen_sparse_kernel_context(self, kernel_output_names):
                 param = str(param).lower()
             else:
                 param + str(param) + ", "
-            kernel_context_code = kernel_context_code + f"""
+            kernel_context_code = (
+                kernel_context_code
+                + f"""
     kernel_context.EmplaceBackAttr({param});"""
+            )
 
         for out_name in kernel_output_names:
-            kernel_context_code = kernel_context_code + f"""
+            kernel_context_code = (
+                kernel_context_code
+                + f"""
     kernel_context.EmplaceBackOutput({out_name});"""
+            )
 
         return kernel_context_code
 
+    def prepare_input(self):
+        input_names = self.inputs['names']
+        input_types = self.inputs['tensor_type']
+        attr_names = self.attrs['names']
+        infer_meta = self.infer_meta
+
+        infer_meta_params = (
+            infer_meta['param']
+            if infer_meta['param'] is not None
+            else input_names + attr_names
+        )
+
+        create_input_var_code = ""
+        tensor_type_map = {
+            'dense': 'phi::DenseTensor',
+            'sparse_coo': 'phi::SparseCooTensor',
+            'sparse_csr': 'phi::SparseCsrTensor',
+        }
+        for param in infer_meta_params:
+            if param in input_names:
+                var_name = "auto " + PREFIX_TENSOR_NAME + param + " = "
+                if self.inputs['input_info'][param] == "const Tensor&":
+                    create_input_var_code = (
+                        create_input_var_code + var_name + param + ".impl();\n"
+                    )
+                elif param in self.optional_vars:
+                    tensor_type = 'phi::DenseTensor'
+                    for name, input_type in zip(input_names, input_types):
+                        if param == name:
+                            tensor_type = tensor_type_map[input_type]
+                            break
+                    optional_var = "paddle::optional<" + tensor_type + ">("
+                    create_input_var_code = (
+                        create_input_var_code
+                        + var_name
+                        + param
+                        + " ? "
+                        + optional_var
+                        + "*static_cast<"
+                        + tensor_type
+                        + "*>((*"
+                        + param
+                        + ").impl().get())) : "
+                        + optional_var
+                        + "paddle::none);\n"
+                    )
+        return f"""{create_input_var_code}"""
+
     def gen_sparse_kernel_code(self, kernel_name, inplace_flag=False):
         _, kernel_output_names, output_create = self.gene_output(
-            self.kernel['dispatch'][kernel_name][1], None, '', inplace_flag)
+            self.kernel['dispatch'][kernel_name][1], None, '', inplace_flag
+        )
 
         kernel_context_code = self.gen_sparse_kernel_context(
-            kernel_output_names)
-        return_code = "" if len(
-            self.gene_return_code()) == 0 else "  " + self.gene_return_code()
+            kernel_output_names
+        )
+        return_code = (
+            ""
+            if len(self.gene_return_code()) == 0
+            else "  " + self.gene_return_code()
+        )
         return f"""
     VLOG(6) << "{self.api} api sparse kernel key: [" << kernel_backend << ", " << kernel_layout << ", "<< kernel_data_type << "]";
     auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
@@ -154,19 +228,23 @@ def gen_sparse_kernel_code(self, kernel_name, inplace_flag=False):
     auto* dev_ctx = GetDeviceContextByBackend(kernel_result.has_fallback_cpu ? Backend::CPU : kernel_backend);
     auto kernel_context = phi::KernelContext(dev_ctx);
 {output_create}
+{self.prepare_input()}
+{self.gene_infer_meta(kernel_output_names, '')}
 {kernel_context_code}
     phi_kernel(&kernel_context);
   {return_code}"""
 
     def get_condition_code(self, kernel_name):
-        assert self.kernel['dispatch'][kernel_name], \
-                f"{self.api} api: the tensor type of inputs and outputs for kernel isn't set, see also 'kernel:func' of 'conv3d' in sparse_api.yaml."
+        assert self.kernel['dispatch'][
+            kernel_name
+        ], f"{self.api} api: the tensor type of inputs and outputs for kernel isn't set, see also 'kernel:func' of 'conv3d' in sparse_ops.yaml."
         input_types = self.kernel['dispatch'][kernel_name][0]
         sparse_type_map = {
             'sparse_coo': 'DataLayout::SPARSE_COO',
-            'sparse_csr': 'DataLayout::SPARSE_CSR'
+            'sparse_csr': 'DataLayout::SPARSE_CSR',
         }
         condition_list = []
+        tensor_type_list = []
         for i, in_type in enumerate(input_types):
             if in_type == "dense":
                 if self.inputs['names'][i] in self.optional_vars:
@@ -178,9 +256,17 @@ def get_condition_code(self, kernel_name):
                         f"phi::DenseTensor::classof({self.inputs['names'][i]}.impl().get())"
                     )
             else:
-                condition_list.append(
-                    f"{self.inputs['names'][i]}.layout() == {sparse_type_map[in_type]}"
-                )
+                if in_type == 'sparse_coo':
+                    condition_list.append(
+                        f"{self.inputs['names'][i]}.is_sparse_coo_tensor()"
+                    )
+                else:
+                    condition_list.append(
+                        f"{self.inputs['names'][i]}.is_sparse_csr_tensor()"
+                    )
+            tensor_type_list.append(in_type)
+        self.inputs['tensor_type'] = tensor_type_list
+
         return " && ".join(condition_list)
 
     def gene_dispatch_code(self, kernel_name, inplace_flag=False):
@@ -197,10 +283,11 @@ def gene_base_api_code(self, inplace_flag=False):
         kernel_dispatch_code = f"{self.gene_kernel_select()}\n"
         for kernel_name in self.kernel['func']:
             kernel_dispatch_code += self.gene_dispatch_code(
-                kernel_name, inplace_flag)
+                kernel_name, inplace_flag
+            )
 
         return f"""
-PADDLE_API {self.get_return_type()} {api_func_name}({self.get_define_args()}) {{
+PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {kernel_dispatch_code}
   PADDLE_THROW(phi::errors::Unimplemented(
           "The kernel of ({self.api}) for input tensors is unimplemented, please check the type of input tensors."));
@@ -229,23 +316,34 @@ def source_include(header_file_path):
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/data_transform.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/ternary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/utils/none.h"
+
+#include "paddle/phi/infermeta/sparse/unary.h"
+#include "paddle/phi/infermeta/sparse/binary.h"
+#include "paddle/phi/infermeta/sparse/multiary.h"
 """
 
 
 def api_namespace():
-    return ("""
+    return (
+        """
 namespace paddle {
 namespace experimental {
 namespace sparse {
 
-""", """
+""",
+        """
 
 }  // namespace sparse
 }  // namespace experimental
 }  // namespace paddle
-""")
+""",
+    )
 
 
 def generate_api(api_yaml_path, header_file_path, source_file_path):
@@ -281,18 +379,25 @@ def generate_api(api_yaml_path, header_file_path, source_file_path):
 
 def main():
     parser = argparse.ArgumentParser(
-        description='Generate PaddlePaddle C++ Sparse API files')
-    parser.add_argument('--api_yaml_path',
-                        help='path to sparse api yaml file',
-                        default='paddle/phi/api/yaml/sparse_api.yaml')
-
-    parser.add_argument('--api_header_path',
-                        help='output of generated api header code file',
-                        default='paddle/phi/api/include/sparse_api.h')
-
-    parser.add_argument('--api_source_path',
-                        help='output of generated api source code file',
-                        default='paddle/phi/api/lib/sparse_api.cc')
+        description='Generate PaddlePaddle C++ Sparse API files'
+    )
+    parser.add_argument(
+        '--api_yaml_path',
+        help='path to sparse api yaml file',
+        default='paddle/phi/api/yaml/sparse_ops.yaml',
+    )
+
+    parser.add_argument(
+        '--api_header_path',
+        help='output of generated api header code file',
+        default='paddle/phi/api/include/sparse_api.h',
+    )
+
+    parser.add_argument(
+        '--api_source_path',
+        help='output of generated api source code file',
+        default='paddle/phi/api/lib/sparse_api.cc',
+    )
 
     options = parser.parse_args()
 
diff --git a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
index 4692ed64513eae..83569d69510c3b 100644
--- a/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
+++ b/paddle/phi/api/yaml/generator/sparse_bw_api_gen.py
@@ -111,8 +111,15 @@ def source_include(header_file_path):
 #include "paddle/phi/api/include/sparse_api.h"
 #include "paddle/phi/api/lib/api_gen_utils.h"
 #include "paddle/phi/api/lib/kernel_dispatch.h"
-#include "paddle/phi/api/lib/sparse_api_custom_impl.h"
 #include "paddle/phi/core/kernel_registry.h"
+
+#include "paddle/phi/infermeta/unary.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/backward.h"
+
+#include "paddle/phi/infermeta/sparse/unary.h"
+#include "paddle/phi/infermeta/sparse/binary.h"
+#include "paddle/phi/infermeta/sparse/backward.h"
 """
 
 
@@ -164,7 +171,7 @@ def main():
         description='Generate PaddlePaddle C++ Sparse API files')
     parser.add_argument('--api_yaml_path',
                         help='path to sparse api yaml file',
-                        default='paddle/phi/api/yaml/sparse_bw_api.yaml')
+                        default='paddle/phi/api/yaml/sparse_backward.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/paddle/phi/api/yaml/generator/strings_api_gen.py b/paddle/phi/api/yaml/generator/strings_api_gen.py
index e8df9425144fb5..94c8958035854e 100644
--- a/paddle/phi/api/yaml/generator/strings_api_gen.py
+++ b/paddle/phi/api/yaml/generator/strings_api_gen.py
@@ -351,7 +351,7 @@ def main():
         description='Generate PaddlePaddle C++ Strings API files')
     parser.add_argument('--api_yaml_path',
                         help='path to sparse api yaml file',
-                        default='paddle/phi/api/yaml/strings_api.yaml')
+                        default='paddle/phi/api/yaml/strings_ops.yaml')
 
     parser.add_argument('--api_header_path',
                         help='output of generated api header code file',
diff --git a/paddle/phi/api/yaml/generator/templates/op.c.j2 b/paddle/phi/api/yaml/generator/templates/op.c.j2
index 0c2708ce223c7c..4799866f993cb8 100644
--- a/paddle/phi/api/yaml/generator/templates/op.c.j2
+++ b/paddle/phi/api/yaml/generator/templates/op.c.j2
@@ -1,4 +1,4 @@
-{% from "operator_utils.c.j2" import op_maker, backward_op_maker, operator, register_op_with_components, register_op_version %}
+{% from "operator_utils.c.j2" import op_maker, backward_op_maker, backward_op_reused_maker, operator, register_op_with_components, register_op_version %}
 // this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit.
 #include <string>
 #include "paddle/fluid/framework/infershape_utils.h"
@@ -33,6 +33,8 @@ using paddle::framework::GradVarName;
 {{backward_op_maker(api, api_dict[api["forward"]["name"]])}}
 
 {{operator(api)}}
+  {% else %}
+{{backward_op_reused_maker(api, api_dict[api["forward"]["name"]], api["invoke"])}}
   {% endif %}
 {% endfor %}
 }  // namespace operators
diff --git a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2 b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2
index 3910da99d8ae31..da497e2b3bd00b 100644
--- a/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2
+++ b/paddle/phi/api/yaml/generator/templates/operator_utils.c.j2
@@ -81,7 +81,11 @@ AddAttr<{{typename | to_op_attr_type}}>("{{name}}", "({{typename | to_op_attr_ty
   {% set default_value = attr["default_value"] %}
   {% set typename = attr["typename"] %}
   {% if typename == "DataType" %}{# convert back to VarType #}
+    {% if default_value == "DataType::UNDEFINED" %}
+-1
+    {%- else %}
 static_cast<int>(framework::TransToProtoVarType(experimental::{{default_value}}))
+    {%- endif %}
   {%- elif typename == "DataLayout" %} {# does DataLayout need any processing?#}
 static_cast<int>(experimental::{{default_value}})
   {%- elif typename == "Place" %}{# construct a Place to get the type #}
@@ -94,7 +98,7 @@ static_cast<int>(phi::Place({{"phi::" if not default_value is initializer_list}}
 
 {# --------------------------------------- name mapping ---------------------------------------------- #}
 {% macro name_map(api) %}
-KernelSignature {{api["name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
+KernelSignature {{api["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
   {% set kernel_args = api["kernel"]["param"] %}
   {{get_input_list(api["inputs"], kernel_args)}};
   paddle::small_vector<const char*> attrs;
@@ -124,12 +128,64 @@ All possible KernelSignatures returned by {{api["name"] | to_pascal_case }}OpArg
 */
 {% endmacro %}
 
+{% macro get_kernel_dispatch(inputs, kernel_config) %}{# inline #}
+{%- for kernel_func in kernel_config["func"] %}
+  {% set input_idx = namespace(idx=0) %}
+  {% set kernel_in_type_list = kernel_config["dispatch"][kernel_func][0] %}
+
+  if ( {%- for input in inputs %}
+    {%- if input["name"] in kernel_config["param"] %}
+      {%- if kernel_in_type_list[input_idx.idx] == "dense" %}
+ctx.IsDenseTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
+      {%- elif kernel_in_type_list[input_idx.idx] == "selected_rows" %}
+ctx.IsSelectedRowsInput("{{input["name"]}}"){{" && " if not loop.last}}
+      {%- elif kernel_in_type_list[input_idx.idx] == "sparse_coo" %}
+ctx.IsSparseCooTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
+      {%- elif kernel_in_type_list[input_idx.idx] == "sparse_csr" %}
+ctx.IsSparseCsrTensorInput("{{input["name"]}}"){{" && " if not loop.last}}
+      {%- endif %}
+      {% set input_idx.idx = input_idx.idx + 1 %}
+    {%- endif %}
+  {%- endfor %}) {
+    kernel_name = "{{kernel_func}}";
+  }
+{%- endfor %}
+{%- endmacro %}
+
+{% macro sparse_op_name_map(api) %}
+KernelSignature {{api["op_name"] | to_pascal_case }}OpArgumentMapping(const ArgumentMappingContext& ctx) {
+  {% set kernel_args = api["kernel"]["param"] %}
+  {{get_input_list(api["inputs"], kernel_args)}};
+  paddle::small_vector<const char*> attrs;
+  {% for attr in api["attrs"]%}
+  {% filter indent(2)%}
+  {{get_an_attr(attr)}};
+  {% endfilter %}
+  {% endfor %}
+  {{get_output_list(api["outputs"], kernel_args)}};
+
+  const char* kernel_name = "unregistered";
+{{get_kernel_dispatch(api["inputs"], api["kernel"])}}
+  KernelSignature sig (kernel_name, std::move(inputs), std::move(attrs), std::move(outputs));
+  return sig;
+}
+
+/*
+******************************************************************
+NOTE: The following codes are for 'get_compat_kernel_signature.py'
+All possible KernelSignatures returned by {{api["name"] | to_pascal_case }}OpArgumentMapping:
+
+{{api | cartesian_prod_mapping}}
+******************************************************************
+*/
+{% endmacro %}
+
 {% macro register_base_kernel_name(api) %}
 PD_REGISTER_BASE_KERNEL_NAME({{api["op_name"]}}, {{api["name"]}});
 {%- endmacro %}
 
 {% macro register_name_map(api) %}
-PD_REGISTER_ARG_MAPPING_FN({{api["op_name"]}}, phi::{{api["name"] | to_pascal_case}}OpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN({{api["op_name"]}}, phi::{{api["op_name"] | to_pascal_case}}OpArgumentMapping);
 {%- endmacro %}
 
 {% macro get_input_list(inputs, kernel_args) %}{# inline #}
@@ -352,6 +408,48 @@ class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T>
 };
 {% endmacro %}
 
+{% macro backward_op_reused_maker(bw_op, forward_op, invoke_op) %}
+  {% set name = bw_op["op_name"] %}
+  {% set forward_input_names = bw_op["forward"]["inputs"] | map(attribute="name") | list %}
+  {% set forward_output_names = bw_op["forward"]["outputs"] | map(attribute="name") | list %}
+  {% set forward_attr_names = bw_op["forward"]["attrs"] | map(attribute="name") | list %}
+  {% set forward_input_orig_names = forward_op["inputs"] | map(attribute="name") | list %}
+  {% set forward_output_orig_names = forward_op["outputs"] | map(attribute="name") | list %}
+  {% set forward_attr_orig_names = forward_op["attrs"] | map(attribute="name") | list %}
+template <typename T>
+class {{name | to_pascal_case}}OpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> grad_op) const override {
+    grad_op->SetType("{{invoke_op["func"]}}");
+
+  {% for input in invoke_op["inputs"] %}
+    grad_op->SetInput({{input["name"] | to_opmaker_name}}, this->{{extract_input_from_forward(
+      input["value"],
+      forward_input_names,
+      forward_output_names,
+      forward_input_orig_names,
+      forward_output_orig_names)}});
+  {% endfor %}
+
+  {% for output in invoke_op["outputs"] %}
+    grad_op->SetOutput({{output["name"] | to_opmaker_name}}, this->{{extract_output_from_forward(
+      output["value"],
+      forward_input_names,
+      forward_output_names,
+      forward_input_orig_names,
+      forward_output_orig_names)}});
+  {% endfor %}
+
+  {% for attr in invoke_op["attrs"] %}
+    grad_op->SetAttr("{{attr["name"]}}", {{attr["value"]}});
+  {% endfor %}
+  }
+};
+{% endmacro %}
+
 
 {% macro extract_input_from_forward(name,
   input_names, output_names,
diff --git a/paddle/phi/api/yaml/generator/templates/sparse_ks.c.j2 b/paddle/phi/api/yaml/generator/templates/sparse_ks.c.j2
new file mode 100644
index 00000000000000..1af54ca8660838
--- /dev/null
+++ b/paddle/phi/api/yaml/generator/templates/sparse_ks.c.j2
@@ -0,0 +1,24 @@
+{% from "operator_utils.c.j2" import sparse_op_name_map, register_name_map, register_base_kernel_name %}
+// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit.
+#include "paddle/phi/core/compat/op_utils.h"
+#include "paddle/utils/small_vector.h"
+
+namespace phi {
+
+{% for api in apis %}
+  {% if api is base_api %}
+{{sparse_op_name_map(api)}}
+  {% endif %}
+{% endfor %}
+{% for api in backward_apis %}
+  {% if api is base_api %}
+{{sparse_op_name_map(api)}}
+  {% endif %}
+{% endfor %}
+}  // namespace phi
+
+{% for api in apis + backward_apis %}
+  {% if api is base_api %}
+{{register_name_map(api)}}
+  {% endif %}
+{% endfor %}
diff --git a/paddle/phi/api/yaml/generator/templates/sparse_op.c.j2 b/paddle/phi/api/yaml/generator/templates/sparse_op.c.j2
new file mode 100644
index 00000000000000..15d887e589e70d
--- /dev/null
+++ b/paddle/phi/api/yaml/generator/templates/sparse_op.c.j2
@@ -0,0 +1,49 @@
+{% from "operator_utils.c.j2" import op_maker, backward_op_maker, backward_op_reused_maker, operator, register_op_with_components, register_op_version %}
+// this file is generated by paddle/phi/api/yaml/generator/generate_op.py, do not edit.
+#include <string>
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/backward.h"
+#include "paddle/phi/infermeta/binary.h"
+#include "paddle/phi/infermeta/multiary.h"
+#include "paddle/phi/infermeta/sparse/backward.h"
+#include "paddle/phi/infermeta/sparse/binary.h"
+#include "paddle/phi/infermeta/sparse/multiary.h"
+#include "paddle/phi/infermeta/sparse/unary.h"
+#include "paddle/phi/infermeta/ternary.h"
+#include "paddle/phi/infermeta/unary.h"
+
+namespace paddle {
+namespace operators {
+
+using paddle::framework::GradVarName;
+
+{% for api in apis %}
+  {% if api is base_api %}
+
+{{op_maker(api)}}
+
+{{operator(api)}}
+  {% endif %}
+{% endfor %}
+
+{% for api in backward_apis %}
+  {% if api is base_api %}
+
+{{backward_op_maker(api, api_dict[api["forward"]["name"]])}}
+
+{{operator(api)}}
+ {% else %}
+{{backward_op_reused_maker(api, api_dict[api["forward"]["name"]], api["invoke"])}}
+  {% endif %}
+{% endfor %}
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+{% for api in apis + backward_apis %}
+{% if api is base_api %}
+{{register_op_with_components(api)}}
+{% endif %}
+{% endfor %}
diff --git a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
index 0504d3fd108910..e456f6f5728c3b 100644
--- a/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
+++ b/paddle/phi/api/yaml/generator/wrapped_infermeta_gen.py
@@ -167,7 +167,7 @@ def main():
     parser.add_argument('--api_yaml_path',
                         help='path to api yaml file',
                         nargs='+',
-                        default='paddle/phi/api/yaml/api.yaml')
+                        default='paddle/phi/api/yaml/ops.yaml')
     parser.add_argument(
         '--wrapped_infermeta_header_path',
         help='output of generated wrapped_infermeta header code file',
diff --git a/paddle/phi/api/yaml/legacy_backward.yaml b/paddle/phi/api/yaml/legacy_backward.yaml
index 27738c49bae713..2e7d240c5f586d 100755
--- a/paddle/phi/api/yaml/legacy_backward.yaml
+++ b/paddle/phi/api/yaml/legacy_backward.yaml
@@ -1,4 +1,4 @@
-- backward_api : abs_double_grad
+- backward_op : abs_double_grad
   forward : abs_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_x_grad)
   output : Tensor(grad_out_grad)
@@ -10,7 +10,7 @@
   data_transform:
     skip_transform : grad_x_grad
 
-- backward_api : abs_grad
+- backward_op : abs_grad
   forward : abs (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -23,7 +23,7 @@
     skip_transform : out_grad
   backward : abs_double_grad
 
-- backward_api : acos_grad
+- backward_op : acos_grad
   forward : acos (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -34,7 +34,7 @@
     func : acos_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : acosh_grad
+- backward_op : acosh_grad
   forward : acosh (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -45,7 +45,7 @@
     func : acosh_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : add_double_grad
+- backward_op : add_double_grad
   forward : add_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(grad_out_grad)
@@ -58,7 +58,7 @@
   backward : add_triple_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : add_grad
+- backward_op : add_grad
   forward : add (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -71,7 +71,7 @@
   backward : add_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : add_triple_grad
+- backward_op : add_triple_grad
   forward : add_double_grad (Tensor y, Tensor grad_out, Tensor grad_grad_x, Tensor grad_grad_y, int axis = -1) -> Tensor(grad_grad_out)
   args : (Tensor grad_grad_x, Tensor grad_grad_y, Tensor grad_grad_out_grad, int axis = -1)
   output : Tensor(grad_grad_x_grad), Tensor(grad_grad_y_grad)
@@ -82,7 +82,7 @@
     func : add_triple_grad
   inplace : (grad_grad_out_grad -> grad_grad_x_grad)
 
-- backward_api : addmm_grad
+- backward_op : addmm_grad
   forward : addmm (Tensor input, Tensor x, Tensor y, float alpha, float beta) -> Tensor(out)
   args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha, float beta)
   output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad)
@@ -92,7 +92,7 @@
   kernel :
     func : addmm_grad
 
-- backward_api : affine_grid_grad
+- backward_op : affine_grid_grad
   forward : affine_grid (Tensor input, IntArray outputShape, bool use_cudnn=true, bool align_corners=true) -> Tensor(output)
   args : (Tensor output_grad, IntArray outputShape, bool use_cudnn=true, bool align_corners=true)
   output : Tensor(input_grad)
@@ -104,7 +104,7 @@
     param : [output_grad, outputShape, align_corners]
     use_gpudnn: use_cudnn
 
-- backward_api : amax_grad
+- backward_op : amax_grad
   forward: amax (Tensor x,  int64_t[] dims={},  bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] dims={},  bool keep_dim=false, bool reduce_all=false)
   output : Tensor(x_grad)
@@ -114,7 +114,7 @@
   kernel :
     func : amax_grad
 
-- backward_api : amin_grad
+- backward_op : amin_grad
   forward: amin (Tensor x,  int64_t[] dims={},  bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] dims={},  bool keep_dim=false, bool reduce_all=false)
   output : Tensor(x_grad)
@@ -124,7 +124,7 @@
   kernel :
     func : amin_grad
 
-- backward_api : angle_grad
+- backward_op : angle_grad
   forward : angle (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -136,7 +136,7 @@
   data_transform:
     skip_transform : out_grad
 
-- backward_api : argsort_grad
+- backward_op : argsort_grad
   forward : argsort (Tensor x, int axis, bool descending) -> Tensor(out), Tensor(indices)
   args : (Tensor indices, Tensor x, Tensor out_grad, int axis, bool descending)
   output : Tensor(x_grad)
@@ -148,19 +148,19 @@
     data_type : out_grad
   no_need_buffer : x
 
-- backward_api : as_complex_grad
+- backward_op : as_complex_grad
   forward : as_complex (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
   invoke : as_real(out_grad)
 
-- backward_api : as_real_grad
+- backward_op : as_real_grad
   forward : as_real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
   invoke : as_complex(out_grad)
 
-- backward_api : asin_grad
+- backward_op : asin_grad
   forward : asin (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -171,7 +171,7 @@
     func : asin_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : asinh_grad
+- backward_op : asinh_grad
   forward : asinh (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -182,29 +182,13 @@
     func : asinh_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : assign_double_grad
-  forward : assign_grad (Tensor grad_out) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad)
-  output : Tensor(grad_out_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : assign
-  backward: assign_triple_grad
-  inplace : (grad_x_grad -> grad_out_grad)
-
-- backward_api : assign_grad
+- backward_op : assign_grad
   forward : assign (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : assign
-  backward: assign_double_grad
-  inplace : (out_grad -> x_grad)
+  invoke : assign(out_grad)
 
-- backward_api : assign_out__grad
+- backward_op : assign_out__grad
   forward : assign_out_ (Tensor x, Tensor output) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -214,17 +198,7 @@
     func : assign
   inplace : (out_grad -> x_grad)
 
-- backward_api : assign_triple_grad
-  forward : assign_double_grad (Tensor grad_out) -> Tensor(grad_x)
-  args : (Tensor grad_x_grad)
-  output : Tensor(grad_out_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-  kernel :
-    func : assign
-  inplace : (grad_x_grad -> grad_out_grad)
-
-- backward_api : atan_grad
+- backward_op : atan_grad
   forward : atan (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -235,7 +209,7 @@
     func : atan_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : atanh_grad
+- backward_op : atanh_grad
   forward : atanh (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -246,7 +220,7 @@
     func : atanh_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : batch_norm_double_grad
+- backward_op : batch_norm_double_grad
   forward : batch_norm_grad (Tensor x, Tensor scale, Tensor bias, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor grad_out, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
   args : (Tensor x, Tensor scale, Tensor out_mean, Tensor out_variance, Tensor saved_mean, Tensor saved_variance, Tensor grad_out,  Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(grad_out_grad)
@@ -259,7 +233,7 @@
   optional : out_mean, out_variance
   inplace : (grad_out -> grad_out_grad)
 
-- backward_api : batch_norm_grad
+- backward_op : batch_norm_grad
   forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
@@ -272,7 +246,7 @@
   optional : mean_out, variance_out, reserve_space
   backward : batch_norm_double_grad
 
-- backward_api : bce_loss_grad
+- backward_op : bce_loss_grad
   forward : bce_loss (Tensor input, Tensor label) -> Tensor(out)
   args : (Tensor input, Tensor label, Tensor out_grad)
   output : Tensor(input_grad)
@@ -283,7 +257,7 @@
     func : bce_loss_grad
   inplace : (out_grad -> input_grad)
 
-- backward_api : bicubic_interp_grad
+- backward_op : bicubic_interp_grad
   forward : bicubic_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) -> Tensor(output)
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(x_grad)
@@ -295,7 +269,7 @@
     func : bicubic_interp_grad
     data_type : output_grad
 
-- backward_api : bilinear_interp_grad
+- backward_op : bilinear_interp_grad
   forward : bilinear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) -> Tensor(output)
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(x_grad)
@@ -307,7 +281,7 @@
     func : bilinear_interp_grad
     data_type : output_grad
 
-- backward_api : bilinear_tensor_product_grad
+- backward_op : bilinear_tensor_product_grad
   forward : bilinear_tensor_product (Tensor x, Tensor y, Tensor weight, Tensor bias) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor weight, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad), Tensor(weight_grad), Tensor(bias_grad)
@@ -316,7 +290,7 @@
   kernel :
     func : bilinear_tensor_product_grad
 
-- backward_api : bmm_grad
+- backward_op : bmm_grad
   forward : bmm (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -325,7 +299,7 @@
   kernel :
     func : bmm_grad
 
-- backward_api : brelu_grad
+- backward_op : brelu_grad
   forward : brelu (Tensor x, float t_min, float t_max) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float t_min, float t_max)
   output : Tensor(x_grad)
@@ -336,7 +310,7 @@
     func : brelu_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : broadcast_tensors_grad
+- backward_op : broadcast_tensors_grad
   forward : broadcast_tensors (Tensor[] x) -> Tensor[](out)
   args : (Tensor[] x, Tensor[] out_grad)
   output : Tensor[](x_grad)
@@ -348,19 +322,14 @@
     param : [out_grad]
   no_need_buffer : x
 
-- backward_api : cast_grad
+- backward_op : cast_grad
   forward : cast (Tensor x, DataType out_dtype) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
-  kernel :
-    func : cast_grad
-    data_type : out_grad
+  invoke : cast (out_grad, x.dtype())
   no_need_buffer : x
 
-- backward_api : ceil_grad
+- backward_op : ceil_grad
   forward : ceil(Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -371,7 +340,7 @@
     func : ceil_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : celu_double_grad
+- backward_op : celu_double_grad
   forward : celu_grad(Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
   output : Tensor(x_grad), Tensor(grad_out_grad)
@@ -382,7 +351,7 @@
     func : celu_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : celu_grad
+- backward_op : celu_grad
   forward : celu(Tensor x, float alpha) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float alpha)
   output : Tensor(x_grad)
@@ -394,7 +363,7 @@
   backward : celu_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : clip_double_grad
+- backward_op : clip_double_grad
   forward : clip_grad (Tensor x, Tensor grad_out, Scalar min = 0., Scalar max = 0.) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_x_grad, Scalar min = 0., Scalar max = 0.)
   output : Tensor(grad_out_grad)
@@ -404,7 +373,7 @@
   kernel :
     func : clip_grad
 
-- backward_api : clip_grad
+- backward_op : clip_grad
   forward : clip (Tensor x, Scalar min, Scalar max) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, Scalar min = 0., Scalar max = 0.)
   output : Tensor(x_grad)
@@ -416,7 +385,7 @@
   backward : clip_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : complex_grad
+- backward_op : complex_grad
   forward : complex (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -426,17 +395,13 @@
     func : complex_grad
     data_type : x
 
-- backward_api : concat_double_grad
+- backward_op : concat_double_grad
   forward : concat_grad (Tensor[] x, Tensor grad_out, Scalar axis) -> Tensor[](grad_x)
   args : (Tensor[] grad_x_grad, Scalar axis = 0)
   output : Tensor(grad_out_grad)
-  infer_meta :
-    func : ConcatInferMeta
-    param : [grad_x_grad, axis]
-  kernel :
-    func : concat
+  invoke : concat(grad_x_grad, axis)
 
-- backward_api : concat_grad
+- backward_op : concat_grad
   forward : concat (Tensor[] x, Scalar axis) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad, Scalar axis = 0)
   output : Tensor[](x_grad){x.size()}
@@ -448,7 +413,7 @@
   no_need_buffer : x
   backward : concat_double_grad
 
-- backward_api : conj_grad
+- backward_op : conj_grad
   forward : conj (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -458,7 +423,7 @@
   kernel :
     func : conj
 
-- backward_api : conv2d_grad
+- backward_op : conv2d_grad
   forward : conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
   args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad)
@@ -470,7 +435,7 @@
     use_gpudnn : true
   backward : conv2d_grad_grad
 
-- backward_api : conv2d_grad_grad
+- backward_op : conv2d_grad_grad
   forward : conv2d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
   args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
@@ -482,7 +447,7 @@
     use_gpudnn : true
   optional : grad_input_grad, grad_filter_grad
 
-- backward_api : conv2d_transpose_double_grad
+- backward_op : conv2d_transpose_double_grad
   forward : conv2d_transpose_grad(Tensor x, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(grad_x), Tensor(grad_filter)
   args : (Tensor x, Tensor filter, Tensor grad_out, Tensor grad_x_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(x_grad), Tensor(filter_grad), Tensor(grad_out_grad)
@@ -492,7 +457,7 @@
     func : conv2d_transpose_grad_grad
     use_gpudnn : true
 
-- backward_api : conv2d_transpose_grad
+- backward_op : conv2d_transpose_grad
   forward : conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(x_grad), Tensor(filter_grad)
@@ -503,7 +468,7 @@
     use_gpudnn : true
   backward : conv2d_transpose_double_grad
 
-- backward_api : conv3d_grad
+- backward_op : conv3d_grad
   forward : conv3d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(out)
   args : (Tensor input, Tensor filter, Tensor out_grad,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad)
@@ -515,7 +480,7 @@
     use_gpudnn : true
   backward : conv3d_grad_grad
 
-- backward_api : conv3d_grad_grad
+- backward_op : conv3d_grad_grad
   forward : conv3d_grad (Tensor input, Tensor filter, Tensor grad_out,  int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search) -> Tensor(grad_input), Tensor(grad_filter)
   args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
@@ -527,7 +492,7 @@
     use_gpudnn : true
   optional : grad_input_grad, grad_filter_grad
 
-- backward_api : conv3d_transpose_grad
+- backward_op : conv3d_transpose_grad
   forward : conv3d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(x_grad), Tensor(filter_grad)
@@ -537,7 +502,7 @@
     func : conv3d_transpose_grad
     use_gpudnn : true
 
-- backward_api : cos_grad
+- backward_op : cos_grad
   forward : cos (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -548,7 +513,7 @@
     func : cos_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : cosh_grad
+- backward_op : cosh_grad
   forward : cosh (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -559,7 +524,7 @@
     func : cosh_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : crop_tensor_grad
+- backward_op : crop_tensor_grad
   forward : crop_tensor (Tensor x, IntArray shape, IntArray offsets) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray offsets)
   output : Tensor(x_grad)
@@ -569,7 +534,7 @@
     func : crop_tensor_grad
     data_type : x
 
-- backward_api : cross_entropy_with_softmax_grad
+- backward_op : cross_entropy_with_softmax_grad
   forward : cross_entropy_with_softmax (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis) -> Tensor(softmax), Tensor(loss)
   args : (Tensor label, Tensor softmax, Tensor loss_grad, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
   output : Tensor(input_grad)
@@ -580,7 +545,7 @@
     data_type : softmax
   inplace : (softmax -> input_grad)
 
-- backward_api : cumprod_grad
+- backward_op : cumprod_grad
   forward : cumprod (Tensor x, int dim) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int dim)
   output : Tensor(x_grad)
@@ -590,16 +555,13 @@
   kernel :
     func : cumprod_grad
 
-- backward_api : cumsum_grad
+- backward_op : cumsum_grad
   forward : cumsum(Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [x]
   args : (Tensor out_grad, Scalar axis, bool flatten, bool exclusive, bool reverse)
   output : Tensor(x_grad)
   invoke : cumsum(out_grad, axis, flatten, exclusive, !reverse)
 
-- backward_api : deformable_conv_grad
+- backward_op : deformable_conv_grad
   forward : deformable_conv(Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step) -> Tensor(out)
   args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, Tensor out_grad, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step)
   output : Tensor(x_grad), Tensor(offset_grad), Tensor(filter_grad), Tensor(mask_grad)
@@ -610,7 +572,7 @@
     data_type : x
   optional : mask
 
-- backward_api : depthwise_conv2d_grad
+- backward_op : depthwise_conv2d_grad
   forward : depthwise_conv2d (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(out)
   args : (Tensor input, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
   output : Tensor(input_grad), Tensor(filter_grad)
@@ -623,7 +585,7 @@
     use_gpudnn : use_gpudnn
   backward : depthwise_conv2d_grad_grad
 
-- backward_api : depthwise_conv2d_grad_grad
+- backward_op : depthwise_conv2d_grad_grad
   forward : depthwise_conv2d_grad (Tensor input, Tensor filter, Tensor grad_out, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn) -> Tensor(grad_input), Tensor(grad_filter)
   args : (Tensor input, Tensor filter, Tensor grad_out, Tensor grad_input_grad, Tensor grad_filter_grad, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu)
   output : Tensor(input_grad), Tensor(filter_grad), Tensor(grad_out_grad)
@@ -634,7 +596,7 @@
     func : depthwise_conv2d_grad_grad
   optional : grad_input_grad, grad_filter_grad
 
-- backward_api : depthwise_conv2d_transpose_grad
+- backward_op : depthwise_conv2d_transpose_grad
   forward : depthwise_conv2d_transpose(Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor filter, Tensor out_grad, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(x_grad), Tensor(filter_grad)
@@ -643,7 +605,7 @@
   kernel :
     func : depthwise_conv2d_transpose_grad
 
-- backward_api : det_grad
+- backward_op : det_grad
   forward : det (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -653,7 +615,7 @@
   kernel :
     func : determinant_grad
 
-- backward_api : divide_double_grad
+- backward_op : divide_double_grad
   forward : divide_grad (Tensor x, Tensor y, Tensor out, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor y, Tensor out, Tensor grad_x, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(y_grad), Tensor(out_grad), Tensor(grad_out_grad)
@@ -666,7 +628,7 @@
   optional : grad_x_grad, grad_y_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : divide_grad
+- backward_op : divide_grad
   forward : divide (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -677,7 +639,7 @@
     func : divide_grad
   backward : divide_double_grad
 
-- backward_api : dropout_grad
+- backward_op : dropout_grad
   forward : dropout (Tensor x, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed) -> Tensor(out), Tensor(mask)
   args : (Tensor mask, Tensor out_grad, Scalar p, bool is_test, str mode)
   output : Tensor(x_grad)
@@ -687,7 +649,7 @@
   kernel :
     func : dropout_grad
 
-- backward_api : eig_grad
+- backward_op : eig_grad
   forward : eig (Tensor x) -> Tensor(out_w), Tensor(out_v)
   args : (Tensor out_w, Tensor out_v, Tensor out_w_grad, Tensor out_v_grad)
   output : Tensor(x_grad)
@@ -700,7 +662,7 @@
   data_transform:
     skip_transform : out_w, out_w_grad
 
-- backward_api : eigh_grad
+- backward_op : eigh_grad
   forward : eigh (Tensor x, str uplo) -> Tensor(out_w), Tensor(out_v)
   args : (Tensor out_w, Tensor out_v, Tensor out_w_grad, Tensor out_v_grad)
   output : Tensor(x_grad)
@@ -713,7 +675,7 @@
   data_transform:
     skip_transform : out_w, out_w_grad
 
-- backward_api : eigvalsh_grad
+- backward_op : eigvalsh_grad
   forward : eigvalsh (Tensor x, str uplo, bool is_test) -> Tensor(eigenvalues), Tensor(eigenvectors)
   args : (Tensor eigenvectors, Tensor eigenvalues_grad, str uplo, bool is_test)
   output : Tensor(x_grad)
@@ -725,7 +687,7 @@
   data_transform :
     skip_transform : eigenvalues_grad
 
-- backward_api : einsum_grad
+- backward_op : einsum_grad
   forward : einsum (Tensor[] x, str equation) -> Tensor(out), Tensor[](inner_cache), Tensor[](x_shape)
   args : (Tensor[] x_shape, Tensor[] inner_cache, Tensor out_grad, str equation)
   output : Tensor[](x_grad){x.size()}
@@ -735,7 +697,7 @@
   kernel :
     func : einsum_grad
 
-- backward_api : elementwise_pow_grad
+- backward_op : elementwise_pow_grad
   forward : elementwise_pow(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis=-1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -745,7 +707,7 @@
   kernel :
     func : elementwise_pow_grad
 
-- backward_api : elu_double_grad
+- backward_op : elu_double_grad
   forward : elu_grad (Tensor x, Tensor out, Tensor grad_out, float alpha)-> Tensor(grad_x)
   args : (Tensor x, Tensor grad_out, Tensor grad_x_grad, float alpha)
   output : Tensor(x_grad), Tensor(grad_out_grad)
@@ -756,7 +718,7 @@
     func : elu_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : elu_grad
+- backward_op : elu_grad
   forward : elu (Tensor x, float alpha) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, float alpha)
   output : Tensor(x_grad)
@@ -768,13 +730,13 @@
   backward : elu_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : embedding_grad
+- backward_op : embedding_grad
   forward : embedding (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false) -> Tensor(out)
   args : (Tensor x, Tensor weight, Tensor out_grad, int64_t padding_idx=-1, bool sparse=false)
   output : Tensor(weight_grad)
   invoke : embedding_grad_impl(x, weight, out_grad, padding_idx, sparse, weight_grad)
 
-- backward_api : exp_grad
+- backward_op : exp_grad
   forward : exp (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -785,7 +747,7 @@
     func : exp_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : expand_as_grad
+- backward_op : expand_as_grad
   forward : expand_as (Tensor x, Tensor y, int[] target_shape) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] target_shape)
   output : Tensor(x_grad)
@@ -796,16 +758,13 @@
     func : expand_as_grad
   no_need_buffer : x
 
-- backward_api : expand_double_grad
+- backward_op : expand_double_grad
   forward : expand_grad (Tensor x, Tensor grad_out, IntArray shape) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray shape)
   output : Tensor(grad_out_grad)
-  infer_meta :
-    func : ExpandInferMeta
-  kernel :
-    func : expand
+  invoke : expand(grad_x_grad, shape)
 
-- backward_api : expand_grad
+- backward_op : expand_grad
   forward : expand (Tensor x, IntArray shape) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray shape)
   output : Tensor(x_grad)
@@ -817,7 +776,7 @@
   no_need_buffer : x
   backward : expand_double_grad
 
-- backward_api : expm1_grad
+- backward_op : expm1_grad
   forward : expm1 (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -828,7 +787,7 @@
     func : expm1_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : exponential__grad
+- backward_op : exponential__grad
   forward : exponential_ (Tensor x, float lambda) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -836,7 +795,7 @@
     func : UnchangedInferMeta
   invoke : zeros_like(out_grad)
 
-- backward_api : fill_diagonal_grad
+- backward_op : fill_diagonal_grad
   forward : fill_diagonal (Tensor x, float value, int offset, bool wrap) -> Tensor(out)
   args : (Tensor out_grad, float value, int offset, bool wrap)
   output : Tensor(x_grad)
@@ -845,7 +804,7 @@
   kernel :
     func : fill_diagonal_grad
 
-- backward_api : fill_diagonal_tensor_grad
+- backward_op : fill_diagonal_tensor_grad
   forward : fill_diagonal_tensor (Tensor x, Tensor y, int64_t offset, int dim1, int dim2) -> Tensor(out)
   args : (Tensor out_grad, int64_t offset, int dim1, int dim2)
   output : Tensor(x_grad)
@@ -855,7 +814,7 @@
     func : fill_diagonal_tensor_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : fill_grad
+- backward_op : fill_grad
   forward : fill (Tensor x, Scalar value) -> Tensor(out)
   args : (Tensor out_grad, Scalar value)
   output : Tensor(x_grad)
@@ -866,7 +825,7 @@
     func : fill_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : flatten_grad
+- backward_op : flatten_grad
   forward : flatten(Tensor x, int start_axis, int stop_axis) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad)
   output : Tensor(x_grad)
@@ -880,17 +839,7 @@
     layout: out_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : flip_grad
-  forward : flip (Tensor x, int[] axis) -> Tensor(out)
-  args : (Tensor out_grad, int[] axis)
-  output : Tensor(x_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param: [out_grad]
-  kernel :
-    func : flip
-
-- backward_api : floor_grad
+- backward_op : floor_grad
   forward : floor(Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -901,7 +850,7 @@
     func : floor_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : fmax_grad
+- backward_op : fmax_grad
   forward : fmax(Tensor x, Tensor y, int axis) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -911,7 +860,7 @@
   kernel :
     func : fmax_grad
 
-- backward_api : fmin_grad
+- backward_op : fmin_grad
   forward : fmin(Tensor x, Tensor y, int axis) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -921,7 +870,7 @@
   kernel :
     func : fmin_grad
 
-- backward_api : frame_grad
+- backward_op : frame_grad
   forward : frame(Tensor x, int frame_length, int hop_length, int axis) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int frame_length, int hop_length, int axis)
   output : Tensor(x_grad)
@@ -931,7 +880,7 @@
   kernel :
     func : frame_grad
 
-- backward_api : frobenius_norm_grad
+- backward_op : frobenius_norm_grad
   forward : frobenius_norm(Tensor x, int64_t[] axis,  bool keep_dim,  bool reduce_all) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis,  bool keep_dim,  bool reduce_all)
   output : Tensor(x_grad)
@@ -941,7 +890,7 @@
   kernel :
     func : frobenius_norm_grad
 
-- backward_api : gather_grad
+- backward_op : gather_grad
   forward : gather(Tensor x, Tensor index, Scalar axis=0) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad, Scalar axis=0, bool overwrite=false)
   output : Tensor(x_grad)
@@ -953,7 +902,7 @@
     func : gather_grad
   no_need_buffer : x
 
-- backward_api : gather_nd_grad
+- backward_op : gather_nd_grad
   forward : gather_nd (Tensor x, Tensor index) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad)
   output : Tensor(x_grad)
@@ -964,7 +913,7 @@
     func : gather_nd_grad
   no_need_buffer : x
 
-- backward_api : gelu_grad
+- backward_op : gelu_grad
   forward : gelu(Tensor x,  bool approximate) -> Tensor(out)
   args : (Tensor x, Tensor out_grad,  bool approximate)
   output : Tensor(x_grad)
@@ -974,7 +923,7 @@
   kernel :
     func : gelu_grad
 
-- backward_api : graph_send_recv_grad
+- backward_op : graph_send_recv_grad
   forward : graph_send_recv (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0}) -> Tensor(out), Tensor(dst_count)
   args : (Tensor x, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str reduce_op = "SUM")
   output : Tensor(x_grad)
@@ -986,7 +935,7 @@
     data_type : out_grad
   optional: out, dst_count
 
-- backward_api : graph_send_ue_recv_grad
+- backward_op : graph_send_ue_recv_grad
   forward : graph_send_ue_recv (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size) -> Tensor(out), Tensor(dst_count)
   args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, Tensor out, Tensor dst_count, Tensor out_grad, str message_op, str reduce_op)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -998,7 +947,7 @@
     data_type : out_grad
   optional: out, dst_count
 
-- backward_api : grid_sample_grad
+- backward_op : grid_sample_grad
   forward : grid_sample (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners) -> Tensor(out)
   args : (Tensor x, Tensor grid, Tensor out_grad, str mode, str padding_mode, bool align_corners)
   output : Tensor(x_grad), Tensor(grid_grad)
@@ -1009,7 +958,7 @@
     func : grid_sample_grad
     data_type : x
 
-- backward_api : group_norm_grad
+- backward_op : group_norm_grad
   forward : group_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int groups, str data_layout) -> Tensor(y), Tensor(mean), Tensor(variance)
   args : (Tensor x, Tensor scale, Tensor bias, Tensor y, Tensor mean, Tensor variance, Tensor y_grad, float epsilon, int groups, str data_layout)
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
@@ -1022,7 +971,7 @@
   optional: scale, bias
   inplace : (y_grad -> x_grad)
 
-- backward_api : gumbel_softmax_grad
+- backward_op : gumbel_softmax_grad
   forward : gumbel_softmax (Tensor x, float temperature, bool hard, int axis) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, int axis)
   output : Tensor(x_grad)
@@ -1032,7 +981,7 @@
   kernel :
     func : gumbel_softmax_grad
 
-- backward_api : hard_shrink_grad
+- backward_op : hard_shrink_grad
   forward : hard_shrink (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
   output : Tensor(x_grad)
@@ -1043,7 +992,7 @@
     func : hard_shrink_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : hard_sigmoid_grad
+- backward_op : hard_sigmoid_grad
   forward : hard_sigmoid (Tensor x, float slope, float offset) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, float slope, float offset)
   output : Tensor(x_grad)
@@ -1054,7 +1003,7 @@
     func : hard_sigmoid_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : hard_swish_grad
+- backward_op : hard_swish_grad
   forward : hard_swish (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold, float scale, float offset)
   output : Tensor(x_grad)
@@ -1065,7 +1014,7 @@
     func : hard_swish_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : hierarchical_sigmoid_grad
+- backward_op : hierarchical_sigmoid_grad
   forward : hierarchical_sigmoid (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse) -> Tensor(out), Tensor(pre_out), Tensor(w_out)
   args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, Tensor pre_out, Tensor out_grad, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse)
   output : Tensor(x_grad), Tensor(w_grad), Tensor(bias_grad)
@@ -1076,7 +1025,7 @@
   kernel :
     func : hierarchical_sigmoid_grad
 
-- backward_api : huber_loss_grad
+- backward_op : huber_loss_grad
   forward : huber_loss (Tensor input, Tensor label, float delta) -> Tensor(out), Tensor(residual)
   args : (Tensor residual, Tensor out_grad, float delta)
   output : Tensor(input_grad), Tensor(label_grad)
@@ -1086,13 +1035,13 @@
   kernel :
     func : huber_loss_grad
 
-- backward_api : imag_grad
+- backward_op : imag_grad
   forward : imag (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
   invoke : imag_grad_impl(out_grad, x_grad)
 
-- backward_api : index_add_grad
+- backward_op : index_add_grad
   forward : index_add(Tensor x, Tensor index,  Tensor add_value, int axis) -> Tensor(out)
   args : (Tensor index, Tensor add_value, Tensor out_grad, int axis)
   output : Tensor(x_grad), Tensor(add_value_grad)
@@ -1103,7 +1052,7 @@
     data_type : out_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : index_sample_grad
+- backward_op : index_sample_grad
   forward : index_sample (Tensor x, Tensor index) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1115,7 +1064,7 @@
     data_type : out_grad
   no_need_buffer : x
 
-- backward_api : index_select_grad
+- backward_op : index_select_grad
   forward : index_select(Tensor x, Tensor index,  int dim) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad,  int dim)
   output : Tensor(x_grad)
@@ -1127,7 +1076,7 @@
     data_type : x
   no_need_buffer : x
 
-- backward_api : instance_norm_double_grad
+- backward_op : instance_norm_double_grad
   forward : instance_norm_grad(Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, float epsilon) -> Tensor(grad_x), Tensor(grad_scale), Tensor(grad_bias)
   args : (Tensor x, Tensor fwd_scale, Tensor saved_mean, Tensor saved_variance, Tensor grad_y, Tensor grad_x_grad, Tensor grad_scale_grad, Tensor grad_bias_grad, float epsilon)
   output : Tensor(x_grad), Tensor(fwd_scale_grad), Tensor(grad_y_grad)
@@ -1138,7 +1087,7 @@
     data_type : x
   optional : fwd_scale, grad_x_grad, grad_scale_grad, grad_bias_grad
 
-- backward_api : instance_norm_grad
+- backward_op : instance_norm_grad
   forward : instance_norm(Tensor x, Tensor scale, Tensor bias, float epsilon) -> Tensor(y), Tensor(saved_mean), Tensor(saved_variance)
   args : (Tensor x, Tensor scale, Tensor saved_mean, Tensor saved_variance, Tensor y_grad, float epsilon)
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
@@ -1150,7 +1099,7 @@
   optional : scale
   backward : instance_norm_double_grad
 
-- backward_api : inverse_grad
+- backward_op : inverse_grad
   forward : inverse(Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1159,7 +1108,7 @@
   kernel :
     func : inverse_grad
 
-- backward_api : kldiv_loss_grad
+- backward_op : kldiv_loss_grad
   forward : kldiv_loss(Tensor x, Tensor label, str reduction) -> Tensor(out)
   args : (Tensor x, Tensor label, Tensor out_grad, str reduction)
   output : Tensor(x_grad)
@@ -1170,7 +1119,7 @@
     func : kldiv_loss_grad
   no_need_buffer : x
 
-- backward_api : kron_grad
+- backward_op : kron_grad
   forward : kron (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -1181,7 +1130,7 @@
     func : kron_grad
     data_type : out_grad
 
-- backward_api : kthvalue_grad
+- backward_op : kthvalue_grad
   forward : kthvalue(Tensor x, int k, int axis, bool keepdim) -> Tensor(out), Tensor(indices)
   args : (Tensor x, Tensor indices, Tensor out_grad, int k, int axis, bool keepdim)
   output : Tensor(x_grad)
@@ -1191,7 +1140,7 @@
   kernel :
     func : kthvalue_grad
 
-- backward_api : label_smooth_grad
+- backward_op : label_smooth_grad
   forward : label_smooth (Tensor label, Tensor prior_dist, float epsilon) -> Tensor(out)
   args : (Tensor out_grad, float epsilon)
   output : Tensor(label_grad)
@@ -1201,7 +1150,7 @@
   kernel :
     func : label_smooth_grad
 
-- backward_api : layer_norm_grad
+- backward_op : layer_norm_grad
   forward : layer_norm (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test) -> Tensor(out), Tensor(mean), Tensor(variance)
   args : (Tensor x,  Tensor scale, Tensor bias, Tensor mean, Tensor variance, Tensor out_grad, float epsilon, int begin_norm_axis, bool is_test)
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
@@ -1214,7 +1163,7 @@
   no_need_buffer : bias
   optional : scale, bias
 
-- backward_api : leaky_relu_double_grad
+- backward_op : leaky_relu_double_grad
   forward : leaky_relu_grad (Tensor x, Tensor grad_out, float alpha) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_x_grad, float alpha)
   output : Tensor(grad_out_grad)
@@ -1225,7 +1174,7 @@
     func : leaky_relu_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : leaky_relu_grad
+- backward_op : leaky_relu_grad
   forward : leaky_relu (Tensor x, float alpha) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float alpha)
   output : Tensor(x_grad)
@@ -1237,7 +1186,7 @@
   backward : leaky_relu_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : lerp_grad
+- backward_op : lerp_grad
   forward : lerp (Tensor x, Tensor y, Tensor weight) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor weight, Tensor out, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -1247,7 +1196,7 @@
   kernel :
     func : lerp_grad
 
-- backward_api : linear_interp_grad
+- backward_op : linear_interp_grad
   forward : linear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) -> Tensor(output)
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(x_grad)
@@ -1259,7 +1208,7 @@
     func : linear_interp_grad
     data_type : output_grad
 
-- backward_api : log10_grad
+- backward_op : log10_grad
   forward : log10 (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1270,7 +1219,7 @@
     func : log10_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : log1p_grad
+- backward_op : log1p_grad
   forward : log1p (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1281,7 +1230,7 @@
     func : log1p_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : log2_grad
+- backward_op : log2_grad
   forward : log2 (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1292,7 +1241,7 @@
     func : log2_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : log_double_grad
+- backward_op : log_double_grad
   forward : log_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_out, Tensor grad_x_grad)
   output : Tensor(x_grad), Tensor(grad_out_grad)
@@ -1303,7 +1252,7 @@
     func : log_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : log_grad
+- backward_op : log_grad
   forward : log (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1315,7 +1264,7 @@
   backward : log_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : log_loss_grad
+- backward_op : log_loss_grad
   forward : log_loss (Tensor input, Tensor label, float epsilon) -> Tensor(out)
   args : (Tensor input, Tensor label, Tensor out_grad, float epsilon)
   output : Tensor(input_grad)
@@ -1325,7 +1274,7 @@
   kernel :
     func : log_loss_grad
 
-- backward_api : log_softmax_grad
+- backward_op : log_softmax_grad
   forward : log_softmax(Tensor x,  int axis) -> Tensor(out)
   args : (Tensor out, Tensor out_grad,  int axis)
   output : Tensor(x_grad)
@@ -1335,7 +1284,7 @@
   kernel :
     func : log_softmax_grad
 
-- backward_api : logcumsumexp_grad
+- backward_op : logcumsumexp_grad
   forward : logcumsumexp(Tensor x, int axis, bool flatten, bool exclusive, bool reverse) -> Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
@@ -1345,7 +1294,7 @@
   kernel :
     func : logcumsumexp_grad
 
-- backward_api : logit_grad
+- backward_op : logit_grad
   forward : logit (Tensor x, float eps = 1e-6f) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float eps)
   output : Tensor(x_grad)
@@ -1355,7 +1304,7 @@
   kernel :
     func : logit_grad
 
-- backward_api : logsigmoid_grad
+- backward_op : logsigmoid_grad
   forward : logsigmoid (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1366,7 +1315,7 @@
     func : logsigmoid_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : logsumexp_grad
+- backward_op : logsumexp_grad
   forward : logsumexp(Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int64_t[] axis,  bool keepdim,  bool reduce_all)
   output : Tensor(x_grad)
@@ -1376,7 +1325,7 @@
   kernel :
     func : logsumexp_grad
 
-- backward_api : lu_grad
+- backward_op : lu_grad
   forward : lu (Tensor x, bool pivot) -> Tensor(out), Tensor(pivots), Tensor(infos)
   args : (Tensor x, Tensor out, Tensor pivots, Tensor out_grad, bool pivot)
   output : Tensor(x_grad)
@@ -1385,7 +1334,7 @@
   kernel :
     func : lu_grad
 
-- backward_api : lu_unpack_grad
+- backward_op : lu_unpack_grad
   forward : lu_unpack (Tensor x, Tensor pivots, bool unpack_ludata, bool unpack_pivots) -> Tensor(pmat), Tensor(l), Tensor(u)
   args : (Tensor x, Tensor pivots, Tensor l, Tensor u, Tensor pmat, Tensor l_grad, Tensor u_grad, bool unpack_ludata, bool unpack_pivots)
   output : Tensor(x_grad)
@@ -1394,7 +1343,7 @@
   kernel :
     func : lu_unpack_grad
 
-- backward_api : margin_cross_entropy_grad
+- backward_op : margin_cross_entropy_grad
   forward : margin_cross_entropy (Tensor logits, Tensor label, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale) -> Tensor(softmax), Tensor(loss)
   args : (Tensor logits, Tensor label, Tensor softmax, Tensor loss_grad, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale)
   output : Tensor(logits_grad)
@@ -1405,7 +1354,7 @@
     data_type : softmax
   inplace : (softmax -> logits_grad)
 
-- backward_api : masked_select_grad
+- backward_op : masked_select_grad
   forward : masked_select (Tensor x, Tensor mask) -> Tensor(out)
   args : (Tensor x, Tensor mask, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1417,7 +1366,7 @@
     data_type : x
   no_need_buffer : x
 
-- backward_api : matmul_double_grad
+- backward_op : matmul_double_grad
   forward : matmul_grad (Tensor x, Tensor y, Tensor grad_out, bool transpose_x=false, bool transpose_y=false) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor x, Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, bool transpose_x=false, bool transpose_y=false)
   output : Tensor(x_grad), Tensor(y_grad), Tensor(grad_out_grad)
@@ -1429,7 +1378,7 @@
   backward : matmul_triple_grad
   optional : grad_x_grad, grad_y_grad
 
-- backward_api : matmul_grad
+- backward_op : matmul_grad
   forward : matmul (Tensor x, Tensor y, bool transpose_x=false, bool transpose_y=false) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, bool transpose_x=false, bool transpose_y=false)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -1440,7 +1389,7 @@
     func : matmul_grad
   backward : matmul_double_grad
 
-- backward_api : matmul_triple_grad
+- backward_op : matmul_triple_grad
   forward : matmul_double_grad (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, bool transpose_x=false, bool transpose_y=false) -> Tensor(grad_x), Tensor(grad_y), Tensor(grad_grad_out)
   args : (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, Tensor grad_x_grad, Tensor grad_y_grad, Tensor grad_grad_out_grad, bool transpose_x=false, bool transpose_y=false)
   output : Tensor(x_grad), Tensor(y_grad), Tensor(fwd_grad_out_grad), Tensor(fwd_grad_grad_x_grad), Tensor(fwd_grad_grad_y_grad)
@@ -1451,7 +1400,7 @@
     func : matmul_triple_grad
   optional : grad_x_grad, grad_y_grad, grad_grad_out_grad
 
-- backward_api : matrix_power_grad
+- backward_op : matrix_power_grad
   forward : matrix_power (Tensor x, int n) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int n)
   output : Tensor(x_grad)
@@ -1461,7 +1410,7 @@
   kernel :
     func : matrix_power_grad
 
-- backward_api : max_grad
+- backward_op : max_grad
   forward: max (Tensor x,  IntArray dims={},  bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims={}, bool keep_dim=false, bool reduce_all=false)
   output : Tensor(x_grad)
@@ -1471,7 +1420,7 @@
   kernel :
     func : max_grad
 
-- backward_api : max_pool2d_with_index_grad
+- backward_op : max_pool2d_with_index_grad
   forward : max_pool2d_with_index(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) -> Tensor(out), Tensor(mask)
   args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive)
   output : Tensor(x_grad)
@@ -1480,7 +1429,7 @@
   kernel :
     func : max_pool2d_with_index_grad
 
-- backward_api : max_pool3d_with_index_grad
+- backward_op : max_pool3d_with_index_grad
   forward : max_pool3d_with_index(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive) -> Tensor(out), Tensor(mask)
   args : (Tensor x, Tensor mask, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive)
   output : Tensor(x_grad)
@@ -1489,7 +1438,7 @@
   kernel :
     func : max_pool3d_with_index_grad
 
-- backward_api : maximum_grad
+- backward_op : maximum_grad
   forward : maximum(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis=-1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -1499,7 +1448,7 @@
   kernel :
     func : maximum_grad
 
-- backward_api : maxout_grad
+- backward_op : maxout_grad
   forward : maxout(Tensor x, int groups, int axis) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int groups, int axis)
   output : Tensor(x_grad)
@@ -1509,7 +1458,7 @@
   kernel :
     func : maxout_grad
 
-- backward_api : mean_all_grad
+- backward_op : mean_all_grad
   forward : mean_all(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1519,13 +1468,13 @@
   kernel :
     func : mean_all_grad
 
-- backward_api : mean_double_grad
+- backward_op : mean_double_grad
   forward: mean_grad (Tensor x, Tensor grad_out, IntArray dims={},  bool keep_dim=false, bool reduce_all = false) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray dims={},  bool keep_dim=false)
   output : Tensor(grad_out_grad)
   invoke : mean(grad_x_grad, dims, keep_dim)
 
-- backward_api : mean_grad
+- backward_op : mean_grad
   forward: mean (Tensor x,  IntArray dims={},  bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray dims={},  bool keep_dim=false, bool reduce_all=false)
   output : Tensor(x_grad)
@@ -1537,7 +1486,7 @@
   backward : mean_double_grad
   no_need_buffer : x
 
-- backward_api : meshgrid_grad
+- backward_op : meshgrid_grad
   forward : meshgrid (Tensor[] inputs) -> Tensor[](outputs)
   args : (Tensor[] inputs, Tensor[] outputs_grad)
   output : Tensor[](inputs_grad){inputs.size()}
@@ -1546,7 +1495,7 @@
   kernel :
     func : meshgrid_grad
 
-- backward_api : min_grad
+- backward_op : min_grad
   forward: min (Tensor x,  IntArray dims={},  bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims={}, bool keep_dim=false, bool reduce_all=false)
   output : Tensor(x_grad)
@@ -1556,7 +1505,7 @@
   kernel :
     func : min_grad
 
-- backward_api : minimum_grad
+- backward_op : minimum_grad
   forward : minimum(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis=-1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -1566,7 +1515,7 @@
   kernel :
     func : minimum_grad
 
-- backward_api : mish_grad
+- backward_op : mish_grad
   forward : mish (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
   output : Tensor(x_grad)
@@ -1577,7 +1526,7 @@
     func : mish_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : mode_grad
+- backward_op : mode_grad
   forward : mode(Tensor x,  int axis,  bool keepdim) -> Tensor(out), Tensor(indices)
   args : (Tensor x, Tensor indices, Tensor out_grad,  int axis,  bool keepdim)
   output : Tensor(x_grad)
@@ -1587,7 +1536,7 @@
   kernel :
     func : mode_grad
 
-- backward_api : multi_dot_grad
+- backward_op : multi_dot_grad
   forward : multi_dot (Tensor[] x) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad)
   output : Tensor[](x_grad) {x.size()}
@@ -1596,7 +1545,7 @@
   kernel :
     func : multi_dot_grad
 
-- backward_api : multiplex_grad
+- backward_op : multiplex_grad
   forward : multiplex (Tensor[] ins, Tensor ids) -> Tensor(out)
   args : (Tensor[] ins, Tensor ids, Tensor out_grad)
   output : Tensor[](ins_grad){ins.size()}
@@ -1607,7 +1556,7 @@
     func : multiplex_grad
     param : [ids, out_grad]
 
-- backward_api : multiply_double_grad
+- backward_op : multiply_double_grad
   forward : multiply_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor x, Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad), Tensor(grad_out_grad)
@@ -1620,7 +1569,7 @@
   backward : multiply_triple_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : multiply_grad
+- backward_op : multiply_grad
   forward : multiply (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -1631,7 +1580,7 @@
     func : multiply_grad
   backward : multiply_double_grad
 
-- backward_api : multiply_triple_grad
+- backward_op : multiply_triple_grad
   forward : multiply_double_grad (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, int aixs = -1) -> Tensor(grad_x), Tensor(grad_y), Tensor(grad_grad_out)
   args : (Tensor x, Tensor y, Tensor fwd_grad_out, Tensor fwd_grad_grad_x, Tensor fwd_grad_grad_y, Tensor grad_x_grad, Tensor grad_y_grad, Tensor grad_grad_out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad), Tensor(fwd_grad_out_grad), Tensor(fwd_grad_grad_x_grad), Tensor(fwd_grad_grad_y_grad)
@@ -1642,7 +1591,7 @@
     func : multiply_triple_grad
   optional : fwd_grad_grad_x, fwd_grad_grad_y, grad_grad_out_grad
 
-- backward_api : nearest_interp_grad
+- backward_op : nearest_interp_grad
   forward : nearest_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) -> Tensor(output)
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(x_grad)
@@ -1654,7 +1603,7 @@
     func : nearest_interp_grad
     data_type : output_grad
 
-- backward_api : nll_loss_grad
+- backward_op : nll_loss_grad
   forward : nll_loss (Tensor input, Tensor label, Tensor weight, int64_t ignore_index, str reduction) -> Tensor(out), Tensor(total_weight)
   args : (Tensor input, Tensor label, Tensor weight, Tensor total_weight, Tensor out_grad, int64_t ignore_index, str reduction)
   output : Tensor(input_grad)
@@ -1665,7 +1614,7 @@
     data_type : input
   optional : weight
 
-- backward_api : norm_grad
+- backward_op : norm_grad
   forward : norm (Tensor x, int axis, float epsilon, bool is_test) -> Tensor(out), Tensor(norm)
   args : (Tensor x, Tensor norm, Tensor out_grad, int axis, float epsilon, bool is_test)
   output : Tensor(x_grad)
@@ -1675,7 +1624,7 @@
   kernel :
     func : norm_grad
 
-- backward_api : overlap_add_grad
+- backward_op : overlap_add_grad
   forward : overlap_add(Tensor x, int hop_length, int axis) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int hop_length, int axis)
   output : Tensor(x_grad)
@@ -1685,7 +1634,7 @@
     func : overlap_add_grad
     data_type : x
 
-- backward_api : p_norm_grad
+- backward_op : p_norm_grad
   forward : p_norm(Tensor x,  float porder,  int axis,  float epsilon,  bool keepdim,  bool asvector=false) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad,  float porder,  int axis,  float epsilon,  bool keepdim,  bool asvector)
   output : Tensor(x_grad)
@@ -1695,7 +1644,7 @@
   kernel :
     func : p_norm_grad
 
-- backward_api : pad3d_double_grad
+- backward_op : pad3d_double_grad
   forward : pad3d_grad(Tensor x, Tensor grad_out, IntArray paddings, str mode, float pad_value, str data_format) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray paddings, str mode, float pad_value, str data_format)
   output : Tensor(grad_out_grad)
@@ -1704,7 +1653,7 @@
   kernel :
     func : pad3d
 
-- backward_api : pad3d_grad
+- backward_op : pad3d_grad
   forward : pad3d(Tensor x, IntArray paddings, str mode,  float pad_value, str data_format) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray paddings, str mode,  float pad_value, str data_format)
   output : Tensor(x_grad)
@@ -1716,7 +1665,7 @@
   no_need_buffer : x
   backward : pad3d_double_grad
 
-- backward_api : pad_double_grad
+- backward_op : pad_double_grad
   forward : pad_grad(Tensor x, Tensor grad_out, int[] paddings, Scalar pad_value) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, int[] paddings, Scalar pad_value)
   output : Tensor(grad_out_grad)
@@ -1725,7 +1674,7 @@
   kernel :
     func : pad
 
-- backward_api : pad_grad
+- backward_op : pad_grad
   forward : pad(Tensor x, int[] paddings, Scalar pad_value) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] paddings, Scalar pad_value)
   output : Tensor(x_grad)
@@ -1738,7 +1687,7 @@
   no_need_buffer : x
   backward : pad_double_grad
 
-- backward_api : pixel_shuffle_grad
+- backward_op : pixel_shuffle_grad
   forward : pixel_shuffle (Tensor x, int upscale_factor, str data_format) -> Tensor(out)
   args : (Tensor out_grad, int upscale_factor, str data_format)
   output : Tensor(x_grad)
@@ -1747,7 +1696,7 @@
   kernel :
     func : pixel_shuffle_grad
 
-- backward_api : pool2d_double_grad
+- backward_op : pool2d_double_grad
   forward : pool2d_grad(Tensor x, Tensor out, Tensor grad_out, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn)
   output : Tensor(grad_out_grad)
@@ -1759,7 +1708,7 @@
     param : [grad_x_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
     use_gpudnn : use_gpudnn
 
-- backward_api : pool2d_grad
+- backward_op : pool2d_grad
   forward : pool2d(Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn)
   output : Tensor(x_grad)
@@ -1772,7 +1721,7 @@
     use_gpudnn : use_gpudnn
   backward : pool2d_double_grad
 
-- backward_api : pool3d_grad
+- backward_op : pool3d_grad
   forward : pool3d(Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn)
   output : Tensor(x_grad)
@@ -1784,7 +1733,7 @@
     param : [x, out, out_grad, kernel_size, strides, paddings, ceil_mode, exclusive, data_format, pooling_type, global_pooling, adaptive, padding_algorithm]
     use_gpudnn : use_gpudnn
 
-- backward_api : pow_grad
+- backward_op : pow_grad
   forward : pow(Tensor x, Scalar s) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, Scalar s=-1)
   output : Tensor(x_grad)
@@ -1795,7 +1744,7 @@
     func : pow_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : prelu_grad
+- backward_op : prelu_grad
   forward : prelu(Tensor x, Tensor alpha, str data_format, str mode) -> Tensor(out)
   args : (Tensor x, Tensor alpha, Tensor out_grad, str data_format, str mode)
   output : Tensor(x_grad), Tensor(alpha_grad)
@@ -1805,7 +1754,7 @@
   kernel :
     func : prelu_grad
 
-- backward_api : psroi_pool_grad
+- backward_op : psroi_pool_grad
   forward : psroi_pool (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, int output_channels, float spatial_scale) -> Tensor(out)
   args : (Tensor x, Tensor boxes, Tensor boxes_num, Tensor out_grad, int pooled_height, int pooled_width, int output_channels, float spatial_scale)
   output : Tensor(x_grad)
@@ -1818,7 +1767,7 @@
   optional : boxes_num
 
 # output is optional
-- backward_api : put_along_axis_grad
+- backward_op : put_along_axis_grad
   forward : put_along_axis (Tensor x, Tensor index, Tensor value, int axis, str reduce) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad, int axis, str reduce)
   output : Tensor(x_grad), Tensor(value_grad)
@@ -1828,7 +1777,7 @@
   kernel :
     func : put_along_axis_grad
 
-- backward_api : qr_grad
+- backward_op : qr_grad
   forward : qr (Tensor x, str mode) -> Tensor(q), Tensor(r)
   args : (Tensor x, Tensor q, Tensor r, Tensor q_grad, Tensor r_grad, str mode)
   output : Tensor(x_grad)
@@ -1838,13 +1787,13 @@
   kernel :
     func : qr_grad
 
-- backward_api : real_grad
+- backward_op : real_grad
   forward : real (Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
   invoke : real_grad_impl(out_grad, x_grad)
 
-- backward_api : reciprocal_grad
+- backward_op : reciprocal_grad
   forward : reciprocal (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1855,7 +1804,7 @@
     func : reciprocal_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : reduce_prod_grad
+- backward_op : reduce_prod_grad
   forward : reduce_prod (Tensor x, IntArray dims, bool keep_dim, bool reduce_all) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad, IntArray dims,  bool keep_dim, bool reduce_all)
   output : Tensor(x_grad)
@@ -1865,7 +1814,7 @@
   kernel :
     func : prod_grad
 
-- backward_api : relu6_grad
+- backward_op : relu6_grad
   forward : relu6 (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, float threshold)
   output : Tensor(x_grad)
@@ -1876,7 +1825,7 @@
     func : relu6_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : relu_double_grad
+- backward_op : relu_double_grad
   forward : relu_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor out, Tensor grad_x_grad)
   output : Tensor(grad_out_grad)
@@ -1887,7 +1836,7 @@
     func : relu_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : relu_grad
+- backward_op : relu_grad
   forward : relu (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1899,7 +1848,7 @@
   backward: relu_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : renorm_grad
+- backward_op : renorm_grad
   forward : renorm (Tensor x, float p, int axis, float max_norm) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float p, int axis, float max_norm)
   output : Tensor(x_grad)
@@ -1909,7 +1858,7 @@
   kernel :
     func : renorm_grad
 
-- backward_api : repeat_interleave_grad
+- backward_op : repeat_interleave_grad
   forward : repeat_interleave(Tensor x, int repeats, int dim) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int repeats, int dim)
   output : Tensor(x_grad)
@@ -1919,7 +1868,7 @@
   kernel :
     func : repeat_interleave_grad
 
-- backward_api : repeat_interleave_with_tensor_index_grad
+- backward_op : repeat_interleave_with_tensor_index_grad
   forward : repeat_interleave_with_tensor_index(Tensor x, Tensor repeats, int dim) -> Tensor(out)
   args : (Tensor x, Tensor repeats, Tensor out_grad, int dim)
   output : Tensor(x_grad)
@@ -1930,7 +1879,7 @@
     func : repeat_interleave_with_tensor_index_grad
     data_type : x
 
-- backward_api : reshape_double_grad
+- backward_op : reshape_double_grad
   forward : reshape_grad (Tensor xshape, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor grad_out, Tensor grad_x_grad)
   output : Tensor(grad_out_grad)
@@ -1942,7 +1891,7 @@
   no_need_buffer : grad_out
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : reshape_grad
+- backward_op : reshape_grad
   forward : reshape (Tensor x, IntArray shape) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad)
   output : Tensor(x_grad)
@@ -1958,7 +1907,7 @@
   backward : reshape_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : reverse_array_grad
+- backward_op : reverse_array_grad
   forward : reverse_array (Tensor[] x, IntArray axis) -> Tensor[](out)
   args : (Tensor[] out_grad, IntArray axis)
   output : Tensor[](x_grad){out_grad.size()}
@@ -1967,15 +1916,13 @@
   kernel :
     func : reverse
 
-- backward_api : reverse_grad
+- backward_op : reverse_grad
   forward : reverse (Tensor x, IntArray axis) -> Tensor(out)
   args : (Tensor out_grad, IntArray axis)
   output : Tensor(x_grad)
-  infer_meta :
-    func : ReverseInferMeta
   invoke : reverse(out_grad, axis)
 
-- backward_api : roi_align_grad
+- backward_op : roi_align_grad
   forward : roi_align (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned) -> Tensor(out)
   args : (Tensor x, Tensor boxes, Tensor boxes_num, Tensor out_grad, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned)
   output : Tensor(x_grad)
@@ -1988,7 +1935,7 @@
   no_need_buffer : x
   optional : boxes_num
 
-- backward_api : roi_pool_grad
+- backward_op : roi_pool_grad
   forward : roi_pool (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale) -> Tensor(out), Tensor(arg_max)
   args : (Tensor x, Tensor boxes, Tensor boxes_num, Tensor arg_max, Tensor out_grad, int pooled_height, int pooled_width, float spatial_scale)
   output : Tensor(x_grad)
@@ -2000,7 +1947,7 @@
     data_type : x
   optional : boxes_num
 
-- backward_api : roll_grad
+- backward_op : roll_grad
   forward : roll(Tensor x, IntArray shifts, int64_t[] axis) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray shifts, int64_t[] axis)
   output : Tensor(x_grad)
@@ -2012,7 +1959,7 @@
     data_type : x
   no_need_buffer : x
 
-- backward_api : round_grad
+- backward_op : round_grad
   forward : round(Tensor x) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
@@ -2023,7 +1970,7 @@
     func : round_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : rsqrt_double_grad
+- backward_op : rsqrt_double_grad
   forward : rsqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
   output : Tensor(out_grad), Tensor(grad_out_grad)
@@ -2034,7 +1981,7 @@
     func : rsqrt_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : rsqrt_grad
+- backward_op : rsqrt_grad
   forward : rsqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2046,13 +1993,13 @@
   backward : rsqrt_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : scale_grad
+- backward_op : scale_grad
   forward : scale (Tensor x, Scalar scale, float bias, bool bias_after_scale) -> Tensor(out)
   args : (Tensor out_grad, Scalar scale=1.0, bool bias_after_scale=true)
   output : Tensor(x_grad)
   invoke : scale(out_grad, scale, 0.0, bias_after_scale)
 
-- backward_api : scatter_grad
+- backward_op : scatter_grad
   forward : scatter (Tensor x, Tensor index, Tensor updates, bool overwrite) -> Tensor(out)
   args : (Tensor index, Tensor updates, Tensor out_grad, bool overwrite)
   output : Tensor(x_grad), Tensor(updates_grad)
@@ -2063,7 +2010,7 @@
     func : scatter_grad
   no_need_buffer : updates
 
-- backward_api : scatter_nd_add_grad
+- backward_op : scatter_nd_add_grad
   forward : scatter_nd_add (Tensor x, Tensor index, Tensor updates) -> Tensor(out)
   args : (Tensor index, Tensor updates, Tensor out_grad)
   output : Tensor(x_grad), Tensor(updates_grad)
@@ -2074,7 +2021,7 @@
     func : scatter_nd_add_grad
   no_need_buffer : updates
 
-- backward_api : segment_pool_grad
+- backward_op : segment_pool_grad
   forward : segment_pool (Tensor x, Tensor segment_ids, str pooltype) -> Tensor(out), Tensor(summed_ids)
   args : (Tensor x, Tensor segment_ids, Tensor out, Tensor summed_ids, Tensor out_grad, str pooltype)
   output : Tensor(x_grad)
@@ -2086,7 +2033,7 @@
     data_type : x
   optional : summed_ids
 
-- backward_api : selu_grad
+- backward_op : selu_grad
   forward : selu (Tensor x, float scale, float alpha) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, float scale, float alpha)
   output : Tensor(x_grad)
@@ -2096,7 +2043,7 @@
   kernel :
     func : selu_grad
 
-- backward_api : sigmoid_cross_entropy_with_logits_grad
+- backward_op : sigmoid_cross_entropy_with_logits_grad
   forward : sigmoid_cross_entropy_with_logits (Tensor x, Tensor label, bool normalize, int ignore_index) -> Tensor(out)
   args : (Tensor x, Tensor label, Tensor out_grad, bool normalize, int ignore_index)
   output : Tensor(x_grad)
@@ -2107,7 +2054,7 @@
     func : sigmoid_cross_entropy_with_logits_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : sigmoid_double_grad
+- backward_op : sigmoid_double_grad
   forward : sigmoid_grad (Tensor out, Tensor fwd_grad_out) -> Tensor(grad_x)
   args : (Tensor out, Tensor fwd_grad_out, Tensor grad_x_grad)
   output : Tensor(out_grad), Tensor(fwd_grad_out_grad)
@@ -2119,7 +2066,7 @@
   backward : sigmoid_triple_grad
   inplace : (grad_x_grad -> fwd_grad_out_grad)
 
-- backward_api : sigmoid_grad
+- backward_op : sigmoid_grad
   forward : sigmoid (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2131,7 +2078,7 @@
   backward : sigmoid_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : sigmoid_triple_grad
+- backward_op : sigmoid_triple_grad
   forward : sigmoid_double_grad (Tensor out, Tensor fwd_grad_out, Tensor grad_grad_x) -> Tensor(grad_out), Tensor(grad_grad_out)
   args : (Tensor out, Tensor fwd_grad_out, Tensor grad_grad_x, Tensor grad_out_grad, Tensor grad_grad_out_grad)
   output : Tensor(out_grad), Tensor(fwd_grad_out_grad), Tensor(grad_grad_x_grad)
@@ -2143,7 +2090,13 @@
   optional : grad_grad_out_grad
   inplace : (grad_grad_x -> fwd_grad_out_grad)
 
-- backward_api : silu_grad
+- backward_op : sign_grad
+  forward : sign (Tensor x) -> Tensor(out)
+  args : (Tensor out_grad)
+  output : Tensor(x_grad)
+  invoke : scale(out_grad, 0.0, 0.0, true)
+
+- backward_op : silu_grad
   forward : silu (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2154,7 +2107,7 @@
     func : silu_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : sin_grad
+- backward_op : sin_grad
   forward : sin (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2165,7 +2118,7 @@
     func : sin_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : sinh_grad
+- backward_op : sinh_grad
   forward : sinh (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2176,17 +2129,13 @@
     func : sinh_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : slice_double_grad
+- backward_op : slice_double_grad
   forward : slice_grad (Tensor input, Tensor grad_out, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(grad_input)
   args : (Tensor grad_input_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
   output : Tensor(grad_out_grad)
-  infer_meta :
-    func : UnchangedInferMeta
-    param : [grad_input_grad]
-  kernel :
-    func : slice
+  invoke : slice(grad_input_grad, axes, starts, ends, infer_flags, decrease_axis)
 
-- backward_api : slice_grad
+- backward_op : slice_grad
   forward : slice (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis) -> Tensor(out)
   args : (Tensor input, Tensor out_grad, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
   output : Tensor(input_grad)
@@ -2198,7 +2147,7 @@
   backward : slice_double_grad
   no_need_buffer : input
 
-- backward_api : slogdet_grad
+- backward_op : slogdet_grad
   forward : slogdet (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2208,7 +2157,7 @@
   kernel :
     func : slogdeterminant_grad
 
-- backward_api : soft_shrink_grad
+- backward_op : soft_shrink_grad
   forward : soft_shrink (Tensor x, float lambda) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float lambda)
   output : Tensor(x_grad)
@@ -2219,7 +2168,7 @@
     func : soft_shrink_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : softmax_grad
+- backward_op : softmax_grad
   forward : softmax (Tensor x, int axis) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, int axis)
   output : Tensor(x_grad)
@@ -2230,7 +2179,7 @@
     func : softmax_grad
     use_gpudnn : true
 
-- backward_api : softplus_grad
+- backward_op : softplus_grad
   forward : softplus (Tensor x, float beta, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float beta, float threshold)
   output : Tensor(x_grad)
@@ -2241,7 +2190,7 @@
     func : softplus_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : softsign_grad
+- backward_op : softsign_grad
   forward : softsign (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2252,7 +2201,7 @@
     func : softsign_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : spectral_norm_grad
+- backward_op : spectral_norm_grad
   forward : spectral_norm (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps) -> Tensor(out)
   args : (Tensor weight, Tensor u, Tensor v, Tensor out_grad, int dim, int power_iters, float eps)
   output : Tensor(weight_grad)
@@ -2262,20 +2211,20 @@
     func : spectral_norm_grad
     data_type : out_grad
 
-- backward_api : split_grad
+- backward_op : split_grad
   forward : split (Tensor x, IntArray num_or_sections, Scalar axis) -> Tensor[](out)
   args : (Tensor[] out_grad, Scalar axis = -1)
   output : Tensor(x_grad)
   invoke : concat( out_grad, axis)
 
-- backward_api : split_with_num_grad
+- backward_op : split_with_num_grad
   forward : split_with_num (Tensor x, int num, Scalar axis) -> Tensor[](out)
   args : (Tensor[] out_grad, Scalar axis = -1)
   output : Tensor(x_grad)
   invoke : concat( out_grad, axis)
 # TODO(zhangyunfei) The config of double grad and triple grad will be supported in the future.
 
-- backward_api : sqrt_double_grad
+- backward_op : sqrt_double_grad
   forward : sqrt_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor out, Tensor grad_x, Tensor grad_x_grad)
   output : Tensor(out_grad), Tensor(grad_out_grad)
@@ -2286,7 +2235,7 @@
     func : sqrt_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : sqrt_grad
+- backward_op : sqrt_grad
   forward : sqrt (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2298,7 +2247,7 @@
   backward : sqrt_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : square_double_grad
+- backward_op : square_double_grad
   forward : square_grad (Tensor x, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor x, Tensor grad_out, Tensor grad_x_grad)
   output : Tensor(x_grad), Tensor(grad_out_grad)
@@ -2309,7 +2258,7 @@
     func : square_double_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : square_grad
+- backward_op : square_grad
   forward : square (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2321,7 +2270,7 @@
   backward : square_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : squared_l2_norm_grad
+- backward_op : squared_l2_norm_grad
   forward : squared_l2_norm(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2331,13 +2280,13 @@
   kernel :
     func : squared_l2_norm_grad
 
-- backward_api : squeeze_double_grad
+- backward_op : squeeze_double_grad
   forward : squeeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray axes)
   output : Tensor(grad_out_grad)
   invoke: squeeze(grad_x_grad, axes)
 
-- backward_api : squeeze_grad
+- backward_op : squeeze_grad
   forward : squeeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad, IntArray axes)
   output : Tensor(x_grad)
@@ -2349,7 +2298,7 @@
   inplace : (out_grad -> x_grad)
   backward: squeeze_double_grad
 
-- backward_api : stack_grad
+- backward_op : stack_grad
   forward : stack (Tensor[] x, int axis) -> Tensor(out)
   args : (Tensor[] x, Tensor out_grad, int axis)
   output : Tensor[](x_grad){x.size()}
@@ -2361,7 +2310,7 @@
     param : [out_grad, axis]
   no_need_buffer : x
 
-- backward_api : strided_slice_grad
+- backward_op : strided_slice_grad
   forward : strided_slice (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] axes, IntArray starts, IntArray ends, IntArray strides)
   output : Tensor(x_grad)
@@ -2372,7 +2321,7 @@
     func : strided_slice_grad
   no_need_buffer : x
 
-- backward_api : subtract_double_grad
+- backward_op : subtract_double_grad
   forward : subtract_grad (Tensor x, Tensor y, Tensor grad_out, int axis = -1) -> Tensor(grad_x), Tensor(grad_y)
   args : (Tensor y, Tensor grad_out, Tensor grad_x_grad, Tensor grad_y_grad, int axis = -1)
   output : Tensor(grad_out_grad)
@@ -2385,7 +2334,7 @@
   no_need_buffer : y, grad_out
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : subtract_grad
+- backward_op : subtract_grad
   forward : subtract (Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad, int axis = -1)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -2398,13 +2347,13 @@
   backward : subtract_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : sum_double_grad
+- backward_op : sum_double_grad
   forward : sum_grad (Tensor x, Tensor grad_out, IntArray dims, bool keep_dim, bool reduce_all=false) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray dims={}, bool keep_dim=false)
   output : Tensor(grad_out_grad)
   invoke : sum(grad_x_grad, dims, grad_x_grad.dtype(), keep_dim)
 
-- backward_api : sum_grad
+- backward_op : sum_grad
   forward : sum (Tensor x, IntArray dims={}, DataType out_dtype=DataType::UNDEFINED, bool keep_dim=false) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray dims, bool keep_dim, bool reduce_all=false)
   output : Tensor(x_grad)
@@ -2416,7 +2365,7 @@
   no_need_buffer : x
   backward : sum_double_grad
 
-- backward_api : svd_grad
+- backward_op : svd_grad
   forward : svd (Tensor x, bool full) -> Tensor(u), Tensor(s), Tensor(vh)
   args : (Tensor x, Tensor u, Tensor vh, Tensor s, Tensor u_grad, Tensor vh_grad, Tensor s_grad, bool full)
   output : Tensor(x_grad)
@@ -2427,7 +2376,7 @@
     func : svd_grad
   optional: u_grad, vh_grad, s_grad
 
-- backward_api : swish_grad
+- backward_op : swish_grad
   forward : swish (Tensor x, float beta=1.0) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float bete=1.0)
   output : Tensor(x_grad)
@@ -2438,7 +2387,7 @@
     func : swish_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : sync_batch_norm_grad
+- backward_op : sync_batch_norm_grad
   forward : sync_batch_norm_ (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
   args : (Tensor x, Tensor scale, Tensor bias, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
   output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
@@ -2450,7 +2399,7 @@
     data_type : out_grad
   optional : reserve_space
 
-- backward_api : take_along_axis_grad
+- backward_op : take_along_axis_grad
   forward : take_along_axis (Tensor x, Tensor index, int axis) -> Tensor(out)
   args : (Tensor x, Tensor index, Tensor out_grad, int axis)
   output : Tensor(x_grad)
@@ -2460,7 +2409,7 @@
   kernel :
     func : take_along_axis_grad
 
-- backward_api : tan_grad
+- backward_op : tan_grad
   forward : tan (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2471,7 +2420,7 @@
     func : tan_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : tanh_double_grad
+- backward_op : tanh_double_grad
   forward : tanh_grad (Tensor out, Tensor grad_out) -> Tensor(grad_x)
   args : (Tensor out, Tensor grad_out, Tensor grad_x_grad)
   output : Tensor(out_grad), Tensor(grad_out_grad)
@@ -2483,7 +2432,7 @@
   backward : tanh_triple_grad
   inplace : (grad_x_grad -> grad_out_grad)
 
-- backward_api : tanh_grad
+- backward_op : tanh_grad
   forward : tanh (Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2495,7 +2444,7 @@
   backward : tanh_double_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : tanh_shrink_grad
+- backward_op : tanh_shrink_grad
   forward : tanh_shrink (Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
@@ -2506,7 +2455,7 @@
     func : tanh_shrink_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : tanh_triple_grad
+- backward_op : tanh_triple_grad
   forward : tanh_double_grad (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward) -> Tensor(grad_out_new), Tensor(grad_out_grad)
   args : (Tensor out, Tensor grad_out_forward, Tensor grad_x_grad_forward, Tensor grad_out_new_grad, Tensor grad_out_grad_grad)
   output : Tensor(out_grad), Tensor(grad_out_forward_grad), Tensor(grad_x_grad_forward_grad)
@@ -2517,7 +2466,7 @@
     func : tanh_triple_grad
   inplace : (grad_x_grad_forward -> grad_out_forward_grad)
 
-- backward_api : temporal_shift_grad
+- backward_op : temporal_shift_grad
   forward : temporal_shift(Tensor x, int seg_num, float shift_ratio, str data_format_str) -> Tensor(out)
   args : (Tensor out_grad, int seg_num, float shift_ratio, str data_format_str)
   output : Tensor(x_grad)
@@ -2527,7 +2476,7 @@
   kernel :
     func : temporal_shift_grad
 
-- backward_api : thresholded_relu_grad
+- backward_op : thresholded_relu_grad
   forward : thresholded_relu (Tensor x, float threshold) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float threshold)
   output : Tensor(x_grad)
@@ -2538,16 +2487,13 @@
     func : thresholded_relu_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : tile_double_grad
+- backward_op : tile_double_grad
   forward : tile_grad (Tensor x, Tensor grad_out, IntArray repeat_times) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray repeat_times)
   output : Tensor(grad_out_grad)
-  infer_meta :
-    func : TileInferMeta
-  kernel :
-    func : tile
+  invoke : tile(grad_x_grad, repeat_times)
 
-- backward_api : tile_grad
+- backward_op : tile_grad
   forward : tile (Tensor x, IntArray repeat_times) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, IntArray repeat_times)
   output : Tensor(x_grad)
@@ -2559,7 +2505,7 @@
   no_need_buffer : x
   backward : tile_double_grad
 
-- backward_api : top_k_grad
+- backward_op : top_k_grad
   forward : top_k (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true) -> Tensor(out), Tensor(indices)
   args : (Tensor x, Tensor indices, Tensor out_grad, Scalar k = -1, int axis = -1, bool largest = true, bool sorted = true)
   output : Tensor(x_grad)
@@ -2569,13 +2515,13 @@
   kernel :
     func : top_k_grad
 
-- backward_api : transpose_double_grad
+- backward_op : transpose_double_grad
   forward : transpose_grad (Tensor grad_out, int[] axis) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, int[] axis)
   output : Tensor(grad_out_grad)
   invoke : transpose(grad_x_grad, axis)
 
-- backward_api : transpose_grad
+- backward_op : transpose_grad
   forward : transpose (Tensor x, int[] axis) -> Tensor(out)
   args : (Tensor out_grad, int[] axis)
   output : Tensor(x_grad)
@@ -2586,7 +2532,7 @@
     func : transpose_grad
   backward : transpose_double_grad
 
-- backward_api : triangular_solve_grad
+- backward_op : triangular_solve_grad
   forward : triangular_solve (Tensor x, Tensor y, bool upper, bool tranpose, bool unitriangular) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad, bool upper, bool tranpose, bool unitriangular)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -2596,7 +2542,7 @@
   kernel :
     func : triangular_solve_grad
 
-- backward_api : tril_triu_grad
+- backward_op : tril_triu_grad
   forward : tril_triu(Tensor x,  int diagonal,  bool lower) -> Tensor(out)
   args : (Tensor out_grad,  int diagonal,  bool lower)
   output : Tensor(x_grad)
@@ -2606,7 +2552,7 @@
   kernel :
     func : tril_triu_grad
 
-- backward_api : trilinear_interp_grad
+- backward_op : trilinear_interp_grad
   forward : trilinear_interp (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode) -> Tensor(output)
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, Tensor output_grad, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(x_grad)
@@ -2618,13 +2564,13 @@
     func : trilinear_interp_grad
     data_type : output_grad
 
-- backward_api : unbind_grad
+- backward_op : unbind_grad
   forward : unbind (Tensor input, int axis) -> Tensor[](out)
   args : (Tensor[] out_grad, int axis)
   output : Tensor(input_grad)
   invoke : stack(out_grad, axis)
 
-- backward_api : unfold_grad
+- backward_op : unfold_grad
   forward : unfold (Tensor x, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations)
   output : Tensor(x_grad)
@@ -2635,7 +2581,7 @@
     func : unfold_grad
   no_need_buffer : x
 
-- backward_api : uniform_random_inplace_grad
+- backward_op : uniform_random_inplace_grad
   forward : uniform_random_inplace(Tensor x, float min, float max, int seed, int diag_num, int diag_step, float diag_val) -> Tensor(out)
   args : (Tensor out_grad, float min, float max, int seed, int diag_num, int diag_step, float diag_val)
   output : Tensor(x_grad)
@@ -2645,13 +2591,13 @@
     func : uniform_random_inplace_grad
   inplace : (out_grad -> x_grad)
 
-- backward_api : unsqueeze_double_grad
+- backward_op : unsqueeze_double_grad
   forward : unsqueeze_grad(Tensor xshape, Tensor grad_out, IntArray axes) -> Tensor(grad_x)
   args : (Tensor grad_x_grad, IntArray axes)
   output : Tensor(grad_out_grad)
   invoke : unsqueeze(grad_x_grad, axes)
 
-- backward_api : unsqueeze_grad
+- backward_op : unsqueeze_grad
   forward : unsqueeze(Tensor x, IntArray axes) -> Tensor(out), Tensor(xshape)
   args : (Tensor xshape, Tensor out_grad, IntArray axes)
   output : Tensor(x_grad)
@@ -2664,7 +2610,7 @@
   inplace : (out_grad -> x_grad)
   backward : unsqueeze_double_grad
 
-- backward_api : unstack_grad
+- backward_op : unstack_grad
   forward : unstack (Tensor x, int axis, int num) -> Tensor[](out)
   args : (Tensor[] out_grad, int axis)
   output : Tensor(x_grad)
@@ -2674,7 +2620,7 @@
   kernel :
     func : unstack_grad
 
-- backward_api : warpctc_grad
+- backward_op : warpctc_grad
   forward : warpctc (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times) -> Tensor(loss), Tensor(warpctcgrad)
   args : (Tensor logits, Tensor logits_length, Tensor warpctcgrad, Tensor loss_grad, int blank, bool norm_by_times)
   output : Tensor(logits_grad)
@@ -2686,7 +2632,7 @@
   optional : logits_length
   no_need_buffer : logits
 
-- backward_api : where_grad
+- backward_op : where_grad
   forward : where (Tensor condition, Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor condition, Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
@@ -2697,7 +2643,7 @@
     func : where_grad
   no_need_buffer : x, y
 
-- backward_api : yolov3_loss_grad
+- backward_op : yolov3_loss_grad
   forward : yolov3_loss(Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0) -> Tensor(loss), Tensor(objectness_mask), Tensor(gt_match_mask)
   args : (Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, Tensor objectness_mask, Tensor gt_match_mask, Tensor loss_grad, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0)
   output : Tensor(x_grad), Tensor(gt_box_grad), Tensor(gt_label_grad), Tensor(gt_score_grad)
@@ -2707,7 +2653,7 @@
     func : yolov3_loss_grad
   optional : gt_score
 
-- backward_api: fold_grad
+- backward_op: fold_grad
   forward: fold (Tensor x, int[] output_sizes, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations) -> Tensor(out)
   args: (Tensor x, Tensor out_grad, int[] output_sizes, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations)
   output: Tensor(x_grad)
@@ -2718,7 +2664,7 @@
     func: fold_grad
   no_need_buffer : x
 
-- backward_api: unpool3d_grad
+- backward_op: unpool3d_grad
   forward: unpool3d (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format) -> Tensor(out)
   args: (Tensor x, Tensor indices, Tensor out, Tensor out_grad, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format)
   output: Tensor(x_grad)
@@ -2729,7 +2675,7 @@
     func: unpool3d_grad
     data_type: x
 
-- backward_api: unpool_grad
+- backward_op: unpool_grad
   forward: unpool (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding,  IntArray output_size, str data_format) -> Tensor(out)
   args: (Tensor x, Tensor indices, Tensor out, Tensor out_grad, int[] ksize, int[] strides, int[] padding, IntArray output_size, str data_format)
   output: Tensor(x_grad)
diff --git a/paddle/phi/api/yaml/legacy_api.yaml b/paddle/phi/api/yaml/legacy_ops.yaml
similarity index 92%
rename from paddle/phi/api/yaml/legacy_api.yaml
rename to paddle/phi/api/yaml/legacy_ops.yaml
index d48e66485864e5..7159a0350cb830 100755
--- a/paddle/phi/api/yaml/legacy_api.yaml
+++ b/paddle/phi/api/yaml/legacy_ops.yaml
@@ -1,7 +1,7 @@
 # The apis in this file are unstandardized that may caused by a variety of reasons,
-# we are trying to fix these apis and will move standardized apis into api.yaml.
+# we are trying to fix these apis and will move standardized apis into ops.yaml.
 
-- api : abs
+- op : abs
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -10,7 +10,7 @@
     func : abs
   backward : abs_grad
 
-- api : accuracy
+- op : accuracy
   args : (Tensor x, Tensor indices, Tensor label)
   output : Tensor(accuracy), Tensor(correct), Tensor(total)
   infer_meta :
@@ -19,7 +19,7 @@
     func : accuracy
     dtype : x
 
-- api : acos
+- op : acos
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -28,7 +28,7 @@
     func : acos
   backward : acos_grad
 
-- api : acosh
+- op : acosh
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -37,7 +37,7 @@
     func : acosh
   backward : acosh_grad
 
-- api : adadelta_
+- op : adadelta_
   args : (Tensor param, Tensor grad, Tensor avg_squared_grad, Tensor avg_squared_update, float rho, float epsilon)
   output : Tensor(param_out), Tensor(moment_out), Tensor(inf_norm_out)
   infer_meta :
@@ -46,7 +46,7 @@
     func : adadelta
   inplace : (param -> param_out), (avg_squared_grad -> moment_out), (avg_squared_update -> inf_norm_out)
 
-- api : adagrad_
+- op : adagrad_
   args : (Tensor param, Tensor grad, Tensor moment, Tensor learning_rate, float epsilon)
   output : Tensor(param_out), Tensor(moment_out)
   infer_meta :
@@ -57,7 +57,7 @@
     data_type : param
   inplace : (param -> param_out), (moment -> moment_out)
 
-- api : adam_
+- op : adam_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
   infer_meta :
@@ -69,7 +69,7 @@
   optional : master_param, skip_update
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
 
-- api : adamax_
+- op : adamax_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment, Tensor inf_norm, Tensor beta1_pow, float beta1, float beta2, float epsilon)
   output : Tensor(param_out), Tensor(avg_squared_grad_out), Tensor(avg_squared_update_out)
   infer_meta :
@@ -78,7 +78,7 @@
     func : adamax
   inplace : (param -> param_out), (moment -> avg_squared_grad_out), (inf_norm -> avg_squared_update_out)
 
-- api : adamw_
+- op : adamw_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, Scalar beta1, Scalar beta2, Scalar epsilon, float lr_ratio, float coeff, bool with_decay, bool lazy_mode, int64_t min_row_size_to_use_multithread, bool multi_precision, bool use_global_beta_pow)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
   infer_meta :
@@ -89,7 +89,7 @@
   optional : master_param, skip_update
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
 
-- api : add
+- op : add
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -99,16 +99,13 @@
   inplace : (x -> out)
   backward : add_grad
 
-- api : add_n
+- op : add_n
   args : (Tensor[] x)
   output : Tensor
-  infer_meta :
-    func : AddNInferMeta
-  kernel :
-    func : add_n
+  invoke : add_n_impl(x)
   backward : add_n_grad
 
-- api : addmm
+- op : addmm
   args : (Tensor input, Tensor x, Tensor y, float alpha, float beta)
   output : Tensor
   infer_meta :
@@ -117,7 +114,7 @@
     func : addmm
   backward : addmm_grad
 
-- api : affine_grid
+- op : affine_grid
   args : (Tensor input, IntArray outputShape, bool use_cudnn=true, bool align_corners=true)
   output : Tensor
   infer_meta :
@@ -130,7 +127,7 @@
     use_gpudnn: use_cudnn
   backward : affine_grid_grad
 
-- api : all
+- op : all
   args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -138,7 +135,7 @@
   kernel :
     func : all
 
-- api : allclose
+- op : allclose
   args : (Tensor x, Tensor y, Scalar rtol, Scalar atol, bool equal_nan)
   output : Tensor(out)
   infer_meta :
@@ -147,7 +144,7 @@
   kernel :
     func : allclose
 
-- api : amax
+- op : amax
   args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -156,7 +153,7 @@
     func : amax
   backward : amax_grad
 
-- api : amin
+- op : amin
   args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -165,7 +162,7 @@
     func : amin
   backward : amin_grad
 
-- api : angle
+- op : angle
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -174,7 +171,7 @@
     func : angle
   backward : angle_grad
 
-- api : any
+- op : any
   args : (Tensor x, int64_t[] dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -182,7 +179,7 @@
   kernel :
     func : any
 
-- api : arange
+- op : arange
   args : (Tensor start, Tensor end, Tensor step, DataType dtype, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -196,7 +193,7 @@
   data_transform :
     support_trans_dtype : start, end, step
 
-- api : argmax
+- op : argmax
   args : (Tensor x, Scalar axis, bool keepdims, bool flatten, int dtype)
   output : Tensor(out)
   infer_meta :
@@ -204,7 +201,7 @@
   kernel :
     func : arg_max
 
-- api : argmin
+- op : argmin
   args : (Tensor x, Scalar axis, bool keepdims, bool flatten, int dtype)
   output : Tensor(out)
   infer_meta :
@@ -212,7 +209,7 @@
   kernel :
     func : arg_min
 
-- api : argsort
+- op : argsort
   args : (Tensor x, int axis=-1, bool descending=false)
   output : Tensor(out), Tensor(indices)
   infer_meta :
@@ -221,7 +218,7 @@
     func : argsort
   backward : argsort_grad
 
-- api : as_complex
+- op : as_complex
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -230,7 +227,7 @@
     func : as_complex
   backward : as_complex_grad
 
-- api : as_real
+- op : as_real
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -239,7 +236,7 @@
     func : as_real
   backward : as_real_grad
 
-- api : asin
+- op : asin
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -248,7 +245,7 @@
     func : asin
   backward : asin_grad
 
-- api : asinh
+- op : asinh
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -257,7 +254,7 @@
     func : asinh
   backward : asinh_grad
 
-- api : assign
+- op : assign
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -266,7 +263,7 @@
     func : assign
   backward : assign_grad
 
-- api : assign_out_
+- op : assign_out_
   args : (Tensor x, Tensor output)
   output : Tensor(out)
   infer_meta :
@@ -278,7 +275,7 @@
   inplace : (output -> out)
   backward : assign_out__grad
 
-- api : assign_value_
+- op : assign_value_
   args : (Tensor output, int[] shape, DataType dtype, Scalar[] values, Place place = {})
   output : Tensor(out)
   inplace: (output -> out)
@@ -291,7 +288,7 @@
     data_type : dtype
     backend : place > output
 
-- api : atan
+- op : atan
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -300,7 +297,7 @@
     func : atan
   backward : atan_grad
 
-- api : atanh
+- op : atanh
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -309,7 +306,7 @@
     func : atanh
   backward : atanh_grad
 
-- api : auc
+- op : auc
   args : (Tensor x, Tensor label, Tensor stat_pos, Tensor stat_neg, Tensor ins_tag_weight, str curve, int num_thresholds, int slide_steps)
   output : Tensor(auc), Tensor(stat_pos_out), Tensor(stat_neg_out)
   infer_meta :
@@ -318,7 +315,7 @@
     func : auc
   optional : ins_tag_weight
 
-- api : average_accumulates_
+- op : average_accumulates_
   args : (Tensor param, Tensor in_sum_1, Tensor in_sum_2, Tensor in_sum_3, Tensor in_num_accumulates, Tensor in_old_num_accumulates, Tensor in_num_updates, float average_window, int64_t max_average_window, int64_t min_average_window)
   output : Tensor(out_sum_1), Tensor(out_sum_2), Tensor(out_sum_3), Tensor(out_num_accumulates), Tensor(out_old_num_accumulates), Tensor(out_num_updates)
   infer_meta:
@@ -328,13 +325,13 @@
     data_type : param
   inplace : (in_sum_1 -> out_sum_1), (in_sum_2 -> out_sum_2), (in_sum_3 -> out_sum_3), (in_num_accumulates -> out_num_accumulates), (in_old_num_accumulates -> out_old_num_accumulates), (in_num_updates -> out_num_updates)
 
-- api : batch_norm
+- op : batch_norm
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
   output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
   invoke : batch_norm_impl(x, scale, bias, mean, variance, momentum, epsilon, data_layout, is_test, use_global_stats, trainable_statistics, fuse_with_relu)
   backward : batch_norm_grad
 
-- api : bce_loss
+- op : bce_loss
   args : (Tensor input, Tensor label)
   output : Tensor
   infer_meta :
@@ -343,7 +340,7 @@
     func : bce_loss
   backward : bce_loss_grad
 
-- api : bicubic_interp
+- op : bicubic_interp
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(output)
   infer_meta :
@@ -354,7 +351,7 @@
     data_type : x
   backward : bicubic_interp_grad
 
-- api : bilinear_interp
+- op : bilinear_interp
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(output)
   infer_meta :
@@ -365,7 +362,7 @@
     data_type : x
   backward : bilinear_interp_grad
 
-- api : bilinear_tensor_product
+- op : bilinear_tensor_product
   args : (Tensor x, Tensor y, Tensor weight, Tensor bias)
   output : Tensor
   infer_meta :
@@ -375,7 +372,7 @@
   optional : bias
   backward : bilinear_tensor_product_grad
 
-- api : bitwise_and
+- op : bitwise_and
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -383,7 +380,7 @@
   kernel :
     func : bitwise_and
 
-- api : bitwise_not
+- op : bitwise_not
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -391,7 +388,7 @@
   kernel :
     func : bitwise_not
 
-- api : bitwise_or
+- op : bitwise_or
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -399,7 +396,7 @@
   kernel :
     func : bitwise_or
 
-- api : bitwise_xor
+- op : bitwise_xor
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -407,7 +404,7 @@
   kernel :
     func : bitwise_xor
 
-- api : bmm
+- op : bmm
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -416,7 +413,7 @@
     func : bmm
   backward : bmm_grad
 
-- api : box_coder
+- op : box_coder
   args : (Tensor prior_box, Tensor prior_box_var, Tensor target_box, str code_type, bool box_normalized, int axis, float[] variance)
   output : Tensor(output_box)
   infer_meta :
@@ -425,7 +422,7 @@
     func : box_coder
   optional : prior_box_var
 
-- api : brelu
+- op : brelu
   args : (Tensor x, float t_min, float t_max)
   output : Tensor
   infer_meta :
@@ -435,7 +432,7 @@
     func : brelu
   backward : brelu_grad
 
-- api : cast
+- op : cast
   args : (Tensor x, DataType out_dtype)
   output : Tensor
   infer_meta :
@@ -446,7 +443,7 @@
     data_type : x
   backward : cast_grad
 
-- api : ceil
+- op : ceil
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -456,7 +453,7 @@
   inplace : (x -> out)
   backward : ceil_grad
 
-- api : celu
+- op : celu
   args : (Tensor x, float alpha)
   output : Tensor(out)
   infer_meta :
@@ -466,7 +463,7 @@
     func : celu
   backward : celu_grad
 
-- api : class_center_sample
+- op : class_center_sample
   args : (Tensor label, int num_classes, int num_samples, int ring_id, int rank, int nranks, bool fix_seed, int seed)
   output : Tensor(remapped_label), Tensor(sampled_local_class_center)
   infer_meta :
@@ -474,7 +471,7 @@
   kernel :
     func : class_center_sample
 
-- api : clip
+- op : clip
   args : (Tensor x, Scalar(float) min, Scalar(float) max)
   output : Tensor(out)
   inplace : (x -> out)
@@ -485,7 +482,7 @@
     func : clip
   backward : clip_grad
 
-- api : clip_by_norm
+- op : clip_by_norm
   args : (Tensor x, float max_norm)
   output : Tensor(out)
   infer_meta :
@@ -493,7 +490,7 @@
   kernel :
     func : clip_by_norm
 
-- api : coalesce_tensor
+- op : coalesce_tensor
   args : (Tensor[] input, DataType dtype, bool copy_data = false, bool set_constant = false, bool persist_output = false, float constant = 0.0, bool use_align = true, int align_size = -1, int size_of_dtype = -1, int64_t[] concated_shapes = {}, int64_t[] concated_ranks = {})
   output : Tensor[](output){input.size()}, Tensor(fused_output)
   infer_meta :
@@ -502,7 +499,7 @@
     func : coalesce_tensor
     data_type : dtype
 
-- api : complex
+- op : complex
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -511,7 +508,7 @@
     func : complex
   backward : complex_grad
 
-- api : concat
+- op : concat
   args : (Tensor[] x, Scalar(int64_t) axis)
   output : Tensor
   infer_meta :
@@ -521,7 +518,7 @@
     func : concat
   backward : concat_grad
 
-- api : conj
+- op : conj
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -530,7 +527,7 @@
     func : conj
   backward : conj_grad
 
-- api : conv2d
+- op : conv2d
   args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor
   infer_meta :
@@ -540,7 +537,7 @@
     use_gpudnn : true
   backward : conv2d_grad
 
-- api : conv2d_transpose
+- op : conv2d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
   infer_meta :
@@ -550,7 +547,7 @@
     use_gpudnn : true
   backward : conv2d_transpose_grad
 
-- api : conv3d
+- op : conv3d
   args : (Tensor input, Tensor filter, int[] strides, int[] paddings, str paddding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search)
   output : Tensor
   infer_meta :
@@ -560,7 +557,7 @@
     use_gpudnn : true
   backward : conv3d_grad
 
-- api : conv3d_transpose
+- op : conv3d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, int[] output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
   infer_meta :
@@ -570,12 +567,12 @@
     use_gpudnn : true
   backward : conv3d_transpose_grad
 
-- api : copy_to
+- op : copy_to
   args : (Tensor x, Place place, bool blocking)
   output : Tensor(out)
   invoke : copy_to_impl(x, place, blocking)
 
-- api : cos
+- op : cos
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -584,7 +581,7 @@
     func : cos
   backward : cos_grad
 
-- api : cosh
+- op : cosh
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -593,7 +590,7 @@
     func : cosh
   backward : cosh_grad
 
-- api : crop_tensor
+- op : crop_tensor
   args : (Tensor x, IntArray shape, IntArray offsets)
   output : Tensor(out)
   infer_meta :
@@ -604,7 +601,7 @@
   backward : crop_tensor_grad
 
 # Part of python API paddle.nn.functional.cross_entropy
-- api : cross_entropy_with_softmax
+- op : cross_entropy_with_softmax
   args : (Tensor input, Tensor label, bool soft_label, bool use_softmax, bool numeric_stable_mode, int ignore_index, int axis)
   output : Tensor(softmax), Tensor(loss)
   infer_meta :
@@ -614,7 +611,7 @@
     data_type : input
   backward : cross_entropy_with_softmax_grad
 
-- api : cumprod
+- op : cumprod
   args : (Tensor x,  int dim)
   output : Tensor(out)
   infer_meta :
@@ -624,7 +621,7 @@
     func : cumprod
   backward : cumprod_grad
 
-- api : cumsum
+- op : cumsum
   args : (Tensor x, Scalar axis, bool flatten, bool exclusive, bool reverse)
   output : Tensor(out)
   infer_meta :
@@ -633,7 +630,7 @@
     func : cumsum
   backward : cumsum_grad
 
-- api : decode_jpeg
+- op : decode_jpeg
   args : (Tensor x, str mode)
   output : Tensor(out)
   infer_meta :
@@ -641,7 +638,7 @@
   kernel :
     func : decode_jpeg
 
-- api : deformable_conv
+- op : deformable_conv
   args : (Tensor x, Tensor offset, Tensor filter, Tensor mask, int[] strides, int[] paddings, int[] dilations, int deformable_groups, int groups, int im2col_step)
   output : Tensor(out)
   infer_meta :
@@ -652,7 +649,7 @@
   optional : mask
   backward : deformable_conv_grad
 
-- api : depthwise_conv2d
+- op : depthwise_conv2d
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, str padding_algorithm, int groups, int[] dilations, str data_format, bool use_addto, int workspace_size_MB, bool exhaustive_search, bool fuse_relu, bool use_gpudnn)
   output : Tensor(out)
   infer_meta :
@@ -664,7 +661,7 @@
     use_gpudnn : use_gpudnn
   backward : depthwise_conv2d_grad
 
-- api : depthwise_conv2d_transpose
+- op : depthwise_conv2d_transpose
   args : (Tensor x, Tensor filter, int[] strides, int[] paddings, int[] output_padding, IntArray output_size, str padding_algorithm, int groups, int[] dilations, str data_format)
   output : Tensor(out)
   infer_meta :
@@ -673,7 +670,7 @@
     func : depthwise_conv2d_transpose
   backward : depthwise_conv2d_transpose_grad
 
-- api : det
+- op : det
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -682,7 +679,7 @@
     func : determinant
   backward : det_grad
 
-- api : diag_embed
+- op : diag_embed
   args : (Tensor x, int offset, int dim1, int dim2)
   output : Tensor(out)
   infer_meta :
@@ -690,7 +687,7 @@
   kernel :
     func : diag_embed
 
-- api : distribute_fpn_proposals
+- op : distribute_fpn_proposals
   args : (Tensor fpn_rois, Tensor rois_num, int min_level, int max_level, int refer_level, int refer_scale, bool pixel_offset)
   output : Tensor[](multi_fpn_rois){max_level - min_level + 1}, Tensor[](multi_level_rois_num){max_level - min_level + 1}, Tensor(restore_index)
   infer_meta :
@@ -700,7 +697,7 @@
     data_type : fpn_rois
   optional : rois_num
 
-- api : divide
+- op : divide
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -709,7 +706,7 @@
     func : divide
   backward : divide_grad
 
-- api : dropout
+- op : dropout
   args : (Tensor x, Tensor seed_tensor, Scalar p, bool is_test, str mode, int seed, bool fix_seed)
   output : Tensor(out), Tensor(mask)
   infer_meta :
@@ -720,7 +717,7 @@
   optional : seed_tensor
   backward : dropout_grad
 
-- api : edit_distance
+- op : edit_distance
   args : (Tensor hyps, Tensor refs, Tensor hypslength, Tensor refslength, bool normalized = false)
   output : Tensor(sequencenum), Tensor(out)
   infer_meta :
@@ -730,7 +727,7 @@
     data_type: DataType::FLOAT32
   optional : hypslength, refslength
 
-- api : eigh
+- op : eigh
   args : (Tensor x, str uplo)
   output : Tensor(out_w), Tensor(out_v)
   infer_meta :
@@ -739,7 +736,7 @@
     func : eigh
   backward : eigh_grad
 
-- api : eigvals
+- op : eigvals
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -747,7 +744,7 @@
   kernel :
     func : eigvals
 
-- api : eigvalsh
+- op : eigvalsh
   args : (Tensor x, str uplo, bool is_test)
   output : Tensor(eigenvalues), Tensor(eigenvectors)
   infer_meta :
@@ -756,7 +753,7 @@
     func : eigvalsh
   backward : eigvalsh_grad
 
-- api : einsum
+- op : einsum
   args : (Tensor[] x, str equation)
   output : Tensor, Tensor[]{x.size()}, Tensor[]{x.size()}
   infer_meta :
@@ -766,7 +763,7 @@
     func : einsum_raw
   backward : einsum_grad
 
-- api : elementwise_pow
+- op : elementwise_pow
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -775,7 +772,7 @@
     func : elementwise_pow
   backward : elementwise_pow_grad
 
-- api : elu
+- op : elu
   args : (Tensor x, float alpha)
   output : Tensor(out)
   infer_meta :
@@ -786,13 +783,13 @@
   inplace : (x -> out)
   backward : elu_grad
 
-- api : embedding
+- op : embedding
   args : (Tensor x, Tensor weight, int64_t padding_idx=-1, bool sparse=false)
   output : Tensor
   invoke : embedding_impl(x, weight, padding_idx, sparse)
   backward : embedding_grad
 
-- api : empty
+- op : empty
   args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor(out)
   infer_meta :
@@ -804,7 +801,7 @@
     data_type : dtype
     backend : place
 
-- api : empty_like
+- op : empty_like
   args : (Tensor x, DataType dtype = DataType::UNDEFINED, Place place = {})
   output: Tensor(out)
   infer_meta :
@@ -816,7 +813,7 @@
     data_type : dtype > x
     backend : place > x
 
-- api : equal
+- op : equal
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
   infer_meta :
@@ -824,7 +821,7 @@
   kernel :
     func : equal
 
-- api : equal_all
+- op : equal_all
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -832,7 +829,7 @@
   kernel :
     func : equal_all
 
-- api : exp
+- op : exp
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -842,7 +839,7 @@
   inplace : (x -> out)
   backward : exp_grad
 
-- api : expand
+- op : expand
   args : (Tensor x, IntArray shape)
   output : Tensor
   infer_meta :
@@ -851,7 +848,7 @@
     func : expand
   backward : expand_grad
 
-- api : expand_as
+- op : expand_as
   args : (Tensor x, Tensor y, int[] target_shape)
   output : Tensor
   infer_meta :
@@ -861,7 +858,7 @@
   optional : y
   backward : expand_as_grad
 
-- api : expm1
+- op : expm1
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -871,7 +868,7 @@
     func : expm1
   backward : expm1_grad
 
-- api : exponential_
+- op : exponential_
   args : (Tensor x, float lambda)
   output : Tensor(out)
   infer_meta :
@@ -882,7 +879,7 @@
   inplace : (x -> out)
   backward : exponential__grad
 
-- api : eye
+- op : eye
   args : (Scalar num_rows, Scalar num_columns, DataType dtype=DataType::FLOAT32, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -894,7 +891,7 @@
     data_type : dtype
     backend : place
 
-- api : fill
+- op : fill
   args : (Tensor x, Scalar value)
   output : Tensor(out)
   infer_meta :
@@ -905,7 +902,7 @@
   inplace : (x -> out)
   backward: fill_grad
 
-- api : fill_diagonal
+- op : fill_diagonal
   args : (Tensor x, float value, int offset, bool wrap)
   output : Tensor(out)
   infer_meta :
@@ -915,7 +912,7 @@
   inplace : (x -> out)
   backward : fill_diagonal_grad
 
-- api : fill_diagonal_tensor
+- op : fill_diagonal_tensor
   args : (Tensor x, Tensor y, int64_t offset, int dim1, int dim2)
   output : Tensor(out)
   infer_meta :
@@ -925,7 +922,7 @@
   inplace : (x -> out)
   backward : fill_diagonal_tensor_grad
 
-- api : flatten
+- op : flatten
   args : (Tensor x, int start_axis, int stop_axis)
   output : Tensor(out), Tensor(xshape)
   infer_meta :
@@ -938,16 +935,7 @@
   intermediate : xshape
   backward : flatten_grad
 
-- api : flip
-  args : (Tensor x, int[] axis)
-  output : Tensor
-  infer_meta :
-    func : FlipInferMeta
-  kernel :
-    func : flip
-  backward : flip_grad
-
-- api : floor
+- op : floor
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -957,7 +945,7 @@
   inplace : (x -> out)
   backward : floor_grad
 
-- api : floor_divide
+- op : floor_divide
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -965,7 +953,7 @@
   kernel :
     func : floor_divide
 
-- api : fmax
+- op : fmax
   args : (Tensor x, Tensor y,  int axis)
   output : Tensor(out)
   infer_meta :
@@ -975,7 +963,7 @@
     func : fmax
   backward : fmax_grad
 
-- api : fmin
+- op : fmin
   args : (Tensor x, Tensor y,  int axis)
   output : Tensor(out)
   infer_meta :
@@ -985,7 +973,7 @@
     func : fmin
   backward : fmin_grad
 
-- api : frame
+- op : frame
   args : (Tensor x, int frame_length, int hop_length, int axis)
   output : Tensor(out)
   infer_meta :
@@ -994,7 +982,7 @@
     func : frame
   backward : frame_grad
 
-- api : frobenius_norm
+- op : frobenius_norm
   args : (Tensor x, int64_t[] axis,  bool keep_dim,  bool reduce_all)
   output : Tensor(out)
   infer_meta :
@@ -1003,7 +991,7 @@
     func : frobenius_norm
   backward : frobenius_norm_grad
 
-- api : full
+- op : full
   args : (IntArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output: Tensor(out)
   infer_meta :
@@ -1015,7 +1003,7 @@
     data_type : dtype
     backend : place
 
-- api : full_
+- op : full_
   args : (Tensor output, IntArray shape, Scalar value, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output : Tensor(out)
   inplace : (output -> out)
@@ -1028,7 +1016,7 @@
     data_type : dtype
     backend : place
 
-- api : full_batch_size_like
+- op : full_batch_size_like
   args : (Tensor input, int[] shape, DataType dtype, Scalar value, int input_dim_idx, int output_dim_idx, Place place=CPUPlace())
   output: Tensor(out)
   infer_meta :
@@ -1040,7 +1028,7 @@
     data_type : dtype
     backend : place
 
-- api : full_like
+- op : full_like
   args : (Tensor x, Scalar value, DataType dtype = DataType::UNDEFINED, Place place = {})
   output: Tensor(out)
   infer_meta :
@@ -1054,7 +1042,7 @@
   data_transform :
     skip_transform : x
 
-- api : gather
+- op : gather
   args : (Tensor x, Tensor index, Scalar(int) axis=0)
   output : Tensor(out)
   infer_meta :
@@ -1064,7 +1052,7 @@
     data_type: x
   backward : gather_grad
 
-- api : gather_nd
+- op : gather_nd
   args : (Tensor x, Tensor index)
   output : Tensor
   infer_meta :
@@ -1074,7 +1062,7 @@
     data_type : x
   backward : gather_nd_grad
 
-- api : gather_tree
+- op : gather_tree
   args : (Tensor ids, Tensor parents)
   output : Tensor(out)
   infer_meta :
@@ -1082,7 +1070,7 @@
   kernel :
     func : gather_tree
 
-- api : gaussian_random
+- op : gaussian_random
   args : (IntArray shape, float mean, float std, int seed, DataType dtype, Place place={})
   output: Tensor(out)
   infer_meta :
@@ -1094,7 +1082,7 @@
     data_type : dtype
     backend : place
 
-- api : gelu
+- op : gelu
   args : (Tensor x,  bool approximate)
   output : Tensor(out)
   infer_meta :
@@ -1104,7 +1092,7 @@
     func : gelu
   backward : gelu_grad
 
-- api : generate_proposals_v2
+- op : generate_proposals_v2
   args : (Tensor scores, Tensor bbox_deltas, Tensor im_shape, Tensor anchors, Tensor variances, int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, float eta, bool pixel_offset=true)
   output : Tensor(rpn_rois), Tensor(rpn_roi_probs), Tensor(rpn_rois_num)
   infer_meta :
@@ -1112,7 +1100,7 @@
   kernel :
     func : generate_proposals_v2
 
-- api : graph_send_recv
+- op : graph_send_recv
   args : (Tensor x, Tensor src_index, Tensor dst_index, str reduce_op = "SUM", IntArray out_size = {0})
   output : Tensor(out), Tensor(dst_count)
   infer_meta :
@@ -1123,7 +1111,7 @@
   intermediate : dst_count
   backward : graph_send_recv_grad
 
-- api : graph_send_ue_recv
+- op : graph_send_ue_recv
   args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op, str reduce_op, IntArray out_size)
   output : Tensor(out), Tensor(dst_count)
   infer_meta :
@@ -1134,7 +1122,7 @@
   intermediate : dst_count
   backward : graph_send_ue_recv_grad
 
-- api : greater_equal
+- op : greater_equal
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
   infer_meta :
@@ -1142,7 +1130,7 @@
   kernel :
     func : greater_equal
 
-- api : greater_than
+- op : greater_than
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
   infer_meta :
@@ -1150,7 +1138,7 @@
   kernel :
     func : greater_than
 
-- api : grid_sample
+- op : grid_sample
   args : (Tensor x, Tensor grid, str mode, str padding_mode, bool align_corners)
   output : Tensor(out)
   infer_meta :
@@ -1161,7 +1149,7 @@
     data_type : x
   backward : grid_sample_grad
 
-- api : group_norm
+- op : group_norm
   args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int groups, str data_layout)
   output : Tensor(y), Tensor(mean), Tensor(variance)
   infer_meta :
@@ -1172,7 +1160,7 @@
   intermediate : mean, variance
   backward : group_norm_grad
 
-- api : gumbel_softmax
+- op : gumbel_softmax
   args : (Tensor x, float temperature, bool hard, int axis)
   output : Tensor
   infer_meta :
@@ -1181,7 +1169,7 @@
     func : gumbel_softmax
   backward : gumbel_softmax_grad
 
-- api : hard_shrink
+- op : hard_shrink
   args : (Tensor x, float threshold)
   output : Tensor
   infer_meta :
@@ -1191,7 +1179,7 @@
     func : hard_shrink
   backward : hard_shrink_grad
 
-- api : hard_sigmoid
+- op : hard_sigmoid
   args : (Tensor x, float slope, float offset)
   output : Tensor
   infer_meta :
@@ -1201,7 +1189,7 @@
     func : hard_sigmoid
   backward : hard_sigmoid_grad
 
-- api : hard_swish
+- op : hard_swish
   args : (Tensor x, float threshold = 6.0, float scale = 6.0, float offset = 3.0)
   output : Tensor
   infer_meta :
@@ -1211,7 +1199,7 @@
     func : hard_swish
   backward : hard_swish_grad
 
-- api : hierarchical_sigmoid
+- op : hierarchical_sigmoid
   args : (Tensor x, Tensor w, Tensor label, Tensor path, Tensor code, Tensor bias, int num_classes, bool remote_prefetch, int trainer_id, int64_t[] height_sections, str[] epmap, str[] table_names, bool is_sparse)
   output : Tensor(out), Tensor(pre_out), Tensor(w_out)
   infer_meta :
@@ -1222,7 +1210,7 @@
     data_type : x
   backward : hierarchical_sigmoid_grad
 
-- api : histogram
+- op : histogram
   args : (Tensor x, int64_t bins, int min, int max)
   output : Tensor(out)
   infer_meta :
@@ -1230,7 +1218,7 @@
   kernel :
     func : histogram
 
-- api : huber_loss
+- op : huber_loss
   args : (Tensor input, Tensor label, float delta)
   output : Tensor(out), Tensor(residual)
   infer_meta :
@@ -1239,7 +1227,7 @@
     func : huber_loss
   backward : huber_loss_grad
 
-- api : imag
+- op : imag
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1248,7 +1236,7 @@
     func : imag
   backward : imag_grad
 
-- api : increment
+- op : increment
   args : (Tensor x, float value)
   output : Tensor(out)
   infer_meta :
@@ -1257,7 +1245,7 @@
     func : increment
   inplace : (x -> out)
 
-- api : index_add
+- op : index_add
   args : (Tensor x, Tensor index,  Tensor add_value, int axis)
   output : Tensor(out)
   infer_meta :
@@ -1268,7 +1256,7 @@
   inplace : (x -> out)
   backward : index_add_grad
 
-- api : index_sample
+- op : index_sample
   args : (Tensor x, Tensor index)
   output : Tensor
   infer_meta :
@@ -1278,7 +1266,7 @@
     data_type : x
   backward : index_sample_grad
 
-- api : index_select
+- op : index_select
   args : (Tensor x, Tensor index,  int dim)
   output : Tensor(out)
   infer_meta :
@@ -1288,7 +1276,7 @@
     data_type : x
   backward : index_select_grad
 
-- api : instance_norm
+- op : instance_norm
   args : (Tensor x, Tensor scale, Tensor bias, float epsilon)
   output : Tensor(y), Tensor(saved_mean), Tensor(saved_variance)
   infer_meta :
@@ -1300,7 +1288,7 @@
   intermediate : saved_mean, saved_variance
   backward : instance_norm_grad
 
-- api : inverse
+- op : inverse
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -1309,7 +1297,7 @@
     func : inverse
   backward : inverse_grad
 
-- api : is_empty
+- op : is_empty
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -1317,7 +1305,7 @@
   kernel :
     func : is_empty
 
-- api : isclose
+- op : isclose
   args : (Tensor x, Tensor y, Scalar rtol, Scalar atol,  bool equal_nan)
   output : Tensor(out)
   infer_meta :
@@ -1326,7 +1314,7 @@
   kernel :
     func : isclose
 
-- api : isfinite
+- op : isfinite
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -1335,7 +1323,7 @@
     func : isfinite {dense -> dense},
            infinite_sr {selected_rows -> selected_rows}
 
-- api : isinf
+- op : isinf
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -1344,7 +1332,7 @@
     func : isinf {dense -> dense},
            isinf_sr {selected_rows -> selected_rows}
 
-- api : isnan
+- op : isnan
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -1353,7 +1341,7 @@
     func : isnan {dense -> dense},
            isnan_sr {selected_rows -> selected_rows}
 
-- api : kldiv_loss
+- op : kldiv_loss
   args : (Tensor x, Tensor label, str reduction)
   output : Tensor(out)
   infer_meta :
@@ -1363,7 +1351,7 @@
     data_type : x
   backward : kldiv_loss_grad
 
-- api : kron
+- op : kron
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -1372,7 +1360,7 @@
     func : kron
   backward : kron_grad
 
-- api : kthvalue
+- op : kthvalue
   args : (Tensor x, int k, int axis, bool keepdim)
   output : Tensor(out), Tensor(indices)
   infer_meta :
@@ -1381,7 +1369,7 @@
     func : kthvalue
   backward : kthvalue_grad
 
-- api : label_smooth
+- op : label_smooth
   args : (Tensor label, Tensor prior_dist, float epsilon)
   output : Tensor
   infer_meta :
@@ -1393,7 +1381,7 @@
   optional : prior_dist
   backward : label_smooth_grad
 
-- api : lamb_
+- op : lamb_
   args : (Tensor param, Tensor grad, Tensor learning_rate, Tensor moment1, Tensor moment2, Tensor beta1_pow, Tensor beta2_pow, Tensor master_param, Tensor skip_update, float weight_decay, float beta1, float beta2, float epsilon, bool multi_precision)
   output : Tensor(param_out), Tensor(moment1_out), Tensor(moment2_out), Tensor(beta1_pow_out), Tensor(beta2_pow_out), Tensor(master_param_outs)
   infer_meta :
@@ -1405,7 +1393,7 @@
   optional : master_param, skip_update
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_outs)
 
-- api : layer_norm
+- op : layer_norm
   args : (Tensor x, Tensor scale, Tensor bias, float epsilon, int begin_norm_axis, bool is_test)
   output : Tensor(out), Tensor(mean), Tensor(variance)
   infer_meta :
@@ -1416,7 +1404,7 @@
   backward : layer_norm_grad
   optional : scale, bias
 
-- api : leaky_relu
+- op : leaky_relu
   args : (Tensor x, float alpha)
   output : Tensor
   infer_meta :
@@ -1426,7 +1414,7 @@
     func : leaky_relu
   backward : leaky_relu_grad
 
-- api : lerp
+- op : lerp
   args : (Tensor x, Tensor y, Tensor weight)
   output : Tensor(out)
   infer_meta :
@@ -1436,7 +1424,7 @@
   inplace : (x -> out)
   backward : lerp_grad
 
-- api : less_equal
+- op : less_equal
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
   infer_meta :
@@ -1444,7 +1432,7 @@
   kernel :
     func : less_equal
 
-- api : less_than
+- op : less_than
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
   infer_meta :
@@ -1452,7 +1440,7 @@
   kernel :
     func : less_than
 
-- api : linear_interp
+- op : linear_interp
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(output)
   infer_meta :
@@ -1463,16 +1451,19 @@
     data_type : x
   backward : linear_interp_grad
 
-- api : linspace
-  args : (Tensor start, Tensor stop, Tensor number, DataType dtype)
+- op : linspace
+  args : (Tensor start, Tensor stop, Tensor number, DataType dtype, Place place)
   output : Tensor(out)
   infer_meta :
     func : LinspaceInferMeta
+    param: [start, stop, number, dtype]
   kernel :
     func : linspace
+    param: [start, stop, number, dtype]
     data_type : dtype
+    backend : place
 
-- api : log
+- op : log
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1481,7 +1472,7 @@
     func : log
   backward: log_grad
 
-- api : log10
+- op : log10
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1490,7 +1481,7 @@
     func : log10
   backward: log10_grad
 
-- api : log1p
+- op : log1p
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1499,7 +1490,7 @@
     func : log1p
   backward: log1p_grad
 
-- api : log2
+- op : log2
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1508,7 +1499,7 @@
     func : log2
   backward: log2_grad
 
-- api : log_loss
+- op : log_loss
   args : (Tensor input, Tensor label, float epsilon)
   output : Tensor
   infer_meta :
@@ -1517,7 +1508,7 @@
     func : log_loss
   backward : log_loss_grad
 
-- api : log_softmax
+- op : log_softmax
   args : (Tensor x,  int axis)
   output : Tensor(out)
   infer_meta :
@@ -1526,7 +1517,7 @@
     func : log_softmax
   backward : log_softmax_grad
 
-- api : logcumsumexp
+- op : logcumsumexp
   args : (Tensor x, int axis, bool flatten, bool exclusive, bool reverse)
   output : Tensor(out)
   infer_meta :
@@ -1535,7 +1526,7 @@
     func : logcumsumexp
   backward : logcumsumexp_grad
 
-- api : logical_and
+- op : logical_and
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -1543,7 +1534,7 @@
   kernel :
     func : logical_and
 
-- api : logical_not
+- op : logical_not
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -1551,7 +1542,7 @@
   kernel :
     func : logical_not
 
-- api : logical_or
+- op : logical_or
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -1559,7 +1550,7 @@
   kernel :
     func : logical_or
 
-- api : logical_xor
+- op : logical_xor
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -1567,7 +1558,7 @@
   kernel :
     func : logical_xor
 
-- api : logit
+- op : logit
   args : (Tensor x, float eps = 1e-6f)
   output : Tensor
   infer_meta :
@@ -1577,7 +1568,7 @@
     func : logit
   backward : logit_grad
 
-- api : logsigmoid
+- op : logsigmoid
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1586,7 +1577,7 @@
     func : logsigmoid
   backward : logsigmoid_grad
 
-- api : logsumexp
+- op : logsumexp
   args : (Tensor x, int64_t[] axis,  bool keepdim,  bool reduce_all)
   output : Tensor(out)
   infer_meta :
@@ -1595,7 +1586,7 @@
     func : logsumexp
   backward : logsumexp_grad
 
-- api : lstsq
+- op : lstsq
   args : (Tensor x, Tensor y, Scalar rcond, str driver)
   output : Tensor(solution), Tensor(residuals), Tensor(rank), Tensor(singular_values)
   infer_meta :
@@ -1604,7 +1595,7 @@
   kernel :
     func : lstsq
 
-- api : lu
+- op : lu
   args : (Tensor x, bool pivot)
   output : Tensor(out), Tensor(pivots), Tensor(infos)
   infer_meta :
@@ -1613,7 +1604,7 @@
     func : lu
   backward : lu_grad
 
-- api : lu_unpack
+- op : lu_unpack
   args : (Tensor x, Tensor pivots, bool unpack_ludata, bool unpack_pivots)
   output : Tensor(pmat), Tensor(l), Tensor(u)
   infer_meta :
@@ -1623,7 +1614,7 @@
     data_type : x
   backward : lu_unpack_grad
 
-- api : margin_cross_entropy
+- op : margin_cross_entropy
   args : (Tensor logits, Tensor label, bool return_softmax, int ring_id, int rank, int nranks, float margin1, float margin2, float margin3, float scale)
   output : Tensor(softmax), Tensor(loss)
   infer_meta :
@@ -1633,7 +1624,7 @@
     data_type : logits
   backward : margin_cross_entropy_grad
 
-- api : masked_select
+- op : masked_select
   args : (Tensor x, Tensor mask)
   output : Tensor
   infer_meta :
@@ -1643,7 +1634,7 @@
     data_type : x
   backward : masked_select_grad
 
-- api : matmul
+- op : matmul
   args : (Tensor x, Tensor y, bool transpose_x = false, bool transpose_y = false)
   output : Tensor
   infer_meta :
@@ -1652,7 +1643,7 @@
     func : matmul
   backward : matmul_grad
 
-- api : matrix_nms
+- op : matrix_nms
   args : (Tensor bboxes, Tensor scores, float score_threshold, int nms_top_k, int keep_top_k, float post_threshold=0., bool use_gaussian = false, float gaussian_sigma = 2.0, int background_label = 0, bool normalized = true)
   output : Tensor(out), Tensor(index), Tensor(roisnum)
   infer_meta :
@@ -1660,7 +1651,7 @@
   kernel :
     func : matrix_nms
 
-- api : matrix_power
+- op : matrix_power
   args : (Tensor x, int n)
   output : Tensor
   infer_meta :
@@ -1670,7 +1661,7 @@
     func : matrix_power
   backward : matrix_power_grad
 
-- api : matrix_rank
+- op : matrix_rank
   args : (Tensor x, float tol, bool use_default_tol=true, bool hermitian=false)
   output : Tensor(out)
   infer_meta :
@@ -1679,7 +1670,7 @@
   kernel :
     func : matrix_rank
 
-- api : matrix_rank_tol
+- op : matrix_rank_tol
   args : (Tensor x, Tensor atol_tensor, bool use_default_tol=true, bool hermitian=false)
   output : Tensor(out)
   infer_meta :
@@ -1687,7 +1678,7 @@
   kernel :
     func : matrix_rank_tol
 
-- api : max
+- op : max
   args : (Tensor x, IntArray dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -1696,7 +1687,7 @@
     func : max
   backward : max_grad
 
-- api : max_pool2d_with_index
+- op : max_pool2d_with_index
   args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive)
   output : Tensor(out), Tensor(mask)
   infer_meta :
@@ -1705,7 +1696,7 @@
     func : max_pool2d_with_index
   backward : max_pool2d_with_index_grad
 
-- api : max_pool3d_with_index
+- op : max_pool3d_with_index
   args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool global_pooling, bool adaptive)
   output : Tensor(out), Tensor(mask)
   infer_meta :
@@ -1714,7 +1705,7 @@
     func : max_pool3d_with_index
   backward : max_pool3d_with_index_grad
 
-- api : maximum
+- op : maximum
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -1723,7 +1714,7 @@
     func : maximum
   backward : maximum_grad
 
-- api : maxout
+- op : maxout
   args : (Tensor x, int groups, int axis)
   output : Tensor(out)
   infer_meta :
@@ -1732,7 +1723,7 @@
     func : maxout
   backward : maxout_grad
 
-- api : mean
+- op : mean
   args : (Tensor x, IntArray dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -1741,7 +1732,7 @@
     func : mean
   backward : mean_grad
 
-- api : mean_all
+- op : mean_all
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -1750,7 +1741,7 @@
     func : mean_all
   backward : mean_all_grad
 
-- api : merged_adam_
+- op : merged_adam_
   args : (Tensor[] param, Tensor[] grad, Tensor[] learning_rate, Tensor[] moment1, Tensor[] moment2, Tensor[] beta1_pow, Tensor[] beta2_pow, Tensor[] master_param, Scalar beta1, Scalar beta2, Scalar epsilon, bool multi_precision, bool use_global_beta_pow)
   output : Tensor[](param_out){param.size()}, Tensor[](moment1_out){param.size()}, Tensor[](moment2_out){param.size()}, Tensor[](beta1_pow_out){param.size()}, Tensor[](beta2_pow_out){param.size()}, Tensor[](master_param_out){param.size()}
   infer_meta :
@@ -1761,7 +1752,7 @@
     data_type : param
   inplace : (param -> param_out), (moment1 -> moment1_out), (moment2 -> moment2_out), (beta1_pow -> beta1_pow_out), (beta2_pow -> beta2_pow_out), (master_param -> master_param_out)
 
-- api : merged_momentum_
+- op : merged_momentum_
   args : (Tensor[] param, Tensor[] grad, Tensor[] velocity, Tensor[] learning_rate, Tensor[] master_param, float mu, bool use_nesterov = false, str[] regularization_method = {}, float[] regularization_coeff = {}, bool multi_precision = false, float rescale_grad = 1.0f)
   output : Tensor[](param_out){param.size()}, Tensor[](velocity_out){param.size()}, Tensor[](master_param_out){param.size()}
   infer_meta :
@@ -1772,7 +1763,7 @@
     data_type : param
   inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
 
-- api : meshgrid
+- op : meshgrid
   args : (Tensor[] inputs)
   output : Tensor[]{inputs.size()}
   infer_meta :
@@ -1781,7 +1772,7 @@
     func : meshgrid
   backward : meshgrid_grad
 
-- api : min
+- op : min
   args : (Tensor x, IntArray dims={}, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -1790,7 +1781,7 @@
     func : min
   backward : min_grad
 
-- api : minimum
+- op : minimum
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -1799,7 +1790,7 @@
     func : minimum
   backward : minimum_grad
 
-- api : mish
+- op : mish
   args : (Tensor x, float lambda)
   output : Tensor
   infer_meta :
@@ -1809,7 +1800,7 @@
     func : mish
   backward : mish_grad
 
-- api : mode
+- op : mode
   args : (Tensor x,  int axis,  bool keepdim)
   output : Tensor(out), Tensor(indices)
   infer_meta :
@@ -1818,7 +1809,7 @@
     func : mode
   backward : mode_grad
 
-- api : momentum_
+- op : momentum_
   args : (Tensor param, Tensor grad, Tensor velocity, Tensor learning_rate, Tensor master_param, float mu, bool use_nesterov = false, str regularization_method = "", float regularization_coeff = 0.0, bool multi_precision = false, float rescale_grad = 1.0f)
   output : Tensor(param_out), Tensor(velocity_out), Tensor(master_param_out)
   infer_meta:
@@ -1829,7 +1820,7 @@
   optional : master_param
   inplace : (param -> param_out), (velocity -> velocity_out), (master_param -> master_param_out)
 
-- api : multi_dot
+- op : multi_dot
   args : (Tensor[] x)
   output : Tensor
   infer_meta :
@@ -1838,7 +1829,7 @@
     func : multi_dot
   backward : multi_dot_grad
 
-- api : multiclass_nms3
+- op : multiclass_nms3
   args : (Tensor bboxes, Tensor scores, Tensor rois_num, float score_threshold, int nms_top_k, int keep_top_k, float nms_threshold=0.3, bool normalized=true, float nms_eta=1.0, int background_label=0)
   output : Tensor(out), Tensor(index), Tensor(nms_rois_num)
   infer_meta :
@@ -1847,7 +1838,7 @@
     func : multiclass_nms3
   optional : rois_num
 
-- api : multinomial
+- op : multinomial
   args : (Tensor x, Scalar num_samples, bool replacement)
   output : Tensor(out)
   infer_meta :
@@ -1855,7 +1846,7 @@
   kernel :
     func : multinomial
 
-- api : multiplex
+- op : multiplex
   args : (Tensor[] ins, Tensor ids)
   output : Tensor
   infer_meta :
@@ -1865,7 +1856,7 @@
     data_type : ins
   backward : multiplex_grad
 
-- api : multiply
+- op : multiply
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -1875,7 +1866,7 @@
            multiply_sr {selected_rows, dense -> selected_rows}
   backward : multiply_grad
 
-- api : nearest_interp
+- op : nearest_interp
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(output)
   infer_meta :
@@ -1886,7 +1877,7 @@
     data_type : x
   backward : nearest_interp_grad
 
-- api : nll_loss
+- op : nll_loss
   args : (Tensor input, Tensor label, Tensor weight, int64_t ignore_index, str reduction)
   output : Tensor(out), Tensor(total_weight)
   infer_meta :
@@ -1897,7 +1888,7 @@
   optional : weight
   backward : nll_loss_grad
 
-- api : nms
+- op : nms
   args : (Tensor x, float threshold)
   output : Tensor(out)
   infer_meta :
@@ -1906,7 +1897,7 @@
     func : nms
     data_type : x
 
-- api : norm
+- op : norm
   args : (Tensor x, int axis, float epsilon, bool is_test)
   output : Tensor(out), Tensor(norm)
   infer_meta :
@@ -1915,7 +1906,7 @@
     func : norm
   backward : norm_grad
 
-- api : not_equal
+- op : not_equal
   args : (Tensor x, Tensor y, int axis = -1)
   output : Tensor(out)
   infer_meta :
@@ -1923,7 +1914,7 @@
   kernel :
     func : not_equal
 
-- api : one_hot
+- op : one_hot
   args : (Tensor x, Scalar(int) num_classes)
   output : Tensor(out)
   infer_meta :
@@ -1931,17 +1922,17 @@
   kernel :
     func : one_hot
 
-- api : ones
+- op : ones
   args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output : Tensor(out)
   invoke : full(shape, 1, dtype, place)
 
-- api : ones_like
+- op : ones_like
   args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place={})
   output : Tensor(out)
   invoke : full_like(x, 1, dtype, place)
 
-- api : p_norm
+- op : p_norm
   args : (Tensor x,  float porder,  int axis,  float epsilon,  bool keepdim,  bool asvector=false)
   output : Tensor(out)
   infer_meta :
@@ -1950,7 +1941,7 @@
     func : p_norm
   backward : p_norm_grad
 
-- api : pad
+- op : pad
   args : (Tensor x, int[] paddings, Scalar pad_value)
   output : Tensor
   infer_meta :
@@ -1959,7 +1950,7 @@
     func : pad
   backward : pad_grad
 
-- api : pad3d
+- op : pad3d
   args : (Tensor x, IntArray paddings, str mode,  float pad_value, str data_format)
   output : Tensor(out)
   infer_meta :
@@ -1968,7 +1959,7 @@
     func : pad3d
   backward : pad3d_grad
 
-- api : pixel_shuffle
+- op : pixel_shuffle
   args : (Tensor x, int upscale_factor, str data_format)
   output : Tensor
   infer_meta :
@@ -1977,7 +1968,7 @@
     func : pixel_shuffle
   backward : pixel_shuffle_grad
 
-- api : pool2d
+- op : pool2d
   args : (Tensor x, IntArray kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn)
   output : Tensor(out)
   infer_meta :
@@ -1989,7 +1980,7 @@
     use_gpudnn : use_gpudnn
   backward : pool2d_grad
 
-- api : pool3d
+- op : pool3d
   args : (Tensor x, int[] kernel_size, int[] strides, int[] paddings, bool ceil_mode, bool exclusive, str data_format, str pooling_type, bool global_pooling, bool adaptive, str padding_algorithm, bool use_gpudnn)
   output : Tensor(out)
   infer_meta :
@@ -2001,7 +1992,7 @@
     use_gpudnn : use_gpudnn
   backward : pool3d_grad
 
-- api : pow
+- op : pow
   args : (Tensor x, Scalar s)
   output : Tensor(out)
   infer_meta :
@@ -2011,7 +2002,7 @@
     func : pow
   backward : pow_grad
 
-- api : prelu
+- op : prelu
   args : (Tensor x, Tensor alpha, str data_format, str mode)
   output : Tensor(out)
   infer_meta :
@@ -2020,7 +2011,7 @@
     func : prelu
   backward : prelu_grad
 
-- api : prior_box
+- op : prior_box
   args : (Tensor input, Tensor image, float[] min_sizes, float[] aspect_ratios, float[] variances, float[] max_sizes = {}, bool flip=true, bool clip=true, float step_w=0.0, float step_h=0.0, float offset=0.5, bool min_max_aspect_ratios_order=false)
   output : Tensor(out), Tensor(var)
   infer_meta :
@@ -2028,7 +2019,7 @@
   kernel :
     func : prior_box
 
-- api : psroi_pool
+- op : psroi_pool
   args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, int output_channels, float spatial_scale)
   output : Tensor
   infer_meta :
@@ -2039,7 +2030,7 @@
   optional : boxes_num
   backward : psroi_pool_grad
 
-- api : put_along_axis
+- op : put_along_axis
   args : (Tensor x, Tensor index, Tensor value, int axis, str reduce)
   output : Tensor(out)
   infer_meta :
@@ -2051,7 +2042,7 @@
   inplace : (x -> out)
   backward : put_along_axis_grad
 
-- api : qr
+- op : qr
   args : (Tensor x, str mode)
   output : Tensor(q), Tensor(r)
   infer_meta :
@@ -2060,7 +2051,7 @@
     func : qr
   backward : qr_grad
 
-- api : randint
+- op : randint
   args : (int low, int high, IntArray shape, DataType dtype=DataType::INT64, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -2072,7 +2063,7 @@
     data_type : dtype
     backend : place
 
-- api : randperm
+- op : randperm
   args : (int n, DataType dtype, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -2084,7 +2075,7 @@
     data_type : dtype
     backend : place
 
-- api : real
+- op : real
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2093,7 +2084,7 @@
     func : real
   backward : real_grad
 
-- api : reciprocal
+- op : reciprocal
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -2103,7 +2094,7 @@
   inplace : (x -> out)
   backward : reciprocal_grad
 
-- api : reduce_prod
+- op : reduce_prod
   args : (Tensor x, IntArray dims, bool keep_dim, bool reduce_all)
   output : Tensor
   infer_meta :
@@ -2112,7 +2103,7 @@
     func : prod_raw
   backward : reduce_prod_grad
 
-- api : relu
+- op : relu
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -2122,7 +2113,7 @@
   inplace : (x -> out)
   backward : relu_grad
 
-- api : relu6
+- op : relu6
   args : (Tensor x, float threshold)
   output : Tensor
   infer_meta :
@@ -2132,7 +2123,7 @@
     func : relu6
   backward : relu6_grad
 
-- api : remainder
+- op : remainder
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -2141,7 +2132,7 @@
     func : remainder
   inplace : (x -> out)
 
-- api : renorm
+- op : renorm
   args : (Tensor x, float p, int axis, float max_norm)
   output : Tensor
   infer_meta :
@@ -2151,7 +2142,7 @@
     func : renorm
   backward : renorm_grad
 
-- api : repeat_interleave
+- op : repeat_interleave
   args : (Tensor x, int repeats, int dim)
   output : Tensor(out)
   infer_meta :
@@ -2161,7 +2152,7 @@
     func : repeat_interleave
   backward: repeat_interleave_grad
 
-- api : repeat_interleave_with_tensor_index
+- op : repeat_interleave_with_tensor_index
   args : (Tensor x, Tensor repeats, int dim)
   output : Tensor(out)
   infer_meta :
@@ -2172,7 +2163,7 @@
     data_type : x
   backward: repeat_interleave_with_tensor_index_grad
 
-- api : reshape
+- op : reshape
   args : (Tensor x, IntArray shape)
   output : Tensor(out), Tensor(xshape)
   infer_meta :
@@ -2184,7 +2175,7 @@
   intermediate : xshape
   backward: reshape_grad
 
-- api : reverse
+- op : reverse
   args : (Tensor x, IntArray axis)
   output : Tensor
   infer_meta :
@@ -2193,7 +2184,7 @@
     func : reverse
   backward : reverse_grad
 
-- api : rmsprop_
+- op : rmsprop_
   args : (Tensor param, Tensor mean_square, Tensor grad, Tensor moment, Tensor learning_rate, Tensor mean_grad, float epsilon, float decay, float momentum, bool centered)
   output : Tensor(param_out), Tensor(moment_out), Tensor(mean_square_out), Tensor(mean_grad_out)
   infer_meta :
@@ -2204,7 +2195,7 @@
     optional : mean_grad
   inplace : (param -> param_out), (moment -> moment_out), (mean_square -> mean_square_out), (mean_grad -> mean_grad_out)
 
-- api : roi_align
+- op : roi_align
   args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale, int sampling_ratio, bool aligned)
   output : Tensor
   infer_meta :
@@ -2215,7 +2206,7 @@
   optional : boxes_num
   backward : roi_align_grad
 
-- api : roi_pool
+- op : roi_pool
   args : (Tensor x, Tensor boxes, Tensor boxes_num, int pooled_height, int pooled_width, float spatial_scale)
   output : Tensor(out), Tensor(arg_max)
   infer_meta :
@@ -2227,7 +2218,7 @@
   intermediate : arg_max
   backward : roi_pool_grad
 
-- api : roll
+- op : roll
   args : (Tensor x, IntArray shifts, int64_t[] axis)
   output : Tensor(out)
   infer_meta :
@@ -2236,7 +2227,7 @@
     func : roll
   backward : roll_grad
 
-- api : round
+- op : round
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -2246,7 +2237,7 @@
   inplace : (x -> out)
   backward : round_grad
 
-- api : rsqrt
+- op : rsqrt
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -2256,7 +2247,7 @@
   inplace : (x -> out)
   backward : rsqrt_grad
 
-- api : scale
+- op : scale
   args : (Tensor x, Scalar scale, float bias, bool bias_after_scale)
   output : Tensor(out)
   infer_meta :
@@ -2268,7 +2259,7 @@
   inplace : (x -> out)
   backward : scale_grad
 
-- api : scatter
+- op : scatter
   args : (Tensor x, Tensor index, Tensor updates, bool overwrite)
   output : Tensor(out)
   infer_meta :
@@ -2279,7 +2270,7 @@
   inplace : (x -> out)
   backward : scatter_grad
 
-- api : scatter_nd_add
+- op : scatter_nd_add
   args : (Tensor x, Tensor index, Tensor updates)
   output : Tensor
   infer_meta :
@@ -2289,7 +2280,7 @@
     func : scatter_nd_add
   backward : scatter_nd_add_grad
 
-- api : searchsorted
+- op : searchsorted
   args : (Tensor sorted_sequence, Tensor value, bool out_int32, bool right)
   output : Tensor(out)
   infer_meta :
@@ -2298,7 +2289,7 @@
     func : searchsorted
     data_type : sorted_sequence
 
-- api : segment_pool
+- op : segment_pool
   args : (Tensor x, Tensor segment_ids, str pooltype)
   output : Tensor(out), Tensor(summed_ids)
   infer_meta :
@@ -2308,7 +2299,7 @@
     data_type : x
   backward : segment_pool_grad
 
-- api : selu
+- op : selu
   args : (Tensor x, float scale, float alpha)
   output : Tensor
   infer_meta :
@@ -2318,7 +2309,7 @@
     func : selu
   backward : selu_grad
 
-- api : sgd_
+- op : sgd_
   args : (Tensor param, Tensor learning_rate, Tensor grad, Tensor master_param, bool multi_precision)
   output : Tensor(param_out), Tensor(master_param_out)
   infer_meta :
@@ -2333,7 +2324,7 @@
   optional : master_param
   inplace : (param -> param_out), (master_param -> master_param_out)
 
-- api : shape
+- op : shape
   args : (Tensor input)
   output : Tensor(out)
   infer_meta :
@@ -2344,7 +2335,7 @@
   data_transform:
     skip_transform : input
 
-- api : shard_index
+- op : shard_index
   args : (Tensor in, int index_num, int nshards, int shard_id, int ignore_value)
   output : Tensor(out)
   infer_meta :
@@ -2352,7 +2343,7 @@
   kernel :
     func : shard_index
 
-- api : sigmoid
+- op : sigmoid
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2361,7 +2352,7 @@
     func : sigmoid
   backward : sigmoid_grad
 
-- api : sigmoid_cross_entropy_with_logits
+- op : sigmoid_cross_entropy_with_logits
   args : (Tensor x, Tensor label, bool normalize, int ignore_index)
   output : Tensor
   infer_meta :
@@ -2370,15 +2361,16 @@
     func : sigmoid_cross_entropy_with_logits
   backward : sigmoid_cross_entropy_with_logits_grad
 
-- api : sign
+- op : sign
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
     func : UnchangedInferMeta
   kernel :
     func : sign
+  backward : sign_grad
 
-- api : silu
+- op : silu
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2387,7 +2379,7 @@
     func : silu
   backward : silu_grad
 
-- api : sin
+- op : sin
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2396,7 +2388,7 @@
     func : sin
   backward : sin_grad
 
-- api : sinh
+- op : sinh
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2405,7 +2397,7 @@
     func : sinh
   backward : sinh_grad
 
-- api : size
+- op : size
   args : (Tensor x)
   output : Tensor(size)
   infer_meta :
@@ -2415,7 +2407,7 @@
   data_transform:
     skip_transform : x
 
-- api : slice
+- op : slice
   args : (Tensor input, int64_t[] axes, IntArray starts, IntArray ends, int64_t[] infer_flags, int64_t[] decrease_axis)
   output : Tensor
   infer_meta :
@@ -2424,7 +2416,7 @@
     func : slice
   backward : slice_grad
 
-- api : slogdet
+- op : slogdet
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2433,7 +2425,7 @@
     func : slogdeterminant
   backward : slogdet_grad
 
-- api : soft_shrink
+- op : soft_shrink
   args : (Tensor x, float lambda)
   output : Tensor
   infer_meta :
@@ -2443,7 +2435,7 @@
     func : soft_shrink
   backward : soft_shrink_grad
 
-- api : softmax
+- op : softmax
   args : (Tensor x, int axis)
   output : Tensor(out)
   infer_meta :
@@ -2454,7 +2446,7 @@
   inplace : (x -> out)
   backward : softmax_grad
 
-- api : softplus
+- op : softplus
   args : (Tensor x, float beta, float threshold)
   output : Tensor
   infer_meta :
@@ -2464,7 +2456,7 @@
     func : softplus
   backward : softplus_grad
 
-- api : softsign
+- op : softsign
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2474,7 +2466,7 @@
     func : softsign
   backward : softsign_grad
 
-- api : spectral_norm
+- op : spectral_norm
   args : (Tensor weight, Tensor u, Tensor v, int dim, int power_iters, float eps)
   output : Tensor
   infer_meta :
@@ -2484,7 +2476,7 @@
     data_type : weight
   backward : spectral_norm_grad
 
-- api : split
+- op : split
   args : (Tensor x, IntArray sections, Scalar(int) axis)
   output : Tensor[]{sections.size()}
   infer_meta :
@@ -2493,7 +2485,7 @@
     func : split
   backward : split_grad
 
-- api : split_with_num
+- op : split_with_num
   args : (Tensor x, int num, Scalar(int) axis)
   output : Tensor[]{num}
   infer_meta :
@@ -2502,7 +2494,7 @@
     func : split_with_num
   backward : split_with_num_grad
 
-- api : sqrt
+- op : sqrt
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -2512,7 +2504,7 @@
   inplace : (x -> out)
   backward : sqrt_grad
 
-- api : square
+- op : square
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2521,7 +2513,7 @@
     func : square
   backward : square_grad
 
-- api : squared_l2_norm
+- op : squared_l2_norm
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2530,7 +2522,7 @@
     func : squared_l2_norm
   backward : squared_l2_norm_grad
 
-- api : squeeze
+- op : squeeze
   args : (Tensor x, IntArray axes)
   output : Tensor(out), Tensor(xshape)
   infer_meta :
@@ -2542,7 +2534,7 @@
   intermediate : xshape
   backward : squeeze_grad
 
-- api : stack
+- op : stack
   args : (Tensor[] x, int axis)
   output : Tensor
   infer_meta :
@@ -2551,7 +2543,7 @@
     func : stack
   backward : stack_grad
 
-- api : strided_slice
+- op : strided_slice
   args : (Tensor x, int[] axes, IntArray starts, IntArray ends, IntArray strides)
   output : Tensor
   infer_meta :
@@ -2560,7 +2552,7 @@
     func : strided_slice
   backward : strided_slice_grad
 
-- api : subtract
+- op : subtract
   args : (Tensor x, Tensor y)
   output : Tensor(out)
   infer_meta :
@@ -2570,7 +2562,7 @@
   inplace : (x -> out)
   backward : subtract_grad
 
-- api : sum
+- op : sum
   args : (Tensor x, IntArray dims={}, DataType out_dtype=DataType::UNDEFINED, bool keep_dim=false)
   output : Tensor(out)
   infer_meta :
@@ -2580,7 +2572,7 @@
     data_type : x
   backward : sum_grad
 
-- api : svd
+- op : svd
   args : (Tensor x, bool full_metrices)
   output : Tensor(u), Tensor(s), Tensor(vh)
   infer_meta :
@@ -2590,7 +2582,7 @@
   backward : svd_grad
 
 # The python API paddle.nn.functional.swish has no `bete` argument, it may be removed later
-- api : swish
+- op : swish
   args : (Tensor x, float beta=1.0)
   output : Tensor(out)
   infer_meta :
@@ -2600,7 +2592,7 @@
     func : swish
   backward : swish_grad
 
-- api : sync_batch_norm_
+- op : sync_batch_norm_
   args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
   output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
   infer_meta :
@@ -2611,7 +2603,7 @@
   backward : sync_batch_norm_grad
   inplace : (mean -> mean_out), (variance -> variance_out)
 
-- api : take_along_axis
+- op : take_along_axis
   args : (Tensor x, Tensor index, int axis)
   output : Tensor
   infer_meta :
@@ -2622,7 +2614,7 @@
     data_type : x
   backward : take_along_axis_grad
 
-- api : tan
+- op : tan
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2631,7 +2623,7 @@
     func : tan
   backward : tan_grad
 
-- api : tanh
+- op : tanh
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -2641,7 +2633,7 @@
   inplace : (x -> out)
   backward : tanh_grad
 
-- api : tanh_shrink
+- op : tanh_shrink
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -2650,7 +2642,7 @@
     func : tanh_shrink
   backward : tanh_shrink_grad
 
-- api : temporal_shift
+- op : temporal_shift
   args : (Tensor x, int seg_num, float shift_ratio, str data_format_str)
   output : Tensor
   infer_meta :
@@ -2659,7 +2651,7 @@
     func : temporal_shift
   backward : temporal_shift_grad
 
-- api : thresholded_relu
+- op : thresholded_relu
   args : (Tensor x, float threshold)
   output : Tensor
   infer_meta :
@@ -2669,7 +2661,7 @@
     func : thresholded_relu
   backward : thresholded_relu_grad
 
-- api : tile
+- op : tile
   args : (Tensor x, IntArray repeat_times)
   output : Tensor
   infer_meta :
@@ -2678,7 +2670,7 @@
     func : tile
   backward : tile_grad
 
-- api : top_k
+- op : top_k
   args : (Tensor x, Scalar k, int axis = -1, bool largest = true, bool sorted = true)
   output : Tensor(out), Tensor(indices)
   infer_meta :
@@ -2687,7 +2679,7 @@
     func : top_k
   backward : top_k_grad
 
-- api : transpose
+- op : transpose
   args : (Tensor x, int[] axis)
   output : Tensor
   infer_meta :
@@ -2696,7 +2688,7 @@
     func : transpose
   backward : transpose_grad
 
-- api : triangular_solve
+- op : triangular_solve
   args : (Tensor x, Tensor y, bool upper, bool transpose, bool unitriangular)
   output : Tensor
   infer_meta :
@@ -2705,7 +2697,7 @@
     func : triangular_solve
   backward : triangular_solve_grad
 
-- api : tril_indices
+- op : tril_indices
   args : (int rows, int cols, int offset, DataType dtype, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -2717,7 +2709,7 @@
     data_type : dtype
     backend : place
 
-- api : tril_triu
+- op : tril_triu
   args : (Tensor x,  int diagonal,  bool lower)
   output : Tensor(out)
   infer_meta :
@@ -2726,7 +2718,7 @@
     func : tril_triu
   backward : tril_triu_grad
 
-- api : trilinear_interp
+- op : trilinear_interp
   args : (Tensor x, Tensor out_size, Tensor[] size_tensor, Tensor scale_tensor, str data_layout, int out_d, int out_h, int out_w, float[] scale, str interp_method, bool align_corners, int align_mode)
   output : Tensor(output)
   infer_meta :
@@ -2737,7 +2729,7 @@
     data_type : x
   backward : trilinear_interp_grad
 
-- api : triu_indices
+- op : triu_indices
   args : (int row, int col, int offset, DataType dtype, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -2750,7 +2742,7 @@
     backend : place
 
 # python API: paddle.nn.initializer.TruncatedNormal
-- api : truncated_gaussian_random
+- op : truncated_gaussian_random
   args : (int[] shape, float mean, float std, int seed, DataType dtype=DataType::FLOAT32, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -2762,7 +2754,7 @@
     backend : place
     data_type : dtype
 
-- api : unbind
+- op : unbind
   args : (Tensor input, int axis)
   output : Tensor[] {axis<0 ? input.dims()[input.dims().size()+axis]:input.dims()[axis]}
   infer_meta :
@@ -2771,7 +2763,7 @@
     func : unbind
   backward : unbind_grad
 
-- api : unfold
+- op : unfold
   args : (Tensor x, int[] kernel_sizes, int[] strides, int[] paddings, int[] dilations)
   output : Tensor
   infer_meta :
@@ -2780,7 +2772,7 @@
     func : unfold
   backward : unfold_grad
 
-- api : uniform_random
+- op : uniform_random
   args : (IntArray shape,  DataType dtype,  Scalar min,  Scalar max,  int seed, Place place={})
   output : Tensor(out)
   infer_meta :
@@ -2793,7 +2785,7 @@
     backend : place
 
 # The `axis` argument of Python API paddle.unique is not vector
-- api : unique
+- op : unique
   args : (Tensor x, bool return_index, bool return_inverse, bool return_counts, int[] axis, DataType dtype=DataType::INT64)
   output : Tensor(out), Tensor(indices), Tensor(inverse), Tensor(counts)
   infer_meta :
@@ -2802,7 +2794,7 @@
     func : unique
     data_type : x
 
-- api : unique_consecutive
+- op : unique_consecutive
   args : (Tensor x, bool return_inverse, bool return_counts, int[] axis, int dtype)
   output : Tensor(out), Tensor(index), Tensor(counts)
   infer_meta :
@@ -2811,7 +2803,7 @@
     func : unique_consecutive
     data_type : x
 
-- api : unsqueeze
+- op : unsqueeze
   args : (Tensor x, IntArray axis)
   output : Tensor(out), Tensor(xshape)
   infer_meta :
@@ -2823,7 +2815,7 @@
   intermediate : xshape
   backward : unsqueeze_grad
 
-- api : unstack
+- op : unstack
   args : (Tensor x, int axis, int num)
   output : Tensor[]{num}
   infer_meta :
@@ -2832,7 +2824,7 @@
     func : unstack
   backward : unstack_grad
 
-- api : viterbi_decode
+- op : viterbi_decode
   args : (Tensor input, Tensor transition, Tensor length, bool include_bos_eos_tag)
   output : Tensor(scores), Tensor(path)
   infer_meta :
@@ -2841,7 +2833,7 @@
     func : viterbi_decode
     data_type : input
 
-- api : warpctc
+- op : warpctc
   args : (Tensor logits, Tensor label, Tensor logits_length, Tensor labels_length, int blank, bool norm_by_times)
   output :  Tensor(loss), Tensor(warpctcgrad)
   infer_meta :
@@ -2853,7 +2845,7 @@
   intermediate: warpctcgrad
   backward : warpctc_grad
 
-- api : where
+- op : where
   args : (Tensor condition, Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -2862,7 +2854,7 @@
     func : where
   backward : where_grad
 
-- api : where_index
+- op : where_index
   args : (Tensor condition)
   output : Tensor(out)
   infer_meta :
@@ -2870,7 +2862,7 @@
   kernel :
     func : where_index
 
-- api : yolo_box
+- op : yolo_box
   args : (Tensor x, Tensor img_size, int[] anchors, int class_num, float conf_thresh, int downsample_ratio, bool clip_bbox, float scale_x_y=1.0, bool iou_aware=false, float iou_aware_factor=0.5)
   output : Tensor(boxes), Tensor(scores)
   infer_meta :
@@ -2879,7 +2871,7 @@
     func : yolo_box
     data_type : x
 
-- api : yolov3_loss
+- op : yolov3_loss
   args : (Tensor x, Tensor gt_box, Tensor gt_label, Tensor gt_score, int[] anchors, int[] anchor_mask, int class_num, float ignore_thresh, int downsample_ratio, bool use_label_smooth=true, float scale_x_y=1.0)
   output : Tensor(loss), Tensor(objectness_mask), Tensor(gt_match_mask)
   infer_meta :
@@ -2890,17 +2882,17 @@
   optional : gt_score
   backward : yolov3_loss_grad
 
-- api : zeros
+- op : zeros
   args : (IntArray shape, DataType dtype=DataType::FLOAT32, Place place=CPUPlace())
   output : Tensor(out)
   invoke : full(shape, 0, dtype, place)
 
-- api : zeros_like
+- op : zeros_like
   args : (Tensor x, DataType dtype=DataType::UNDEFINED, Place place = {})
   output : Tensor(out)
   invoke : full_like(x, 0, dtype, place)
 
-- api: broadcast_tensors
+- op: broadcast_tensors
   args: (Tensor[] x)
   output: Tensor[]{x.size()}
   infer_meta:
@@ -2909,7 +2901,7 @@
     func: broadcast_tensors
   backward: broadcast_tensors_grad
 
-- api: dirichlet
+- op: dirichlet
   args: (Tensor alpha)
   output: Tensor(out)
   infer_meta:
@@ -2917,7 +2909,7 @@
   kernel:
     func: dirichlet
 
-- api: eig
+- op: eig
   args: (Tensor x)
   output: Tensor(out_w), Tensor(out_v)
   infer_meta:
@@ -2926,7 +2918,7 @@
     func: eig
   backward: eig_grad
 
-- api: fold
+- op: fold
   args: (Tensor x, int[] output_sizes, int[] kernel_sizes,  int[] strides, int[] paddings, int[] dilations)
   output: Tensor(out)
   infer_meta:
@@ -2935,7 +2927,7 @@
     func: fold
   backward: fold_grad
 
-- api: overlap_add
+- op: overlap_add
   args: (Tensor x, int hop_length, int axis)
   output: Tensor
   infer_meta:
@@ -2944,7 +2936,7 @@
     func: overlap_add
   backward: overlap_add_grad
 
-- api: uniform_random_inplace
+- op: uniform_random_inplace
   args: (Tensor x, float min, float max, int seed, int diag_num, int diag_step, float diag_val)
   output: Tensor(out)
   infer_meta:
@@ -2955,7 +2947,7 @@
   inplace: (x -> out)
   backward: uniform_random_inplace_grad
 
-- api: unpool
+- op: unpool
   args: (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, IntArray output_size, str data_format)
   output: Tensor(out)
   infer_meta:
@@ -2965,7 +2957,7 @@
     data_type: x
   backward: unpool_grad
 
-- api: unpool3d
+- op: unpool3d
   args: (Tensor x, Tensor indices, int[] ksize, int[] strides, int[] padding, int[] output_size, str data_format)
   output: Tensor(out)
   infer_meta:
diff --git a/paddle/phi/api/yaml/op_compat.yaml b/paddle/phi/api/yaml/op_compat.yaml
new file mode 100644
index 00000000000000..ccf3c5852adc00
--- /dev/null
+++ b/paddle/phi/api/yaml/op_compat.yaml
@@ -0,0 +1,779 @@
+# - op : rnn
+#   backward : rnn_grad
+#   extra :
+#     attrs : [bool is_test = false]
+
+- op : abs
+  backward : abs_grad
+  extra :
+    attrs : [bool use_cudnn = false, bool use_mkldnn = false]
+
+- op : acosh
+  backward : acosh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : add (elementwise_add)
+  backward : add_grad (elementwise_add_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : addmm
+  backward : addmm_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : affine_grid
+  backward : affine_grid_grad
+  extra :
+    attrs : [bool use_cudnn = true]
+
+- op : angle
+  backward : angle_grad
+  extra :
+    attrs : [bool use_cudnn = false, bool use_mkldnn = false]
+
+- op : asinh
+  backward : asinh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : atan2
+  inputs :
+    {x : X1, y : X2}
+  outputs :
+    out : Out
+
+- op : atanh
+  backward : atanh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : batch_norm
+  backward : batch_norm_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+
+- op : bernoulli
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : bicubic_interp (bicubic_interp_v2)
+  backward : bicubic_interp_grad (bicubic_interp_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : bilinear_interp (bilinear_interp_v2)
+  backward : bilinear_interp_grad (bilinear_interp_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : ceil
+  backward : ceil_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : cholesky
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : cholesky_solve
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : clip
+  backward : clip_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : concat
+  backward : concat_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_quantizer = false, str mkldnn_data_type = "float32"]
+
+- op : conditional_block
+  backward : conditional_block_grad
+  extra :
+    attrs : ['str[] skip_eager_deletion_vars = {}']
+
+- op : conv2d
+  backward : conv2d_grad
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = true, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
+             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+
+- op : conv2d_fusion
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
+             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+
+- op : conv2d_transpose
+  backward : conv2d_transpose_grad
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, bool force_fp32_output = false,
+             str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
+             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()]
+
+- op : conv3d
+  backward : conv3d_grad
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = true, bool use_mkldnn = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
+             bool use_addto = false, bool fuse_residual_connection = false, bool force_fp32_output = false,
+             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+
+- op : conv3d_transpose
+  backward : conv3d_transpose_grad
+  extra :
+    attrs : [bool use_cudnn = true, bool use_mkldnn = false, int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()]
+
+- op : cos
+  backward : cos_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : cosh
+  backward : cosh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : cross
+  inputs :
+    {x : X, y : Y}
+  attrs :
+    axis : dim
+  outputs :
+    out : Out
+
+- op : data_norm
+  backward : data_norm_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : depthwise_conv2d
+  backward : depthwise_conv2d_grad
+  extra :
+    attrs : [bool is_test = false, bool fuse_relu_before_depthwise_conv = false, bool use_mkldnn = false,
+             bool use_quantizer = false, str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f, bool use_addto = false,
+             bool fuse_residual_connection = false, float Scale_in = 1.0f, float Scale_out = 1.0f,
+             float Scale_in_eltwise = 1.0f, 'float[] Scale_weights = {1.0f}', bool force_fp32_output = false,
+             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB(), bool exhaustive_search = false]
+
+- op : depthwise_conv2d_transpose
+  backward : depthwise_conv2d_transpose_grad
+  extra :
+    attrs : [bool is_test = false, bool use_cudnn = false, bool use_mkldnn = false, bool force_fp32_output = false,
+             str mkldnn_data_type = "float32", bool fuse_relu = false,
+             str fuse_activation = "", float fuse_alpha = 0.0f, float fuse_beta = 0.0f,
+             int workspace_size_MB = platform::GetDefaultConvWorkspaceSizeLimitMB()]
+
+- op : dequantize_linear
+  extra :
+    attrs : [float moving_rate = 0.9]
+
+- op : diag (diag_v2)
+  backward : diag_grad (diag_v2_grad)
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : diagonal
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- op : digamma
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : dist
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : distributed_push_sparse
+  extra :
+    attrs : ['int[] slots = {}']
+
+- op : divide (elementwise_div)
+  backward : divide_grad (elementwise_div)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : dot
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : dropout
+  backward : dropout_grad
+  extra :
+    attrs : [bool fix_seed = false, int seed = 0]
+
+- op : dropout_nd
+  backward : dropout_nd_grad
+  extra :
+    attrs : [bool fix_seed = false, int seed = 0]
+
+- op : elementwise_pow
+  backward : elementwise_pow_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : elu
+  backward : elu_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : embedding (lookup_table_v2)
+  backward : embedding_grad (lookup_table_v2_grad)
+  extra :
+    attrs : [bool is_sparse = false, bool is_distributed = false, bool remote_prefetch = false,
+             int trainer_id = 0, int slot = 0, 'int64_t[] height_sections = {}', 'str[] epmap = {}',
+             'str[] table_names = {}']
+
+- op : erf
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : erfinv
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : exp
+  backward : exp_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : expand (expand_v2)
+  backward : expand_grad (expand_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : expm1
+  backward : expm1_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : fake_channel_wise_quantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_channel_wise_quantize_dequantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_dequantize_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_dequantize_moving_average_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_moving_average_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fake_quantize_range_abs_max
+  extra :
+    attrs : [int round_type = 1]
+
+- op : fft_c2c
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op : fft_c2r
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op : fft_r2c
+  inputs: {x: X}
+  outputs: {out: Out}
+
+- op : flip
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : floor
+  backward : floor_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : floor_divide (elementwise_floordiv)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : fmax (elementwise_fmax)
+  backward : fmax_grad (elementwise_fmax_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : fmin (elementwise_fmin)
+  backward : fmin_grad (elementwise_fmin_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : frobenius_norm
+  backward : frobenius_norm_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : full (fill_constant)
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : gather
+  backward : gather_grad
+  extra :
+    attrs : [bool overwrite = true]
+
+- op : gelu
+  backward : gelu_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool use_cudnn = false]
+
+- op : grad_add
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : grid_sampler
+  backward : grid_sampler_grad
+  extra :
+    attrs : [bool use_cudnn = true]
+
+- op : gru
+  backward : gru_grad
+  extra :
+    attrs : [bool is_test = false]
+
+- op : hard_swish
+  backward : hard_swish_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : heaviside (elementwise_heaviside)
+  backward : heaviside_grad (elementwise_heaviside_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : inplace_abn
+  backward : inplace_abn_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+
+- op : layer_norm
+  backward : layer_norm_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : leaky_relu
+  backward : leaky_relu_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : lgamma
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : linear_interp (linear_interp_v2)
+  backward : linear_interp_grad (linear_interp_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : log
+  backward : log_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log10
+  backward : log10_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log1p
+  backward : log1p_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log2
+  backward : log2_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : log_softmax
+  backward : log_softmax_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : logsigmoid
+  backward : logsigmoid_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : lrn
+  backward : lrn_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool is_test = false]
+
+- op : matmul (matmul_v2)
+  backward : matmul_grad (matmul_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, 'int[] fused_reshape_Out = {}', 'int[] fused_transpose_Out = {}',
+             str mkldnn_data_type = "float32", 'int[] fused_reshape_X = {}', 'int[] fused_reshape_Y = {}',
+             'int[] fused_transpose_X = {}', 'int[] fused_transpose_Y = {}']
+
+- op : matmul_with_flatten (mul)
+  backward : matmul_with_flatten_grad (mul_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, float scale_x = 1.0f, 'float[] scale_y = {1.0f}',
+             float scale_out = 1.0f, bool force_fp32_output = false]
+
+- op : maximum (elementwise_max)
+  backward : maximum_grad (elementwise_max_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : maximum (elementwise_min)
+  backward : maximum_grad (elementwise_min_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : mish
+  backward : mish_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : multiply (elementwise_mul)
+  backward : multiply_grad (elementwise_mul_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : mv
+  inputs :
+    {x : X, vec : Vec}
+  outputs :
+    out : Out
+
+- op : nce
+  backward : nce_grad
+  extra :
+    attrs : [int trainer_id = 0, 'int64_t[] height_sections = {}', 'str[] epmap = {}',
+             'str[] table_names = {}', 'int[] custom_neg_classes = {}']
+
+- op : nearest_interp (nearest_interp_v2)
+  backward : nearest_interp_grad (nearest_interp_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : pad2d
+  backward : pad2d_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : pad3d
+  backward : pad3d_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : partial_sum
+  backward : partial_sum_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : poisson
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : pool2d
+  backward : pool2d_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_quantizer = false,
+              str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : pool3d
+  backward : pool3d_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : prelu
+  backward : prelu_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : quantize_linear
+  extra :
+    attrs : [float moving_rate = 0.9]
+
+- op : reciprocal
+  backward : reciprocal_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : reduce_all
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_amax
+  backward : reduce_amax_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_amin
+  backward : reduce_amin_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_any
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_max
+  backward : reduce_max_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_mean
+  backward : reduce_mean_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_min
+  backward : reduce_min_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_prod
+  backward : reduce_prod_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : reduce_sum
+  backward : reduce_sum_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : relu
+  backward : relu_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : relu6
+  backward : relu6_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : remainder (elementwise_mod)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : renorm
+  backward : renorm_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : round
+  backward : round_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : rsqrt
+  backward : rsqrt_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : scale
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : seed
+  extra :
+    attrs : [bool deterministic = false, str rng_name = "", bool force_cpu = false]
+
+- op : sequence_softmax
+  backward : sequence_softmax_grad
+  extra :
+    attrs : [str data_format = "AnyLayout"]
+
+- op : shape
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : shuffle_channel
+  backward : shuffle_channel_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : sigmoid
+  backward : sigmoid_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : silu
+  backward : silu_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : sin
+  backward : sin_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : sinh
+  backward : sinh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : slice
+  backward : slice_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : softmax
+  backward : softmax_grad
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32", bool is_test = false]
+
+- op : softplus
+  backward : softplus_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false, str fuse_activation_type = "", float fuse_activation_alpha = 0.0f,
+             float fuse_activation_beta = 0.0f, float fuse_activation_scale = 1.0f]
+
+- op : softsign
+  backward : softsign_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : solve
+  inputs :
+    {x : X, y : Y}
+  outputs :
+    out : Out
+
+- op : sqrt
+  backward : sqrt_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : square
+  backward : square_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : squeeze (squeeze2)
+  backward : squeeze_grad (squeeze2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str mkldnn_data_type = "float32"]
+
+- op : stack
+  backward : stack_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : subtract (elementwise_sub)
+  backward : subtract_grad (elementwise_sub_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str x_data_format = "", str y_data_format = "", str mkldnn_data_type = "float32",
+             bool use_quantizer = false, float Scale_x = 1.0f, float Scale_y = 1.0f, float Scale_out = 1.0f]
+
+- op : swish
+  backward : swish_grad
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : sync_batch_norm
+  backward : sync_batch_norm_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool fuse_with_relu = false]
+
+- op : tan
+  backward : tan_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : tanh
+  backward : tanh_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : tanh_shrink
+  backward : tanh_shrink_grad
+  extra :
+    attrs : [bool use_mkldnn = false, bool use_cudnn = false]
+
+- op : trace
+  inputs :
+    x : Input
+  outputs :
+    out : Out
+
+- op : transpose (transpose2)
+  backward : transpose_grad (transpose2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false, str data_format = "AnyLayout", bool use_quantizer = false,
+              str mkldnn_data_type = "float32"]
+
+- op : trilinear_interp (trilinear_interp_v2)
+  backward : trilinear_interp_grad (trilinear_interp_v2_grad)
+  extra :
+    attrs : [bool use_mkldnn = false]
+
+- op : trunc
+  inputs :
+    x : X
+  outputs :
+    out : Out
+
+- op : while
+  backward : while_grad
+  extra :
+    attrs : ['str[] skip_eager_deletion_vars = {}']
diff --git a/paddle/phi/api/yaml/api_version.yaml b/paddle/phi/api/yaml/op_version.yaml
similarity index 65%
rename from paddle/phi/api/yaml/api_version.yaml
rename to paddle/phi/api/yaml/op_version.yaml
index eb2ae4b2c82e9a..3028b927966a20 100644
--- a/paddle/phi/api/yaml/api_version.yaml
+++ b/paddle/phi/api/yaml/op_version.yaml
@@ -1,4 +1,14 @@
-- api : trace
+- op : flip
+  version :
+    - checkpoint : Upgrade flip, add new attr [axis] and delete attr [dims]
+      action :
+        - add_attr : axis
+          comment : The added attr 'axis' doesn't set default value
+          default : paddle::none
+        - delete_attr : dims
+          comment : The attr 'dims' is deleted.
+
+- op : trace
   version :
     - checkpoint : Upgrade trace add a new attribute [axis2]
       action :
diff --git a/paddle/phi/api/yaml/api.yaml b/paddle/phi/api/yaml/ops.yaml
similarity index 88%
rename from paddle/phi/api/yaml/api.yaml
rename to paddle/phi/api/yaml/ops.yaml
index 2218532fd8c5d5..10e617bd912439 100644
--- a/paddle/phi/api/yaml/api.yaml
+++ b/paddle/phi/api/yaml/ops.yaml
@@ -1,4 +1,4 @@
-- api : atan2
+- op : atan2
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -7,7 +7,7 @@
     func : atan2
   backward : atan2_grad
 
-- api : bernoulli
+- op : bernoulli
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -15,7 +15,7 @@
   kernel :
     func : bernoulli
 
-- api : cholesky
+- op : cholesky
   args : (Tensor x, bool upper=false)
   output : Tensor
   infer_meta :
@@ -24,7 +24,7 @@
     func : cholesky
   backward : cholesky_grad
 
-- api : cholesky_solve
+- op : cholesky_solve
   args : (Tensor x, Tensor y, bool upper=false)
   output : Tensor
   infer_meta :
@@ -33,7 +33,7 @@
     func : cholesky_solve
   backward : cholesky_solve_grad
 
-- api : cross
+- op : cross
   args : (Tensor x, Tensor y, int axis = 9)
   output : Tensor
   infer_meta :
@@ -43,7 +43,7 @@
     data_type : x
   backward : cross_grad
 
-- api : diag
+- op : diag
   args : (Tensor x, int offset = 0, float padding_value = 0.0)
   output : Tensor
   infer_meta :
@@ -52,7 +52,7 @@
     func : diag
   backward : diag_grad
 
-- api : diagonal
+- op : diagonal
   args : (Tensor x, int offset = 0, int axis1 = 0, int axis2 = 1)
   output : Tensor
   infer_meta :
@@ -61,7 +61,7 @@
     func : diagonal
   backward : diagonal_grad
 
-- api : digamma
+- op : digamma
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -70,7 +70,7 @@
     func : digamma
   backward : digamma_grad
 
-- api : dist
+- op : dist
   args : (Tensor x, Tensor y, float p = 2.0)
   output : Tensor
   infer_meta :
@@ -79,7 +79,7 @@
     func : dist
   backward : dist_grad
 
-- api : dot
+- op : dot
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -89,7 +89,7 @@
     data_type : x
   backward : dot_grad
 
-- api : erf
+- op : erf
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -98,7 +98,7 @@
     func : erf
   backward : erf_grad
 
-- api : erfinv
+- op : erfinv
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -108,7 +108,7 @@
   inplace : (x -> out)
   backward : erfinv_grad
 
-- api : fft_c2c
+- op : fft_c2c
   args : (Tensor x, int64_t[] axes, str normalization, bool forward)
   output : Tensor
   infer_meta :
@@ -117,7 +117,7 @@
     func : fft_c2c
   backward : fft_c2c_grad
 
-- api : fft_c2r
+- op : fft_c2r
   args : (Tensor x, int64_t[] axes, str normalization, bool forward, int64_t last_dim_size=0L)
   output : Tensor
   infer_meta :
@@ -126,7 +126,7 @@
     func : fft_c2r
   backward : fft_c2r_grad
 
-- api : fft_r2c
+- op : fft_r2c
   args : (Tensor x, int64_t[] axes, str normalization, bool forward, bool onesided)
   output : Tensor
   infer_meta :
@@ -135,7 +135,7 @@
     func : fft_r2c
   backward : fft_r2c_grad
 
-- api : graph_send_uv
+- op : graph_send_uv
   args : (Tensor x, Tensor y, Tensor src_index, Tensor dst_index, str message_op = "ADD")
   output : Tensor(out)
   infer_meta :
@@ -145,7 +145,7 @@
     data_type : x
   backward : graph_send_uv_grad
 
-- api : lgamma
+- op : lgamma
   args : (Tensor x)
   output : Tensor(out)
   infer_meta :
@@ -154,7 +154,7 @@
     func : lgamma
   backward : lgamma_grad
 
-- api : mv
+- op : mv
   args : (Tensor x, Tensor vec)
   output : Tensor
   infer_meta :
@@ -163,7 +163,7 @@
     func : mv
   backward : mv_grad
 
-- api : poisson
+- op : poisson
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -172,7 +172,7 @@
     func : poisson
   backward : poisson_grad
 
-- api : solve
+- op : solve
   args : (Tensor x, Tensor y)
   output : Tensor
   infer_meta :
@@ -182,7 +182,7 @@
     data_type : x
   backward : solve_grad
 
-- api : trace
+- op : trace
   args : (Tensor x, int offset = 0, int axis1 = 0, int axis2 = 1)
   output : Tensor
   infer_meta :
@@ -191,7 +191,7 @@
     func : trace
   backward : trace_grad
 
-- api : trunc
+- op : trunc
   args : (Tensor x)
   output : Tensor
   infer_meta :
@@ -199,3 +199,12 @@
   kernel :
     func : trunc
   backward : trunc_grad
+
+- op : flip
+  args : (Tensor x, int[] axis)
+  output : Tensor (out)
+  infer_meta :
+    func : FlipInferMeta
+  kernel :
+    func : flip
+  backward : flip_grad
diff --git a/paddle/phi/api/yaml/sparse_bw_api.yaml b/paddle/phi/api/yaml/sparse_backward.yaml
similarity index 60%
rename from paddle/phi/api/yaml/sparse_bw_api.yaml
rename to paddle/phi/api/yaml/sparse_backward.yaml
index e6242f178e5407..6503dbb46e8576 100644
--- a/paddle/phi/api/yaml/sparse_bw_api.yaml
+++ b/paddle/phi/api/yaml/sparse_backward.yaml
@@ -1,296 +1,452 @@
-- backward_api : abs_grad
-  forward : tanh(Tensor x) -> Tensor(out)
+- backward_op : abs_grad
+  forward : abs(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : abs_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            abs_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : acos_grad
+- backward_op : acos_grad
   forward : acos(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : acos_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            acos_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : acosh_grad
+- backward_op : acosh_grad
   forward : acosh(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : acosh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            acosh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : add_grad
+- backward_op : add_grad
   forward : add(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : add_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
-           add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
+           add_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr},
+           add_coo_dense_grad{sparse_coo, dense, sparse_coo -> sparse_coo, dense}
 
-- backward_api : addmm_grad
+- backward_op : addmm_grad
   forward : addmm(Tensor input, Tensor x, Tensor y, float alpha=1.0, float beta=1.0) -> Tensor(out)
   args : (Tensor input, Tensor x, Tensor y, Tensor out_grad, float alpha=1.0, float beta=1.0)
   output : Tensor(input_grad), Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [input, x, y]
   kernel :
     func : addmm_csr_dense_grad {dense, sparse_csr, dense, dense -> dense, sparse_csr, dense},
            addmm_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr, sparse_csr},
            addmm_coo_dense_grad {dense, sparse_coo, dense, dense -> dense, sparse_coo, dense},
            addmm_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo, sparse_coo}
 
-- backward_api : asin_grad
+- backward_op : asin_grad
   forward : asin(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : asin_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            asin_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : asinh_grad
+- backward_op : asinh_grad
   forward : asinh(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : asinh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            asinh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : atan_grad
+- backward_op : atan_grad
   forward : atan(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : atan_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            atan_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : atanh_grad
+- backward_op : atanh_grad
   forward : atanh(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : atanh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            atanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : cast_grad
+- backward_op : batch_norm_grad
+  forward : batch_norm (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean_out, Tensor variance_out, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, bias]
+  kernel :
+    func : batch_norm_coo_grad {sparse_coo, dense, dense, dense, dense, dense, dense, dense, sparse_coo -> sparse_coo, dense, dense}
+    data_type : out_grad
+  optional : mean_out, variance_out, reserve_space
+
+- backward_op : cast_grad
   forward : cast(Tensor x, DataType index_dtype, DataType value_dtype) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, DataType value_dtype)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
   kernel :
     func : cast_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            cast_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
     data_type : out_grad
 
-- backward_api : conv3d_coo_grad
-  forward : conv3d_coo (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out), Tensor(rulebook), Tensor(counter)
+- backward_op : conv3d_grad
+  forward : conv3d (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key) -> Tensor(out), Tensor(rulebook), Tensor(counter)
   args : (Tensor x, Tensor kernel, Tensor out, Tensor rulebook, Tensor counter, Tensor out_grad, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
   output : Tensor(x_grad), Tensor(kernel_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, kernel]
   kernel :
     func : conv3d_coo_grad{sparse_coo, dense, sparse_coo, dense, dense, sparse_coo -> sparse_coo, dense}
 
-- backward_api : divide_grad
+- backward_op : divide_grad
   forward : divide(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : divide_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
            divide_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
-- backward_api : divide_scalar_grad
+- backward_op : divide_scalar_grad
   forward : divide_scalar (Tensor x, float scalar) -> Tensor(out)
   args : (Tensor out_grad, float scalar)
   output : Tensor(x_grad)
   invoke : divide_scalar(out_grad, scalar)
 
-- backward_api : expm1_grad
+- backward_op : expm1_grad
   forward : expm1(Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
   kernel :
     func : expm1_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            expm1_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : leaky_relu_grad
+- backward_op : leaky_relu_grad
   forward : leaky_relu(Tensor x, float alpha) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float alpha)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : leaky_relu_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            leaky_relu_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : log1p_grad
+- backward_op : log1p_grad
   forward : log1p(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : log1p_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            log1p_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : masked_matmul_grad
+- backward_op : masked_matmul_grad
   forward : masked_matmul(Tensor x, Tensor y, Tensor mask) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : masked_matmul_csr_grad{dense, dense, sparse_csr -> dense, dense}
 
-- backward_api : matmul_grad
+- backward_op : matmul_grad
   forward : matmul(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : matmul_csr_dense_grad {sparse_csr, dense, dense -> sparse_csr, dense},
            matmul_csr_csr_grad {sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr},
            matmul_coo_dense_grad {sparse_coo, dense, dense -> sparse_coo, dense},
            matmul_coo_coo_grad {sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo}
 
-- backward_api : maxpool_grad
+- backward_op : maxpool_grad
   forward : maxpool(Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides) -> Tensor(out), Tensor(rulebook), Tensor(counter)
   args : (Tensor x, Tensor rulebook, Tensor counter, Tensor out, Tensor out_grad, int[] kernel_sizes)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [x]
   kernel :
     func : maxpool_coo_grad {sparse_coo, dense, dense, sparse_coo, sparse_coo -> sparse_coo}
 
-- backward_api : multiply_grad
+- backward_op : multiply_grad
   forward : multiply(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : multiply_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
            multiply_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
-- backward_api : mv_grad
+- backward_op : mv_grad
   forward : mv(Tensor x, Tensor vec) -> Tensor(out)
   args : (Tensor x, Tensor vec, Tensor out_grad)
   output : Tensor(x_grad), Tensor(vec_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, vec]
   kernel :
     func : mv_coo_grad{sparse_coo, dense, dense -> sparse_coo, dense},
            mv_csr_grad{sparse_csr, dense, dense -> sparse_csr, dense}
 
-- backward_api : pow_grad
+- backward_op : pow_grad
   forward : pow(Tensor x, float factor) -> Tensor(out)
   args : (Tensor x, Tensor out_grad, float factor)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : pow_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            pow_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : relu6_grad
+- backward_op : relu6_grad
   forward : relu6(Tensor x, float threshold) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, float threshold)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
   kernel :
     func : relu6_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            relu6_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : relu_grad
+- backward_op : relu_grad
   forward : relu(Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
   kernel :
     func : relu_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            relu_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : scale_grad
+- backward_op : reshape_grad
+  forward : reshape(Tensor x, IntArray shape) -> Tensor(out)
+  args : (Tensor x, Tensor out_grad)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
+  kernel :
+    func : reshape_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
+           reshape_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
+
+- backward_op : scale_grad
   forward : scale(Tensor x, float scale, float bias, bool bias_after_scale) -> Tensor(out)
   args : (Tensor out_grad, float scale)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out_grad]
   invoke : scale(out_grad, scale, 0.0, true)
 
-- backward_api : sin_grad
+- backward_op : sin_grad
   forward : sin(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : sin_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            sin_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : sinh_grad
+- backward_op : sinh_grad
   forward : sinh(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : sinh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            sinh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : softmax_grad
+- backward_op : softmax_grad
   forward : softmax(Tensor x, int axis=-1) -> Tensor(out)
   args : (Tensor out, Tensor out_grad, int axis)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
   kernel :
     func : softmax_csr_grad{sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : sparse_coo_tensor_grad
-  forward : sparse_coo_tensor(Tensor values, Tensor indices, IntArray dense_shape) -> Tensor(out)
+- backward_op : sparse_coo_tensor_grad
+  forward : sparse_coo_tensor(Tensor values, Tensor indices, int64_t[] shape) -> Tensor(out)
   args : (Tensor indices, Tensor out_grad)
   output : Tensor(values_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param: [out_grad]
   kernel :
     func : sparse_coo_tensor_grad{dense, sparse_coo -> dense}
 
-- backward_api : sqrt_grad
+- backward_op : sqrt_grad
   forward : sqrt(Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
   kernel :
     func : sqrt_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            sqrt_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : square_grad
+- backward_op : square_grad
   forward : square(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : square_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            square_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : subtract_grad
+- backward_op : subtract_grad
   forward : subtract(Tensor x, Tensor y) -> Tensor(out)
   args : (Tensor x, Tensor y, Tensor out_grad)
   output : Tensor(x_grad), Tensor(y_grad)
+  infer_meta :
+    func : GeneralBinaryGradInferMeta
+    param : [x, y]
   kernel :
     func : subtract_coo_coo_grad{sparse_coo, sparse_coo, sparse_coo -> sparse_coo, sparse_coo},
            subtract_csr_csr_grad{sparse_csr, sparse_csr, sparse_csr -> sparse_csr, sparse_csr}
 
-- backward_api : tan_grad
+- backward_op : sync_batch_norm_grad
+  forward : sync_batch_norm_(Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu) -> Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor saved_mean, Tensor saved_variance, Tensor reserve_space, Tensor out_grad, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(x_grad), Tensor(scale_grad), Tensor(bias_grad)
+  infer_meta :
+    func : GeneralTernaryGradInferMeta
+    param : [x, scale, bias]
+  kernel :
+    func : sync_batch_norm_coo_grad{sparse_coo, dense, dense, dense, dense, dense, sparse_coo -> sparse_coo, dense, dense}
+    data_type : out_grad
+  optional : reserve_space
+
+- backward_op : tan_grad
   forward : tan(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : tan_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            tan_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : tanh_grad
+- backward_op : tanh_grad
   forward : tanh(Tensor x) -> Tensor(out)
   args : (Tensor out, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [out]
   kernel :
     func : tanh_coo_grad {sparse_coo, sparse_coo -> sparse_coo},
            tanh_csr_grad {sparse_csr, sparse_csr -> sparse_csr}
 
-- backward_api : to_dense_grad
+- backward_op : to_dense_grad
   forward : to_dense(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : coo_to_dense_grad{sparse_coo, dense -> sparse_coo}
 
-- backward_api : to_sparse_coo_grad
+- backward_op : to_sparse_coo_grad
   forward : to_sparse_coo(Tensor x, int64_t sparse_dim) -> Tensor(out)
   args : (Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : coo_to_dense { sparse_coo -> dense }
 
-- backward_api : values_grad
-  forward : values_coo(Tensor x) -> Tensor(out)
+- backward_op : transpose_grad
+  forward : transpose(Tensor x, int[] perm) -> Tensor(out)
+  args : (Tensor out_grad, int[] perm)
+  output : Tensor(x_grad)
+  infer_meta :
+    func : TransposeGradInferMeta
+    param : [out_grad, perm]
+  kernel :
+    func : transpose_coo_grad {sparse_coo -> sparse_coo},
+           transpose_csr_grad {sparse_csr -> sparse_csr}
+
+- backward_op : values_grad
+  forward : values(Tensor x) -> Tensor(out)
   args : (Tensor x, Tensor out_grad)
   output : Tensor(x_grad)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : values_coo_grad{sparse_coo, dense-> sparse_coo}
 
-- backward_api: fused_attention_grad
-  forward : fused_attention_csr(Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask) -> Tensor(out), Tensor(softmax)
+- backward_op: fused_attention_grad
+  forward : fused_attention(Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask) -> Tensor(out), Tensor(softmax)
   args: (Tensor query, Tensor key, Tensor value, Tensor softmax, Tensor out_grad)
   output : Tensor(query_grad), Tensor(key_grad), Tensor(value_grad)
+  infer_meta :
+    func : sparse::FusedAttentionGradInferMeta
   kernel :
     func : fused_attention_csr_grad{dense, dense, dense, sparse_csr, dense -> dense, dense, dense}
     layout : softmax
diff --git a/paddle/phi/api/yaml/sparse_api.yaml b/paddle/phi/api/yaml/sparse_ops.yaml
similarity index 64%
rename from paddle/phi/api/yaml/sparse_api.yaml
rename to paddle/phi/api/yaml/sparse_ops.yaml
index ca40d10b496fab..015e7aef0ff0b0 100644
--- a/paddle/phi/api/yaml/sparse_api.yaml
+++ b/paddle/phi/api/yaml/sparse_ops.yaml
@@ -1,78 +1,109 @@
-- api : abs
+- op : abs
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : abs_coo{sparse_coo -> sparse_coo},
            abs_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : abs_grad
 
-- api : acos
+- op : acos
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : acos_coo{sparse_coo -> sparse_coo},
            acos_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : acos_grad
 
-- api : acosh
+- op : acosh
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : acosh_coo{sparse_coo -> sparse_coo},
            acosh_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : acosh_grad
 
-- api : add
+- op : add
   args : (Tensor x, Tensor y)
   output : Tensor(out)
+  infer_meta :
+    func : ElementwiseInferMeta
   kernel :
     func : add_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
            add_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
+           add_coo_dense{sparse_coo, dense -> sparse_coo},
     layout : x
   backward : add_grad
 
-- api : asin
+- op : asin
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : asin_coo{sparse_coo -> sparse_coo},
            asin_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : asin_grad
 
-- api : asinh
+- op : asinh
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : asinh_coo{sparse_coo -> sparse_coo},
            asinh_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : asinh_grad
 
-- api : atan
+- op : atan
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : atan_coo{sparse_coo -> sparse_coo},
            atan_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : atan_grad
 
-- api : atanh
+- op : atanh
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : atanh_coo{sparse_coo -> sparse_coo},
            atanh_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : atanh_grad
 
-- api : cast
+- op : batch_norm
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  infer_meta :
+    func : BatchNormInferMeta
+  kernel :
+    func : batch_norm_coo {sparse_coo, dense, dense, dense, dense -> sparse_coo, dense, dense, dense, dense, dense}
+    data_type : x
+  view : (mean -> mean_out), (variance -> variance_out)
+  backward : batch_norm_grad
+
+- op : cast
   args : (Tensor x, DataType index_dtype=DataType::UNDEFINED, DataType value_dtype=DataType::UNDEFINED)
   output : Tensor(out)
+  infer_meta :
+    func : CastInferMeta
+    param: [x, value_dtype]
   kernel :
     func : cast_coo{sparse_coo -> sparse_coo},
            cast_csr{sparse_csr -> sparse_csr}
@@ -80,218 +111,276 @@
     data_type : x
   backward : cast_grad
 
-- api : conv3d
-  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key)
+- op : conv3d
+  args : (Tensor x, Tensor kernel, int[] paddings, int[] dilations, int[] strides, int groups, bool subm, str key="")
   output : Tensor(out), Tensor(rulebook), Tensor(counter)
+  infer_meta :
+    func : sparse::Conv3dInferMeta
   kernel :
     func : conv3d_coo{sparse_coo, dense -> sparse_coo, dense, dense}
     layout : x
   intermediate: rulebook, counter
-  backward : conv3d_coo_grad
+  backward : conv3d_grad
 
-- api : divide
+- op : divide
   args : (Tensor x, Tensor y)
   output : Tensor(out)
+  infer_meta :
+    func : ElementwiseInferMeta
   kernel :
     func : divide_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
            divide_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
     layout : x
   backward : divide_grad
 
-- api : divide_scalar
+- op : divide_scalar
   args : (Tensor x, float scalar)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : divide_coo_scalar{sparse_coo -> sparse_coo},
            divide_csr_scalar{sparse_csr -> sparse_csr}
   backward : divide_scalar_grad
 
-- api : expm1
+- op : expm1
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : expm1_coo{sparse_coo -> sparse_coo},
            expm1_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : expm1_grad
 
-- api : leaky_relu
+- op : leaky_relu
   args : (Tensor x, float alpha)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : leaky_relu_coo{sparse_coo -> sparse_coo},
            leaky_relu_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : leaky_relu_grad
 
-- api : log1p
+- op : log1p
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : log1p_coo{sparse_coo -> sparse_coo},
            log1p_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : log1p_grad
 
-- api : multiply
+- op : multiply
   args : (Tensor x, Tensor y)
   output : Tensor(out)
+  infer_meta :
+    func : ElementwiseInferMeta
   kernel :
     func : multiply_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
            multiply_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
     layout : x
   backward : multiply_grad
 
-- api : pow
+- op : pow
   args : (Tensor x, float factor)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : pow_coo{sparse_coo -> sparse_coo},
            pow_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : pow_grad
 
-- api : relu
+- op : relu
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : relu_coo{sparse_coo -> sparse_coo},
            relu_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : relu_grad
 
-- api : relu6
+- op : relu6
   args : (Tensor x, float threshold)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : relu6_coo{sparse_coo -> sparse_coo},
            relu6_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : relu6_grad
 
-- api : scale
+- op : scale
   args : (Tensor x, float scale, float bias, bool bias_after_scale)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : scale_coo{sparse_coo -> sparse_coo},
            scale_csr{sparse_csr -> sparse_csr}
   backward : scale_grad
 
-- api : sin
+- op : sin
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : sin_coo{sparse_coo -> sparse_coo},
            sin_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : sin_grad
 
-- api : sinh
+- op : sinh
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : sinh_coo{sparse_coo -> sparse_coo},
            sinh_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : sinh_grad
 
-- api : softmax
+- op : softmax
   args : (Tensor x, int axis=-1)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : softmax_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : softmax_grad
 
-- api : sparse_coo_tensor
-  args : (Tensor values, Tensor indices, IntArray dense_shape)
+- op : sparse_coo_tensor
+  args : (Tensor values, Tensor indices, int64_t[] shape={})
   output : Tensor(out)
+  infer_meta :
+    func : sparse::SparseCooTensorInferMeta
   kernel :
     func : sparse_coo_tensor{dense, dense -> sparse_coo}
     layout : values
     data_type : values
   backward : sparse_coo_tensor_grad
 
-- api : sqrt
+- op : sqrt
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : sqrt_coo{sparse_coo -> sparse_coo},
            sqrt_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : sqrt_grad
 
-- api : square
+- op : square
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : square_coo{sparse_coo -> sparse_coo},
            square_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : square_grad
 
-- api : subtract
+- op : subtract
   args : (Tensor x, Tensor y)
   output : Tensor(out)
+  infer_meta :
+    func : ElementwiseInferMeta
   kernel :
     func : subtract_coo_coo{sparse_coo, sparse_coo -> sparse_coo},
            subtract_csr_csr{sparse_csr, sparse_csr -> sparse_csr}
     layout : x
   backward : subtract_grad
 
-- api : tan
+- op : tan
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : tan_coo{sparse_coo -> sparse_coo},
            tan_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : tan_grad
 
-- api : tanh
+- op : tanh
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : tanh_coo{sparse_coo -> sparse_coo},
            tanh_csr{sparse_csr -> sparse_csr}
     layout : x
   backward : tanh_grad
 
-- api : to_dense
+- op : to_dense
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : coo_to_dense {sparse_coo -> dense},
            csr_to_dense {sparse_csr -> dense}
   backward : to_dense_grad
 
-- api : to_sparse_coo
+- op : to_sparse_coo
   args : (Tensor x, int64_t sparse_dim)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [x]
   kernel :
     func : dense_to_coo { dense -> sparse_coo },
            csr_to_coo { sparse_csr -> sparse_coo}
   backward : to_sparse_coo_grad
 
-- api : to_sparse_csr
+- op : to_sparse_csr
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func : dense_to_csr {dense -> sparse_csr},
            coo_to_csr {sparse_coo -> sparse_csr}
 
-- api : values
+- op : values
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : sparse::ValuesInferMeta
   kernel :
     func : values_coo{sparse_coo -> dense},
            values_csr{sparse_csr -> dense}
     layout : x
   backward : values_grad
 
-- api: addmm
+- op: addmm
   args : (Tensor input, Tensor x, Tensor y, float alpha=1.0, float beta=1.0)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
+    param : [input]
   kernel :
     func : addmm_csr_dense {dense, sparse_csr, dense -> dense},
            addmm_csr_csr {sparse_csr, sparse_csr, sparse_csr -> sparse_csr},
@@ -300,25 +389,32 @@
     layout : x
   backward: addmm_grad
 
-- api: coalesce
+- op: coalesce
   args : (Tensor x)
   output : Tensor(out)
+  infer_meta :
+    func : UnchangedInferMeta
   kernel :
     func: coalesce{sparse_coo -> sparse_coo}
     layout : x
 
-- api: full_like
+- op: full_like
   args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED)
   output : Tensor(out)
+  infer_meta :
+    func : CreateLikeInferMeta
+    param : [x, dtype]
   kernel :
     func : coo_full_like{sparse_coo -> sparse_coo},
            csr_full_like{sparse_csr -> sparse_csr}
     layout : x
     data_type : dtype
 
-- api: fused_attention
+- op: fused_attention
   args : (Tensor query, Tensor key, Tensor value, Tensor sparse_mask, Tensor key_padding_mask, Tensor attn_mask)
   output : Tensor(out), Tensor(softmax)
+  infer_meta :
+    func : sparse::FusedAttentionInferMeta
   kernel :
     func : fused_attention_csr{dense, dense, dense, sparse_csr, dense, dense -> dense, sparse_csr}
     layout : sparse_mask
@@ -327,17 +423,23 @@
   intermediate : softmax
   backward: fused_attention_grad
 
-- api: masked_matmul
+- op: masked_matmul
   args : (Tensor x, Tensor y, Tensor mask)
   output : Tensor(out)
+  infer_meta :
+    func : MatmulInferMeta
+    param : [x, y, false, false]
   kernel :
     func : masked_matmul_csr{dense, dense, sparse_csr -> sparse_csr}
     layout : x
   backward: masked_matmul_grad
 
-- api: matmul
+- op: matmul
   args : (Tensor x, Tensor y)
   output : Tensor(out)
+  infer_meta :
+    func : MatmulInferMeta
+    param: [x, y, false, false]
   kernel :
     func : matmul_csr_dense {sparse_csr, dense -> dense},
            matmul_csr_csr {sparse_csr, sparse_csr -> sparse_csr},
@@ -346,20 +448,58 @@
     layout : x
   backward: matmul_grad
 
-- api: maxpool
+- op: maxpool
   args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
   output : Tensor(out), Tensor(rulebook), Tensor(counter)
+  infer_meta :
+    func : sparse::Pool3dInferMeta
   kernel :
     func : maxpool_coo{sparse_coo -> sparse_coo, dense, dense}
     layout : x
   intermediate : rulebook, counter
   backward : maxpool_grad
 
-- api: mv
+- op: mv
   args : (Tensor x, Tensor vec)
   output : Tensor(out)
+  infer_meta :
+    func : MvInferMeta
   kernel :
     func : mv_coo{sparse_coo, dense -> dense},
            mv_csr{sparse_csr, dense -> dense}
     layout : x
   backward: mv_grad
+
+- op : transpose
+  args : (Tensor x, int[] perm)
+  output : Tensor(out)
+  infer_meta :
+    func : TransposeInferMeta
+    param: [ x, perm ]
+  kernel :
+    func : transpose_coo{sparse_coo -> sparse_coo},
+           transpose_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : transpose_grad
+
+- op : sync_batch_norm_
+  args : (Tensor x, Tensor scale, Tensor bias, Tensor mean, Tensor variance, float momentum, float epsilon, str data_layout, bool is_test, bool use_global_stats, bool trainable_statistics, bool fuse_with_relu)
+  output : Tensor(out), Tensor(mean_out), Tensor(variance_out), Tensor(saved_mean), Tensor(saved_variance), Tensor(reserve_space)
+  infer_meta :
+    func : BatchNormInferMeta
+  kernel :
+    func : sync_batch_norm_coo{sparse_coo, dense, dense, dense, dense -> sparse_coo, dense, dense, dense, dense, dense}
+    data_type : x
+  backward : sync_batch_norm_grad
+  inplace : (mean -> mean_out), (variance -> variance_out)
+
+- op : reshape
+  args : (Tensor x,  IntArray shape)
+  output : Tensor(out)
+  infer_meta :
+    func : ReshapeInferMeta
+  kernel :
+    func : reshape_coo{sparse_coo -> sparse_coo},
+           reshape_csr{sparse_csr -> sparse_csr}
+    layout : x
+  backward : reshape_grad
diff --git a/paddle/phi/api/yaml/strings_api.yaml b/paddle/phi/api/yaml/strings_ops.yaml
similarity index 93%
rename from paddle/phi/api/yaml/strings_api.yaml
rename to paddle/phi/api/yaml/strings_ops.yaml
index 34dac9221a4a0d..aef16d85d2882e 100644
--- a/paddle/phi/api/yaml/strings_api.yaml
+++ b/paddle/phi/api/yaml/strings_ops.yaml
@@ -1,4 +1,4 @@
-- api : empty
+- op : empty
   args : (IntArray shape, Place place=CPUPlace())
   output : Tensor(out@StringTensor)
   infer_meta :
@@ -9,7 +9,7 @@
     param : [shape]
     backend : place
 
-- api : empty_like
+- op : empty_like
   args : (Tensor x, Place place = {})
   output : Tensor(out@StringTensor)
   infer_meta :
@@ -20,7 +20,7 @@
     param : [x]
     backend : place > x
 
-- api : lower
+- op : lower
   args : (Tensor x, bool use_utf8_encoding)
   output : Tensor(out@StringTensor)
   infer_meta :
@@ -29,7 +29,7 @@
   kernel :
     func : strings_lower
 
-- api : upper
+- op : upper
   args : (Tensor x, bool use_utf8_encoding)
   output : Tensor(out@StringTensor)
   infer_meta :
diff --git a/paddle/phi/backends/CMakeLists.txt b/paddle/phi/backends/CMakeLists.txt
index 9a26aed5f341b5..9bc9573529241e 100644
--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
@@ -21,6 +21,7 @@ endif()
 
 if(WITH_MKLDNN)
   list(APPEND BACKENDS_SRCS onednn/onednn_context.cc)
+  list(APPEND BACKENDS_SRCS onednn/axpy_handler.cc)
   list(APPEND BACKENDS_DEPS mkldnn)
 endif()
 
diff --git a/paddle/phi/backends/dynload/cublasLt.h b/paddle/phi/backends/dynload/cublasLt.h
index 1e2a20ebdf4407..90492ff4ba69d6 100644
--- a/paddle/phi/backends/dynload/cublasLt.h
+++ b/paddle/phi/backends/dynload/cublasLt.h
@@ -54,26 +54,28 @@ extern void *cublasLt_dso_handle;
 
 // APIs available after CUDA 10.1
 // #if CUDA_VERSION >= 10100
-#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)      \
-  __macro(cublasLtCreate);                       \
-  __macro(cublasLtDestroy);                      \
-  __macro(cublasLtMatmul);                       \
-  __macro(cublasLtMatmulDescCreate);             \
-  __macro(cublasLtMatmulDescDestroy);            \
-  __macro(cublasLtMatmulDescSetAttribute);       \
-  __macro(cublasLtMatmulDescGetAttribute);       \
-  __macro(cublasLtMatrixLayoutCreate);           \
-  __macro(cublasLtMatrixLayoutDestroy);          \
-  __macro(cublasLtMatrixLayoutSetAttribute);     \
-  __macro(cublasLtMatrixLayoutGetAttribute);     \
-  __macro(cublasLtMatmulPreferenceCreate);       \
-  __macro(cublasLtMatmulPreferenceDestroy);      \
-  __macro(cublasLtMatmulPreferenceSetAttribute); \
-  __macro(cublasLtMatmulAlgoGetHeuristic);       \
-  __macro(cublasLtMatrixTransform);              \
-  __macro(cublasLtMatrixTransformDescCreate);    \
-  __macro(cublasLtMatrixTransformDescDestroy);   \
-  __macro(cublasLtMatrixTransformDescSetAttribute);
+#define CUBLASLT_BLAS_ROUTINE_EACH(__macro)         \
+  __macro(cublasLtCreate);                          \
+  __macro(cublasLtDestroy);                         \
+  __macro(cublasLtMatmul);                          \
+  __macro(cublasLtMatmulDescCreate);                \
+  __macro(cublasLtMatmulDescDestroy);               \
+  __macro(cublasLtMatmulDescSetAttribute);          \
+  __macro(cublasLtMatmulDescGetAttribute);          \
+  __macro(cublasLtMatrixLayoutCreate);              \
+  __macro(cublasLtMatrixLayoutDestroy);             \
+  __macro(cublasLtMatrixLayoutSetAttribute);        \
+  __macro(cublasLtMatrixLayoutGetAttribute);        \
+  __macro(cublasLtMatmulPreferenceCreate);          \
+  __macro(cublasLtMatmulPreferenceDestroy);         \
+  __macro(cublasLtMatmulPreferenceSetAttribute);    \
+  __macro(cublasLtMatmulAlgoGetHeuristic);          \
+  __macro(cublasLtMatrixTransform);                 \
+  __macro(cublasLtMatrixTransformDescCreate);       \
+  __macro(cublasLtMatrixTransformDescDestroy);      \
+  __macro(cublasLtMatrixTransformDescSetAttribute); \
+  __macro(cublasLtMatmulAlgoInit);                  \
+  __macro(cublasLtMatmulAlgoConfigSetAttribute);
 
 CUBLASLT_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLASLT_WRAP)
 // #endif
diff --git a/paddle/phi/backends/dynload/dynamic_loader.cc b/paddle/phi/backends/dynload/dynamic_loader.cc
index 36a78695959235..b804e930580db5 100644
--- a/paddle/phi/backends/dynload/dynamic_loader.cc
+++ b/paddle/phi/backends/dynload/dynamic_loader.cc
@@ -326,7 +326,7 @@ void* GetCublasDsoHandle() {
 
 void* GetCublasLtDsoHandle() {
 // APIs available after CUDA 10.1
-#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10100
+#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10010
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublasLt.so");
 #else
   std::string warning_msg(
diff --git a/paddle/phi/backends/dynload/tensorrt.cc b/paddle/phi/backends/dynload/tensorrt.cc
index 45525701020250..2e2319a47cc542 100644
--- a/paddle/phi/backends/dynload/tensorrt.cc
+++ b/paddle/phi/backends/dynload/tensorrt.cc
@@ -40,21 +40,10 @@ void* GetDsoHandle(const std::string& dso_name) {
 
   void* dso_handle = dlopen(dso_name.c_str(), dynload_flags);
 
-  if (nullptr == dso_handle) {
-    auto error_msg =
-        "You are using Paddle compiled with TensorRT, but TensorRT dynamic "
-        "library is not found. Ignore this if TensorRT is not needed.\n"
-        "The TensorRT that Paddle depends on is not configured correctly.\n"
-        "  Suggestions:\n"
-        "  1. Check if the TensorRT is installed correctly and its version"
-        " is matched with paddlepaddle you installed.\n"
-        "  2. Configure environment variables as "
-        "follows:\n"
-        "  - Linux: set LD_LIBRARY_PATH by `export LD_LIBRARY_PATH=...`\n"
-        "  - Windows: set PATH by `set PATH=XXX;%PATH%`\n"
-        "  - Mac: set  DYLD_LIBRARY_PATH by `export DYLD_LIBRARY_PATH=...`\n";
-    LOG(WARNING) << error_msg;
-  }
+  PADDLE_ENFORCE_NOT_NULL(dso_handle,
+                          paddle::platform::errors::NotFound(
+                              "TensorRT is needed, "
+                              "but TensorRT dynamic library is not found."));
   return dso_handle;
 }
 
diff --git a/paddle/phi/backends/gpu/cuda/cuda_helper.h b/paddle/phi/backends/gpu/cuda/cuda_helper.h
index 6d33d802b1880a..7463edc5d9ff60 100644
--- a/paddle/phi/backends/gpu/cuda/cuda_helper.h
+++ b/paddle/phi/backends/gpu/cuda/cuda_helper.h
@@ -62,11 +62,12 @@ namespace gpu {
  *
  */
 
-#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                  \
-  int64_t __index__ =                                              \
-      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x; \
-  for (index_type i = __index__; __index__ < (num);                \
-       __index__ += blockDim.x * gridDim.x, i = __index__)
+#define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                    \
+  int64_t __index__ =                                                \
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;   \
+  int64_t __stride__ = static_cast<int64_t>(blockDim.x) * gridDim.x; \
+  for (index_type i = __index__; __index__ < (num);                  \
+       __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends
diff --git a/paddle/phi/backends/gpu/gpu_resources.cc b/paddle/phi/backends/gpu/gpu_resources.cc
index 02571399143845..4a16480101a703 100644
--- a/paddle/phi/backends/gpu/gpu_resources.cc
+++ b/paddle/phi/backends/gpu/gpu_resources.cc
@@ -97,6 +97,22 @@ void InitGpuProperties(Place place,
       (*driver_version / 1000) * 10 + (*driver_version % 100) / 10;
   auto compile_cuda_version =
       (CUDA_VERSION / 1000) * 10 + (CUDA_VERSION % 100) / 10;
+#if defined(__linux__)
+  PADDLE_ENFORCE_EQ(
+      (local_cuda_version / 10 < compile_cuda_version / 10) &&
+          (cudnn_dso_ver / 1000 < CUDNN_VERSION / 1000),
+      false,
+      phi::errors::InvalidArgument(
+          "The installed Paddle is compiled with CUDA%d/cuDNN%d,"
+          "but CUDA/cuDNN version in your machine is CUDA%d/cuDNN%d. "
+          "which will cause serious incompatible bug. "
+          "Please recompile or reinstall Paddle with compatible CUDA/cuDNN "
+          "version.",
+          compile_cuda_version / 10,
+          CUDNN_VERSION / 1000,
+          local_cuda_version / 10,
+          cudnn_dso_ver / 1000));
+#endif
   if (local_cuda_version < compile_cuda_version) {
     LOG_FIRST_N(WARNING, 1)
         << "WARNING: device: " << static_cast<int>(place.device)
diff --git a/paddle/phi/backends/gpu/rocm/rocm_helper.h b/paddle/phi/backends/gpu/rocm/rocm_helper.h
index e25dea28e36c10..07fdde5a2f417a 100644
--- a/paddle/phi/backends/gpu/rocm/rocm_helper.h
+++ b/paddle/phi/backends/gpu/rocm/rocm_helper.h
@@ -65,8 +65,9 @@ namespace gpu {
 #define CUDA_KERNEL_LOOP_TYPE(i, num, index_type)                           \
   int64_t __index__ =                                                       \
       static_cast<int64_t>(hipBlockIdx_x) * hipBlockDim_x + hipThreadIdx_x; \
+  int64_t __stride__ = static_cast<int64_t>(hipBlockDim_x) * hipGridDim_x;  \
   for (index_type i = __index__; __index__ < (num);                         \
-       __index__ += hipBlockDim_x * hipGridDim_x, i = __index__)
+       __index__ += __stride__, i = __index__)
 
 }  // namespace gpu
 }  // namespace backends
diff --git a/paddle/phi/backends/onednn/axpy_handler.cc b/paddle/phi/backends/onednn/axpy_handler.cc
new file mode 100644
index 00000000000000..df61948d62215b
--- /dev/null
+++ b/paddle/phi/backends/onednn/axpy_handler.cc
@@ -0,0 +1,133 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/onednn/axpy_handler.h"
+
+#include <cinttypes>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "paddle/phi/backends/onednn/onednn_helper.h"
+
+namespace phi {
+namespace funcs {
+
+template <typename T>
+class AXPYHandler {
+ public:
+  AXPYHandler(const dnnl::engine onednn_engine, int n, float alpha) {
+    OneDNNContext::tls().log_lib_version();
+    auto md = dnnl::memory::desc(
+        {n}, OneDNNGetDataType<T>(), dnnl::memory::format_tag::x);
+    src_mem_ = dnnl::memory(md, onednn_engine, DNNL_MEMORY_NONE);
+    dst_mem_ = dnnl::memory(md, onednn_engine, DNNL_MEMORY_NONE);
+    dnnl::primitive_attr reorder_attr;
+    dnnl::post_ops post_operations;
+    if (alpha != 1.f) {
+      std::vector<float> scales(1, alpha);
+      reorder_attr.set_output_scales(0, scales);
+    }
+    post_operations.append_sum(1.0f);
+
+    reorder_attr.set_post_ops(post_operations);
+    reorder_p_ = dnnl::reorder(src_mem_, dst_mem_, reorder_attr);
+  }
+
+  dnnl::memory &AcquireSrcMemory(const T *x) {
+    src_mem_.set_data_handle(to_void_cast<T>(x));
+    return src_mem_;
+  }
+
+  dnnl::memory &AcquireDstMemory(T *y) {
+    dst_mem_.set_data_handle(y);
+    return dst_mem_;
+  }
+
+  const dnnl::reorder &AcquireReorder() { return reorder_p_; }
+
+ private:
+  dnnl::memory src_mem_;
+  dnnl::memory dst_mem_;
+  dnnl::reorder reorder_p_;
+};
+
+template class AXPYHandler<float>;
+template class AXPYHandler<phi::dtype::bfloat16>;
+
+template <typename T>
+static void naive_axpy(int n, T alpha, const T *x, T *y) {
+  while (n-- > 0) {
+    *y += alpha * *x;
+    ++y;
+    ++x;
+  }
+}
+
+template <typename T>
+class OneDNNAXPYHandler<T>::Impl {
+ public:
+  Impl(int64_t n, T alpha, const dnnl::engine onednn_engine);
+  void operator()(const T *x, T *y);
+
+ private:
+  std::unique_ptr<AXPYHandler<T>> handler_;
+  int64_t n_;
+  T alpha_;
+};
+
+template <typename T>
+OneDNNAXPYHandler<T>::Impl::Impl(int64_t n,
+                                 T alpha,
+                                 const dnnl::engine onednn_engine)
+    : n_{n}, alpha_{alpha} {
+  handler_ = std::make_unique<AXPYHandler<T>>(
+      onednn_engine, n, static_cast<float>(alpha));
+}
+
+template <typename T>
+void OneDNNAXPYHandler<T>::Impl::operator()(const T *x, T *y) {
+  if (this->n_ < 100) {
+    naive_axpy(this->n_, this->alpha_, x, y);
+    return;
+  }
+
+  auto &reorder_src_mem_p = handler_->AcquireSrcMemory(x);
+  auto &reorder_dst_mem_p = handler_->AcquireDstMemory(y);
+  auto reorder_p = handler_->AcquireReorder();
+  auto &astream = OneDNNContext::tls().get_stream();
+  reorder_p.execute(astream, reorder_src_mem_p, reorder_dst_mem_p);
+  astream.wait();
+}
+
+template <typename T>
+OneDNNAXPYHandler<T>::OneDNNAXPYHandler(int64_t n,
+                                        T alpha,
+                                        const dnnl::engine onednn_engine)
+    : pimpl_{new Impl{n, alpha, onednn_engine},
+             [](Impl *impl) { delete impl; }} {
+  VLOG(4) << "[OneDNN] OneDNNAXPYHandler<" << typeid(T).name() << ">, "
+          << "n: " << n << ", alpha: " << alpha;
+}
+
+template <typename T>
+void OneDNNAXPYHandler<T>::operator()(const T *x, T *y) {
+  pimpl_->operator()(x, y);
+}
+
+template class OneDNNAXPYHandler<float>;
+template class OneDNNAXPYHandler<dtype::bfloat16>;
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/backends/onednn/axpy_handler.h b/paddle/phi/backends/onednn/axpy_handler.h
new file mode 100644
index 00000000000000..dd9a8108f59b05
--- /dev/null
+++ b/paddle/phi/backends/onednn/axpy_handler.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+#include "dnnl.hpp"  // NOLINT
+
+namespace phi {
+namespace funcs {
+///
+/// @brief      Helper class for AXPY execution using oneDNN library.
+///
+/// @tparam     T     Data type.
+///
+template <typename T>
+class OneDNNAXPYHandler {
+ public:
+  OneDNNAXPYHandler(OneDNNAXPYHandler&) = delete;
+  OneDNNAXPYHandler(OneDNNAXPYHandler&&) = delete;
+  OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&) = delete;
+  OneDNNAXPYHandler& operator=(OneDNNAXPYHandler&&) = delete;
+  ///
+  /// @brief      Constructor.
+  ///
+  /// @param[in]  n              The number of elements in tensor (assumed 1D
+  /// tensor)
+  /// @param[in]  alpha          The alpha coefficient.
+  /// @param[in]  onednn_engine  The oneDNN engine.
+  ///
+  OneDNNAXPYHandler(int64_t n, T alpha, dnnl::engine onednn_engine);
+  ///
+  /// @brief      Executes AXPY.
+  ///
+  /// @param[in]  x     The pointer to input X tensor data.
+  /// @param[out] y     The pointer to output Y tensor data.
+  ///
+  void operator()(const T* x, T* y);
+
+ private:
+  OneDNNAXPYHandler() = delete;
+  // (arogowie-intel) Private implementation idiom to hide dependency
+  // on OneDNN headers.
+  class Impl;
+  // We need custom deleter, since the compiler is unable to parameterize
+  // an allocator's default deleter due to incomple type.
+  std::unique_ptr<Impl, void (*)(Impl*)> pimpl_;
+};
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/backends/onednn/onednn_helper.h b/paddle/phi/backends/onednn/onednn_helper.h
index aeaecf7491e616..e91e02282ccc0d 100644
--- a/paddle/phi/backends/onednn/onednn_helper.h
+++ b/paddle/phi/backends/onednn/onednn_helper.h
@@ -96,29 +96,29 @@ inline dnnl::memory::format_tag GetPlainOneDNNFormat(int tensor_rank) {
 }
 
 template <typename Type>
-dnnl::memory::data_type oneDNNGetDataType() {
+dnnl::memory::data_type OneDNNGetDataType() {
   return dnnl::memory::data_type::undef;
 }
 
 template <>
-inline dnnl::memory::data_type oneDNNGetDataType<float>() {
+inline dnnl::memory::data_type OneDNNGetDataType<float>() {
   return dnnl::memory::data_type::f32;
 }
 template <>
-inline dnnl::memory::data_type oneDNNGetDataType<int32_t>() {
+inline dnnl::memory::data_type OneDNNGetDataType<int32_t>() {
   return dnnl::memory::data_type::s32;
 }
 template <>
-inline dnnl::memory::data_type oneDNNGetDataType<int8_t>() {
+inline dnnl::memory::data_type OneDNNGetDataType<int8_t>() {
   return dnnl::memory::data_type::s8;
 }
 template <>
-inline dnnl::memory::data_type oneDNNGetDataType<uint8_t>() {
+inline dnnl::memory::data_type OneDNNGetDataType<uint8_t>() {
   return dnnl::memory::data_type::u8;
 }
 
 template <>
-inline dnnl::memory::data_type oneDNNGetDataType<dtype::bfloat16>() {
+inline dnnl::memory::data_type OneDNNGetDataType<dtype::bfloat16>() {
   return dnnl::memory::data_type::bf16;
 }
 
diff --git a/paddle/phi/backends/onednn/onednn_reuse.h b/paddle/phi/backends/onednn/onednn_reuse.h
index 4a540ec884d935..cd8c076b28503c 100644
--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
@@ -24,8 +24,11 @@ limitations under the License. */
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/backends/onednn/onednn_helper.h"
 #include "paddle/phi/common/data_type.h"
+#include "paddle/phi/common/int_array.h"
 #include "paddle/phi/common/place.h"
+#include "paddle/phi/common/scalar.h"
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/data_layout_transform.h"
 
 namespace phi {
@@ -681,6 +684,43 @@ class ActivationOneDNNHandler
   }
 };
 
+template <typename T>
+class SoftmaxOneDNNHandler
+    : public OneDNNHandlerNoCachingT<T,
+                                     dnnl::softmax_forward,
+                                     dnnl::softmax_backward> {
+ public:
+  SoftmaxOneDNNHandler(const dnnl::engine onednn_engine,
+                       Place cpu_place,
+                       const DenseTensor* x,
+                       int axis)
+      : OneDNNHandlerNoCachingT<T,
+                                dnnl::softmax_forward,
+                                dnnl::softmax_backward>(onednn_engine,
+                                                        cpu_place) {
+    const int canonical_axis = funcs::CanonicalAxis(axis, x->dims().size());
+    this->AcquireForwardPrimitiveDescriptor(
+        dnnl::prop_kind::forward_scoring, x->mem_desc(), canonical_axis);
+  }
+
+  SoftmaxOneDNNHandler(const dnnl::engine onednn_engine,
+                       Place cpu_place,
+                       int axis,
+                       const DenseTensor* out,
+                       const DenseTensor* out_grad)
+      : OneDNNHandlerNoCachingT<T,
+                                dnnl::softmax_forward,
+                                dnnl::softmax_backward>(onednn_engine,
+                                                        cpu_place) {
+    const int canonical_axis =
+        funcs::CanonicalAxis(axis, out_grad->dims().size());
+    this->AcquireForwardPrimitiveDescriptor(
+        dnnl::prop_kind::forward_scoring, out->mem_desc(), canonical_axis);
+    this->AcquireBackwardPrimitiveDescriptor(
+        out_grad->mem_desc(), out->mem_desc(), canonical_axis);
+  }
+};
+
 class ReorderOneDNNHandler {
  public:
   ReorderOneDNNHandler(std::vector<int64_t>& dims,  // NOLINT
@@ -785,6 +825,7 @@ class ReorderOneDNNHandler {
 template <typename T>
 class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
  public:
+  bool use_broadcasting_hack;
   BinaryOneDNNHandler(const dnnl::algorithm algo,
                       const int axis,
                       const dnnl::engine engine,
@@ -795,15 +836,17 @@ class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
                       float scale_x,
                       float scale_y,
                       float scale_out,
+                      bool allow_hack,
                       const dnnl::post_ops& post_ops = dnnl::post_ops{})
       : OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
+    use_broadcasting_hack = false;
     const auto src_x_tz = vectorize(x->dims());
     const auto src_y_tz = vectorize(y->dims());
     // if output tensor(z) is nullptr then we are computing into oneDNN
     // managed buffer
     auto rankdiff = x->dims().size() - y->dims().size();
-    const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
-                                         : vectorize(out->dims());
+    auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
+                                   : vectorize(out->dims());
 
     auto src0_md = x->mem_desc();
     auto src1_md = y->mem_desc();
@@ -830,13 +873,53 @@ class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
       }
       src0_md = src0_md.reshape(dims0_ex);
     }
-    const auto dst_md =
-        memory::desc(dst_tz, oneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
 
     auto attributes =
         CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
 
+    // Workaround for U2++ model which deletes first tensor dimensions to enable
+    // optimized oneDNNs broadcasting. Output tensor is reshaped back afterwards
+    // at the end of the kernel, after the computation
+    if (allow_hack && dst_tz.size() == 4 &&
+        src0_md.dims()[2] != src1_md.dims()[2]) {
+      auto are_strides_plain = [](int64_t* strides, int ndims) {
+        for (int i = 0; i < ndims - 1; ++i) {
+          if (strides[i] < strides[i + 1]) {
+            return false;
+          }
+        }
+        return true;
+      };
+
+      auto src0_strides = src0_md.data.format_desc.blocking.strides;
+      auto src1_strides = src1_md.data.format_desc.blocking.strides;
+      auto src0_dims = src0_md.dims();
+      auto src1_dims = src1_md.dims();
+
+      bool can_squeeze = src0_dims[0] == src1_dims[0] &&
+                         src0_dims[1] == src1_dims[1] &&
+                         src0_dims[3] == src1_dims[3];
+
+      if (can_squeeze && are_strides_plain(src0_strides, 4) &&
+          are_strides_plain(src1_strides, 4)) {
+        src0_dims[1] *= dst_tz[0];
+        src1_dims[1] *= dst_tz[0];
+        dst_tz[1] *= dst_tz[0];
+        dst_tz.erase(dst_tz.begin());
+        src0_md = src0_md.reshape({src0_dims.begin() + 1, src0_dims.end()});
+        src1_md = src1_md.reshape({src1_dims.begin() + 1, src1_dims.end()});
+        use_broadcasting_hack = true;
+      }
+    }
+
+    auto dst_md =
+        memory::desc(dst_tz, OneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
+
     if (x->numel() < y->numel()) {
+      if (algo == dnnl::algorithm::binary_sub) {
+        attributes = CreateAttributes(
+            algo, -1.0 * scale_x, -1.0 * scale_y, scale_out, post_ops);
+      }
       this->AcquireForwardPrimitiveDescriptor(
           attributes, algo, src1_md, src0_md, dst_md);
     } else {
@@ -902,7 +985,7 @@ class BroadcastDataOneDNNHandler
       : OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
     const auto src0_tz = vectorize(out->dims());
     const auto src0_md = dnnl::memory::desc(
-        src0_tz, oneDNNGetDataType<T>(), GetPlainOneDNNFormat(src0_tz.size()));
+        src0_tz, OneDNNGetDataType<T>(), GetPlainOneDNNFormat(src0_tz.size()));
     const auto src1_md = x->mem_desc().reshape(extended_x_dims);
 
     dnnl::primitive_attr attributes;
@@ -937,7 +1020,7 @@ class ReductionOneDNNHandler
                          const dnnl::primitive_attr& attrs = NULL)
       : OneDNNHandlerNoCachingT<T, dnnl::reduction>(engine, cpu_place) {
     const auto out_md = memory::desc(
-        out_tz, oneDNNGetDataType<T>(), dnnl::memory::format_tag::any);
+        out_tz, OneDNNGetDataType<T>(), dnnl::memory::format_tag::any);
 
     if (attrs)
       this->AcquireForwardPrimitiveDescriptor(
@@ -947,5 +1030,124 @@ class ReductionOneDNNHandler
           algo, x->mem_desc(), out_md, p, eps);
   }
 };
+
+template <typename T>
+class ClipOneDNNHandler
+    : public OneDNNHandlerNoCachingT<T,
+                                     dnnl::eltwise_forward,
+                                     dnnl::eltwise_backward> {
+ public:
+  ClipOneDNNHandler(const Scalar& min,
+                    const Scalar& max,
+                    const dnnl::engine engine,
+                    Place cpu_place,
+                    const DenseTensor* x)
+      : OneDNNHandlerNoCachingT<T,
+                                dnnl::eltwise_forward,
+                                dnnl::eltwise_backward>(engine, cpu_place) {
+    float alpha = min.to<float>();
+    float beta = max.to<float>();
+
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
+                                            dnnl::algorithm::eltwise_clip_v2,
+                                            x->mem_desc(),
+                                            alpha,
+                                            beta);
+  }
+
+  ClipOneDNNHandler(const Scalar& min,
+                    const Scalar& max,
+                    const dnnl::engine engine,
+                    Place cpu_place,
+                    const DenseTensor* x,
+                    const DenseTensor* dout)
+      : OneDNNHandlerNoCachingT<T,
+                                dnnl::eltwise_forward,
+                                dnnl::eltwise_backward>(engine, cpu_place) {
+    float alpha = min.to<float>();
+    float beta = max.to<float>();
+
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
+                                            dnnl::algorithm::eltwise_clip_v2,
+                                            x->mem_desc(),
+                                            alpha,
+                                            beta);
+    this->AcquireBackwardPrimitiveDescriptor(dnnl::algorithm::eltwise_clip_v2,
+                                             dout->mem_desc(),
+                                             x->mem_desc(),
+                                             alpha,
+                                             beta);
+  }
+  std::shared_ptr<dnnl::memory> AcquireBackwardSrcMemory(
+      const DenseTensor* input) {
+    const T* input_data = input->data<T>();
+    return this->AcquireMemoryFromPrimitive(this->bwd_pd_->src_desc(),
+                                            to_void_cast<T>(input_data));
+  }
+};
+template <typename T>
+class TransposeOneDNNHandler {
+ public:
+  TransposeOneDNNHandler(const OneDNNContext& dev_ctx,
+                         std::vector<int64_t>& dims,  // NOLINT
+                         std::vector<int>& axis,      // NOLINT
+                         dnnl::engine engine)
+      : dev_ctx_(dev_ctx),
+        dims_(dims),
+        axis_(axis),
+        logical_axis_(dims.size(), 0),
+        engine_(engine) {}
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const OneDNNMemoryFormat& fmt,
+                                                 void* ptr) {
+    // Make memory descriptor using input format, unless it
+    // cannot be trusted (nchw) then make up memory fmt manually
+    for (size_t i = 0; i < this->logical_axis_.size(); ++i) {
+      this->logical_axis_[i] = i;
+    }
+
+    auto src_md = fmt != OneDNNMemoryFormat::nchw
+                      ? OneDNNMemDesc(dims_, OneDNNGetDataType<T>(), fmt)
+                      : Axis2MemoryDesc(dims_, logical_axis_);
+    return std::make_shared<dnnl::memory>(src_md, engine_, ptr);
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireDstMemory(DenseTensor* output,
+                                                 Place place) {
+    auto dst_md = Axis2MemoryDesc(dims_, axis_);
+    auto dst_data = dev_ctx_.Alloc<T>(output);
+    return std::make_shared<dnnl::memory>(dst_md, engine_, dst_data);
+  }
+
+  std::shared_ptr<dnnl::reorder> AcquireTranspose(
+      std::shared_ptr<dnnl::memory> dst_memory_p,
+      std::shared_ptr<dnnl::memory> src_memory_p) {
+    return std::make_shared<dnnl::reorder>(*(src_memory_p), *(dst_memory_p));
+  }
+
+ protected:
+  dnnl::memory::desc Axis2MemoryDesc(std::vector<int64_t>& nchw_tz,  // NOLINT
+                                     std::vector<int>& axis          // NOLINT
+  ) {
+    size_t ndims = axis.size();
+
+    std::vector<int64_t> strides(ndims);
+    unsigned int total_stride = 1;
+    for (int i = ndims - 1; i >= 0; --i) {
+      strides[axis[i]] = total_stride;
+      total_stride *= nchw_tz[axis[i]];
+    }
+    dnnl::memory::desc mem_d(nchw_tz, OneDNNGetDataType<T>(), strides);
+
+    return mem_d;
+  }
+
+ private:
+  const OneDNNContext& dev_ctx_;
+  std::vector<int64_t> dims_;
+  std::vector<int> axis_;
+  std::vector<int> logical_axis_;
+  dnnl::engine engine_;
+};
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 6f1774fe8e46ac..b740815305dedf 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -61,7 +61,7 @@ enum class Backend : uint8_t {
   NUM_BACKENDS,
 
   /**
-   * [ Why we need ALL in baisc kernel key member? ]
+   * [ Why we need ALL in basic kernel key member? ]
    *
    * For Tensor, ALL represents an illegal Backend, but for Kernel, some
    * kernels may be device-independent by nature, such as reshape; and when
diff --git a/paddle/phi/common/float16.h b/paddle/phi/common/float16.h
index 4b0799a1774c17..c401d8c6575e59 100644
--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -14,6 +14,11 @@
 
 #pragma once
 
+#if defined(_M_X64) || defined(__x86_64__) || defined(_M_IX86) || \
+    defined(__i386__)
+#define __PADDLE_x86__
+#include <immintrin.h>
+#endif
 #include <stdint.h>
 
 #include <cmath>
@@ -108,7 +113,7 @@ struct PADDLE_ALIGN(2) float16 {
     float16_t res = vget_lane_f16(vcvt_f16_f32(tmp), 0);
     x = *reinterpret_cast<uint16_t*>(&res);
 
-#elif defined(__F16C__)
+#elif defined(__F16C__) and defined(__PADDLE_x86__)
     x = _cvtss_sh(val, 0);
 
 #else
diff --git a/paddle/phi/core/compat/arg_map_context.h b/paddle/phi/core/compat/arg_map_context.h
index 6cfd18369c973f..d680cda4aeea24 100644
--- a/paddle/phi/core/compat/arg_map_context.h
+++ b/paddle/phi/core/compat/arg_map_context.h
@@ -108,6 +108,12 @@ class ArgumentMappingContext {
   virtual bool IsDenseTensorInput(const std::string& name) const = 0;
   virtual bool IsDenseTensorInputs(const std::string& name) const = 0;
   virtual bool IsSelectedRowsInput(const std::string& name) const = 0;
+
+  virtual bool IsSparseCooTensorInput(const std::string& name) const = 0;
+  virtual bool IsSparseCsrTensorInput(const std::string& name) const = 0;
+
+  virtual bool IsSelectedRowsInputs(const std::string& name) const = 0;
+
   // For compatibility with LoDTensorArray
   virtual bool IsDenseTensorVectorInput(const std::string& name) const = 0;
 
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index b578afa7c2b854..10b859fdac2603 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -40,7 +40,7 @@ const std::unordered_set<std::string> standard_kernel_suffixs({
  * after 2.0, and can no longer be occupied by the previously abandoned ops.
  * They are marked here uniformly.
  */
-const std::unordered_set<std::string> deprecated_op_names(
+static const std::unordered_set<std::string> deprecated_op_names(
     {"diag",
      "flatten",
      "flatten_grad",
diff --git a/paddle/phi/core/kernel_context.h b/paddle/phi/core/kernel_context.h
index 7b79138fe78a3b..107a1fe49c98f3 100644
--- a/paddle/phi/core/kernel_context.h
+++ b/paddle/phi/core/kernel_context.h
@@ -144,6 +144,13 @@ class KernelContext {
   size_t OutputsSize() const { return outputs_.size(); }
   size_t AttrsSize() const { return attrs_.size(); }
 
+  void ClearInputOutput() {
+    inputs_.clear();
+    input_range_.clear();
+    outputs_.clear();
+    output_range_.clear();
+  }
+
  private:
   DeviceContext* dev_ctx_;
 
diff --git a/paddle/phi/core/kernel_factory.cc b/paddle/phi/core/kernel_factory.cc
index 6e16029ee40b55..a8479f8624ba32 100644
--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -16,6 +16,11 @@
 
 #include "glog/logging.h"
 #include "paddle/phi/core/enforce.h"
+#if defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
+#include "paddle/fluid/platform/device/xpu/xpu_op_list.h"
+#include "paddle/phi/core/compat/convert_utils.h"
+#endif
+#include "paddle/phi/core/compat/op_utils.h"
 
 DECLARE_bool(enable_api_kernel_fallback);
 
@@ -41,6 +46,17 @@ KernelFactory& KernelFactory::Instance() {
   return g_op_kernel_factory;
 }
 
+bool KernelFactory::HasCompatiblePhiKernel(const std::string& op_type) const {
+  if (deprecated_op_names.find(op_type) == deprecated_op_names.end()) {
+    if (phi::OpUtilsMap::Instance().Contains(op_type)) {
+      return true;
+    } else if (kernels_.find(op_type) != kernels_.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
 const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name,
                                           const KernelKey& kernel_key) const {
   auto iter = kernels_.find(kernel_name);
diff --git a/paddle/phi/core/kernel_factory.h b/paddle/phi/core/kernel_factory.h
index 59e91451fff750..ed9280fa475bf5 100644
--- a/paddle/phi/core/kernel_factory.h
+++ b/paddle/phi/core/kernel_factory.h
@@ -210,7 +210,7 @@ class KernelArgsDef {
 
 class Kernel {
  public:
-  // for map element contruct
+  // for map element construct
   Kernel() = default;
 
   explicit Kernel(KernelFn fn, void* variadic_fn)
@@ -272,9 +272,7 @@ class KernelFactory {
 
   KernelNameMap& kernels() { return kernels_; }
 
-  bool HasCompatiblePhiKernel(const std::string& op_type) const {
-    return kernels_.find(TransToPhiKernelName(op_type)) != kernels_.end();
-  }
+  bool HasCompatiblePhiKernel(const std::string& op_type) const;
 
   KernelResult SelectKernelOrThrowError(const std::string& kernel_name,
                                         const KernelKey& kernel_key,
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 28c750dd9d9238..7ae01b7c725f0b 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -100,6 +100,24 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<const SelectedRows*>&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<const TensorBase*>&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
+      } else if (arg_type == std::type_index(typeid(
+                                 const std::vector<const TensorArray*>&))) {
+        args_def->AppendInput(default_key.backend(),
+                              default_tensor_layout,
+                              default_key.dtype(),
+                              arg_type);
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 9237b8eb31a2e1..cdfdefa059cd79 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -270,6 +270,8 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
   PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(TensorBase);
+  PD_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(SelectedRows);
   PD_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
   PD_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_MULTI_INPUT(DenseTensor);
 
diff --git a/paddle/phi/core/meta_tensor.cc b/paddle/phi/core/meta_tensor.cc
index 9a008e429dade9..6f45e2a265d726 100644
--- a/paddle/phi/core/meta_tensor.cc
+++ b/paddle/phi/core/meta_tensor.cc
@@ -39,7 +39,11 @@ int64_t MetaTensor::numel() const {
 
 DDim MetaTensor::dims() const {
   ValidCheck(*this);
-  return tensor_->dims();
+  if (phi::SelectedRows::classof(tensor_)) {
+    return static_cast<SelectedRows*>(tensor_)->GetCompleteDims();
+  } else {
+    return tensor_->dims();
+  }
 }
 
 DataType MetaTensor::dtype() const {
@@ -61,8 +65,12 @@ void MetaTensor::set_dims(const DDim& dims) {
     StringTensorUtils::GetMutableMeta(static_cast<StringTensor*>(tensor_))
         ->dims = dims;
   } else if (phi::SelectedRows::classof(tensor_)) {
-    DenseTensorUtils::GetMutableMeta(
-        static_cast<SelectedRows*>(tensor_)->mutable_value())
+    static_cast<SelectedRows*>(tensor_)->set_height(dims[0]);
+  } else if (phi::SparseCooTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(static_cast<SparseCooTensor*>(tensor_))
+        ->dims = dims;
+  } else if (phi::SparseCsrTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
         ->dims = dims;
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -81,6 +89,13 @@ void MetaTensor::set_dtype(DataType dtype) {
     DenseTensorUtils::GetMutableMeta(
         static_cast<SelectedRows*>(tensor_)->mutable_value())
         ->dtype = dtype;
+  } else if (phi::SparseCooTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(static_cast<SparseCooTensor*>(tensor_))
+        ->dtype = dtype;
+  } else if (phi::SparseCsrTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
+        ->dtype = dtype;
+    // No need to set dtype
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Unsupported settting dtype for `%s`.", tensor_->type_info().name()));
@@ -98,6 +113,12 @@ void MetaTensor::set_layout(DataLayout layout) {
     DenseTensorUtils::GetMutableMeta(
         static_cast<SelectedRows*>(tensor_)->mutable_value())
         ->layout = layout;
+  } else if (phi::SparseCooTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(static_cast<SparseCooTensor*>(tensor_))
+        ->layout = layout;
+  } else if (phi::SparseCsrTensor::classof(tensor_)) {
+    DenseTensorUtils::GetMutableMeta(static_cast<SparseCsrTensor*>(tensor_))
+        ->layout = layout;
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
         "Unsupported settting layout for `%s`.", tensor_->type_info().name()));
@@ -107,6 +128,10 @@ void MetaTensor::set_layout(DataLayout layout) {
 void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
   ValidCheck(*this);
   ValidCheck(meta_tensor);
+  if (phi::SparseCooTensor::classof(tensor_) ||
+      phi::SparseCsrTensor::classof(tensor_)) {
+    return;
+  }
   if (meta_tensor.lod().size() == 0) {
     // no need share
     return;
@@ -128,7 +153,9 @@ void MetaTensor::share_lod(const MetaTensor& meta_tensor) {
 void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   ValidCheck(*this);
   if (phi::DenseTensor::classof(tensor_) ||
-      phi::SelectedRows::classof(tensor_)) {
+      phi::SelectedRows::classof(tensor_) ||
+      phi::SparseCooTensor::classof(tensor_) ||
+      phi::SparseCsrTensor::classof(tensor_)) {
     share_dims(meta_tensor);
     set_dtype(meta_tensor.dtype());
     set_layout(meta_tensor.layout());
@@ -139,12 +166,22 @@ void MetaTensor::share_meta(const MetaTensor& meta_tensor) {
   }
 }
 
+TensorBase* MetaTensor::tensor() const { return tensor_; }
+
+bool MetaTensor::is_dense() const { return DenseTensor::classof(tensor_); }
+bool MetaTensor::is_selected_rows() const {
+  return SelectedRows::classof(tensor_);
+}
+
+bool MetaTensor::is_tensor_array() const { return false; }
+
 void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
   ValidCheck(*this);
   bool is_dense_tensor = phi::DenseTensor::classof(tensor_);
   bool is_selected_rows = phi::SelectedRows::classof(tensor_);
-  if (is_dense_tensor || is_selected_rows) {
-    set_dims(meta_tensor.dims());
+  bool is_sparse_coo = phi::SparseCooTensor::classof(tensor_);
+  bool is_sparse_csr = phi::SparseCsrTensor::classof(tensor_);
+  if (is_dense_tensor || is_selected_rows || is_sparse_coo || is_sparse_csr) {
     if (is_selected_rows) {
       const auto in_tensor_base = meta_tensor.tensor();
       PADDLE_ENFORCE_EQ(
@@ -156,6 +193,11 @@ void MetaTensor::share_dims(const MetaTensor& meta_tensor) {
       auto* selected_rows_in = static_cast<SelectedRows*>(in_tensor_base);
       selected_rows_out->set_rows(selected_rows_in->rows());
       selected_rows_out->set_height(selected_rows_in->height());
+      DenseTensorUtils::GetMutableMeta(
+          static_cast<SelectedRows*>(tensor_)->mutable_value())
+          ->dims = selected_rows_in->mutable_value()->dims();
+    } else {
+      set_dims(meta_tensor.dims());
     }
   } else {
     PADDLE_THROW(phi::errors::Unimplemented(
@@ -172,12 +214,14 @@ const LoD& MetaTensor::lod() const {
     return static_cast<DenseTensor*>(tensor_)->lod();
   } else if (phi::SelectedRows::classof(tensor_)) {
     return static_cast<SelectedRows*>(tensor_)->value().lod();
+  } else if (phi::SparseCooTensor::classof(tensor_)) {
+    return static_cast<SparseCooTensor*>(tensor_)->non_zero_elements().lod();
+  } else if (phi::SparseCsrTensor::classof(tensor_)) {
+    return static_cast<SparseCsrTensor*>(tensor_)->non_zero_elements().lod();
   } else {
     PADDLE_THROW(phi::errors::Unimplemented("Unsupported getting lod of `%s`.",
                                             tensor_->type_info().name()));
   }
 }
 
-TensorBase* MetaTensor::tensor() const { return tensor_; }
-
 }  // namespace phi
diff --git a/paddle/phi/core/meta_tensor.h b/paddle/phi/core/meta_tensor.h
index 377d0e9bc4d6d3..1a9dfb0d3c16f0 100644
--- a/paddle/phi/core/meta_tensor.h
+++ b/paddle/phi/core/meta_tensor.h
@@ -68,6 +68,12 @@ class MetaTensor {
 
   virtual bool initialized() const;
 
+  virtual bool is_selected_rows() const;
+  virtual bool is_dense() const;
+  // TODO(YuanRisheng) This API is for compatible with Fluid
+  //  and it will be deleted in the future.
+  virtual bool is_tensor_array() const;
+
   virtual operator unspecified_bool_type() const {
     return tensor_ == nullptr ? 0 : unspecified_bool_true;
   }
diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h
index a71c0471cc431c..c011605809e441 100644
--- a/paddle/phi/core/selected_rows.h
+++ b/paddle/phi/core/selected_rows.h
@@ -132,10 +132,7 @@ class SelectedRows : public TensorBase,
 
   /// \brief Returns the dims of the tensor.
   /// \return The dims of the tensor.
-  const DDim& dims() const noexcept override {
-    return impl_->dims();
-    // return phi::make_ddim(dims);
-  }
+  const DDim& dims() const noexcept override { return impl_->dims(); }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
diff --git a/paddle/phi/core/sparse_coo_tensor.cc b/paddle/phi/core/sparse_coo_tensor.cc
index bf4d601c0b5669..8df031421fee2f 100644
--- a/paddle/phi/core/sparse_coo_tensor.cc
+++ b/paddle/phi/core/sparse_coo_tensor.cc
@@ -21,34 +21,47 @@ SparseCooTensor::SparseCooTensor() {
   this->SetMember(non_zero_indices, non_zero_elements, {1}, true);
 }
 
+SparseCooTensor::SparseCooTensor(SparseCooTensor&& other) {
+  this->non_zero_elements_ = other.non_zero_elements_;
+  this->non_zero_indices_ = other.non_zero_indices_;
+  this->coalesced_ = other.coalesced_;
+  set_meta(other.meta());
+}
+
 SparseCooTensor::SparseCooTensor(const DenseTensor& non_zero_indices,
                                  const DenseTensor& non_zero_elements,
                                  const DDim& dims)
     : non_zero_indices_(non_zero_indices),
       non_zero_elements_(non_zero_elements),
-      coalesced_(false),
-      dims_(dims) {}
+      coalesced_(false) {
+  meta_.dims = dims;
+  meta_.layout = DataLayout::NCHW;
+  meta_.dtype = non_zero_elements.dtype();
+}
 
 SparseCooTensor::SparseCooTensor(DenseTensor&& non_zero_indices,
                                  DenseTensor&& non_zero_elements,
                                  const DDim& dims)
     : non_zero_indices_(non_zero_indices),
       non_zero_elements_(non_zero_elements),
-      coalesced_(false),
-      dims_(dims) {}
+      coalesced_(false) {
+  meta_.dims = dims;
+  meta_.layout = DataLayout::NCHW;
+  meta_.dtype = non_zero_elements.dtype();
+}
 
 SparseCooTensor::SparseCooTensor(const SparseCooTensor& other)
     : non_zero_indices_(other.non_zero_indices_),
-      non_zero_elements_(other.non_zero_elements_),
-      dims_(other.dims_) {
+      non_zero_elements_(other.non_zero_elements_) {
   this->coalesced_ = other.coalesced_;
+  set_meta(other.meta());
 }
 
 SparseCooTensor SparseCooTensor::operator=(const SparseCooTensor& other) {
-  this->dims_ = other.dims_;
-  this->non_zero_indices_ = other.non_zero_indices_;
   this->non_zero_elements_ = other.non_zero_elements_;
+  this->non_zero_indices_ = other.non_zero_indices_;
   this->coalesced_ = other.coalesced_;
+  set_meta(other.meta());
   return *this;
 }
 
@@ -111,8 +124,18 @@ void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
                                 const bool coalesced) {
   this->non_zero_indices_ = non_zero_indices;
   this->non_zero_elements_ = non_zero_elements;
-  this->dims_ = dims;
+  this->meta_.dims = dims;
+  this->coalesced_ = coalesced;
+}
+
+void SparseCooTensor::SetMember(const DenseTensor& non_zero_indices,
+                                const DenseTensor& non_zero_elements,
+                                const SparseTensorMeta& meta,
+                                const bool coalesced) {
+  this->non_zero_indices_ = non_zero_indices;
+  this->non_zero_elements_ = non_zero_elements;
   this->coalesced_ = coalesced;
+  set_meta(meta);
 }
 
 int32_t SparseCooTensor::sparse_dim() const {
@@ -120,7 +143,25 @@ int32_t SparseCooTensor::sparse_dim() const {
 }
 
 int32_t SparseCooTensor::dense_dim() const {
-  return dims_.size() - sparse_dim();
+  return meta_.dims.size() - sparse_dim();
+}
+
+void SparseCooTensor::set_meta(SparseTensorMeta&& meta) {
+  PADDLE_ENFORCE(!meta_.valid(),
+                 phi::errors::InvalidArgument(
+                     "Only when the original attribute of Tensor is "
+                     "incomplete, can it be reset."));
+  meta_ = std::move(meta);
+}
+
+void SparseCooTensor::set_meta(const SparseTensorMeta& meta) {
+  PADDLE_ENFORCE(
+      meta.valid(),
+      phi::errors::InvalidArgument(
+          "Input meta is invalid, please check the meta attribute."));
+  meta_.dims = meta.dims;
+  meta_.dtype = meta.dtype;
+  meta_.layout = meta.layout;
 }
 
 }  // namespace phi
diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h
index ba85a751dc0808..a28229996c8871 100644
--- a/paddle/phi/core/sparse_coo_tensor.h
+++ b/paddle/phi/core/sparse_coo_tensor.h
@@ -20,6 +20,8 @@ limitations under the License. */
 
 namespace phi {
 
+class DenseTensorUtils;
+
 /// \brief The SparseCooTensor uses two DenseTensors to represent
 /// the non zero elements and the indices of non zero elements of
 /// original DenseTensor.
@@ -93,21 +95,19 @@ class SparseCooTensor : public TensorBase,
 
   /// \brief Return the number of elements contained in original dense tensor
   /// \return The number of elements contained in original dense tensor
-  int64_t numel() const override { return product(dims_); }
+  int64_t numel() const override { return product(meta_.dims); }
 
   /// \brief Returns the dims of the original dense tensor.
   /// \return The dims of the original dense tensor.
-  const DDim& dims() const noexcept override { return dims_; }
+  const DDim& dims() const noexcept override { return meta_.dims; }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
-  DataType dtype() const noexcept override {
-    return non_zero_elements_.dtype();
-  }
+  DataType dtype() const noexcept override { return meta_.dtype; }
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
-  DataLayout layout() const noexcept override { return DataLayout::SPARSE_COO; }
+  DataLayout layout() const noexcept override { return meta_.layout; }
 
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
@@ -140,6 +140,17 @@ class SparseCooTensor : public TensorBase,
                  const DDim& dims,
                  const bool coalesced = false);
 
+  /// \brief set the member of sparse coo tensor.
+  /// \param non_zero_indices The indices of non zero elements in original dense
+  /// tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param meta The meta of original dense tensor.
+  /// \param coalesced whether the indices has coalesced.
+  void SetMember(const DenseTensor& non_zero_indices,
+                 const DenseTensor& non_zero_elements,
+                 const SparseTensorMeta& meta,
+                 const bool coalesced = false);
+
   /// \brief Get a mutable pointer of non_zero_indices_.
   /// return a mutable pointer of non_zero_indices_.
   DenseTensor* mutable_indices() { return &non_zero_indices_; }
@@ -161,15 +172,22 @@ class SparseCooTensor : public TensorBase,
                      DataType dtype,
                      size_t requested_size = 0) override;
 
-  /// \brief set the dims of original dense tensor
-  void set_dims(const DDim& dims) { this->dims_ = dims; }
-
   /// \brief get the sparse dim
   int32_t sparse_dim() const;
 
   /// \brief get the dnese dim
   int32_t dense_dim() const;
 
+  /// \brief Returns the meta information of the tensor.
+  /// \return The meta information of the tensor.
+  const SparseTensorMeta& meta() const noexcept { return meta_; }
+
+  void set_meta(SparseTensorMeta&& meta);
+
+  void set_meta(const SparseTensorMeta& meta);
+
+  void set_dims(const DDim& dims) { meta_.dims = dims; }
+
   /// \brief query table according to key
   const std::pair<DenseTensor, DenseTensor>* IndicesPairs(
       const std::string& key) const {
@@ -213,6 +231,10 @@ class SparseCooTensor : public TensorBase,
   }
 
  private:
+  friend class DenseTensorUtils;
+
+  SparseTensorMeta meta_;
+
   // save the indices of non zero elements in original dense tensor
   DenseTensor non_zero_indices_;
   // save the non zero elements of original dense tensor
@@ -252,7 +274,7 @@ class SparseCooTensor : public TensorBase,
                 [0, 0, 0, 0]]
      dims_ = (4, 4)
      non_zero_elements_ = [[0, 1, 0, 0], [0, 0, 4, 0]]
-     non_zero_indices_ = [0, 2],
+     non_zero_indices_ = [[0, 2], [1, 2]]
    */
 };
 
diff --git a/paddle/phi/core/sparse_csr_tensor.cc b/paddle/phi/core/sparse_csr_tensor.cc
index 45131f48338547..5c793048ea3060 100644
--- a/paddle/phi/core/sparse_csr_tensor.cc
+++ b/paddle/phi/core/sparse_csr_tensor.cc
@@ -21,7 +21,6 @@ SparseCsrTensor::SparseCsrTensor() {
   this->non_zero_crows_ = crows;
   this->non_zero_cols_ = cols;
   this->non_zero_elements_ = values;
-  this->dims_ = phi::make_ddim({1, 1});
 }
 
 inline void check_shape(const DDim& dims) {
@@ -54,27 +53,30 @@ SparseCsrTensor::SparseCsrTensor(const DenseTensor& non_zero_crows,
                                  const DDim& dims)
     : non_zero_crows_(non_zero_crows),
       non_zero_cols_(non_zero_cols),
-      non_zero_elements_(non_zero_elements),
-      dims_(dims) {
+      non_zero_elements_(non_zero_elements) {
   if (non_zero_crows.initialized()) {
-    Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims_);
+    Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims);
   } else {
     // create a empty tensor
     check_shape(dims);
   }
+  meta_.dims = dims;
+  meta_.layout = DataLayout::NCHW;
+  meta_.dtype = non_zero_elements.dtype();
 }
 
 SparseCsrTensor::SparseCsrTensor(const SparseCsrTensor& other)
     : non_zero_crows_(other.non_zero_crows_),
       non_zero_cols_(other.non_zero_cols_),
-      non_zero_elements_(other.non_zero_elements_),
-      dims_(other.dims_) {}
+      non_zero_elements_(other.non_zero_elements_) {
+  set_meta(other.meta());
+}
 
 SparseCsrTensor& SparseCsrTensor::operator=(const SparseCsrTensor& other) {
-  this->dims_ = other.dims();
-  this->non_zero_crows_ = other.crows();
-  this->non_zero_cols_ = other.cols();
-  this->non_zero_elements_ = other.values();
+  this->non_zero_crows_ = other.non_zero_crows();
+  this->non_zero_cols_ = other.non_zero_cols();
+  this->non_zero_elements_ = other.non_zero_elements();
+  set_meta(other.meta());
   return *this;
 }
 
@@ -114,7 +116,35 @@ void SparseCsrTensor::SetMember(const DenseTensor& non_zero_crows,
   this->non_zero_crows_ = non_zero_crows;
   this->non_zero_cols_ = non_zero_cols;
   this->non_zero_elements_ = non_zero_elements;
-  this->dims_ = dims;
+  meta_.dims = dims;
+}
+
+void SparseCsrTensor::SetMember(const DenseTensor& non_zero_crows,
+                                const DenseTensor& non_zero_cols,
+                                const DenseTensor& non_zero_elements,
+                                const SparseTensorMeta& meta) {
+  Check(non_zero_crows, non_zero_cols, non_zero_elements, meta.dims);
+  this->non_zero_crows_ = non_zero_crows;
+  this->non_zero_cols_ = non_zero_cols;
+  this->non_zero_elements_ = non_zero_elements;
+  set_meta(meta);
 }
 
+void SparseCsrTensor::set_meta(SparseTensorMeta&& meta) {
+  PADDLE_ENFORCE(!meta_.valid(),
+                 phi::errors::InvalidArgument(
+                     "Only when the original attribute of Tensor is "
+                     "incomplete, can it be reset."));
+  meta_ = std::move(meta);
+}
+
+void SparseCsrTensor::set_meta(const SparseTensorMeta& meta) {
+  PADDLE_ENFORCE(
+      meta.valid(),
+      phi::errors::InvalidArgument(
+          "Input meta is invalid, please check the meta attribute."));
+  meta_.dims = meta.dims;
+  meta_.dtype = meta.dtype;
+  meta_.layout = meta.layout;
+}
 }  // namespace phi
diff --git a/paddle/phi/core/sparse_csr_tensor.h b/paddle/phi/core/sparse_csr_tensor.h
index ee47e39f97fda4..2acb35915a9c36 100644
--- a/paddle/phi/core/sparse_csr_tensor.h
+++ b/paddle/phi/core/sparse_csr_tensor.h
@@ -14,14 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/phi/core/allocator.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/tensor_meta.h"
 
 namespace phi {
 
-class CompatibleDenseTensorUtils;
+class DenseTensorUtils;
 
 /// \brief The SparseCsrTensor uses three 1-D DenseTensors to represent
 /// the row index , column index and non zero elements of the original
@@ -100,21 +99,19 @@ class SparseCsrTensor : public TensorBase,
 
   /// \brief Return the number of elements contained in original dense tensor
   /// \return The number of elements contained in original dense tensor
-  int64_t numel() const override { return product(dims_); }
+  int64_t numel() const override { return product(meta_.dims); }
 
   /// \brief Returns the dims of the original dense tensor.
   /// \return The dims of the original dense tensor.
-  const DDim& dims() const noexcept override { return dims_; }
+  const DDim& dims() const noexcept override { return meta_.dims; }
 
   /// \brief Returns the data type of the tensor.
   /// \return The data type of the tensor.
-  DataType dtype() const noexcept override {
-    return non_zero_elements_.dtype();
-  }
+  DataType dtype() const noexcept override { return meta_.dtype; }
 
   /// \brief Returns the data layout of the tensor.
   /// \return The data layout of the tensor.
-  DataLayout layout() const noexcept override { return DataLayout::SPARSE_CSR; }
+  DataLayout layout() const noexcept override { return meta_.layout; }
 
   /// \brief Returns the data place of the tensor.
   /// \return The data place of the tensor.
@@ -145,6 +142,18 @@ class SparseCsrTensor : public TensorBase,
                  const DenseTensor& non_zero_elements,
                  const DDim& dims);
 
+  /// \brief set the member of sparse csr tensor.
+  /// \param non_zero_crows The compresessed row index of non zero elements in
+  /// original dense tensor.
+  /// \param non_zero_cols The column index of non zero elements in original
+  /// dense tensor.
+  /// \param non_zero_elements The non zero elements of original dense tensor.
+  /// \param meta The meta of original dense tensor.
+  void SetMember(const DenseTensor& non_zero_crows,
+                 const DenseTensor& non_zero_cols,
+                 const DenseTensor& non_zero_elements,
+                 const SparseTensorMeta& meta);
+
   /// \brief Get a mutable pointer of non_zero_crows.
   /// return a mutable pointer of non_zero_crows.
   DenseTensor* mutable_crows() { return &non_zero_crows_; }
@@ -169,18 +178,28 @@ class SparseCsrTensor : public TensorBase,
   /// mutable_values()
   DenseTensor* mutable_non_zero_elements() { return &non_zero_elements_; }
 
+  /// \brief Returns the meta information of the tensor.
+  /// \return The meta information of the tensor.
+  const SparseTensorMeta& meta() const noexcept { return meta_; }
+
+  void set_meta(SparseTensorMeta&& meta);
+
+  void set_meta(const SparseTensorMeta& meta);
+
   /// \brief set the dims of original dense tensor
-  void set_dims(const DDim& dims) { this->dims_ = dims; }
+  void set_dims(const DDim& dims) { meta_.dims = dims; }
+
+ protected:
+  SparseTensorMeta meta_;
 
  private:
+  friend class DenseTensorUtils;
   // save the compressed rows information of non zero elements
   DenseTensor non_zero_crows_;
   // save the columns information of non zero elements
   DenseTensor non_zero_cols_;
   // save the non zero elements
   DenseTensor non_zero_elements_;
-  // save the number of non zero elements in each batch
-  DDim dims_;
   /* --------------------------- */
   /*   example: 2-D Tensor */
   /* --------------------------- */
@@ -190,7 +209,7 @@ class SparseCsrTensor : public TensorBase,
           [0, 0, 4, 0],
           [0, 5, 0, 6]]
      dims_ = (4, 4)
-     non_zero_elements_ = [1, 2, 3, 4, 5 ,6]
+     non_zero_elements_ = [1, 2, 3, 4, 5, 6]
      non_zero_crows_ = [0, 1, 3, 4, 6]
      non_zero_cols_ = [1, 0, 3, 2, 1, 3]
    */
@@ -209,7 +228,7 @@ class SparseCsrTensor : public TensorBase,
           [0, 0, 4, 0],
           [0, 5, 0, 0]]]
      dims_ = (2, 4, 4)
-     non_zero_elements_ = [1, 2, 3, 4, 5 ,6, 1, 2, 3, 4, 5]
+     non_zero_elements_ = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5]
      non_zero_crows_ = [0, 1, 3, 4, 6, 0, 1, 2, 4, 5]
      non_zero_cols_ = [1, 0, 3, 2, 1, 3, 1, 0, 3, 2, 1]
    */
diff --git a/paddle/phi/core/string_tensor.h b/paddle/phi/core/string_tensor.h
index 0391099faab71b..80d6b69aa6c663 100644
--- a/paddle/phi/core/string_tensor.h
+++ b/paddle/phi/core/string_tensor.h
@@ -123,7 +123,7 @@ class StringTensor : public TensorBase,
   }
   void* AllocateFrom(Allocator* allocator,
                      DataType dtype,
-                     size_t requested_size = 0);
+                     size_t requested_size = 0) override;
   dtype::pstring* mutable_data(const phi::Place& place,
                                size_t requested_size = 0);
 
diff --git a/paddle/phi/core/tensor_array.h b/paddle/phi/core/tensor_array.h
index ade33099eee312..6d834a9375a26e 100644
--- a/paddle/phi/core/tensor_array.h
+++ b/paddle/phi/core/tensor_array.h
@@ -83,7 +83,7 @@ class TensorArray : public TensorBase,
   /// \return Void pointer
   void* AllocateFrom(Allocator* allocator,
                      DataType dtype,
-                     size_t requested_size = 0);
+                     size_t requested_size = 0) override;
 
   bool empty() const { return tensors_.empty(); }
 
diff --git a/paddle/phi/core/tensor_meta.cc b/paddle/phi/core/tensor_meta.cc
index 0140ec23937dae..da08802576838a 100644
--- a/paddle/phi/core/tensor_meta.cc
+++ b/paddle/phi/core/tensor_meta.cc
@@ -48,4 +48,16 @@ bool StringTensorMeta::valid() const noexcept {
   return valid;
 }
 
+SparseTensorMeta::SparseTensorMeta(const DDim& dims) : dims(dims) {}
+
+SparseTensorMeta::SparseTensorMeta(const DDim& dims, const DataLayout& layout)
+    : dims(dims), layout(layout) {}
+
+bool SparseTensorMeta::valid() const noexcept {
+  bool valid{true};
+  valid = valid && (layout != DataLayout::UNDEFINED);
+  valid = valid && (product(dims) >= 0);
+  return valid;
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index 18f276f8b62ea5..8969ef16d95bfd 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -99,4 +99,24 @@ inline bool operator==(const StringTensorMeta& lhs,
          (lhs.offset == rhs.offset);
 }
 
+struct SparseTensorMeta {
+  using DataLayout = paddle::experimental::DataLayout;
+
+  SparseTensorMeta() = default;
+  explicit SparseTensorMeta(const DDim& dims);
+  explicit SparseTensorMeta(const DDim& dims, const DataLayout& layout);
+  /// \brief Test whether the metadata is valid. Does not throw exceptions.
+  /// \return Whether the metadata is valid.
+  bool valid() const noexcept;
+
+  DDim dims;
+  DataType dtype;
+  DataLayout layout{DataLayout::NCHW};
+};
+
+inline bool operator==(const SparseTensorMeta& lhs,
+                       const SparseTensorMeta& rhs) {
+  return (lhs.dims == rhs.dims) && (lhs.layout == rhs.layout);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc
index dcd25180e29976..6e87f40ed0ab07 100644
--- a/paddle/phi/core/tensor_utils.cc
+++ b/paddle/phi/core/tensor_utils.cc
@@ -296,7 +296,7 @@ void Copy(const Context& dev_ctx,
                      dst_place,
                      blocking,
                      dst->mutable_non_zero_elements());
-  dst->set_dims(src.dims());
+  dst->set_meta(src.meta());
   dst->SetCoalesced(src.coalesced());
 }
 
diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h
index c478e3e0895763..ceb46e2abecd7a 100644
--- a/paddle/phi/core/tensor_utils.h
+++ b/paddle/phi/core/tensor_utils.h
@@ -28,6 +28,14 @@ class DenseTensorUtils {
     return &(tensor->meta_);
   }
 
+  static SparseTensorMeta* GetMutableMeta(SparseCooTensor* tensor) {
+    return &(tensor->meta_);
+  }
+
+  static SparseTensorMeta* GetMutableMeta(SparseCsrTensor* tensor) {
+    return &(tensor->meta_);
+  }
+
   static const std::shared_ptr<phi::Allocation>& GetHolder(
       const DenseTensor& tensor) {
     return tensor.holder_;
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index 9877149dc52bd8..ecb51f2c0c48d5 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -19,8 +19,6 @@ limitations under the License. */
 
 #include "paddle/phi/common/data_type.h"
 #include "paddle/phi/core/enforce.h"
-#include "paddle/phi/kernels/funcs/eigen/extensions.h"
-
 namespace phi {
 
 #define _PhiForEachDataTypeHelper_(callback, cpp_type, data_type) \
diff --git a/paddle/phi/infermeta/CMakeLists.txt b/paddle/phi/infermeta/CMakeLists.txt
index 92b64ab4e666ab..b896bb818fa778 100644
--- a/paddle/phi/infermeta/CMakeLists.txt
+++ b/paddle/phi/infermeta/CMakeLists.txt
@@ -7,3 +7,4 @@ cc_library(
   SRCS backward.cc
   DEPS meta_tensor convert_utils)
 add_subdirectory(strings)
+add_subdirectory(sparse)
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index 7f3c91181aa56a..957d942afaafdb 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -2342,6 +2342,28 @@ void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
   }
 }
 
+void SoftmaxMaskFuseInferMeta(const MetaTensor& x,
+                              const MetaTensor& mask,
+                              MetaTensor* out) {
+  auto x_dims = x.dims();
+  auto mask_dims = mask.dims();
+
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      4,
+      phi::errors::InvalidArgument("Input x must be in 4D dimension but "
+                                   "received the dimension of X is %d",
+                                   x_dims.size()));
+  PADDLE_ENFORCE_EQ(
+      mask_dims.size(),
+      4,
+      phi::errors::InvalidArgument("Input mask must be in 4D dimension but "
+                                   "received the dimension of mask is %d",
+                                   mask_dims.size()));
+
+  out->share_meta(x);
+}
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                           const MetaTensor& segment_ids,
                           const std::string& pooltype,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index e91470d32b6644..59fedfe2550690 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -358,6 +358,10 @@ void SearchsortedInferMeta(const MetaTensor& sorted_sequence,
                            bool right,
                            MetaTensor* out);
 
+void SoftmaxMaskFuseInferMeta(const MetaTensor& x,
+                              const MetaTensor& mask,
+                              MetaTensor* out);
+
 void SegmentPoolInferMeta(const MetaTensor& x,
                           const MetaTensor& segment_ids,
                           const std::string& pooltype,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 56dc40cc7c9a69..375b88493a92ba 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -301,6 +301,10 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
   phi::DDim in_dim({0});
   for (size_t i = 0; i < x.size(); ++i) {
     auto x_dim = x[i]->dims();
+    // x_dim.size() == 1 means the real dim of selected rows is [0]
+    if (x[i]->is_selected_rows() && x_dim.size() == 1) {
+      continue;
+    }
     if (phi::product(x_dim) == 0) {
       continue;
     }
@@ -355,6 +359,31 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
   out->share_lod(*x[0]);
 }
 
+// TODO(YuanRisheng) This InferMeta is used in Fluid
+//                   and will be deleted in the future.
+void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
+                              MetaTensor* out,
+                              MetaConfig config) {
+  int64_t max_length = 0;
+  bool has_tensor_array = false;
+  for (auto input : x) {
+    if (input->is_tensor_array()) {
+      has_tensor_array = true;
+      // if input is lod_tensor_array, dims() will return its size (one element)
+      max_length =
+          input->dims()[0] > max_length ? input->dims()[0] : max_length;
+    }
+  }
+
+  if (has_tensor_array) {
+    if (out->is_tensor_array()) {
+      out->set_dims(make_ddim({max_length}));
+    }
+  } else {
+    AddNInferMeta(x, out, config);
+  }
+}
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
@@ -609,6 +638,7 @@ void BatchNormInferMeta(const MetaTensor& x,
     saved_variance->set_dims({C});
   }
   y->share_lod(x);
+  y->set_dtype(x.dtype());
 }
 
 void BatchNormInferInferMeta(const MetaTensor& x,
@@ -2160,6 +2190,14 @@ void MultiplexInferMeta(const std::vector<const MetaTensor*>& ins,
         phi::errors::PreconditionNotMet(
             "All the candidate tensors must have the same size."));
   }
+
+  PADDLE_ENFORCE_GE(
+      in_dim[0],
+      ids_dim[0],
+      phi::errors::InvalidArgument("The 2nd-dim of input cannot be smaller "
+                                   "than batchSize of the index tensor."));
+
+  in_dim[0] = ids_dim[0];
   out->set_dims(in_dim);
   out->set_dtype(ins[0]->dtype());
 }
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index 4e95303f1a0256..8c601182e8fc84 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -123,6 +123,10 @@ void AddNInferMeta(const std::vector<const MetaTensor*>& x,
                    MetaTensor* out,
                    MetaConfig config = MetaConfig());
 
+void AddNTensorArrayInferMeta(const std::vector<const MetaTensor*>& x,
+                              MetaTensor* out,
+                              MetaConfig config);
+
 void AucInferMeta(const MetaTensor& input,
                   const MetaTensor& label,
                   const MetaTensor& stat_pos,
diff --git a/paddle/phi/infermeta/sparse/CMakeLists.txt b/paddle/phi/infermeta/sparse/CMakeLists.txt
new file mode 100644
index 00000000000000..8717ef2cf6fdd2
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/CMakeLists.txt
@@ -0,0 +1,9 @@
+cc_library(
+  sparse_infermeta
+  SRCS unary.cc binary.cc multiary.cc
+  DEPS convert_utils infermeta_utils)
+
+cc_library(
+  sparse_backward_infermeta
+  SRCS backward.cc
+  DEPS meta_tensor convert_utils)
diff --git a/paddle/phi/infermeta/sparse/backward.cc b/paddle/phi/infermeta/sparse/backward.cc
new file mode 100644
index 00000000000000..d09c0e6fb84b3e
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/backward.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/sparse/backward.h"
+#include "paddle/phi/infermeta/unary.h"
+
+#include "paddle/phi/core/infermeta_utils.h"
+
+namespace phi {
+namespace sparse {
+
+void FusedAttentionGradInferMeta(const MetaTensor& query,
+                                 const MetaTensor& key,
+                                 const MetaTensor& value,
+                                 const MetaTensor& softmax,
+                                 const MetaTensor& out_grad,
+                                 MetaTensor* query_grad,
+                                 MetaTensor* key_grad,
+                                 MetaTensor* value_grad) {
+  // TODO(zhouwei, zhangkaihuo) add correct infer meta
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/backward.h b/paddle/phi/infermeta/sparse/backward.h
new file mode 100644
index 00000000000000..e5c797923dfbc5
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/backward.h
@@ -0,0 +1,33 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+namespace sparse {
+
+void FusedAttentionGradInferMeta(const MetaTensor& query,
+                                 const MetaTensor& key,
+                                 const MetaTensor& value,
+                                 const MetaTensor& softmax,
+                                 const MetaTensor& out_grad,
+                                 MetaTensor* query_grad,
+                                 MetaTensor* key_grad,
+                                 MetaTensor* value_grad);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/binary.cc b/paddle/phi/infermeta/sparse/binary.cc
new file mode 100644
index 00000000000000..650ab646639bbb
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/binary.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/sparse/binary.h"
+
+namespace phi {
+namespace sparse {
+
+inline void GetOutShape(const DDim& x_dims,
+                        const std::vector<int>& kernel_sizes,
+                        const std::vector<int>& paddings,
+                        const std::vector<int>& dilations,
+                        const std::vector<int>& strides,
+                        DDim* out_dims) {
+  PADDLE_ENFORCE_EQ(
+      x_dims.size(),
+      5,
+      phi::errors::InvalidArgument("the shape of x should be (N, D, H, W, C)"));
+  PADDLE_ENFORCE_EQ(kernel_sizes.size(),
+                    5,
+                    phi::errors::InvalidArgument(
+                        "the shape of kernel should be (D, H, W, C, OC)"));
+
+  // infer out shape
+  (*out_dims)[0] = x_dims[0];
+  (*out_dims)[4] = kernel_sizes[4];
+  for (int i = 1; i < 4; i++) {
+    (*out_dims)[i] = (x_dims[i] + 2 * paddings[i - 1] -
+                      dilations[i - 1] * (kernel_sizes[i - 1] - 1) - 1) /
+                         strides[i - 1] +
+                     1;
+  }
+}
+
+inline void ResetSubmKernelSizeAndStrides(const DDim& kernel_dims,
+                                          std::vector<int>* paddings,
+                                          std::vector<int>* strides) {
+  for (uint64_t i = 0; i < paddings->size(); i++) {
+    (*paddings)[i] = kernel_dims[i] / 2;
+    (*strides)[i] = 1;
+  }
+}
+
+void Conv3dInferMeta(const MetaTensor& x,
+                     const MetaTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     const std::string& key,
+                     MetaTensor* out,
+                     MetaTensor* rulebook,
+                     MetaTensor* counter) {
+  const auto& x_dims = x.dims();
+  const auto& kernel_dims = kernel.dims();
+  DDim out_dims = {1, 1, 1, 1, 1};
+
+  std::vector<int> kernel_sizes(kernel_dims.size());
+  for (int i = 0; i < kernel_dims.size(); i++) {
+    kernel_sizes[i] = kernel_dims[i];
+  }
+
+  std::vector<int> subm_paddings(paddings), subm_strides(strides);
+  if (subm) {
+    // the out shape of subm_conv is same as input shape
+    // reset the padding=kernel_size/2 and strides=1
+    ResetSubmKernelSizeAndStrides(kernel.dims(), &subm_paddings, &subm_strides);
+  }
+
+  GetOutShape(
+      x_dims, kernel_sizes, subm_paddings, dilations, subm_strides, &out_dims);
+
+  out->set_dtype(x.dtype());
+  out->set_dims(out_dims);
+  out->set_layout(x.layout());
+
+  rulebook->set_dtype(DataType::INT32);
+  rulebook->set_layout(DataLayout::NCHW);
+  rulebook->set_dims({1});
+
+  counter->set_dtype(DataType::INT32);
+  counter->set_layout(DataLayout::NCHW);
+  counter->set_dims({1});
+}
+
+inline const std::vector<int> PoolResetKernel(
+    const std::vector<int>& kernel_sizes,
+    const int in_channels,
+    const int out_channels) {
+  std::vector<int> res(kernel_sizes);
+  res.resize(5);
+  res[3] = in_channels;
+  res[4] = out_channels;
+  return res;
+}
+
+void Pool3dInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     MetaTensor* out,
+                     MetaTensor* rulebook,
+                     MetaTensor* counter) {
+  const auto& x_dims = x.dims();
+  DDim out_dims = {1, 1, 1, 1, 1};
+
+  const std::vector<int>& real_kernel_sizes =
+      PoolResetKernel(kernel_sizes, x_dims[4], x_dims[4]);
+  GetOutShape(
+      x_dims, real_kernel_sizes, paddings, dilations, strides, &out_dims);
+  out->set_dtype(x.dtype());
+  out->set_dims(out_dims);
+  out->set_layout(x.layout());
+
+  rulebook->set_dtype(DataType::INT32);
+  rulebook->set_layout(DataLayout::NCHW);
+  rulebook->set_dims({1});
+
+  counter->set_dtype(DataType::INT32);
+  counter->set_layout(DataLayout::NCHW);
+  counter->set_dims({1});
+}
+
+void SparseCooTensorInferMeta(const MetaTensor& values,
+                              const MetaTensor& indices,
+                              const std::vector<int64_t>& shape,
+                              MetaTensor* out) {
+  out->set_dims(phi::make_ddim(shape));
+  out->set_dtype(values.dtype());
+  out->set_layout(values.layout());
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/binary.h b/paddle/phi/infermeta/sparse/binary.h
new file mode 100644
index 00000000000000..a2c3e6fe5705c5
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/binary.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+namespace sparse {
+
+void Conv3dInferMeta(const MetaTensor& x,
+                     const MetaTensor& kernel,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     const int groups,
+                     const bool subm,
+                     const std::string& key,
+                     MetaTensor* out,
+                     MetaTensor* rulebook,
+                     MetaTensor* counter);
+
+void Pool3dInferMeta(const MetaTensor& x,
+                     const std::vector<int>& kernel_sizes,
+                     const std::vector<int>& paddings,
+                     const std::vector<int>& dilations,
+                     const std::vector<int>& strides,
+                     MetaTensor* out,
+                     MetaTensor* rulebook,
+                     MetaTensor* counter);
+
+void SparseCooTensorInferMeta(const MetaTensor& values,
+                              const MetaTensor& indices,
+                              const std::vector<int64_t>& shape,
+                              MetaTensor* out);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/multiary.cc b/paddle/phi/infermeta/sparse/multiary.cc
new file mode 100644
index 00000000000000..fc940239d40549
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/multiary.cc
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/sparse/multiary.h"
+
+namespace phi {
+namespace sparse {
+
+void FusedAttentionInferMeta(const MetaTensor& query,
+                             const MetaTensor& key,
+                             const MetaTensor& value,
+                             const MetaTensor& sparse_mask,
+                             const MetaTensor& key_padding_mask,
+                             const MetaTensor& attn_mask,
+                             MetaTensor* out,
+                             MetaTensor* softmax) {
+  // TODO(zhouwei,zhangkaihuo) add correct infer meta
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/multiary.h b/paddle/phi/infermeta/sparse/multiary.h
new file mode 100644
index 00000000000000..20070e2cd9d63b
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/multiary.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/meta_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+void FusedAttentionInferMeta(const MetaTensor& query,
+                             const MetaTensor& key,
+                             const MetaTensor& value,
+                             const MetaTensor& sparse_mask,
+                             const MetaTensor& key_padding_mask,
+                             const MetaTensor& attn_mask,
+                             MetaTensor* out,
+                             MetaTensor* softmax);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/unary.cc b/paddle/phi/infermeta/sparse/unary.cc
new file mode 100644
index 00000000000000..f80f18bbba857a
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/unary.cc
@@ -0,0 +1,40 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/infermeta/sparse/unary.h"
+
+#include "paddle/phi/core/infermeta_utils.h"
+
+namespace phi {
+namespace sparse {
+
+void IndicesInferMeta(const MetaTensor& x, MetaTensor* out) {
+  // TODO(zhangkaihuo) Currently, we cannot get sparse_dim from tensor.
+  // correct shape is: shape[0] = x.sparse_dim()
+  // In the 3D point cloud model:
+  // the input x is 5-D tensor, non_zero_elements is 1-D tensor
+  out->set_dims({x.dims().size() - 1, -1});
+  out->set_dtype(DataType::INT32);
+  out->set_layout(DataLayout::NCHW);
+}
+
+void ValuesInferMeta(const MetaTensor& x, MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  out->set_dims({-1, x_dims[x_dims.size() - 1]});
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/sparse/unary.h b/paddle/phi/infermeta/sparse/unary.h
new file mode 100644
index 00000000000000..880e90b7ae697f
--- /dev/null
+++ b/paddle/phi/infermeta/sparse/unary.h
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/meta_tensor.h"
+#include "paddle/phi/core/tensor_meta.h"
+
+namespace phi {
+namespace sparse {
+
+void IndicesInferMeta(const MetaTensor& x, MetaTensor* out);
+
+void ValuesInferMeta(const MetaTensor& x, MetaTensor* out);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index d07df3d36aa9eb..b68d4ab0a17b1b 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -2035,8 +2035,8 @@ void NMSInferMeta(const MetaTensor& x, float threshold, MetaTensor* out) {
                         "whose shape must be [N, 4] "
                         "N is the number of boxes "
                         "in last dimension in format [x1, x2, y1, y2]. "));
-  auto num_boxes = boxes_dim[0];
-  out->set_dims(phi::make_ddim({num_boxes}));
+  out->set_dims(phi::make_ddim({-1}));
+  out->set_dtype(DataType::INT64);
 }
 
 void NormInferMeta(const MetaTensor& x,
@@ -3184,11 +3184,11 @@ void FillSplitOutDims(const MetaTensor& x,
       (*out)[i]->set_dtype(x.dtype());
       (*out)[i]->set_dims(out_dims[i]);
       (*out)[i]->set_layout(x.layout());
+      (*out)[i]->share_lod(x);
     } else {
       (*out)[i]->set_dtype(x.dtype());
       (*out)[i]->set_dims(out_dims[i]);
       (*out)[i]->set_layout(x.layout());
-      (*out)[i]->share_lod(x);
     }
   }
 }
@@ -3205,21 +3205,25 @@ void SplitInferMeta(const MetaTensor& x,
   // fill out dims with -1
   if ((sections.FromTensor() && !config.is_runtime) || axis_value == -1 ||
       (axis_value >= 0 && x.dims().at(axis_value) <= 0)) {
-    std::vector<phi::DDim> out_dims(
-        sections_data.size(),
-        phi::make_ddim(std::vector<int>(x.dims().size(), -1)));
-
+    std::vector<phi::DDim> out_dims;
+    if ((sections.FromTensor() && !config.is_runtime) || axis_value == -1) {
+      out_dims = std::vector<phi::DDim>(
+          sections_data.size(),
+          phi::make_ddim(std::vector<int>(x.dims().size(), -1)));
+    } else {
+      out_dims = std::vector<phi::DDim>(sections_data.size(), x.dims());
+    }
     for (size_t i = 0; i < sections_data.size(); ++i) {
       if (axis_value != 0) {
         // Only pass LoD when not spliting along the first dim.
         out[i]->set_dtype(x.dtype());
         out[i]->set_dims(out_dims[i]);
         out[i]->set_layout(x.layout());
+        out[i]->share_lod(x);
       } else {
         out[i]->set_dtype(x.dtype());
         out[i]->set_dims(out_dims[i]);
         out[i]->set_layout(x.layout());
-        out[i]->share_lod(x);
       }
     }
   } else {
@@ -3293,20 +3297,24 @@ void SplitWithNumInferMeta(const MetaTensor& x,
   int axis_value = GetSplitAxisValue(x, axis, config);
   // fill out dims with -1
   if (axis_value == -1 || (axis_value >= 0 && x.dims().at(axis_value) <= 0)) {
-    std::vector<phi::DDim> out_dims(
-        num, phi::make_ddim(std::vector<int>(x.dims().size(), -1)));
-
+    std::vector<phi::DDim> out_dims;
+    if (axis_value == -1) {
+      out_dims = std::vector<phi::DDim>(
+          num, phi::make_ddim(std::vector<int>(x.dims().size(), -1)));
+    } else {
+      out_dims = std::vector<phi::DDim>(num, x.dims());
+    }
     for (int i = 0; i < num; ++i) {
       if (axis_value != 0) {
         // Only pass LoD when not spliting along the first dim.
         out[i]->set_dtype(x.dtype());
         out[i]->set_dims(out_dims[i]);
         out[i]->set_layout(x.layout());
+        out[i]->share_lod(x);
       } else {
         out[i]->set_dtype(x.dtype());
         out[i]->set_dims(out_dims[i]);
         out[i]->set_layout(x.layout());
-        out[i]->share_lod(x);
       }
     }
   } else {
@@ -3537,7 +3545,7 @@ void StridedSliceInferMeta(const MetaTensor& x,
 
 /*  Why not use SumRawInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of
-   api.yaml
+   ops.yaml
 */
 void SumInferMeta(const MetaTensor& x,
                   const IntArray& axis,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index 275b9ef031bb4f..7cbd218543d651 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -9,6 +9,9 @@ file(
 )
 file(APPEND ${kernel_declare_file}
      "#include \"paddle/phi/core/kernel_registry.h\"\n\n")
+set(kernel_declare_file_prune
+    ${PADDLE_BINARY_DIR}/paddle/phi/kernels/declarations.h.prune
+    CACHE INTERNAL "declarations.h file")
 
 # phi functors and functions called by kernels
 add_subdirectory(funcs)
@@ -45,7 +48,8 @@ set(COMMON_KERNEL_DEPS
     selected_rows_functor)
 # remove this dep after removing fluid deps on tensor creation
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} phi_api_utils)
-set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta infermeta_utils)
+set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} infermeta infermeta_utils
+                       sparse_infermeta)
 set(COMMON_KERNEL_DEPS ${COMMON_KERNEL_DEPS} switch_autotune)
 
 set(COMMON_KERNEL_DEPS
@@ -95,8 +99,8 @@ file(
   "kps/*.cu"
   "selected_rows/gpu/*.cu"
   "sparse/gpu/*.cu"
-  "strings/*.cu"
-  "strings/gpu/*.cu")
+  "strings/gpu/*.cu"
+  "fusion/gpu/*.cu")
 
 if(WITH_MKLDNN)
   file(
@@ -110,7 +114,9 @@ if(WITH_MKLDNN)
     "sparse/cpu/*.cc"
     "strings/*.cc"
     "strings/cpu/*.cc"
-    "onednn/*.cc")
+    "onednn/*.cc"
+    "fusion/*.cc"
+    "fusion/cpu/*.cc")
 else()
   file(
     GLOB
@@ -122,10 +128,12 @@ else()
     "sparse/*.cc"
     "sparse/cpu/*.cc"
     "strings/*.cc"
-    "strings/cpu/*.cc")
+    "strings/cpu/*.cc"
+    "fusion/*.cc"
+    "fusion/cpu/*.cc")
 endif()
 
-file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc")
+file(GLOB kernel_xpu "xpu/*.cc" "selected_rows/xpu/*.cc" "fusion/xpu/*.cc")
 
 add_library(phi_cpu ${kernel_cc})
 kernel_declare("${kernel_cc}")
@@ -166,3 +174,7 @@ if(WITH_XPU)
 endif()
 
 set_property(GLOBAL PROPERTY PHI_KERNELS ${ADD_PHI_KERNELS})
+
+if(NOT "${KERNEL_LIST}" STREQUAL "")
+  prune_declaration_h()
+endif()
diff --git a/paddle/phi/kernels/add_n_kernel.h b/paddle/phi/kernels/add_n_kernel.h
index c35dc2270aad0d..13d974a5877923 100644
--- a/paddle/phi/kernels/add_n_kernel.h
+++ b/paddle/phi/kernels/add_n_kernel.h
@@ -15,12 +15,20 @@
 #pragma once
 
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/tensor_array.h"
 
 namespace phi {
 
+// Note(YuanRisheng): std::vector<const TensorBase*> shouldn't be widely used in
+// PHI. Here, we use it to be compatible with Fluid.
 template <typename T, typename Context>
 void AddNKernel(const Context& dev_ctx,
-                const std::vector<const DenseTensor*>& x,
+                const std::vector<const TensorBase*>& x,
                 DenseTensor* out);
 
+template <typename T, typename Context>
+void AddNArrayKernel(const Context& dev_ctx,
+                     const std::vector<const TensorArray*>& x,
+                     TensorArray* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/autotune/cache.h b/paddle/phi/kernels/autotune/cache.h
index aacebc66570cb8..dc639e9f21ecfa 100644
--- a/paddle/phi/kernels/autotune/cache.h
+++ b/paddle/phi/kernels/autotune/cache.h
@@ -56,12 +56,14 @@ struct hash<std::vector<T>> {
 namespace phi {
 namespace autotune {
 
-struct DnnNode {
-  DnnNode() {}
-  explicit DnnNode(int64_t a, size_t size) : algo(a), workspace_size(size) {}
+struct ConvAutoTuneResult {
+  ConvAutoTuneResult() {}
+  ConvAutoTuneResult(int64_t a, size_t size, bool search)
+      : algo(a), workspace_size(size), exhaustive_search(search) {}
 
   int64_t algo;
   size_t workspace_size = 0;
+  bool exhaustive_search = false;
 };
 
 template <typename... Args>
@@ -73,40 +75,41 @@ size_t GetKey(Args&&... args) {
 
 struct ConvCacheKey {
   ConvCacheKey() {}
-  explicit ConvCacheKey(const std::vector<int64_t>& x_dims,
-                        const std::vector<int64_t>& w_dims,
-                        const std::vector<int>& strides,
-                        const std::vector<int>& paddings,
-                        const std::vector<int>& dilations,
-                        phi::DataType dtype,
-                        int groups,
-                        int64_t data_layout)
-      : x_dims_(x_dims),
-        w_dims_(w_dims),
-        strides_(strides),
-        paddings_(paddings),
-        dilations_(dilations),
-        dtype_(dtype),
-        groups_(groups),
-        data_layout_(data_layout) {}
+  ConvCacheKey(const std::vector<int64_t>& arg_x_dims,
+               const std::vector<int64_t>& arg_w_dims,
+               const std::vector<int>& arg_strides,
+               const std::vector<int>& arg_paddings,
+               const std::vector<int>& arg_dilations,
+               phi::DataType arg_dtype,
+               int arg_groups,
+               int64_t arg_data_layout)
+      : x_dims(arg_x_dims),
+        w_dims(arg_w_dims),
+        strides(arg_strides),
+        paddings(arg_paddings),
+        dilations(arg_dilations),
+        dtype(arg_dtype),
+        groups(arg_groups),
+        data_layout(arg_data_layout) {}
   size_t hash_value() const {
-    return GetKey(x_dims_,
-                  w_dims_,
-                  strides_,
-                  paddings_,
-                  dilations_,
-                  static_cast<int64_t>(dtype_),
-                  groups_,
-                  data_layout_);
+    return GetKey(x_dims,
+                  w_dims,
+                  strides,
+                  paddings,
+                  dilations,
+                  static_cast<int64_t>(dtype),
+                  groups,
+                  data_layout);
   }
-  std::vector<int64_t> x_dims_;
-  std::vector<int64_t> w_dims_;
-  std::vector<int> strides_;
-  std::vector<int> paddings_;
-  std::vector<int> dilations_;
-  phi::DataType dtype_;
-  int groups_;
-  int64_t data_layout_;
+
+  std::vector<int64_t> x_dims;
+  std::vector<int64_t> w_dims;
+  std::vector<int> strides;
+  std::vector<int> paddings;
+  std::vector<int> dilations;
+  phi::DataType dtype;
+  int groups;
+  int64_t data_layout;
 };
 
 struct ConvCacheKeyHash {
@@ -118,14 +121,14 @@ struct ConvCacheKeyHash {
 struct ConvCacheKeyEqual {
   size_t operator()(const ConvCacheKey& first,
                     const ConvCacheKey& second) const {
-    if (first.x_dims_ != second.x_dims_) return false;
-    if (first.w_dims_ != second.w_dims_) return false;
-    if (first.strides_ != second.strides_) return false;
-    if (first.paddings_ != second.paddings_) return false;
-    if (first.dilations_ != second.dilations_) return false;
-    if (first.dtype_ != second.dtype_) return false;
-    if (first.groups_ != second.groups_) return false;
-    if (first.data_layout_ != second.data_layout_) return false;
+    if (first.x_dims != second.x_dims) return false;
+    if (first.w_dims != second.w_dims) return false;
+    if (first.strides != second.strides) return false;
+    if (first.paddings != second.paddings) return false;
+    if (first.dilations != second.dilations) return false;
+    if (first.dtype != second.dtype) return false;
+    if (first.groups != second.groups) return false;
+    if (first.data_layout != second.data_layout) return false;
 
     return true;
   }
@@ -135,7 +138,7 @@ class CudnnAlgorithmsCacheMap {
  public:
   CudnnAlgorithmsCacheMap() : cache_mutex_(new std::mutex()) { hash_.clear(); }
 
-  DnnNode Get(const ConvCacheKey& key) {
+  ConvAutoTuneResult Get(const ConvCacheKey& key) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     PADDLE_ENFORCE_NE(
         hash_.find(key),
@@ -163,7 +166,7 @@ class CudnnAlgorithmsCacheMap {
     cache_misses_ = 0;
   }
 
-  void Set(const ConvCacheKey& key, DnnNode algo) {
+  void Set(const ConvCacheKey& key, ConvAutoTuneResult algo) {
     std::lock_guard<std::mutex> lock(*cache_mutex_);
     if (hash_.size() > static_cast<size_t>(FLAGS_search_cache_max_number)) {
       hash_.clear();
@@ -188,7 +191,10 @@ class CudnnAlgorithmsCacheMap {
   int64_t Size() const { return hash_.size(); }
 
  private:
-  std::unordered_map<ConvCacheKey, DnnNode, ConvCacheKeyHash, ConvCacheKeyEqual>
+  std::unordered_map<ConvCacheKey,
+                     ConvAutoTuneResult,
+                     ConvCacheKeyHash,
+                     ConvCacheKeyEqual>
       hash_;
   std::shared_ptr<std::mutex> cache_mutex_;
 
@@ -289,19 +295,8 @@ class AutoTuneCache {
     return auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
-  CudnnAlgorithmsCacheMap& GetConvForward() {
-    return cudnn_auto_tune_map_[static_cast<int64_t>(
-        AlgorithmType::kConvForward)];
-  }
-
-  CudnnAlgorithmsCacheMap& GetConvBackwardData() {
-    return cudnn_auto_tune_map_[static_cast<int64_t>(
-        AlgorithmType::kConvBackwardData)];
-  }
-
-  CudnnAlgorithmsCacheMap& GetConvBackwardFilter() {
-    return cudnn_auto_tune_map_[static_cast<int64_t>(
-        AlgorithmType::kConvBackwardFilter)];
+  CudnnAlgorithmsCacheMap& GetConv(const AlgorithmType& algo_type) {
+    return cudnn_auto_tune_map_[static_cast<int64_t>(algo_type)];
   }
 
   AlgorithmsCacheMap& GetTranspose() { return Get(AlgorithmType::kTranspose); }
diff --git a/paddle/phi/kernels/autotune/cache_test.cc b/paddle/phi/kernels/autotune/cache_test.cc
index 29affd45f0f5c7..18454ad3e19977 100644
--- a/paddle/phi/kernels/autotune/cache_test.cc
+++ b/paddle/phi/kernels/autotune/cache_test.cc
@@ -25,7 +25,8 @@ enum ConvAlgos { GEMMKernel = 0, CuDNNKernel_1 = 1, CuDNNKernel_2 = 2 };
 
 TEST(AlgosCache, AlgosCache) {
   auto autotune_cache = phi::autotune::AutoTuneCache::Instance();
-  auto& cache = autotune_cache.GetConvForward();
+  auto& cache =
+      autotune_cache.GetConv(phi::autotune::AlgorithmType::kConvForward);
 
   std::vector<int64_t> x_shape = {4, 224, 224, 3};
   std::vector<int64_t> w_shape = {32, 3, 3, 3};
@@ -37,7 +38,8 @@ TEST(AlgosCache, AlgosCache) {
   phi::autotune::ConvCacheKey key(
       x_shape, w_shape, paddings, strides, dilations, dtype, 0, 0);
   EXPECT_EQ(cache.Find(key), false);
-  phi::autotune::DnnNode node(static_cast<int64_t>(ConvAlgos::GEMMKernel), 0);
+  phi::autotune::ConvAutoTuneResult node(
+      static_cast<int64_t>(ConvAlgos::GEMMKernel), 0, false);
   cache.Set(key, node);
   EXPECT_EQ(cache.Size(), 1);
   EXPECT_EQ(cache.Find(key), true);
@@ -48,8 +50,8 @@ TEST(AlgosCache, AlgosCache) {
   phi::autotune::ConvCacheKey key1(
       x_shape, w_shape, paddings, strides, dilations, dtype, 0, 1);
   EXPECT_EQ(cache.Find(key1), false);
-  phi::autotune::DnnNode node1(static_cast<int64_t>(ConvAlgos::CuDNNKernel_1),
-                               0);
+  phi::autotune::ConvAutoTuneResult node1(
+      static_cast<int64_t>(ConvAlgos::CuDNNKernel_1), 0, false);
   cache.Set(key1, node1);
   EXPECT_EQ(cache.Size(), 2);
   EXPECT_EQ(cache.CacheHits(), 1);
diff --git a/paddle/phi/kernels/cpu/add_n_kernel.cc b/paddle/phi/kernels/cpu/add_n_kernel.cc
index d658b55758e5d2..42532161053c93 100644
--- a/paddle/phi/kernels/cpu/add_n_kernel.cc
+++ b/paddle/phi/kernels/cpu/add_n_kernel.cc
@@ -12,24 +12,21 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/add_n_kernel.h"
-
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
 
 namespace phi {
 
 template <typename T, typename Context>
 void AddNKernel(const Context& dev_ctx,
-                const std::vector<const DenseTensor*>& x,
+                const std::vector<const TensorBase*>& x,
                 DenseTensor* out) {
   size_t in_num = x.size();
-  bool in_place = out == x[0];
-  auto* out_ptr = dev_ctx.template Alloc<T>(out);
-  if (in_num >= 1 && x[0]->initialized()) {
-    if (x[0]->numel() > 0) {
-      in_place = (x[0]->data<T>() == out_ptr);
+  dev_ctx.template Alloc<T>(out);
+
+  bool in_place = false;
+  if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) {
+    if ((static_cast<const DenseTensor*>(x[0]))->Holder() == out->Holder()) {
+      in_place = true;
     }
   }
 
@@ -37,9 +34,11 @@ void AddNKernel(const Context& dev_ctx,
   auto& place = *dev_ctx.eigen_device();
   int start = in_place ? 1 : 0;
   if (!in_place) {
-    if ((in_num >= 2) && x[0]->initialized() && x[1]->initialized()) {
-      auto& in_0 = *x[0];
-      auto& in_1 = *x[1];
+    if ((in_num >= 2) && DenseTensor::classof(x[0]) &&
+        DenseTensor::classof(x[1]) && x[0]->initialized() &&
+        x[1]->initialized()) {
+      auto& in_0 = *(static_cast<const DenseTensor*>(x[0]));
+      auto& in_1 = *(static_cast<const DenseTensor*>(x[1]));
       if (in_0.numel() && in_1.numel()) {
         auto in_0_e = EigenVector<T>::Flatten(in_0);
         auto in_1_e = EigenVector<T>::Flatten(in_1);
@@ -49,20 +48,33 @@ void AddNKernel(const Context& dev_ctx,
     }
     if (start != 2) {
       VLOG(10) << "Fill with constant = 0 in sum kernel.";
-      funcs::SetConstant<Context, T> constant_functor;
+      phi::funcs::SetConstant<Context, T> constant_functor;
       constant_functor(dev_ctx, out, static_cast<T>(0));
     }
   }
 
+  paddle::operators::math::SelectedRowsAddToTensor<Context, T> functor;
   // If in_place, just skip the first tensor
   for (size_t i = start; i < in_num; i++) {
-    auto& in_t = *x[i];
-    if (!in_t.initialized() || in_t.numel() == 0) {
-      continue;
+    if (DenseTensor::classof(x[i])) {
+      auto& in_t = *(static_cast<const DenseTensor*>(x[i]));
+      if (!in_t.initialized() || in_t.numel() == 0) {
+        continue;
+      }
+      auto in = EigenVector<T>::Flatten(in_t);
+      result.device(place) = result + in;
+    } else if (SelectedRows::classof(x[i])) {
+      auto& in_t = *(static_cast<const SelectedRows*>(x[i]));
+      functor(dev_ctx, in_t, out);
+    } else {
+      PADDLE_THROW(phi::errors::InvalidArgument(
+          "Expected type of Input(X) of %d-th must be Tensor, "
+          "SelectedRows. But got "
+          "unsupport type: %s.",
+          x[i]->type_info().name()));
     }
-    auto in = EigenVector<T>::Flatten(in_t);
-    result.device(place) = result + in;
   }
+  VLOG(10) << "end add_n kernel";
 }
 
 }  // namespace phi
@@ -76,3 +88,13 @@ PD_REGISTER_KERNEL(add_n,
                    int,
                    phi::dtype::bfloat16,
                    int64_t) {}
+
+PD_REGISTER_KERNEL(add_n_array,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::AddNArrayKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/fill_grad_kernel.cc b/paddle/phi/kernels/cpu/fill_grad_kernel.cc
index ee676773762ca5..07448c85a57d60 100644
--- a/paddle/phi/kernels/cpu/fill_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_grad_kernel.cc
@@ -26,4 +26,5 @@ PD_REGISTER_KERNEL(fill_grad,
                    int64_t,
                    int,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    bool) {}
diff --git a/paddle/phi/kernels/cpu/fill_kernel.cc b/paddle/phi/kernels/cpu/fill_kernel.cc
index ee8dac7f6770c4..adca39e6ab95d3 100644
--- a/paddle/phi/kernels/cpu/fill_kernel.cc
+++ b/paddle/phi/kernels/cpu/fill_kernel.cc
@@ -26,4 +26,5 @@ PD_REGISTER_KERNEL(fill,
                    int64_t,
                    int,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    bool) {}
diff --git a/paddle/phi/kernels/cpu/gather_tree_kernel.cc b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
index 6f3cac6c4aa103..250ee1b1e8a2e5 100644
--- a/paddle/phi/kernels/cpu/gather_tree_kernel.cc
+++ b/paddle/phi/kernels/cpu/gather_tree_kernel.cc
@@ -14,6 +14,7 @@
 
 #include "paddle/phi/kernels/gather_tree_kernel.h"
 
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -49,6 +50,15 @@ void GatherTreeKernel(const Context &dev_ctx,
       out_data[idx] = ids_data[idx];
       auto parent = parents_data[idx];
       for (int step = max_length - 2; step >= 0; step--) {
+        PADDLE_ENFORCE_LT(
+            parent,
+            beam_size,
+            phi::errors::InvalidArgument(
+                "The parents must be less than beam size, but recieved"
+                "parents %d is greater than or equal to beam size %d. ",
+                parent,
+                beam_size));
+
         idx = step * batch_size * beam_size + batch * beam_size;
         out_data[idx + beam] = ids_data[idx + parent];
         parent = parents_data[idx + parent];
diff --git a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
index 7638ca3aa7ee63..d9a6df2794f591 100644
--- a/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
+++ b/paddle/phi/kernels/cpu/gumbel_softmax_kernel.cc
@@ -119,3 +119,10 @@ struct OneHotGenerator<CPUContext, T> {
 
 PD_REGISTER_KERNEL(
     gumbel_softmax, CPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
+
+PD_REGISTER_KERNEL(gumbel_softmax_infer,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxInferKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/multiplex_kernel.cc b/paddle/phi/kernels/cpu/multiplex_kernel.cc
index 2d9f4c51a981ed..4e60448c6c5369 100644
--- a/paddle/phi/kernels/cpu/multiplex_kernel.cc
+++ b/paddle/phi/kernels/cpu/multiplex_kernel.cc
@@ -37,7 +37,7 @@ void MultiplexKernel(const Context& ctx,
   auto rows = ins[0]->dims()[0];
   auto cols = ins[0]->numel() / rows;
   auto index = ids.data<int32_t>();
-  for (auto i = 0; i < rows; i++) {
+  for (auto i = 0; i < ids.dims()[0]; i++) {
     int32_t k = index[i];
     PADDLE_ENFORCE_GE(
         k, 0, errors::PreconditionNotMet("index must be nonnegative."));
diff --git a/paddle/phi/kernels/cpu/nms_kernel.cc b/paddle/phi/kernels/cpu/nms_kernel.cc
index 7e656b14f1fc53..4b56f6bb951050 100644
--- a/paddle/phi/kernels/cpu/nms_kernel.cc
+++ b/paddle/phi/kernels/cpu/nms_kernel.cc
@@ -16,16 +16,17 @@
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/kernels/funcs/diagonal.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 
 namespace phi {
 
 template <typename T>
-static void NMS(const T* boxes_data,
-                int64_t* output_data,
-                float threshold,
-                int64_t num_boxes) {
+static int64_t NMS(const T* boxes_data,
+                   int64_t* output_data,
+                   float threshold,
+                   int64_t num_boxes) {
   auto num_masks = CeilDivide(num_boxes, 64);
   std::vector<uint64_t> masks(num_masks, 0);
 
@@ -54,9 +55,13 @@ static void NMS(const T* boxes_data,
     output_data[output_data_idx++] = i;
   }
 
+  int64_t num_keep_boxes = output_data_idx;
+
   for (; output_data_idx < num_boxes; ++output_data_idx) {
     output_data[output_data_idx] = 0;
   }
+
+  return num_keep_boxes;
 }
 
 template <typename T, typename Context>
@@ -64,8 +69,15 @@ void NMSKernel(const Context& dev_ctx,
                const DenseTensor& boxes,
                float threshold,
                DenseTensor* output) {
-  auto output_data = dev_ctx.template Alloc<int64_t>(output);
-  NMS<T>(boxes.data<T>(), output_data, threshold, boxes.dims()[0]);
+  int64_t num_boxes = boxes.dims()[0];
+  DenseTensor output_tmp;
+  output_tmp.Resize(phi::make_ddim({num_boxes}));
+  auto output_tmp_data = dev_ctx.template Alloc<int64_t>(&output_tmp);
+
+  int64_t num_keep_boxes =
+      NMS<T>(boxes.data<T>(), output_tmp_data, threshold, num_boxes);
+  auto slice_out = output_tmp.Slice(0, num_keep_boxes);
+  phi::Copy(dev_ctx, slice_out, dev_ctx.GetPlace(), false, output);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
index a297843b0c7cdd..573065cbc66157 100644
--- a/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
+++ b/paddle/phi/kernels/cpu/put_along_axis_kernel.cc
@@ -67,7 +67,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
     PADDLE_THROW(errors::InvalidArgument(
         "can not support reduce: '%s' for scatter kernel, only "
         "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
-        "defalut reduce "
+        "default reduce "
         "op is 'assign' ",
         reduce));
     return;
diff --git a/paddle/phi/kernels/cpu/scale_kernel.cc b/paddle/phi/kernels/cpu/scale_kernel.cc
index 421aae270ee591..358d89197edb2e 100644
--- a/paddle/phi/kernels/cpu/scale_kernel.cc
+++ b/paddle/phi/kernels/cpu/scale_kernel.cc
@@ -58,6 +58,7 @@ PD_REGISTER_KERNEL(scale,
                    float,
                    double,
                    phi::dtype::bfloat16,
+                   phi::dtype::float16,
                    uint8_t,
                    int8_t,
                    int16_t,
diff --git a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
index 44df36bb9fd873..dad7628dcf30a7 100644
--- a/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/set_value_grad_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/set_value_grad_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
 
@@ -26,4 +27,7 @@ PD_REGISTER_KERNEL(set_value_grad,
                    double,
                    int,
                    int64_t,
-                   bool) {}
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/set_value_kernel.cc b/paddle/phi/kernels/cpu/set_value_kernel.cc
index dcf278cd94e651..4b0c0415e48349 100644
--- a/paddle/phi/kernels/cpu/set_value_kernel.cc
+++ b/paddle/phi/kernels/cpu/set_value_kernel.cc
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/set_value_kernel.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
 
@@ -26,7 +27,10 @@ PD_REGISTER_KERNEL(set_value,
                    double,
                    int,
                    int64_t,
-                   bool) {}
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(set_value_with_tensor,
                    CPU,
                    ALL_LAYOUT,
@@ -35,4 +39,7 @@ PD_REGISTER_KERNEL(set_value_with_tensor,
                    double,
                    int,
                    int64_t,
-                   bool) {}
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/empty_kernel.cc b/paddle/phi/kernels/empty_kernel.cc
index 2c969cc43d2f1a..01b07c438a5270 100644
--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -88,6 +88,7 @@ PD_REGISTER_KERNEL(empty,
                    int64_t,
                    bool,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 51420c5ecb6dcb..2af106ca38c48c 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -2169,12 +2169,14 @@ struct CudaSeluFunctor : public BaseActivationFunctor<T> {
   }
 
   __device__ __forceinline__ T operator()(const T x) const {
-    T res = x;
-    if (res <= zero) {
+    using MT =
+        typename std::conditional<(sizeof(T) > sizeof(float)), T, float>::type;
+    MT res = static_cast<MT>(x);
+    if (x <= zero) {
       res = alpha * expf(res) - alpha;
     }
     res *= scale;
-    return res;
+    return static_cast<T>(res);
   }
 
  private:
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
index 40dfb76586189e..7d9efa46b7a5d1 100644
--- a/paddle/phi/kernels/funcs/broadcast_function.h
+++ b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -468,6 +468,397 @@ void LaunchBroadcastKernel(
                                        func);
 }
 
+#ifndef PADDLE_WITH_XPU_KP
+HOSTDEVICE static int64_t ConvertSrcIdxToDstIdx(
+    int64_t src_idx,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
+    int rank) {
+  int64_t dst_idx = 0;
+  int64_t old_src_idx = src_idx;
+  for (int k = 0; k < rank; ++k) {
+    auto local_idx = src_idx / src_strides[k + 1];
+    src_idx -= local_idx * src_strides[k + 1];
+
+    if (dst_strides[k] != dst_strides[k + 1]) {
+      dst_idx += local_idx * dst_strides[k + 1];
+    }
+  }
+  return dst_idx;
+}
+
+template <typename T, int VecSize, bool IsBoundary>
+HOSTDEVICE static void ReadVecDataWithInt64Index(
+    const T *in,
+    int64_t idx,
+    bool need_broadcast,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &src_strides,
+    const phi::Array<int64_t, phi::DDim::kMaxRank + 1> &dst_strides,
+    int rank,
+    int n,
+    phi::AlignedVector<T, VecSize> *out) {
+  if (IsBoundary) {
+    for (int i = 0; i < n; ++i) {
+      (*out)[i] =
+          in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
+    }
+  } else {
+    if (!need_broadcast) {
+      phi::Load<T, VecSize>(in + idx, out);
+    } else {
+#pragma unroll
+      for (int i = 0; i < VecSize; ++i) {
+        (*out)[i] =
+            in[ConvertSrcIdxToDstIdx(idx + i, src_strides, dst_strides, rank)];
+      }
+    }
+  }
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int NumIns>
+struct ApplyFunctorWithInt64IndexHelper {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i);
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 0> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(functor());
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 1> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(functor(ins_vec[0][i]));
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 2> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(functor(ins_vec[0][i], ins_vec[1][i]));
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int VecSize>
+struct ApplyFunctorWithInt64IndexHelper<InT, OutT, Functor, VecSize, 3> {
+  HOSTDEVICE static OutT Run(const phi::AlignedVector<InT, VecSize> *ins_vec,
+                             Functor functor,
+                             int i) {
+    return static_cast<OutT>(
+        functor(ins_vec[0][i], ins_vec[1][i], ins_vec[2][i]));
+  }
+};
+
+template <int N>
+struct MaxWithOne {
+  static constexpr auto kValue = (N >= 1 ? N : 1);
+};
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int VecSize,
+          int NumIns>
+__global__ void BroadcastKernelWithInt64Index(
+    phi::Array<const InT *, MaxWithOne<NumIns>::kValue> ins,
+    OutT *out,
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
+               MaxWithOne<NumIns>::kValue> ins_strides,
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> out_strides,
+    phi::Array<bool, MaxWithOne<NumIns>::kValue> need_broadcasts,
+    int rank,
+    Functor functor) {
+  int64_t numel = out_strides[0];
+  int64_t idx =
+      (static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x) * VecSize;
+  int64_t stride = static_cast<int64_t>(blockDim.x) * gridDim.x * VecSize;
+  int64_t limit = numel - VecSize;
+
+  phi::Array<phi::AlignedVector<InT, VecSize>, MaxWithOne<NumIns>::kValue>
+      ins_vec;
+  phi::AlignedVector<OutT, VecSize> out_vec;
+  for (; idx <= limit; idx += stride) {
+#pragma unroll
+    for (int i = 0; i < NumIns; ++i) {
+      ReadVecDataWithInt64Index<InT, VecSize, false>(ins[i],
+                                                     idx,
+                                                     need_broadcasts[i],
+                                                     out_strides,
+                                                     ins_strides[i],
+                                                     rank,
+                                                     VecSize,
+                                                     &ins_vec[i]);
+    }
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      out_vec[i] = ApplyFunctorWithInt64IndexHelper<InT,
+                                                    OutT,
+                                                    Functor,
+                                                    VecSize,
+                                                    NumIns>::Run(ins_vec.Get(),
+                                                                 functor,
+                                                                 i);
+    }
+
+    phi::Store<OutT, VecSize>(out_vec, out + idx);
+  }
+
+  if (idx < numel) {
+    int remain = numel - idx;  // remain is always less than VecSize, therefore
+                               // `int` is enough here
+#pragma unroll
+    for (int i = 0; i < NumIns; ++i) {
+      ReadVecDataWithInt64Index<InT, VecSize, true>(ins[i],
+                                                    idx,
+                                                    need_broadcasts[i],
+                                                    out_strides,
+                                                    ins_strides[i],
+                                                    rank,
+                                                    remain,
+                                                    &ins_vec[i]);
+    }
+
+    for (int i = 0; i < remain; ++i) {
+      out[idx + i] =
+          ApplyFunctorWithInt64IndexHelper<InT,
+                                           OutT,
+                                           Functor,
+                                           VecSize,
+                                           NumIns>::Run(ins_vec.Get(),
+                                                        functor,
+                                                        i);
+    }
+  }
+}
+
+template <typename InT,
+          typename OutT,
+          typename Functor,
+          int Arity,
+          int NumOuts,
+          int VecSize>
+struct LaunchBroadcastKernelWithInt64IndexHelper {
+  static void Run(const KPDevice &ctx,
+                  const std::vector<const DenseTensor *> &ins,
+                  std::vector<DenseTensor *> *outs,
+                  int axis,
+                  Functor functor) {
+    PADDLE_THROW(phi::errors::PermissionDenied(
+        "Unreachable code branch. This may be a bug."));
+  }
+};
+
+template <typename InT, typename OutT, typename Functor, int Arity, int VecSize>
+struct LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                 OutT,
+                                                 Functor,
+                                                 Arity,
+                                                 /*NumOuts=*/1,
+                                                 VecSize> {
+  static void Run(const KPDevice &ctx,
+                  const std::vector<const DenseTensor *> &ins,
+                  std::vector<DenseTensor *> *outs,
+                  int axis,
+                  Functor functor) {
+    phi::Array<const InT *, MaxWithOne<Arity>::kValue> ins_ptrs;
+    for (int i = 0; i < Arity; ++i) {
+      ins_ptrs[i] = ins[i]->data<InT>();
+    }
+    auto *out_tensor = (*outs)[0];
+    auto *out_ptr = ctx.Alloc<OutT>(out_tensor);
+
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank>,
+               MaxWithOne<Arity>::kValue>
+        ins_expand_dims;
+    phi::Array<int64_t, phi::DDim::kMaxRank> broadcast_out_dims;
+    int rank;
+    if (Arity == 1) {
+      rank = ins[0]->dims().size();
+      for (int i = 0; i < rank; ++i) {
+        broadcast_out_dims[i] = ins[0]->dims()[i];
+      }
+      ins_expand_dims[0] = broadcast_out_dims;
+    } else if (Arity >= 2) {
+      CalculateBroadcastDims(ins[0]->dims().Get(),
+                             ins[1]->dims().Get(),
+                             ins[0]->dims().size(),
+                             ins[1]->dims().size(),
+                             axis,
+                             ins_expand_dims[0].GetMutable(),
+                             ins_expand_dims[1].GetMutable(),
+                             broadcast_out_dims.GetMutable(),
+                             &rank);
+      for (int i = 2; i < Arity; ++i) {
+        auto tmp_dims = broadcast_out_dims;
+        phi::Array<int64_t, phi::DDim::kMaxRank> tmp_expand_dims;
+        int tmp_rank;
+        PADDLE_ENFORCE_GE(rank,
+                          ins[i]->dims().size(),
+                          phi::errors::InvalidArgument(
+                              "Unsupported reverse broadcast when the input "
+                              "tensor number is larger than 2."));
+        CalculateBroadcastDims(tmp_dims.Get(),
+                               ins[i]->dims().Get(),
+                               rank,
+                               ins[i]->dims().size(),
+                               axis,
+                               tmp_expand_dims.GetMutable(),
+                               ins_expand_dims[i].GetMutable(),
+                               broadcast_out_dims.GetMutable(),
+                               &tmp_rank);
+        PADDLE_ENFORCE_EQ(rank,
+                          tmp_rank,
+                          phi::errors::InvalidArgument(
+                              "Wrong broadcast algorithm. This may be a bug."));
+      }
+    }
+
+    phi::Array<phi::Array<int64_t, phi::DDim::kMaxRank + 1>,
+               MaxWithOne<Arity>::kValue>
+        ins_strides;
+    phi::Array<bool, MaxWithOne<Arity>::kValue> need_broadcasts;
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> out_strides;
+    const auto &out_dims = out_tensor->dims();
+    if (rank <= out_dims.size()) {
+      out_strides = ShapeToStride(out_dims.Get(), rank);
+    } else {
+      out_strides = ShapeToStride(broadcast_out_dims.Get(), rank);
+    }
+
+    for (int i = 0; i < Arity; ++i) {
+      ins_strides[i] = ShapeToStride(ins_expand_dims[i].Get(), rank);
+      need_broadcasts[i] =
+          !IsSameShape(out_strides.Get(), ins_strides[i].Get(), rank + 1);
+    }
+
+    int64_t numel = out_strides[0];
+    auto gpu_config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(ctx, numel, VecSize);
+
+    BroadcastKernelWithInt64Index<InT, OutT, Functor, VecSize, Arity>
+        <<<gpu_config.block_per_grid,
+           gpu_config.thread_per_block,
+           0,
+           ctx.stream()>>>(ins_ptrs,
+                           out_ptr,
+                           ins_strides,
+                           out_strides,
+                           need_broadcasts,
+                           rank,
+                           functor);
+  }
+
+ private:
+  static void CalculateBroadcastDims(const int64_t *x_dims,
+                                     const int64_t *y_dims,
+                                     int nx,
+                                     int ny,
+                                     int axis,
+                                     int64_t *x_out_dims,
+                                     int64_t *y_out_dims,
+                                     int64_t *broadcast_out_dims,
+                                     int *length) {
+    PADDLE_ENFORCE_GE(
+        axis, 0, phi::errors::InvalidArgument("Invalid axis value: %d", axis));
+    if (nx == ny) {
+      *length = nx;
+      for (int i = 0; i < nx; ++i) {
+        if (x_dims[i] != y_dims[i]) {
+          PADDLE_ENFORCE_EQ(
+              x_dims[i] == 1 || y_dims[i] == 1,
+              true,
+              phi::errors::InvalidArgument("Cannot broadcast input shape where "
+                                           "x_dims[%d] = %d, y_dims[%d] = %d.",
+                                           i,
+                                           x_dims[i],
+                                           i,
+                                           y_dims[i]));
+        }
+        broadcast_out_dims[i] = std::max(x_dims[i], y_dims[i]);
+        x_out_dims[i] = x_dims[i];
+        y_out_dims[i] = y_dims[i];
+      }
+    } else if (nx > ny) {
+      *length = nx;
+      for (int i = nx - axis; i < ny; ++i) {
+        PADDLE_ENFORCE_EQ(
+            y_dims[i],
+            1,
+            phi::errors::InvalidArgument(
+                "The trailing Y.shape[%d] should be 1 but got %d.",
+                i,
+                y_dims[i]));
+      }
+
+      for (int i = 0; i < nx; ++i) {
+        if (i >= axis && i - axis < ny) {
+          if (x_dims[i] != y_dims[i - axis]) {
+            PADDLE_ENFORCE_EQ(x_dims[i] == 1 || y_dims[i - axis] == 1,
+                              true,
+                              phi::errors::InvalidArgument(
+                                  "Cannot broadcast input shape where "
+                                  "x_dims[%d] = %d, y_dims[%d] = %d.",
+                                  i,
+                                  x_dims[i],
+                                  i - axis,
+                                  y_dims[i - axis]));
+          }
+          broadcast_out_dims[i] = std::max(x_dims[i], y_dims[i - axis]);
+          x_out_dims[i] = x_dims[i];
+          y_out_dims[i] = y_dims[i - axis];
+        } else {
+          broadcast_out_dims[i] = x_dims[i];
+          x_out_dims[i] = x_dims[i];
+          y_out_dims[i] = 1;
+        }
+      }
+    } else {
+      CalculateBroadcastDims(y_dims,
+                             x_dims,
+                             ny,
+                             nx,
+                             axis,
+                             y_out_dims,
+                             x_out_dims,
+                             broadcast_out_dims,
+                             length);
+    }
+  }
+
+  static bool IsSameShape(const int64_t *x, const int64_t *y, int rank) {
+    for (int i = 0; i < rank; ++i) {
+      if (x[i] != y[i]) return false;
+    }
+    return true;
+  }
+
+  static phi::Array<int64_t, phi::DDim::kMaxRank + 1> ShapeToStride(
+      const int64_t *arr, int rank) {
+    phi::Array<int64_t, phi::DDim::kMaxRank + 1> strides;
+    strides[rank] = 1;
+    for (int i = rank - 1; i >= 0; --i) {
+      strides[i] = strides[i + 1] * arr[i];
+    }
+    return strides;
+  }
+};
+#endif
+
 template <ElementwiseType ET,
           typename InT,
           typename OutT,
@@ -509,6 +900,63 @@ void BroadcastKernelForDifferentVecSize(
                                    outs->size(),
                                    NumOuts));
 
+#ifndef PADDLE_WITH_XPU_KP
+  constexpr bool kEnabledInt64IndexKernel = (NumOuts == 1 && kArity <= 3);
+  bool use_int64_index_kernel =
+      kEnabledInt64IndexKernel &&
+      (*outs)[0]->numel() >= std::numeric_limits<int32_t>::max();
+  if (use_int64_index_kernel) {
+    int vec_size = GetVecsize<InT, OutT>(ins, outs);
+    switch (vec_size) {
+      case VecSizeL: {
+        LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  kArity,
+                                                  NumOuts,
+                                                  VecSizeL>::Run(ctx,
+                                                                 ins,
+                                                                 outs,
+                                                                 axis,
+                                                                 func);
+        break;
+      }
+      case VecSizeM: {
+        LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  kArity,
+                                                  NumOuts,
+                                                  VecSizeM>::Run(ctx,
+                                                                 ins,
+                                                                 outs,
+                                                                 axis,
+                                                                 func);
+        break;
+      }
+      case VecSizeS: {
+        LaunchBroadcastKernelWithInt64IndexHelper<InT,
+                                                  OutT,
+                                                  Functor,
+                                                  kArity,
+                                                  NumOuts,
+                                                  VecSizeS>::Run(ctx,
+                                                                 ins,
+                                                                 outs,
+                                                                 axis,
+                                                                 func);
+        break;
+      }
+      default: {
+        PADDLE_THROW(phi::errors::Unimplemented(
+            "Unsupported vectorized size: %d!", vec_size));
+        break;
+      }
+    }
+    return;
+  }
+#endif
+
   // mergedim and get vec_size
   const auto merge_dims = DimensionsTransform(ins, (*outs)[0]->dims(), axis);
   phi::Array<kps::details::BroadcastConfig, kArity> configs;
diff --git a/paddle/phi/kernels/funcs/data_layout_transform.cc b/paddle/phi/kernels/funcs/data_layout_transform.cc
index 9b09d897eff88b..9d2d0bf3b5c889 100644
--- a/paddle/phi/kernels/funcs/data_layout_transform.cc
+++ b/paddle/phi/kernels/funcs/data_layout_transform.cc
@@ -83,7 +83,9 @@ void innerTransDataLayoutFromOneDNN(DataLayout in_layout,
   out->set_mem_desc(out_mem_desc);
   out->Resize(in.dims());
 
-  if ((in.mem_desc() != out->mem_desc()) || always_copy) {
+  // Note(0x45f): Using initialized() to support slice Tensors
+  // with shapes like [0, 0, 0].
+  if (in.initialized() && ((in.mem_desc() != out->mem_desc()) || always_copy)) {
     void* in_data = GetDataFromTensor(in, in_type);
 
     ReorderOneDNNHandler handler(in_tz, in.dtype(), in_type, cpu_engine);
diff --git a/paddle/phi/kernels/funcs/eigen/broadcast.cu b/paddle/phi/kernels/funcs/eigen/broadcast.cu
index 0b749f5c009a5d..0c5a3408872c47 100644
--- a/paddle/phi/kernels/funcs/eigen/broadcast.cu
+++ b/paddle/phi/kernels/funcs/eigen/broadcast.cu
@@ -84,6 +84,7 @@ INSTANTIATION(EigenBroadcast, int);
 INSTANTIATION(EigenBroadcast, int64_t);
 INSTANTIATION(EigenBroadcastGrad, bool);
 INSTANTIATION(EigenBroadcastGrad, float);
+INSTANTIATION(EigenBroadcastGrad, dtype::bfloat16);
 INSTANTIATION(EigenBroadcastGrad, dtype::float16);
 INSTANTIATION(EigenBroadcastGrad, double);
 INSTANTIATION(EigenBroadcastGrad, dtype::complex<float>);
diff --git a/paddle/phi/kernels/funcs/eigen/eigen_function.h b/paddle/phi/kernels/funcs/eigen/eigen_function.h
index 1e81256e79e143..42f46b814e1682 100644
--- a/paddle/phi/kernels/funcs/eigen/eigen_function.h
+++ b/paddle/phi/kernels/funcs/eigen/eigen_function.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
+
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace phi {
diff --git a/paddle/phi/kernels/funcs/eigen/scale.cc b/paddle/phi/kernels/funcs/eigen/scale.cc
index 341bf52f547fed..7e2d463a9fab13 100644
--- a/paddle/phi/kernels/funcs/eigen/scale.cc
+++ b/paddle/phi/kernels/funcs/eigen/scale.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/float16.h"
 #include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
 
 namespace phi {
@@ -41,6 +42,7 @@ struct EigenScale<Eigen::DefaultDevice, T> {
 template struct EigenScale<Eigen::DefaultDevice, float>;
 template struct EigenScale<Eigen::DefaultDevice, double>;
 template struct EigenScale<Eigen::DefaultDevice, dtype::bfloat16>;
+template struct EigenScale<Eigen::DefaultDevice, dtype::float16>;
 template struct EigenScale<Eigen::DefaultDevice, uint8_t>;
 template struct EigenScale<Eigen::DefaultDevice, int8_t>;
 template struct EigenScale<Eigen::DefaultDevice, int16_t>;
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
old mode 100755
new mode 100644
index 2573a0e44c90ca..100d2dcd612cef
--- a/paddle/phi/kernels/funcs/elementwise_base.h
+++ b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -760,8 +760,10 @@ __global__ void VectorizedElementwiseKernel(
     kps::IndexType main_offset,
     int read_lens,
     Functor func) {
-  kps::IndexType data_offset = BLOCK_ID_X * BLOCK_NUM_X * read_lens;
-  kps::IndexType stride = BLOCK_NUM_X * GRID_NUM_X * read_lens;
+  kps::IndexType data_offset =
+      static_cast<kps::IndexType>(BLOCK_ID_X) * BLOCK_NUM_X * read_lens;
+  kps::IndexType stride =
+      static_cast<kps::IndexType>(BLOCK_NUM_X) * GRID_NUM_X * read_lens;
   for (; data_offset < main_offset; data_offset += stride) {
     VectorizedElementwiseKernelImpl<OutT,
                                     Functor,
diff --git a/paddle/phi/kernels/funcs/elementwise_functor.h b/paddle/phi/kernels/funcs/elementwise_functor.h
index a4636565cf25a0..1304fedbd2e242 100644
--- a/paddle/phi/kernels/funcs/elementwise_functor.h
+++ b/paddle/phi/kernels/funcs/elementwise_functor.h
@@ -570,7 +570,7 @@ struct FloorDivideFunctor {
 #ifndef PADDLE_WITH_XPU_KP
     PADDLE_ENFORCE(b != 0, DIV_ERROR_INFO);
 #endif
-    return static_cast<T>(std::trunc(a / b));
+    return static_cast<T>(a / b);
   }
 };
 
@@ -580,7 +580,7 @@ struct InverseFloorDivideFunctor {
 #ifndef PADDLE_WITH_XPU_KP
     PADDLE_ENFORCE(a != 0, DIV_ERROR_INFO);
 #endif
-    return static_cast<T>(std::trunc(b / a));
+    return static_cast<T>(b / a);
   }
 };
 
diff --git a/paddle/phi/kernels/funcs/fc_functor.cu b/paddle/phi/kernels/funcs/fc_functor.cu
index 6015266dde9e7a..6fa9b640f12652 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cu
+++ b/paddle/phi/kernels/funcs/fc_functor.cu
@@ -149,7 +149,7 @@ __global__ void bias_relu_v2(const int num,
 #if __CUDA_ARCH__ >= 800
       packed_val = __hmax2(__half2(0, 0), packed_val);
 #elif __CUDA_ARCH__ >= 530
-      packed_val = __hmul2(__hgt2(__half2(0, 0), packed_val), packed_val);
+      packed_val = __hmul2(__hgt2(packed_val, __half2(0, 0)), packed_val);
 #else
       packed_val.x = static_cast<int>(static_cast<float>(packed_val.x) > 0) *
                      static_cast<float>(packed_val.x);
@@ -292,19 +292,16 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
                     errors::PermissionDenied(
                         "Weight padding in fc can not be used in GPU scope."));
   auto blas = phi::funcs::GetBlas<DeviceContext, T>(context);
-  blas.GEMM(false,
-            false,
+  blas.GEMM(CblasNoTrans,
+            CblasNoTrans,
             M,
             N,
             K,
             static_cast<T>(1.0),
             X,
-            K,
             W,
-            N,
             static_cast<T>(0.0),
-            Y,
-            N);
+            Y);
   if (B == NULL) {
     return;
   }
diff --git a/paddle/phi/kernels/funcs/math_function.h b/paddle/phi/kernels/funcs/math_function.h
index ee4ac7482f2115..d894ef2b41d82e 100644
--- a/paddle/phi/kernels/funcs/math_function.h
+++ b/paddle/phi/kernels/funcs/math_function.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
diff --git a/paddle/phi/kernels/funcs/pooling.cu b/paddle/phi/kernels/funcs/pooling.cu
index 6e4fc414afd4a3..425448a3823a3d 100644
--- a/paddle/phi/kernels/funcs/pooling.cu
+++ b/paddle/phi/kernels/funcs/pooling.cu
@@ -388,7 +388,6 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
   const int stride_width = strides[1];
   const int padding_height = paddings[0];
   const int padding_width = paddings[1];
-
   int nthreads = batch_size * output_channels * output_height * output_width;
   int thread_num = 1024;
 #ifdef WITH_NV_JETSON
diff --git a/paddle/phi/kernels/funcs/slice_utils.h b/paddle/phi/kernels/funcs/slice_utils.h
index bfe024f45a098b..ed403c75dbdc8a 100644
--- a/paddle/phi/kernels/funcs/slice_utils.h
+++ b/paddle/phi/kernels/funcs/slice_utils.h
@@ -117,6 +117,10 @@ inline phi::DDim GetSliceDims(const phi::DDim in_dims,
       continue;
     }
 
+    if (in_dims[axis] == -1) {
+      continue;
+    }
+
     T start = starts[i];
     T end = ends[i];
     T step = steps == nullptr ? 1 : (*steps)[i];
diff --git a/paddle/phi/kernels/funcs/sparse/scatter.cu.h b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
index 6c293b2394443a..f27174d5818186 100644
--- a/paddle/phi/kernels/funcs/sparse/scatter.cu.h
+++ b/paddle/phi/kernels/funcs/sparse/scatter.cu.h
@@ -79,7 +79,6 @@ __global__ void ScatterKernelV2(const T* input,
                                 const int* index_groups,
                                 const int non_zero_num,
                                 const int kernel_size,
-                                const int max_voxel,
                                 const int channels,
                                 const int buffer_counts,
                                 T* out) {
@@ -97,11 +96,10 @@ __global__ void ScatterKernelV2(const T* input,
                           &sums);
     for (int it = 0; it < buffer_counts; it++) {
       int len = index_counts[indices_i + it * non_zero_num];
-      const int group_offset = it * max_voxel * kernel_size * non_zero_num;
+      const int group_offset = it * kernel_size * non_zero_num;
       for (int j = 0; j < len; j++) {
         const int out_feature_i =
-            index_groups[indices_i * max_voxel * kernel_size + j +
-                         group_offset];
+            index_groups[indices_i * kernel_size + j + group_offset];
         LoadT vec_in;
         phi::Load<T, VecSize>(
             input + out_feature_i * channels + channels_i * VecSize, &vec_in);
@@ -123,7 +121,6 @@ void ScatterV2(const GPUContext& dev_ctx,
                const int* index_groups,
                const int non_zero_num,
                const int kernel_size,
-               const int max_voxel,
                const int channels,
                const int buffer_counts,
                T* output) {
@@ -139,7 +136,6 @@ void ScatterV2(const GPUContext& dev_ctx,
                                                       index_groups,
                                                       non_zero_num,
                                                       kernel_size,
-                                                      max_voxel,
                                                       channels,
                                                       buffer_counts,
                                                       output);
@@ -154,7 +150,6 @@ void ScatterV2(const GPUContext& dev_ctx,
                                                 index_groups,
                                                 non_zero_num,
                                                 kernel_size,
-                                                max_voxel,
                                                 channels,
                                                 buffer_counts,
                                                 output);
diff --git a/paddle/phi/kernels/fusion/README.md b/paddle/phi/kernels/fusion/README.md
new file mode 100644
index 00000000000000..2080a37dd0fd59
--- /dev/null
+++ b/paddle/phi/kernels/fusion/README.md
@@ -0,0 +1,13 @@
+# What's difference for fusion kernel?
+
+1. We don't recommend to implement Python API for fusion kernel
+
+  - We don't recommend to implement Python API for fusion kernel, because it contains many inputs or outputs arguments generally, it is difficult to use and understand as an Python API, we recommend to call fusion kernel by pass optimization in dy2static mode or static mode.
+  - We also don't recommend to reuse fusion kernel in other kernel implementation, but recommended that the fusion kernel be implemented by reusing other kernels.
+
+2. We don't require fusion kernel to have implementations for all devices
+
+  - Fusion Kernel is generally used to accelerate the combined operation on a certain device. If all devices need to be implemented, the cost is relatively high.
+  - We don't recommend implementing a pseudo kernel that just throws exception, if not required, it can be not implemented.
+
+3. Fusion Kernel needs to be in the `phi/fusion` namespace
diff --git a/paddle/phi/kernels/fusion/fused_softmax_mask_grad_kernel.h b/paddle/phi/kernels/fusion/fused_softmax_mask_grad_kernel.h
new file mode 100644
index 00000000000000..391c614801f232
--- /dev/null
+++ b/paddle/phi/kernels/fusion/fused_softmax_mask_grad_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxMaskFuseGradKernel(const Context& dev_ctx,
+                               const DenseTensor& out,
+                               const DenseTensor& out_grad,
+                               DenseTensor* x_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/fused_softmax_mask_kernel.h b/paddle/phi/kernels/fusion/fused_softmax_mask_kernel.h
new file mode 100644
index 00000000000000..dd08373f428889
--- /dev/null
+++ b/paddle/phi/kernels/fusion/fused_softmax_mask_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxMaskFuseKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& mask,
+                           DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu
new file mode 100644
index 00000000000000..ab731f8f239901
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_grad_kernel.cu
@@ -0,0 +1,201 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "paddle/phi/kernels/fusion/fused_softmax_mask_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, int pow2_index>
+__global__ void SoftmaxMaskFuseGradGPUKernel(const T* grad_input,
+                                             T* grad_output,
+                                             const T* softmax_rst,
+                                             int batch_count,
+                                             int key_seq_len) {
+  constexpr int next_pow2 = 1 << pow2_index;
+  constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
+  constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4);
+  constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1;
+  constexpr int kOneLoadingCounts = 4;
+
+  int data_first_idx =
+      (blockDim.y * blockIdx.x + threadIdx.y) * kLocalBatchSize;
+
+  // batch_count might not be a multiple of kLocalBatchSize. Check how
+  // many batches have to computed within this WARP.
+  int local_batches = batch_count - data_first_idx;
+  if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize;
+
+  // might be many batches per warp. compute the index within the batch
+  int local_idx = threadIdx.x;
+
+  // the first element to process by the current thread
+  int offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx;
+  grad_input += offset;
+  grad_output += offset;
+  softmax_rst += offset;
+
+  // using float for all inter compute
+  float grad_input_reg[kLocalBatchSize][kLocalIterations]{0.0f};
+  float softmax_rst_reg[kLocalBatchSize][kLocalIterations]{0.0f};
+  T temp_grad_input[kOneLoadingCounts];
+  T temp_softmax_rst[kOneLoadingCounts];
+
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+    int batch_data = (i >= local_batches) ? 0 : key_seq_len;
+
+#pragma unroll
+    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
+      int data_index = kOneLoadingCounts * local_idx + ii * WARP_SIZE;
+      if (data_index < batch_data) {
+        load_data(temp_grad_input,
+                  grad_input + i * key_seq_len + ii * warp_size);
+        load_data(temp_softmax_rst,
+                  softmax_rst + i * key_seq_len + ii * warp_size);
+
+#pragma unroll
+        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
+          softmax_rst_reg[i][ii + counter] =
+              static_cast<float>(temp_softmax_rst[counter]);
+        }
+#pragma unroll
+        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
+          grad_input_reg[i][ii + counter] =
+              static_cast<float>(temp_grad_input[counter]) *
+              softmax_rst_reg[i][ii + counter];
+        }
+      }
+    }
+  }
+
+  float samples_sum[kLocalBatchSize];
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+    samples_sum[i] = grad_input_reg[i][0];
+#pragma unroll
+    for (int ii = 1; ii < kLocalIterations; ++ii) {
+      samples_sum[i] += grad_input_reg[i][ii];
+    }
+  }
+  warp_reduce<float, kLocalBatchSize, warp_size, AddOP>(samples_sum);
+
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+    if (i >= local_batches) break;
+#pragma unroll
+    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
+      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
+      if (data_index < key_seq_len) {
+        // compute gradients
+        T samples_out[kOneLoadingCounts];
+#pragma unroll
+        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
+          samples_out[counter] =
+              grad_input_reg[i][ii + counter] -
+              softmax_rst_reg[i][ii + counter] * samples_sum[i];
+        }
+        load_data(grad_output + i * key_seq_len + ii * warp_size, samples_out);
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void SoftmaxMaskFuseGradKernel(const Context& dev_ctx,
+                               const DenseTensor& out,
+                               const DenseTensor& out_grad,
+                               DenseTensor* x_grad) {
+  auto* grad_x_data = dev_ctx.template Alloc<T>(x_grad);
+  auto* grad_y_data = out_grad.data<T>();
+  auto* softmax_rst_data = out.data<T>();
+
+  auto y_dim = out_grad.dims();
+  auto batches = y_dim[0];
+  auto attn_heads = y_dim[1];
+  auto query_seq_len = y_dim[2];
+  auto key_seq_len = y_dim[3];
+
+  auto stream = dev_ctx.stream();
+
+  int pow2_index = get_pow2(key_seq_len);
+  const int next_pow2 = 1 << pow2_index;
+  int batch_count = batches * attn_heads * query_seq_len;
+  int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
+  int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
+  // use 128 threads per block to maximum gpu utilization
+  constexpr int threads_per_block = 128;
+
+  int warps_per_block = (threads_per_block / warp_size);
+  int batches_per_block = warps_per_block * batches_per_warp;
+  int blocks = batch_count / batches_per_block;
+  dim3 threads(warp_size, warps_per_block, 1);
+
+  // launch the kernel based on the pow2_index
+  switch (pow2_index) {
+    case 5:  // 32
+      SoftmaxMaskFuseGradGPUKernel<T, 5><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 6:  // 64
+      SoftmaxMaskFuseGradGPUKernel<T, 6><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 7:  // 128
+      SoftmaxMaskFuseGradGPUKernel<T, 7><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 8:  // 256
+      SoftmaxMaskFuseGradGPUKernel<T, 8><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 9:  // 512
+      SoftmaxMaskFuseGradGPUKernel<T, 9><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 10:  // 1024
+      SoftmaxMaskFuseGradGPUKernel<T, 10><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 11:  // 2048
+      SoftmaxMaskFuseGradGPUKernel<T, 11><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 12:  // 4096
+      SoftmaxMaskFuseGradGPUKernel<T, 12><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    case 13:  // 8192
+      SoftmaxMaskFuseGradGPUKernel<T, 13><<<blocks, threads, 0, stream>>>(
+          grad_y_data, grad_x_data, softmax_rst_data, batch_count, key_seq_len);
+      break;
+    default:
+      break;
+  }
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_softmax_mask_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::SoftmaxMaskFuseGradKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu
new file mode 100644
index 00000000000000..e86b4841e926a8
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_kernel.cu
@@ -0,0 +1,280 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h"
+
+namespace phi {
+namespace fusion {
+
+// T == fp16
+template <typename T, int pow2_index>
+__global__ void SoftmaxMaskFuseGPUKernel(const T* x_data,
+                                         const T* mask_data,
+                                         T* y_data,
+                                         int batch_count,
+                                         int key_seq_len) {
+  // the forward gpu kernel
+  constexpr int next_pow2 = 1 << pow2_index;
+  constexpr int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
+  constexpr int kLocalIterations = std::max(next_pow2 / warp_size, 4);
+  constexpr int kLocalBatchSize = (next_pow2 <= 128) ? 2 : 1;
+  constexpr int kOneLoadingCounts = 4;
+
+  int data_first_idx =
+      (blockDim.y *
+           (blockIdx.x + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z)) +
+       threadIdx.y) *
+      kLocalBatchSize;
+
+  int mask_fist_idx =
+      (blockDim.y * (blockIdx.x + gridDim.x * blockIdx.z) + threadIdx.y) *
+      kLocalBatchSize;
+
+  // batch_count might not be a multiple of kLocalBatchSize. Check how
+  // many batches have to computed within this WARP.
+  int local_batches = batch_count - data_first_idx;
+  if (local_batches > kLocalBatchSize) local_batches = kLocalBatchSize;
+
+  // might be many batches per warp. compute the index within the batch
+  int local_idx = threadIdx.x;
+
+  int x_offset = data_first_idx * key_seq_len + kOneLoadingCounts * local_idx;
+  int mask_offset = mask_fist_idx * key_seq_len + kOneLoadingCounts * local_idx;
+  x_data += x_offset;
+  mask_data += mask_offset;
+  y_data += x_offset;
+
+  // using float for all inter compute
+  float data[kLocalBatchSize][kLocalIterations];
+  T temp_data[kOneLoadingCounts];
+  T temp_mask[kOneLoadingCounts];
+
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+    int batch_data = (i >= local_batches) ? 0 : key_seq_len;
+
+#pragma unroll
+    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
+      int data_index = kOneLoadingCounts * local_idx + ii * warp_size;
+
+      if (data_index < batch_data) {
+        int itr_idx = i * key_seq_len + ii * warp_size;
+
+        // efficiently load data from global memory
+        load_data(temp_data, x_data + itr_idx);
+        load_data(temp_mask, mask_data + itr_idx);
+
+#pragma unroll
+        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
+          data[i][ii + counter] = static_cast<float>(temp_data[counter]) +
+                                  static_cast<float>(temp_mask[counter]);
+        }
+      } else {
+#pragma unroll
+        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
+          data[i][ii + counter] = -std::numeric_limits<float>::infinity();
+        }
+      }
+    }
+  }
+
+  // compute max_value
+  // max value for each batch for current warp
+  float samples_max_value[kLocalBatchSize];
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+    samples_max_value[i] = data[i][0];
+#pragma unroll
+    for (int ii = 1; ii < kLocalIterations; ++ii) {
+      samples_max_value[i] = (samples_max_value[i] > data[i][ii])
+                                 ? samples_max_value[i]
+                                 : data[i][ii];
+    }
+  }
+  // max value for each batch for all warp
+  warp_reduce<float, kLocalBatchSize, warp_size, MaxOP>(samples_max_value);
+
+  // compute the sum for each batch for current warp
+  float samples_sum[kLocalBatchSize]{0.0f};
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+#pragma unroll
+    for (int ii = 0; ii < kLocalIterations; ++ii) {
+      data[i][ii] = std::exp((data[i][ii] - samples_max_value[i]));
+      samples_sum[i] += data[i][ii];
+    }
+  }
+  // samples_sum for each batch for all warp
+  warp_reduce<float, kLocalBatchSize, warp_size, AddOP>(samples_sum);
+
+  // load the result from device back to host
+  T samples_out[kOneLoadingCounts];
+#pragma unroll
+  for (int i = 0; i < kLocalBatchSize; ++i) {
+    if (i >= local_batches) break;
+#pragma unroll
+    for (int ii = 0; ii < kLocalIterations; ii += kOneLoadingCounts) {
+      int idx = kOneLoadingCounts * local_idx + ii * warp_size;
+      if (idx < key_seq_len) {
+#pragma unroll
+        for (int counter = 0; counter < kOneLoadingCounts; ++counter) {
+          samples_out[counter] = data[i][ii + counter] / samples_sum[i];
+        }
+        load_data(y_data + i * key_seq_len + ii * warp_size, samples_out);
+      } else {
+        break;
+      }
+    }
+  }
+}
+
+// T only supports fp16
+// leave as template only for future update
+template <typename T, typename Context>
+void SoftmaxMaskFuseKernel(const Context& dev_ctx,
+                           const DenseTensor& x,
+                           const DenseTensor& mask,
+                           DenseTensor* out) {
+  auto* x_data = x.data<T>();
+  auto* mask_data = mask.data<T>();
+  auto* y_data = dev_ctx.template Alloc<T>(out);
+
+  auto x_dim = x.dims();
+  auto mask_dim = mask.dims();
+  auto batches = x_dim[0];
+  auto attn_heads = x_dim[1];
+  auto query_seq_len = x_dim[2];
+  auto key_seq_len = x_dim[3];
+
+  PADDLE_ENFORCE_GT(query_seq_len,
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Input x's second last dim must be large than 1 but "
+                        "received the second last dimension of x is %d",
+                        query_seq_len));
+
+  PADDLE_ENFORCE_EQ(key_seq_len >= 32 && key_seq_len < 8192,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input x's last dim must be between [32, 8192) "
+                        "received the last dimension of x is %d",
+                        key_seq_len));
+
+  PADDLE_ENFORCE_EQ(mask_dim[1],
+                    1,
+                    phi::errors::InvalidArgument(
+                        "Input mask's second dim must be 1 "
+                        "received the second dimension of mask is %d",
+                        mask_dim[1]));
+
+  // dim of x and mask must be equal
+  for (size_t idx = 0; idx < 4; ++idx) {
+    if (idx == 1) continue;
+    PADDLE_ENFORCE_EQ(
+        x_dim[idx],
+        mask_dim[idx],
+        phi::errors::InvalidArgument(
+            "Input x's %dth dim should be equal with input mask's %dth dim "
+            "but "
+            "received the %dth dimension of x and mask are not equal "
+            "the %dth dim of x is %d, while the %dth dim of mask is %d.",
+            idx,
+            idx,
+            idx,
+            idx,
+            x_dim[idx],
+            idx,
+            mask_dim[idx]));
+  }
+
+  auto stream = dev_ctx.stream();
+
+  int pow2_index = get_pow2(key_seq_len);
+  const int next_pow2 = 1 << pow2_index;
+  int batch_count = batches * attn_heads * query_seq_len;
+  int warp_size = (next_pow2 < WARP_SIZE) ? next_pow2 : WARP_SIZE;
+  int batches_per_warp = (next_pow2 <= 128) ? 2 : 1;
+  // use 128 threads per block to maximum gpu utilization
+  constexpr int threads_per_block = 128;
+
+  int warps_per_block = (threads_per_block / warp_size);
+  int batches_per_block = warps_per_block * batches_per_warp;
+  PADDLE_ENFORCE_EQ(
+      query_seq_len % batches_per_block,
+      0,
+      phi::errors::InvalidArgument(
+          "The query seq len (third dim of input X) must can divide the "
+          "number of batches per block. The query seq len is %d, while "
+          "the number of batches per block is %d.",
+          query_seq_len,
+          batches_per_block));
+  dim3 blocks(query_seq_len / batches_per_block, attn_heads, batches);
+  dim3 threads(warp_size, warps_per_block, 1);
+
+  // launch the kernel based on the pow2_index
+  switch (pow2_index) {
+    case 5:  // 32
+      SoftmaxMaskFuseGPUKernel<T, 5><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 6:  // 64
+      SoftmaxMaskFuseGPUKernel<T, 6><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 7:  // 128
+      SoftmaxMaskFuseGPUKernel<T, 7><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 8:  // 256
+      SoftmaxMaskFuseGPUKernel<T, 8><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 9:  // 512
+      SoftmaxMaskFuseGPUKernel<T, 9><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 10:  // 1024
+      SoftmaxMaskFuseGPUKernel<T, 10><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 11:  // 2048
+      SoftmaxMaskFuseGPUKernel<T, 11><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 12:  // 4096
+      SoftmaxMaskFuseGPUKernel<T, 12><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    case 13:  // 8192
+      SoftmaxMaskFuseGPUKernel<T, 13><<<blocks, threads, 0, stream>>>(
+          x_data, mask_data, y_data, batch_count, key_seq_len);
+      break;
+    default:
+      break;
+  }
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(fused_softmax_mask,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::fusion::SoftmaxMaskFuseKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
new file mode 100644
index 00000000000000..2847a4df8391e0
--- /dev/null
+++ b/paddle/phi/kernels/fusion/gpu/fused_softmax_mask_utils.h
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#ifdef PADDLE_WITH_CUDA
+#include <cuda.h>
+#include <curand_kernel.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#include <hiprand_kernel.h>
+#endif
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+#ifdef PADDLE_WITH_HIP
+#define WARP_SIZE 64
+#else
+#define WARP_SIZE 32
+#endif
+
+#define MASK 0xffffffff
+
+namespace phi {
+namespace fusion {
+
+__device__ __inline__ void load_data(dtype::float16* dst,
+                                     const dtype::float16* src) {
+  *(reinterpret_cast<float2*>(dst)) = *(reinterpret_cast<const float2*>(src));
+}
+
+__device__ __inline__ void load_data(float* dst, const float* src) {
+  *(reinterpret_cast<float4*>(dst)) = *(reinterpret_cast<const float4*>(src));
+}
+
+inline int get_pow2(int value) {
+  // get next pow2 index
+  int pow2_index = 0;
+  while ((1 << pow2_index) < value) {
+    ++pow2_index;
+  }
+  return pow2_index;
+}
+
+template <typename T>
+struct AddOP {
+  __device__ __forceinline__ T operator()(T a, T b) const { return a + b; }
+};
+
+template <typename T>
+struct MaxOP {
+  __device__ __forceinline__ T operator()(T a, T b) const {
+    return a < b ? b : a;
+  }
+};
+
+template <typename T>
+__device__ __forceinline__ T
+warp_shfl_xor(T value, int laneMask, int width, unsigned int mask = MASK) {
+#if CUDA_VERSION >= 9000
+  return __shfl_xor_sync(mask, value, laneMask, width);
+#else
+  return __shfl_xor(value, laneMask, width);
+#endif
+}
+
+template <typename T, int batch, int width, template <typename> class ReduceOp>
+__device__ __forceinline__ void warp_reduce(T* sum) {
+  ReduceOp<T> r;
+#pragma unroll
+  for (int offset = width / 2; offset > 0; offset /= 2) {
+#pragma unroll
+    for (int i = 0; i < batch; ++i) {
+      T b = warp_shfl_xor(sum[i], offset, width);
+      sum[i] = r(sum[i], b);
+    }
+  }
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+#endif
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index 53f727ec51a39b..b947c70cb89d49 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -449,4 +449,5 @@ PD_REGISTER_KERNEL(pow_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 0e9e754a997066..e57332c40756af 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -265,5 +265,12 @@ PD_REGISTER_KERNEL(pow,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
-PD_REGISTER_KERNEL(selu, GPU, ALL_LAYOUT, phi::SeluKernel, float, double) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+PD_REGISTER_KERNEL(selu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SeluKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/adam_kernel.cu b/paddle/phi/kernels/gpu/adam_kernel.cu
index b20e8610fefaf2..b85e1dcdb82592 100644
--- a/paddle/phi/kernels/gpu/adam_kernel.cu
+++ b/paddle/phi/kernels/gpu/adam_kernel.cu
@@ -373,7 +373,8 @@ PD_REGISTER_KERNEL(adam,
                    phi::AdamDenseKernel,
                    float,
                    double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   // Skip beta1_pow, beta2_pow, skip_update data transform
   kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
   kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
@@ -386,7 +387,8 @@ PD_REGISTER_KERNEL(merged_adam,
                    phi::MergedAdamKernel,
                    float,
                    double,
-                   phi::dtype::float16) {
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {
   // Skip beta1_pow, beta2_pow data transform
   kernel->InputAt(5).SetBackend(phi::Backend::ALL_BACKEND);
   kernel->InputAt(6).SetBackend(phi::Backend::ALL_BACKEND);
diff --git a/paddle/phi/kernels/gpu/add_n_kernel.cu b/paddle/phi/kernels/gpu/add_n_kernel.cu
index 981ec24dac6b19..f32ba597f5b68b 100644
--- a/paddle/phi/kernels/gpu/add_n_kernel.cu
+++ b/paddle/phi/kernels/gpu/add_n_kernel.cu
@@ -14,16 +14,27 @@
 
 #include "paddle/phi/kernels/add_n_kernel.h"
 
+#include "paddle/phi/kernels/impl/add_n_kernel_impl.h"
+
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace phi {
 
 #define CEIL_DIV(x, y) (((x) + (y)-1) / (y))
 
+template <class T>
+__global__ void Sum2CUDAKernel(const T *in_0,
+                               const T *in_1,
+                               T *out,
+                               int64_t N) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  while (id < N) {
+    out[id] = in_0[id] + in_1[id];
+    id += blockDim.x * gridDim.x;
+  }
+}
+
 template <class T>
 __global__ void SumArrayCUDAKernel(
     T **in, T *out, int64_t N, size_t in_size, bool read_dst) {
@@ -41,9 +52,26 @@ __global__ void SumArrayCUDAKernel(
   }
 }
 
+template <class T>
+__global__ void SumSelectedRowsCUDAKernel(T **sr_in_out,
+                                          int64_t N,
+                                          size_t rows) {
+  int id = blockIdx.x * blockDim.x + threadIdx.x;
+  while (id < N) {
+    for (int i = 0; i < 2 * rows; i += 2) {
+      const T *tmp = sr_in_out[i];
+      T *tmp_out = sr_in_out[i + 1];
+      if (tmp && tmp_out) {
+        tmp_out[id] += tmp[id];
+      }
+    }
+    id += blockDim.x * gridDim.x;
+  }
+}
+
 template <typename T, typename Context>
 void AddNKernel(const Context &dev_ctx,
-                const std::vector<const DenseTensor *> &x,
+                const std::vector<const TensorBase *> &x,
                 DenseTensor *out) {
   const size_t in_num = x.size();
 
@@ -66,36 +94,38 @@ void AddNKernel(const Context &dev_ctx,
     grids = dim3(CEIL_DIV(length, tile_size), 1, 1);
     blocks = dim3(tile_size, 1, 1);
   };
+  auto *out_ptr = dev_ctx.template Alloc<T>(out);
+  bool in_place = false;
+  if (x.size() > 0 && x[0]->initialized() && DenseTensor::classof(x[0])) {
+    if ((static_cast<const DenseTensor *>(x[0]))->data() == out->data()) {
+      in_place = true;
+    }
+  }
 
-  bool in_place = x[0] == out;
-
-  if (!in_place) {
-    auto *out_ptr = dev_ctx.template Alloc<T>(out);
-    if (in_num >= 1) {
-      auto &in_0_tensor = *x[0];
-      if (in_0_tensor.numel() > 0) {
-        in_place = (in_0_tensor.data<T>() == out_ptr);
-      }
+  if (!in_place && in_num >= 1 && DenseTensor::classof(x[0])) {
+    auto &in_0_tensor = *(static_cast<const DenseTensor *>(x[0]));
+    if (in_0_tensor.numel() > 0) {
+      in_place = (in_0_tensor.data<T>() == out_ptr);
     }
   }
 
   // Sum of two tensors
-  if (in_num == 2) {
-    auto &in_0 = *x[0];
-    auto &in_1 = *x[1];
+  if (in_num == 2 && DenseTensor::classof(x[0]) && DenseTensor::classof(x[1])) {
+    auto &in_0 = *(static_cast<const DenseTensor *>(x[0]));
+    auto &in_1 = *(static_cast<const DenseTensor *>(x[1]));
     int64_t length_0 = in_0.numel();
     int64_t length_1 = in_1.numel();
-    if (length_0 && length_1 && in_0.initialized() && in_1.initialized()) {
+    if (length_0 && length_1 && in_0.IsInitialized() && in_1.IsInitialized()) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
       auto in_0_e = EigenVector<T>::Flatten(in_0);
       auto in_1_e = EigenVector<T>::Flatten(in_1);
       result.device(place) = in_0_e + in_1_e;
-    } else if (length_0 && in_0.initialized()) {
+    } else if (length_0 && in_0.IsInitialized()) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
       result.device(place) = EigenVector<T>::Flatten(in_0);
-    } else if (length_1 && in_1.initialized()) {
+    } else if (length_1 && in_1.IsInitialized()) {
       auto result = EigenVector<T>::Flatten(*out);
       auto &place = *dev_ctx.eigen_device();
       result.device(place) = EigenVector<T>::Flatten(in_1);
@@ -105,27 +135,90 @@ void AddNKernel(const Context &dev_ctx,
 
   int start = in_place ? 1 : 0;
   if (!in_place) {
-    funcs::SetConstant<Context, T> constant_functor;
+    phi::funcs::SetConstant<phi::GPUContext, T> constant_functor;
     constant_functor(dev_ctx, out, static_cast<T>(0));
   }
 
   std::vector<const T *> in_data;
+  std::vector<int> selectrow_index;
   int64_t lod_length = 0;
   bool dst_write = false;
   for (int i = start; i < in_num; ++i) {
-    auto &in_i = *x[i];
-    lod_length = in_i.numel();
-    if (lod_length && in_i.initialized()) {
-      in_data.emplace_back(in_i.data<T>());
+    if (DenseTensor::classof(x[i])) {
+      auto &in_i = *(static_cast<const DenseTensor *>(x[i]));
+      lod_length = in_i.numel();
+      if (lod_length && in_i.IsInitialized()) {
+        in_data.emplace_back(in_i.data<T>());
+      }
+    } else if (SelectedRows::classof(x[i])) {
+      selectrow_index.push_back(i);
     }
   }
 
+  // compute select rows separately.
+  if (!selectrow_index.empty()) {
+    std::vector<const T *> sr_in_out_data;
+    size_t rows = 0;
+    int64_t length = 0;
+    for (auto index : selectrow_index) {
+      auto &sr = *(static_cast<const SelectedRows *>(x[index]));
+      auto &sr_value = sr.value();
+      auto &sr_rows = sr.rows();
+
+      auto row_numel = sr_value.numel() / sr_rows.size();
+      auto out_dims = out->dims();
+
+      PADDLE_ENFORCE_EQ(sr.height(),
+                        out_dims[0],
+                        errors::InvalidArgument(
+                            "The table height of input must be same as output, "
+                            "but received input height is %d"
+                            ", output height is %d",
+                            sr.height(),
+                            out_dims[0]));
+      PADDLE_ENFORCE_EQ(row_numel,
+                        out->numel() / sr.height(),
+                        errors::InvalidArgument(
+                            "The table width of input must be same as output, "
+                            "but received input width is %d"
+                            ", output width is %d",
+                            row_numel,
+                            out->numel() / sr.height()));
+
+      auto *sr_data = sr_value.data<T>();
+      auto *sr_out_data = out->data<T>();
+      rows += sr_rows.size();
+      length = row_numel;
+
+      for (size_t i = 0; i < sr_rows.size(); ++i) {
+        sr_in_out_data.emplace_back(&sr_data[i * row_numel]);
+        sr_in_out_data.emplace_back(&sr_out_data[sr_rows[i] * row_numel]);
+      }
+    }
+    if (!sr_in_out_data.empty()) {
+      auto tmp_sr_in_out_array = paddle::memory::Alloc(
+          dev_ctx.GetPlace(), sr_in_out_data.size() * sizeof(T *));
+
+      paddle::memory::Copy(dev_ctx.GetPlace(),
+                           tmp_sr_in_out_array->ptr(),
+                           phi::CPUPlace(),
+                           reinterpret_cast<void *>(sr_in_out_data.data()),
+                           sr_in_out_data.size() * sizeof(T *),
+                           dev_ctx.stream());
+
+      T **sr_in_out_array_data =
+          reinterpret_cast<T **>(tmp_sr_in_out_array->ptr());
+
+      ComputeKernelParameter(length);
+      SumSelectedRowsCUDAKernel<T>
+          <<<grids, blocks, 0, stream>>>(sr_in_out_array_data, length, rows);
+      dst_write = true;
+    }
+  }
   // if indata not null, merge into one kernel call.
   if (!in_data.empty()) {
-    auto tmp_in_array = paddle::memory::Alloc(
-        dev_ctx.GetPlace(),
-        in_data.size() * sizeof(T *),
-        phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx.stream())));
+    auto tmp_in_array =
+        paddle::memory::Alloc(dev_ctx.GetPlace(), in_data.size() * sizeof(T *));
 
     paddle::memory::Copy(dev_ctx.GetPlace(),
                          tmp_in_array->ptr(),
@@ -153,6 +246,17 @@ PD_REGISTER_KERNEL(add_n,
                    float,
                    double,
                    int,
-                   int64_t,
                    phi::dtype::bfloat16,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   int64_t) {}
+
+PD_REGISTER_KERNEL(add_n_array,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::AddNArrayKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/clip_grad_kernel.cu b/paddle/phi/kernels/gpu/clip_grad_kernel.cu
index 4566e8468ec164..60d311a2555a0d 100644
--- a/paddle/phi/kernels/gpu/clip_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_grad_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(clip_grad,
                    double,
                    int,
                    int64_t,
+                   phi::dtype::bfloat16,
                    phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/clip_kernel.cu b/paddle/phi/kernels/gpu/clip_kernel.cu
index 9e0050db7fdbf1..e8d519a5d3a2b9 100644
--- a/paddle/phi/kernels/gpu/clip_kernel.cu
+++ b/paddle/phi/kernels/gpu/clip_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(clip,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
index 1a4559d5cd6b58..76201a1077edbb 100644
--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -170,7 +170,7 @@ __global__ void CrossEntropySoftLabel(T* loss,
 /*
   Hard label cross entropy.
 */
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 __global__ void CrossEntropyHardLabel(T* loss,
                                       const T* softmax,
                                       const LabelT* labels,
@@ -185,21 +185,17 @@ __global__ void CrossEntropyHardLabel(T* loss,
   // thread ids compute loss[ids] using softmax[idx]
   if (ids < n * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
-    if (lbl < 0) {  // label is negative
+    PADDLE_ENFORCE(lbl >= 0 && lbl < dim || lbl == ignore_idx,
+                   "The value of label expected >= 0 and < %d, or == %d, "
+                   "but got %ld. Please check label value.",
+                   dim,
+                   ignore_idx,
+                   lbl);
+    if (lbl == ignore_idx) {
       loss[ids] = static_cast<T>(0.0);
-    } else {  // label is positive of zero
+    } else {
       int64_t idx = idx_n * dim * d + lbl * d + idx_d;
-      if (IgnoreIndex == true) {
-        // IgnoreIndex is true
-        if (lbl == ignore_idx) {
-          loss[ids] = static_cast<T>(0.0);
-        } else {
-          loss[ids] = -Log(softmax[idx]);
-        }
-      } else {
-        // IgnoreIndex is false
-        loss[ids] = -Log(softmax[idx]);
-      }
+      loss[ids] = -Log(softmax[idx]);
     }
   }
 }
@@ -209,7 +205,7 @@ __global__ void CrossEntropyHardLabel(T* loss,
   Input: log softmax
   Output: loss and exp(input)
 */
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 __global__ void CrossEntropyExpHardLabel(T* loss,
                                          T* softmax,
                                          const LabelT* labels,
@@ -225,23 +221,17 @@ __global__ void CrossEntropyExpHardLabel(T* loss,
 
   if (idx < n * dim * d) {
     auto lbl = static_cast<int64_t>(labels[ids]);
-    if (IgnoreIndex == true) {
-      // IgnoreIndex is true
-      if (idx_dim == lbl) {
-        if (lbl == ignore_idx) {
-          loss[ids] = static_cast<T>(0.0);
-        } else {
-          loss[ids] = -softmax[idx];
-        }
-      }
+    PADDLE_ENFORCE(lbl >= 0 && lbl < dim || lbl == ignore_idx,
+                   "The value of label expected >= 0 and < %d, or == %d, "
+                   "but got %ld. Please check label value.",
+                   dim,
+                   ignore_idx,
+                   lbl);
+    if (lbl == ignore_idx) {
+      loss[ids] = static_cast<T>(0.0);
     } else {
-      // IgnoreIndex is false
-      if (lbl >= 0 && lbl < dim) {
-        if (lbl == idx_dim) {
-          loss[ids] = -softmax[idx];
-        }
-      } else {
-        loss[ids] = static_cast<T>(0.0);
+      if (lbl == idx_dim) {
+        loss[ids] = -softmax[idx];
       }
     }
     softmax[idx] = Exp(softmax[idx]);
@@ -290,7 +280,7 @@ __device__ __forceinline__ AccT ThreadReduce(const T* input,
   return val;
 }
 
-template <typename T, bool IgnoreIndex>
+template <typename T>
 __device__ __forceinline__ void ComputeLoss(T* loss,
                                             const T loss_value,
                                             const int label_id,
@@ -300,14 +290,8 @@ __device__ __forceinline__ void ComputeLoss(T* loss,
                                             const int offset,
                                             const int ignore_index) {
   int loss_id = vec_size * tid + offset;
-  if (IgnoreIndex) {
-    if (label_value == loss_id) {
-      if (label_value == ignore_index) {
-        loss[label_id] = static_cast<T>(0.0f);
-      } else {
-        loss[label_id] = loss_value;
-      }
-    }
+  if (label_value == ignore_index) {
+    loss[label_id] = static_cast<T>(0.0f);
   } else {
     if (label_value == loss_id) {
       loss[label_id] = loss_value;
@@ -315,11 +299,7 @@ __device__ __forceinline__ void ComputeLoss(T* loss,
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename LabelT,
-          int VecSize,
-          bool IgnoreIndex>
+template <typename T, typename AccT, typename LabelT, int VecSize>
 __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
     T* loss,
     T* softmax,
@@ -333,7 +313,13 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
   int tid = threadIdx.x;
   int label_id = blockIdx.x;
   auto label_value = static_cast<int64_t>(label[label_id]);
-  const bool label_valid = label_value >= 0 && label_value < size;
+  PADDLE_ENFORCE(
+      label_value >= 0 && label_value < size || label_value == ignore_index,
+      "The value of label expected >= 0 and < %d, or == %d, "
+      "but got %ld. Please check label value.",
+      size,
+      ignore_index,
+      label_value);
   int loss_id_offset = 0;
 
   if (offset > 0) {
@@ -345,16 +331,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
       AccT log_softmax = func(static_cast<AccT>(logits[tid]));
       softmax[tid] = static_cast<T>(std::exp(log_softmax));
       // loss
-      if (label_valid) {
-        ComputeLoss<T, IgnoreIndex>(loss,
-                                    static_cast<T>(-log_softmax),
-                                    label_id,
-                                    label_value,
-                                    tid,
-                                    1,
-                                    loss_id_offset,
-                                    ignore_index);
-      }
+      ComputeLoss<T>(loss,
+                     static_cast<T>(-log_softmax),
+                     label_id,
+                     label_value,
+                     tid,
+                     1,
+                     loss_id_offset,
+                     ignore_index);
     }
     size -= blockDim.x;
     logits += blockDim.x;
@@ -380,16 +364,14 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
       outs[i] = static_cast<T>(std::exp(log_softmax));
 
       // loss
-      if (label_valid) {
-        ComputeLoss<T, IgnoreIndex>(loss,
-                                    static_cast<T>(-log_softmax),
-                                    label_id,
-                                    label_value,
-                                    tid,
-                                    VecSize,
-                                    loss_id_offset + i,
-                                    ignore_index);
-      }
+      ComputeLoss<T>(loss,
+                     static_cast<T>(-log_softmax),
+                     label_id,
+                     label_value,
+                     tid,
+                     VecSize,
+                     loss_id_offset + i,
+                     ignore_index);
     }
 
     // write
@@ -403,29 +385,18 @@ __device__ __forceinline__ void VectorizedSoftmaxForwardImpl(
     softmax[tid] = static_cast<T>(std::exp(log_softmax));
 
     // loss
-    if (label_valid) {
-      ComputeLoss<T, IgnoreIndex>(loss,
-                                  static_cast<T>(-log_softmax),
-                                  label_id,
-                                  label_value,
-                                  tid,
-                                  1,
-                                  loss_id_offset,
-                                  ignore_index);
-    }
-  }
-
-  // invalid label, write once
-  if (!label_valid && threadIdx.x == 0) {
-    loss[label_id] = static_cast<T>(0.0f);
+    ComputeLoss<T>(loss,
+                   static_cast<T>(-log_softmax),
+                   label_id,
+                   label_value,
+                   tid,
+                   1,
+                   loss_id_offset,
+                   ignore_index);
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename LabelT,
-          int VecSize,
-          bool IgnoreIndex>
+template <typename T, typename AccT, typename LabelT, int VecSize>
 __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
     T* loss,
     T* softmax,
@@ -438,7 +409,13 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
   int remain = size % (VecSize * blockDim.x);
   int label_id = blockIdx.x;
   auto label_value = static_cast<int64_t>(label[label_id]);
-  const bool label_valid = label_value >= 0 && label_value < size;
+  PADDLE_ENFORCE(
+      label_value >= 0 && label_value < size || label_value == ignore_index,
+      "The value of label expected >= 0 and < %d, or == %d, "
+      "but got %ld. Please check label value.",
+      size,
+      ignore_index,
+      label_value);
 
   // main part
   for (; tid < (size - remain); tid += VecSize * blockDim.x) {
@@ -453,16 +430,14 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
       AccT log_softmax = func(static_cast<AccT>(ins[i]));
       softmax[tid + i * blockDim.x] = static_cast<T>(std::exp(log_softmax));
       // loss
-      if (label_valid) {
-        ComputeLoss<T, IgnoreIndex>(loss,
-                                    static_cast<T>(-log_softmax),
-                                    label_id,
-                                    label_value,
-                                    tid,
-                                    VecSize,
-                                    i,
-                                    ignore_index);
-      }
+      ComputeLoss<T>(loss,
+                     static_cast<T>(-log_softmax),
+                     label_id,
+                     label_value,
+                     tid,
+                     VecSize,
+                     i,
+                     ignore_index);
     }
   }
 
@@ -471,29 +446,18 @@ __device__ __forceinline__ void ScalarSoftmaxForwardImpl(
     AccT log_softmax = func(static_cast<AccT>(logits[tid]));
     softmax[tid] = static_cast<T>(std::exp(log_softmax));
     // loss
-    if (label_valid) {
-      ComputeLoss<T, IgnoreIndex>(loss,
-                                  static_cast<T>(-log_softmax),
-                                  label_id,
-                                  label_value,
-                                  tid,
-                                  1,
-                                  0,
-                                  ignore_index);
-    }
-  }
-
-  // invalid label, write once
-  if (!label_valid && threadIdx.x == 0) {
-    loss[label_id] = static_cast<T>(0.0f);
+    ComputeLoss<T>(loss,
+                   static_cast<T>(-log_softmax),
+                   label_id,
+                   label_value,
+                   tid,
+                   1,
+                   0,
+                   ignore_index);
   }
 }
 
-template <typename T,
-          typename AccT,
-          typename LabelT,
-          int VecSize,
-          bool IgnoreIndex>
+template <typename T, typename AccT, typename LabelT, int VecSize>
 __global__ void VectorizedSoftmaxForward(T* loss,
                                          T* softmax,
                                          const T* logits,
@@ -533,17 +497,16 @@ __global__ void VectorizedSoftmaxForward(T* loss,
   // 3. softmax
   phi::LogSoftmaxForwardFunctor<AccT> func(max, sum);
   if (input_offset == output_offset) {
-    VectorizedSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
-        loss,
-        softmax,
-        logits,
-        label,
-        mid_dim,
-        input_offset,
-        func,
-        ignore_index);
+    VectorizedSoftmaxForwardImpl<T, AccT, LabelT, VecSize>(loss,
+                                                           softmax,
+                                                           logits,
+                                                           label,
+                                                           mid_dim,
+                                                           input_offset,
+                                                           func,
+                                                           ignore_index);
   } else {
-    ScalarSoftmaxForwardImpl<T, AccT, LabelT, VecSize, IgnoreIndex>(
+    ScalarSoftmaxForwardImpl<T, AccT, LabelT, VecSize>(
         loss, softmax, logits, label, mid_dim, func, ignore_index);
   }
 }
@@ -556,8 +519,8 @@ The computation includes
   - Compute: sum of - sum_{j}{ label_{i,j} * (src_{i,j} - maxvalue_{i} -
 log(sum[i]))}
 One warp (32 threads) is used to compute 1 or 2 batch (kBatchSize).
-For reduction max (sum), firstly compute max (sum) to one warp, then use shuffle
-api to compute max (sum) in one warp.
+For reduction max (sum), firstly compute max (sum) to one warp, then use
+shuffle api to compute max (sum) in one warp.
 */
 template <typename T, typename VecT, typename AccT, int Log2Elements>
 __global__ void WarpSoftmaxForwardSoftLabel(T* loss,
@@ -876,8 +839,7 @@ template <typename T,
           typename VecT,
           typename AccT,
           int Log2Elements,
-          SoftmaxMode mode,
-          bool IgnoreIndex>
+          SoftmaxMode mode>
 __global__ void WarpSoftmaxForward(T* loss,
                                    T* softmax,
                                    const T* src,
@@ -1029,23 +991,21 @@ __global__ void WarpSoftmaxForward(T* loss,
             // label
             int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize;
             auto lbl = static_cast<int64_t>(label[first_batch + i]);
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (lbl == loss_idx) {
-                if (lbl != ignore_index) {
-                  loss[first_batch + i] = -logsoftmax;
-                } else {
-                  loss[first_batch + i] = static_cast<T>(0.0);
-                }
-              }
+            if (lbl == ignore_index) {
+              loss[first_batch + i] = static_cast<T>(0.0);
             } else {
-              // IgnoreIndex is false
               if (lbl >= 0 && lbl < element_count) {
                 if (lbl == loss_idx) {
                   loss[first_batch + i] = -logsoftmax;
                 }
               } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
+                PADDLE_ENFORCE(
+                    false,
+                    "The value of label expected >= 0 and < %d, or == %d, "
+                    "but got %ld. Please check label value.",
+                    element_count,
+                    ignore_index,
+                    lbl);
               }
             }
           } else {  // softmax
@@ -1072,19 +1032,21 @@ __global__ void WarpSoftmaxForward(T* loss,
             // label
             int loss_idx = (threadIdx.x + it * kWarpSize) * kVSize + s;
             auto lbl = static_cast<int64_t>(label[first_batch + i]);
-            if (IgnoreIndex == true) {
-              // IgnoreIndex is true
-              if (lbl == loss_idx && lbl != ignore_index) {
-                loss[first_batch + i] = -logsoftmax;
-              }
+            if (lbl == ignore_index) {
+              loss[first_batch + i] = static_cast<T>(0.0);
             } else {
-              // IgnoreIndex is false
               if (lbl >= 0 && lbl < element_count) {
                 if (lbl == loss_idx) {
                   loss[first_batch + i] = -logsoftmax;
                 }
               } else {
-                loss[first_batch + i] = static_cast<T>(0.0);
+                PADDLE_ENFORCE(
+                    false,
+                    "The value of label expected >= 0 and < %d, or == %d, "
+                    "but got %ld. Please check label value.",
+                    element_count,
+                    ignore_index,
+                    lbl);
               }
             }
           } else {  // softmax
@@ -1101,23 +1063,23 @@ __global__ void WarpSoftmaxForward(T* loss,
   }
 }
 
-#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT)            \
-  case Log2Elements:                                                           \
-    WarpSoftmaxForward<T, LabelT, VecT, AccT, Log2Elements, mode, IgnoreIndex> \
-        <<<blocks, threads, 0, stream>>>(loss,                                 \
-                                         softmax,                              \
-                                         src,                                  \
-                                         label,                                \
-                                         batch_size,                           \
-                                         stride,                               \
-                                         element_count,                        \
-                                         ignore_index);                        \
+#define SOFTMAX_WARP_FORWARD_CASE(Log2Elements, LabelT, VecT, AccT) \
+  case Log2Elements:                                                \
+    WarpSoftmaxForward<T, LabelT, VecT, AccT, Log2Elements, mode>   \
+        <<<blocks, threads, 0, stream>>>(loss,                      \
+                                         softmax,                   \
+                                         src,                       \
+                                         label,                     \
+                                         batch_size,                \
+                                         stride,                    \
+                                         element_count,             \
+                                         ignore_index);             \
     break;
 
 /*
   Wrapper of softmax with cross entropy forward hard label.
 */
-template <typename T, typename LabelT, SoftmaxMode mode, bool IgnoreIndex>
+template <typename T, typename LabelT, SoftmaxMode mode>
 void SwitchWarpSoftmaxForward(T* loss,
                               T* softmax,
                               const T* src,
@@ -1156,7 +1118,7 @@ void SwitchWarpSoftmaxForward(T* loss,
   }
 }
 
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 void LaunchVectorizedSoftmaxForward(T* loss,
                                     T* softmax,
                                     const T* logits,
@@ -1180,7 +1142,7 @@ void LaunchVectorizedSoftmaxForward(T* loss,
   block_size = std::max(block_size, kps::details::kWarpSize);
   dim3 grids(high_dim);
   dim3 blocks(block_size);
-  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size, IgnoreIndex>
+  VectorizedSoftmaxForward<T, AccT, LabelT, vec_size>
       <<<grids, blocks, 0, stream>>>(
           loss, softmax, logits, label, high_dim, mid_dim, ignore_index);
 }
@@ -1191,7 +1153,7 @@ void LaunchVectorizedSoftmaxForward(T* loss,
   - LaunchVectorizedSoftmaxForward for large size when axis == -1
   - cudnn function for axis != -1
 */
-template <typename T, typename LabelT, bool IgnoreIndex>
+template <typename T, typename LabelT>
 static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
                                              int rank,
                                              int axis,
@@ -1208,24 +1170,24 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
   if (D == 1) {
     if (dim <= max_dim) {  // small size
       const SoftmaxMode mode = SoftmaxMode::kCrossEntropy;
-      SwitchWarpSoftmaxForward<T, LabelT, mode, IgnoreIndex>(loss_data,
-                                                             softmax_data,
-                                                             logits_data,
-                                                             labels_data,
-                                                             N,
-                                                             dim,
-                                                             dim,
-                                                             ignore_index,
-                                                             stream);
+      SwitchWarpSoftmaxForward<T, LabelT, mode>(loss_data,
+                                                softmax_data,
+                                                logits_data,
+                                                labels_data,
+                                                N,
+                                                dim,
+                                                dim,
+                                                ignore_index,
+                                                stream);
     } else {  // large size
-      LaunchVectorizedSoftmaxForward<T, LabelT, IgnoreIndex>(loss_data,
-                                                             softmax_data,
-                                                             logits_data,
-                                                             labels_data,
-                                                             N,
-                                                             dim,
-                                                             ignore_index,
-                                                             stream);
+      LaunchVectorizedSoftmaxForward<T, LabelT>(loss_data,
+                                                softmax_data,
+                                                logits_data,
+                                                labels_data,
+                                                N,
+                                                dim,
+                                                ignore_index,
+                                                stream);
     }
   } else {
     ScopedTensorDescriptor desc;
@@ -1269,9 +1231,8 @@ static void SoftmaxWithCrossEntropyHardLabel(const GPUContext& dev_ctx,
     int threads = 128;
     int blocks = (N * dim * D + threads - 1) / threads;
     // compute cross entropy, input is log softmax
-    CrossEntropyExpHardLabel<T, LabelT, IgnoreIndex>
-        <<<blocks, threads, 0, stream>>>(
-            loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
+    CrossEntropyExpHardLabel<T, LabelT><<<blocks, threads, 0, stream>>>(
+        loss_data, softmax_data, labels_data, N, dim, D, ignore_index);
   }
 }
 
@@ -1367,25 +1328,14 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
       auto* labels_data = labels.data<LabelT>();
       int threads = 128;
       int blocks = (n * d / axis_dim + threads - 1) / threads;
-      if (ignore_index >= 0 && ignore_index < axis_dim) {
-        CrossEntropyHardLabel<T, LabelT, true>
-            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
-                                                       logits_data,
-                                                       labels_data,
-                                                       n,
-                                                       axis_dim,
-                                                       d / axis_dim,
-                                                       ignore_index);
-      } else {
-        CrossEntropyHardLabel<T, LabelT, false>
-            <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
-                                                       logits_data,
-                                                       labels_data,
-                                                       n,
-                                                       axis_dim,
-                                                       d / axis_dim,
-                                                       ignore_index);
-      }
+      CrossEntropyHardLabel<T, LabelT>
+          <<<blocks, threads, 0, dev_ctx.stream()>>>(loss_data,
+                                                     logits_data,
+                                                     labels_data,
+                                                     n,
+                                                     axis_dim,
+                                                     d / axis_dim,
+                                                     ignore_index);
     }
 
     // cause of input is softmax
@@ -1450,31 +1400,17 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
     } else {
       auto* logits_data = logits.data<T>();
       auto* labels_data = label.data<LabelT>();
-      if (ignore_index >= 0 && ignore_index < axis_dim) {
-        SoftmaxWithCrossEntropyHardLabel<T, LabelT, true>(dev_ctx,
-                                                          rank,
-                                                          axis_v,
-                                                          logits_data,
-                                                          labels_data,
-                                                          loss_data,
-                                                          softmax_data,
-                                                          n,
-                                                          axis_dim,
-                                                          d / axis_dim,
-                                                          ignore_index);
-      } else {
-        SoftmaxWithCrossEntropyHardLabel<T, LabelT, false>(dev_ctx,
-                                                           rank,
-                                                           axis_v,
-                                                           logits_data,
-                                                           labels_data,
-                                                           loss_data,
-                                                           softmax_data,
-                                                           n,
-                                                           axis_dim,
-                                                           d / axis_dim,
-                                                           ignore_index);
-      }
+      SoftmaxWithCrossEntropyHardLabel<T, LabelT>(dev_ctx,
+                                                  rank,
+                                                  axis_v,
+                                                  logits_data,
+                                                  labels_data,
+                                                  loss_data,
+                                                  softmax_data,
+                                                  n,
+                                                  axis_dim,
+                                                  d / axis_dim,
+                                                  ignore_index);
     }
   }
 }
diff --git a/paddle/phi/kernels/gpu/depthwise_conv.h b/paddle/phi/kernels/gpu/depthwise_conv.h
index eae7b77519911a..3b5aa7e61e7862 100644
--- a/paddle/phi/kernels/gpu/depthwise_conv.h
+++ b/paddle/phi/kernels/gpu/depthwise_conv.h
@@ -87,43 +87,36 @@ class DepthwiseConvFilterGradFunctor {
                   const DataLayout data_layout = DataLayout::kNCHW);
 };
 
+#define FINAL_MASK 0xffffffff
+#define HALF_WARP 16
+#define WARP_SIZE 32
+
 template <typename T>
-static __forceinline__ __device__ T WarpReduceSum(T val, int warp_size) {
-  typedef cub::WarpReduce<T> WarpReduce;
-  typename WarpReduce::TempStorage temp_storage;
-  val = WarpReduce(temp_storage).Sum(val, warp_size);
+__forceinline__ __device__ T WarpReduceSum(T val, unsigned lane_mask) {
+  for (int mask = HALF_WARP; mask > 0; mask >>= 1)
+    val += platform::CudaShuffleDownSync(lane_mask, val, mask);
   return val;
 }
 
 template <typename T>
-__forceinline__ __device__ T BlockReduceSum(T val) {
-  static __shared__ T shared[32];
-  int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
-                  threadIdx.z * blockDim.x * blockDim.y;
-  int warp_size = min(blockDim.x * blockDim.y * blockDim.z, warpSize);
-  int lane = thread_id % warp_size;
-  int wid = thread_id / warp_size;
-
-  val = WarpReduceSum(val, warp_size);  // Each warp performs partial reduction
-
-  if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
-  __syncthreads();                   // Wait for all partial reductions
-
-  // read from shared memory only if that warp existed
-  int block_size = blockDim.x * blockDim.y * blockDim.z;
-  if (thread_id < (block_size - 1) / warp_size + 1) {
-    val = shared[lane];
-  } else {
-    val = static_cast<T>(0);
-  }
+__forceinline__ __device__ T BlockReduceSum(T val, unsigned mask = FINAL_MASK) {
+  static __shared__ T shared[WARP_SIZE];
+  int tid = threadIdx.y * blockDim.x + threadIdx.x;
+  int lane = tid & 0x1f;
+  int wid = tid >> 5;
+
+  val = WarpReduceSum<T>(val, mask);
 
-  if (wid == 0) {
-    val = WarpReduceSum(val, warp_size);  // Final reduce within first warp
-  }
   __syncthreads();
-  if (thread_id != 0) {
-    val = static_cast<T>(0);
-  }
+  if (lane == 0) shared[wid] = val;
+
+  __syncthreads();
+
+  // align block_span to WARP_SIZE
+  int block_span = (blockDim.x * blockDim.y + WARP_SIZE - 1) >> 5;
+  val = (lane < block_span) ? shared[lane] : static_cast<T>(0.0f);
+  val = WarpReduceSum<T>(val, mask);
+
   return val;
 }
 
@@ -139,55 +132,53 @@ __forceinline__ __device__ T BlockReduceSum(T val) {
 
 // A Cuda kernel to compute the depthwise convolution forward pass
 // in NCHW format.
-template <typename T, bool fuse_relu_before_conv>
+template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvNCHW(
     ARG_DEFINE_KernelDepthwiseConv) {
+  const int fw_size = c_filter != -1 ? c_filter : filter_width;
+  const int fh_size = c_filter != -1 ? c_filter : filter_height;
   int idx = threadIdx.x + blockIdx.x * blockDim.x;
   if (idx >= (output_channels * batch_size * output_height * output_width))
     return;
 
-  const int w_out = idx % output_width;
-  const int h_out = (idx / output_width) % output_height;
-  const int c_out = (idx / output_width / output_height) % output_channels;
-  const int batch = idx / output_width / output_height / output_channels;
+  int tmp_1 = idx / output_width;
+  const int w_out = idx - tmp_1 * output_width;
+  int tmp_2 = tmp_1 / output_height;
+  const int h_out = tmp_1 - tmp_2 * output_height;
+  tmp_1 = tmp_2;
+  tmp_2 = tmp_1 / output_channels;
+  const int c_out = tmp_1 - tmp_2 * output_channels;
+  const int batch = tmp_2;
 
   const int c_in = c_out / filter_multiplier;
-  const T* weight = filter_data + c_out * filter_height * filter_width;
   T value(0);
-  const int h_in_start = -padding_height + h_out * stride_height;
-  const int w_in_start = -padding_width + w_out * stride_width;
-  const int h_in_end = h_in_start + filter_height * dilate_height;
-  const int w_in_end = w_in_start + filter_width * dilate_width;
 
   int in_offset =
       ((batch * input_channels + c_in) * input_height) * input_width;
-
-  const int h_end = h_in_end < input_height ? h_in_end : input_height;
-  const int w_end = w_in_end < input_width ? w_in_end : input_width;
-  const int h_start = h_in_start > 0 ? h_in_start : 0;
-  const int w_start = w_in_start > 0 ? w_in_start : 0;
-  int weight_offset = 0;
+  int weight_offset = c_out * filter_height * filter_width;
+  int h_in_start = -padding_height + h_out * stride_height;
+  int w_in_start = -padding_width + w_out * stride_width;
 
 #pragma unroll
-  for (int h_in = h_in_start; h_in < h_in_end; h_in += dilate_height) {
+  for (int fh = 0, h_in = h_in_start; fh < fh_size;
+       fh++, h_in += dilate_height) {
 #pragma unroll
-    for (int w_in = w_in_start; w_in < w_in_end; w_in += dilate_width) {
-      if (h_in >= h_start && h_in < h_end && w_in >= w_start && w_in < w_end) {
+    for (int fw = 0, w_in = w_in_start; fw < fw_size;
+         fw++, w_in += dilate_width) {
+      if (h_in >= 0 && h_in < input_height && w_in >= 0 && w_in < input_width) {
         int offset = in_offset + h_in * input_width + w_in;
         T in_data = input_data[offset];
         if (fuse_relu_before_conv) {
-          value += weight[weight_offset] * T(max(0.0f, double(in_data)));
+          value += filter_data[weight_offset] *
+                   static_cast<T>(max(0.0f, static_cast<double>(in_data)));
         } else {
-          value += weight[weight_offset] * in_data;
+          value += filter_data[weight_offset] * in_data;
         }
       }
       weight_offset++;
     }
   }
-  int index = batch * output_channels * output_height * output_width +
-              c_out * output_height * output_width + h_out * output_width +
-              w_out;
-  output_data[index] = value;
+  output_data[idx] = value;
 }
 
 // A Cuda kernel to compute the depthwise convolution forward pass
@@ -228,7 +219,8 @@ __device__ __inline__ void KernelDepthwiseConvNHWC(
         T in_data = input_data[offset];
         const T* weight = filter_data + weight_offset * output_channels + c_out;
         if (fuse_relu_before_conv) {
-          value += weight[0] * T(max(0.0f, double(in_data)));
+          value += weight[0] *
+                   static_cast<T>(max(0.0f, static_cast<double>(in_data)));
         } else {
           value += weight[0] * in_data;
         }
@@ -281,7 +273,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNCHW(
             int offset = in_offset + h_in * input_width + w_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
-                       T(max(0.0f, double(input_data[offset])));
+                       static_cast<T>(
+                           max(0.0f, static_cast<double>(input_data[offset])));
             } else {
               value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
@@ -337,7 +330,8 @@ __device__ __inline__ void KernelDepthwiseConvCFilterNHWC(
                 in_offset + (h_in * input_width + w_in) * input_channels + c_in;
             if (fuse_relu_before_conv) {
               value += r_weight[h_f * c_filter + w_f] *
-                       T(max(0.0, double(input_data[offset])));
+                       static_cast<T>(
+                           max(0.0, static_cast<double>(input_data[offset])));
             } else {
               value += r_weight[h_f * c_filter + w_f] * input_data[offset];
             }
@@ -367,25 +361,26 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
   }
   if (c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvNCHW<T, fuse_relu_before_conv>(input_data,
-                                                        filter_data,
-                                                        batch_size,
-                                                        output_channels,
-                                                        output_height,
-                                                        output_width,
-                                                        input_channels,
-                                                        input_height,
-                                                        input_width,
-                                                        final_filter_multiplier,
-                                                        filter_height,
-                                                        filter_width,
-                                                        h_stride,
-                                                        w_stride,
-                                                        padding_height,
-                                                        padding_width,
-                                                        dilate_height,
-                                                        dilate_width,
-                                                        output_data);
+      KernelDepthwiseConvNCHW<T, c_filter, fuse_relu_before_conv>(
+          input_data,
+          filter_data,
+          batch_size,
+          output_channels,
+          output_height,
+          output_width,
+          input_channels,
+          input_height,
+          input_width,
+          final_filter_multiplier,
+          filter_height,
+          filter_width,
+          h_stride,
+          w_stride,
+          padding_height,
+          padding_width,
+          dilate_height,
+          dilate_width,
+          output_data);
     } else {
       KernelDepthwiseConvNHWC<T, fuse_relu_before_conv>(input_data,
                                                         filter_data,
@@ -467,60 +462,62 @@ __global__ void KernelDepthwiseConvSp(ARG_DEFINE_KernelDepthwiseConv) {
       const int dilate_height, const int dilate_width,                         \
       T *const input_grad_data
 
-template <typename T, bool fuse_relu_before_conv>
+template <typename T, int c_filter, bool fuse_relu_before_conv>
 __device__ __inline__ void KernelDepthwiseConvInputGradNCHW(
     ARG_DEFINE_KernelDepthwiseConvInputGrad) {
-  const int batch = blockIdx.y;
-  const int c_in = blockIdx.x;
-  for (int w_in = threadIdx.x; w_in < input_width; w_in += blockDim.x) {
-    for (int h_in = threadIdx.y; h_in < input_height; h_in += blockDim.y) {
-      const int c_out_start = c_in * filter_multiplier;
-      int h_out_start =
-          h_in - (filter_height - 1) * dilate_height + padding_height;
-      int h_out_end = h_in + padding_height;
-      int w_out_start =
-          w_in - (filter_width - 1) * dilate_width + padding_width;
-      int w_out_end = w_in + padding_width;
+  const int fw_size = c_filter != -1 ? c_filter : filter_width;
+  const int fh_size = c_filter != -1 ? c_filter : filter_height;
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= batch_size * input_channels * input_height * input_width) {
+    return;
+  }
+  if (fuse_relu_before_conv) {
+    if (input_data[idx] <= static_cast<T>(0.0f)) {
+      input_grad_data[idx] = 0;
+      return;
+    }
+  }
 
-      T value(0);
-      int index =
-          ((batch * gridDim.x + c_in) * input_height + h_in) * input_width +
-          w_in;
+  int tmp_1 = idx / input_width;
+  const int w_in = idx - tmp_1 * input_width;
+  int tmp_2 = tmp_1 / input_height;
+  const int h_in = tmp_1 - tmp_2 * input_height;
+  tmp_1 = tmp_2;
+  tmp_2 = tmp_1 / input_channels;
+  const int c_in = tmp_1 - tmp_2 * input_channels;
+  const int batch = tmp_2;
 
-      if (fuse_relu_before_conv) {
-        if (input_data[index] <= T(0)) {
-          input_grad_data[index] = 0;
-          continue;
-        }
-      }
+  T value(0);
+  for (int c_mul = 0; c_mul < filter_multiplier; ++c_mul) {
+    int c_out = c_in * filter_multiplier + c_mul;
+    int filter_offset = c_out * filter_height * filter_width;
 
-      for (int c_out = c_out_start; c_out < c_out_start + filter_multiplier;
-           c_out++) {
-        int filter_offset = (c_out + 1) * filter_height * filter_width;
-        for (int h_out = h_out_start; h_out <= h_out_end;
-             h_out += dilate_height) {
-          for (int w_out = w_out_start; w_out <= w_out_end;
-               w_out += dilate_width) {
-            filter_offset--;
-            int s_h_out = h_out / stride_height;
-            int s_w_out = w_out / stride_width;
-            if (h_out % stride_height == 0 && w_out % stride_width == 0 &&
-                s_h_out >= 0 && s_h_out < output_height && s_w_out >= 0 &&
-                s_w_out < output_width) {
-              int output_grad_offset =
-                  ((batch * output_channels + c_out) * output_height +
-                   s_h_out) *
-                      output_width +
-                  s_w_out;
-              value += output_grad_data[output_grad_offset] *
-                       filter_data[filter_offset];
-            }
+#pragma unroll
+    for (int fh = 0; fh < fh_size; ++fh) {
+#pragma unroll
+      for (int fw = 0; fw < fw_size; ++fw) {
+        int h_out = h_in + padding_height - fh * dilate_height;
+        int w_out = w_in + padding_width - fw * dilate_width;
+        if ((h_out - h_out / stride_height * stride_height == 0) &&
+            (w_out - w_out / stride_width * stride_width == 0)) {
+          h_out /= stride_height;
+          w_out /= stride_width;
+
+          if (h_out >= 0 && h_out < output_height && w_out >= 0 &&
+              w_out < output_width) {
+            int output_grad_offset =
+                ((batch * output_channels + c_out) * output_height + h_out) *
+                    output_width +
+                w_out;
+            value += output_grad_data[output_grad_offset] *
+                     filter_data[filter_offset];
           }
         }
+        filter_offset++;
       }
-      input_grad_data[index] = value;
     }
   }
+  input_grad_data[idx] = value;
 }
 
 template <typename T, bool fuse_relu_before_conv>
@@ -733,7 +730,7 @@ __global__ void KernelDepthwiseConvInputGradSp(
 
   if (c_filter_multiplier == 0 || c_filter == -1) {
     if (data_layout != DataLayout::kNHWC) {
-      KernelDepthwiseConvInputGradNCHW<T, fuse_relu_before_conv>(
+      KernelDepthwiseConvInputGradNCHW<T, c_filter, fuse_relu_before_conv>(
           input_data,
           output_grad_data,
           filter_data,
@@ -854,44 +851,81 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNCHW(
     const int dilate_height,
     const int dilate_width,
     T* filter_grad_data) {
-  T s(0);
-  int gbid = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
-
-  for (int image_w = threadIdx.x; image_w < output_width;
-       image_w += blockDim.x) {
-    for (int bid = 0; bid < num; bid++) {
-      for (int image_h = threadIdx.y; image_h < output_height;
-           image_h += blockDim.y) {
-        int kernel_id = blockIdx.z;
-        int kernel_h = blockIdx.y * dilate_height - padding_height;
-        int kernel_w = blockIdx.x * dilate_width - padding_width;
-
-        int image_hk = image_h * stride_height + kernel_h;
-        int image_wk = image_w * stride_width + kernel_w;
-        if (image_hk < 0 || image_hk >= input_height) continue;
-        if (image_wk < 0 || image_wk >= input_width) continue;
-#define gaid(N, C, H, W) \
-  ((((N)*gridDim.z + (C)) * output_height + (H)) * output_width + (W))
-        int input_id = ((bid * (gridDim.z / filter_multiplier) +
-                         kernel_id / filter_multiplier) *
-                            input_height +
-                        image_hk) *
-                           input_width +
-                       image_wk;
+  T f_grad(0);
+  const bool loop_batch = output_height * output_width >= WARP_SIZE;
+
+  int kw_id = blockIdx.x;
+  int kh_id = blockIdx.y;
+  int oc_id = blockIdx.z;
+  int ic_id = oc_id / filter_multiplier;
+  int idx = ((blockIdx.z * gridDim.y) + blockIdx.y) * gridDim.x + blockIdx.x;
+
+  const int ohw = output_height * output_width;
+  const int onhw = num * ohw;
+  const int h_offset = kh_id * dilate_height - padding_height;
+  const int w_offset = kw_id * dilate_width - padding_width;
+
+  if (loop_batch) {
+    for (int og_w = threadIdx.x; og_w < output_width; og_w += blockDim.x) {
+      for (int bid = 0; bid < num; ++bid) {
+        for (int og_h = threadIdx.y; og_h < output_height; og_h += blockDim.y) {
+          int i_h = og_h * stride_height + h_offset;
+          int i_w = og_w * stride_width + w_offset;
+
+          if (i_w >= 0 && i_w < input_width && i_h >= 0 && i_h < input_height) {
+            int input_offset =
+                ((bid * input_channels + ic_id) * input_height + i_h) *
+                    input_width +
+                i_w;
+            int output_grad_offset =
+                ((bid * output_channels + oc_id) * output_height + og_h) *
+                    output_width +
+                og_w;
+            if (fuse_relu_before_conv) {
+              f_grad +=
+                  output_grad_data[output_grad_offset] *
+                  static_cast<T>(
+                      max(0.0f, static_cast<double>(input_data[input_offset])));
+            } else {
+              f_grad += output_grad_data[output_grad_offset] *
+                        input_data[input_offset];
+            }
+          }
+        }
+      }
+    }
+  } else {
+    for (int id = threadIdx.x; id < onhw; id += blockDim.x) {
+      int bid = id / ohw;
+      int og_hw = id - bid * ohw;
+      int og_h = og_hw / output_width;
+      int og_w = og_hw - og_h * output_width;
+
+      int i_h = og_h * stride_height + h_offset;
+      int i_w = og_w * stride_width + w_offset;
+
+      if (i_w >= 0 && i_w < input_width && i_h >= 0 && i_h < input_height) {
+        int input_offset =
+            ((bid * input_channels + ic_id) * input_height + i_h) *
+                input_width +
+            i_w;
+        int output_grad_offset = (bid * output_channels + oc_id) * ohw + og_hw;
         if (fuse_relu_before_conv) {
-          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-               T(max(0.0f, double(input_data[input_id])));
+          f_grad += output_grad_data[output_grad_offset] *
+                    static_cast<T>(max(
+                        0.0f, static_cast<double>(input_data[input_offset])));
         } else {
-          s += output_grad_data[gaid(bid, kernel_id, image_h, image_w)] *
-               input_data[input_id];
+          f_grad +=
+              output_grad_data[output_grad_offset] * input_data[input_offset];
         }
-#undef gaid
       }
     }
   }
 
-  T val = BlockReduceSum(s);
-  platform::CudaAtomicAdd(&filter_grad_data[gbid], val);
+  T val = BlockReduceSum<T>(f_grad);
+  if (threadIdx.x == 0 && threadIdx.y == 0) {
+    filter_grad_data[idx] = val;
+  }
 }
 
 template <typename T, bool fuse_relu_before_conv>
@@ -941,7 +975,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradNHWC(
           kernel_id / filter_multiplier;
       if (fuse_relu_before_conv) {
         s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
-             T(max(0.0f, double(input_data[input_id])));
+             static_cast<T>(
+                 max(0.0f, static_cast<double>(input_data[input_id])));
       } else {
         s += output_grad_data[gaid(bid, image_h, image_w, kernel_id)] *
              input_data[input_id];
@@ -1013,7 +1048,8 @@ __device__ __inline__ void KernelDepthwiseConvFilterGradCFilterNHWC(
           T s(0);
           if (fuse_relu_before_conv) {
             s = output_grad_data[output_id] *
-                T(max(0.0f, double(input_data[input_id])));
+                static_cast<T>(
+                    max(0.0f, static_cast<double>(input_data[input_id])));
           } else {
             s = output_grad_data[output_id] * input_data[input_id];
           }
@@ -1242,8 +1278,7 @@ class DepthwiseConvFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
                   batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
-    int nums_output =
-        batch_size * output_channels * output_height * output_width;
+    int nums_output = output->numel();
 #ifdef __HIPCC__
     int block_size = 256;
 #else
@@ -1416,6 +1451,13 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
                   batch_size);
     }
     int filter_multiplier = output_channels / input_channels;
+    int nums_input = input_grad->numel();
+#ifdef __HIPCC__
+    int block_size = 256;
+#else
+    int block_size = 512;
+#endif
+    int grid_size = (nums_input + block_size - 1) / block_size;
 
 #define check_case(c_filter_multiplier, c_stride, c_filter)             \
   if (c_filter_multiplier == 0 ||                                       \
@@ -1424,6 +1466,11 @@ class DepthwiseConvInputGradFunctor<phi::GPUContext, T, fuse_relu_before_conv> {
           (ksize_height == ksize_width && ksize_height == c_filter ||   \
            c_filter == -1)) {                                           \
     if (data_layout != DataLayout::kNHWC) {                             \
+      if (c_filter == -1) {                                             \
+        threads.x = block_size;                                         \
+        grid.x = grid_size;                                             \
+        threads.y = threads.z = grid.y = grid.z = 1;                    \
+      }                                                                 \
       KernelDepthwiseConvInputGradSp<T,                                 \
                                      c_filter_multiplier,               \
                                      c_stride,                          \
@@ -1554,6 +1601,10 @@ class DepthwiseConvFilterGradFunctor<phi::GPUContext,
       blocks = std::min(std::max(block_size / output_width, 1), output_height);
       grid = dim3(ksize_width, ksize_height, output_channels);
       threads = dim3(std::min(output_width, block_size), blocks, 1);
+      if (output_height * output_width < WARP_SIZE) {
+        threads = dim3(
+            std::min(block_size, batch_size * output_height * output_width));
+      }
     } else {
       blocks = std::min(
           std::max(block_size / output_channels, 1),
diff --git a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
index f11fd0191b9351..60672e46d8b432 100644
--- a/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_grad_kernel.cu
@@ -249,7 +249,8 @@ PD_REGISTER_KERNEL(embedding_grad,
                    phi::EmbeddingGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
 
 PD_REGISTER_KERNEL(embedding_sparse_grad,
                    GPU,
@@ -257,4 +258,5 @@ PD_REGISTER_KERNEL(embedding_sparse_grad,
                    phi::EmbeddingSparseGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/embedding_kernel.cu b/paddle/phi/kernels/gpu/embedding_kernel.cu
index 90f3cc8d36032b..bb22fea5f6493d 100644
--- a/paddle/phi/kernels/gpu/embedding_kernel.cu
+++ b/paddle/phi/kernels/gpu/embedding_kernel.cu
@@ -125,4 +125,5 @@ PD_REGISTER_KERNEL(embedding,
                    phi::EmbeddingKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/fill_grad_kernel.cu b/paddle/phi/kernels/gpu/fill_grad_kernel.cu
index 32559ba95dfbca..e18bb5c6dbb244 100644
--- a/paddle/phi/kernels/gpu/fill_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_grad_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(fill_grad,
                    int64_t,
                    int,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    bool) {}
diff --git a/paddle/phi/kernels/gpu/fill_kernel.cu b/paddle/phi/kernels/gpu/fill_kernel.cu
index 141e47b8cb109b..3fedb4118ff9e1 100644
--- a/paddle/phi/kernels/gpu/fill_kernel.cu
+++ b/paddle/phi/kernels/gpu/fill_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(fill,
                    int64_t,
                    int,
                    paddle::platform::float16,
+                   paddle::platform::bfloat16,
                    bool) {}
diff --git a/paddle/phi/kernels/gpu/gather_tree_kernel.cu b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
index 22b174b5f0bc21..585376ee6688d5 100644
--- a/paddle/phi/kernels/gpu/gather_tree_kernel.cu
+++ b/paddle/phi/kernels/gpu/gather_tree_kernel.cu
@@ -16,6 +16,7 @@
 
 #include <algorithm>
 
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 
 namespace phi {
@@ -35,6 +36,12 @@ __global__ void GatherTree(const T *ids_data,
     out_data[idx] = ids_data[idx];
     auto parent = parents_data[idx];
     for (int step = max_length - 2; step >= 0; step--) {
+      PADDLE_ENFORCE((parent < beam_size),
+                     "The parents must be less than beam size, but recieved"
+                     "parents %ld is greater than or equal to beam size %ld. ",
+                     parent,
+                     beam_size);
+
       idx = step * batch_size * beam_size + batch * beam_size;
       out_data[idx + beam] = ids_data[idx + parent];
       parent = parents_data[idx + parent];
diff --git a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
index 1f33d5c901f297..b1ffa921f912b7 100644
--- a/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_grad_kernel.cu
@@ -99,4 +99,5 @@ PD_REGISTER_KERNEL(gelu_grad,
                    phi::GeluGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/gelu_kernel.cu b/paddle/phi/kernels/gpu/gelu_kernel.cu
index 509a5ccf4d177f..e0792c387d7510 100644
--- a/paddle/phi/kernels/gpu/gelu_kernel.cu
+++ b/paddle/phi/kernels/gpu/gelu_kernel.cu
@@ -93,4 +93,5 @@ PD_REGISTER_KERNEL(gelu,
                    phi::GeluKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
index 6632d3f8b2ec9b..3ea1dbc8e19c20 100644
--- a/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
+++ b/paddle/phi/kernels/gpu/graph_sample_neighbors_kernel.cu
@@ -58,7 +58,7 @@ struct MaxFunctor {
   }
 };
 
-template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+template <typename T, int CTA_SIZE, int BLOCK_CTAS, int TILE_SIZE>
 __global__ void SampleKernel(const uint64_t rand_seed,
                              int k,
                              const int64_t num_nodes,
@@ -71,8 +71,7 @@ __global__ void SampleKernel(const uint64_t rand_seed,
                              T* output_eids,
                              int* output_ptr,
                              bool return_eids) {
-  assert(blockDim.x == WARP_SIZE);
-  assert(blockDim.y == BLOCK_WARPS);
+  assert(blockDim.x == CTA_SIZE);
 
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
@@ -80,13 +79,13 @@ __global__ void SampleKernel(const uint64_t rand_seed,
 #ifdef PADDLE_WITH_HIP
   hiprandState rng;
   hiprand_init(rand_seed * gridDim.x + blockIdx.x,
-               threadIdx.y * WARP_SIZE + threadIdx.x,
+               threadIdx.y * CTA_SIZE + threadIdx.x,
                0,
                &rng);
 #else
-  curandState rng;
+  curandStatePhilox4_32_10_t rng;
   curand_init(rand_seed * gridDim.x + blockIdx.x,
-              threadIdx.y * WARP_SIZE + threadIdx.x,
+              threadIdx.y * CTA_SIZE + threadIdx.x,
               0,
               &rng);
 #endif
@@ -94,7 +93,7 @@ __global__ void SampleKernel(const uint64_t rand_seed,
   while (out_row < last_row) {
     T node = nodes[out_row];
     if (node > len_col_ptr - 1) {
-      out_row += BLOCK_WARPS;
+      out_row += BLOCK_CTAS;
       continue;
     }
     T in_row_start = col_ptr[node];
@@ -102,21 +101,21 @@ __global__ void SampleKernel(const uint64_t rand_seed,
     int out_row_start = output_ptr[out_row];
 
     if (deg <= k) {
-      for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
         output[out_row_start + idx] = row[in_row_start + idx];
         if (return_eids) {
           output_eids[out_row_start + idx] = eids[in_row_start + idx];
         }
       }
     } else {
-      for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < k; idx += CTA_SIZE) {
         output[out_row_start + idx] = idx;
       }
 #ifdef PADDLE_WITH_CUDA
-      __syncwarp();
+      __syncthreads();
 #endif
 
-      for (int idx = k + threadIdx.x; idx < deg; idx += WARP_SIZE) {
+      for (int idx = k + threadIdx.x; idx < deg; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
 #else
@@ -129,10 +128,10 @@ __global__ void SampleKernel(const uint64_t rand_seed,
         }
       }
 #ifdef PADDLE_WITH_CUDA
-      __syncwarp();
+      __syncthreads();
 #endif
 
-      for (int idx = threadIdx.x; idx < k; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < k; idx += CTA_SIZE) {
         T perm_idx = output[out_row_start + idx] + in_row_start;
         output[out_row_start + idx] = row[perm_idx];
         if (return_eids) {
@@ -141,7 +140,7 @@ __global__ void SampleKernel(const uint64_t rand_seed,
       }
     }
 
-    out_row += BLOCK_WARPS;
+    out_row += BLOCK_CTAS;
   }
 }
 
@@ -181,12 +180,12 @@ void SampleNeighbors(const Context& dev_ctx,
   thrust::exclusive_scan(
       output_count, output_count + bs, output_ptr.begin(), 0);
 
-  constexpr int WARP_SIZE = 32;
-  constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
-  constexpr int TILE_SIZE = BLOCK_WARPS * 16;
-  const dim3 block(WARP_SIZE, BLOCK_WARPS);
+  constexpr int CTA_SIZE = 128;
+  constexpr int BLOCK_CTAS = 128 / CTA_SIZE;
+  constexpr int TILE_SIZE = BLOCK_CTAS;
+  const dim3 block(CTA_SIZE, BLOCK_CTAS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
-  SampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+  SampleKernel<T, CTA_SIZE, BLOCK_CTAS, TILE_SIZE>
       <<<grid, block, 0, dev_ctx.stream()>>>(
           0,
           sample_size,
@@ -202,7 +201,7 @@ void SampleNeighbors(const Context& dev_ctx,
           return_eids);
 }
 
-template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+template <typename T, int CTA_SIZE, int BLOCK_CTAS, int TILE_SIZE>
 __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                                         int k,
                                         const int64_t num_rows,
@@ -210,8 +209,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                                         const T* in_rows,
                                         T* src,
                                         const T* dst_count) {
-  assert(blockDim.x == WARP_SIZE);
-  assert(blockDim.y == BLOCK_WARPS);
+  assert(blockDim.x == CTA_SIZE);
 
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
@@ -221,7 +219,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   hiprand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #else
-  curandState rng;
+  curandStatePhilox4_32_10_t rng;
   curand_init(
       rand_seed * gridDim.x + blockIdx.x, threadIdx.y + threadIdx.x, 0, &rng);
 #endif
@@ -229,7 +227,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
   while (out_row < last_row) {
     const T row = in_rows[out_row];
     if (row > len_col_ptr - 1) {
-      out_row += BLOCK_WARPS;
+      out_row += BLOCK_CTAS;
       continue;
     }
     const T in_row_start = dst_count[row];
@@ -241,7 +239,7 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
       } else {
         split = deg - k;
       }
-      for (int idx = split + threadIdx.x; idx <= deg - 1; idx += WARP_SIZE) {
+      for (int idx = split + threadIdx.x; idx <= deg - 1; idx += CTA_SIZE) {
 #ifdef PADDLE_WITH_HIP
         const int num = hiprand(&rng) % (idx + 1);
 #else
@@ -254,14 +252,14 @@ __global__ void FisherYatesSampleKernel(const uint64_t rand_seed,
                            src[in_row_start + idx])));
       }
 #ifdef PADDLE_WITH_CUDA
-      __syncwarp();
+      __syncthreads();
 #endif
     }
-    out_row += BLOCK_WARPS;
+    out_row += BLOCK_CTAS;
   }
 }
 
-template <typename T, int WARP_SIZE, int BLOCK_WARPS, int TILE_SIZE>
+template <typename T, int CTA_SIZE, int BLOCK_CTAS, int TILE_SIZE>
 __global__ void GatherEdge(int k,
                            int64_t num_rows,
                            const T* in_rows,
@@ -273,8 +271,7 @@ __global__ void GatherEdge(int k,
                            int* output_ptr,
                            T* perm_data,
                            bool return_eids) {
-  assert(blockDim.x == WARP_SIZE);
-  assert(blockDim.y == BLOCK_WARPS);
+  assert(blockDim.x == CTA_SIZE);
 
   int64_t out_row = blockIdx.x * TILE_SIZE + threadIdx.y;
   const int64_t last_row =
@@ -287,7 +284,7 @@ __global__ void GatherEdge(int k,
     const T out_row_start = output_ptr[out_row];
 
     if (deg <= k) {
-      for (int idx = threadIdx.x; idx < deg; idx += WARP_SIZE) {
+      for (int idx = threadIdx.x; idx < deg; idx += CTA_SIZE) {
         outputs[out_row_start + idx] = src[in_row_start + idx];
         if (return_eids) {
           output_eids[out_row_start + idx] = eids[in_row_start + idx];
@@ -304,7 +301,7 @@ __global__ void GatherEdge(int k,
         end = deg;
       }
 
-      for (int idx = begin + threadIdx.x; idx < end; idx += WARP_SIZE) {
+      for (int idx = begin + threadIdx.x; idx < end; idx += CTA_SIZE) {
         outputs[out_row_start + idx - begin] =
             src[perm_data[in_row_start + idx]];
         if (return_eids) {
@@ -313,7 +310,7 @@ __global__ void GatherEdge(int k,
         }
       }
     }
-    out_row += BLOCK_WARPS;
+    out_row += BLOCK_CTAS;
   }
 }
 
@@ -337,13 +334,13 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx,
   thrust::exclusive_scan(
       output_count, output_count + bs, output_ptr.begin(), 0);
 
-  constexpr int WARP_SIZE = 32;
-  constexpr int BLOCK_WARPS = 128 / WARP_SIZE;
-  constexpr int TILE_SIZE = BLOCK_WARPS * 16;
-  const dim3 block(WARP_SIZE, BLOCK_WARPS);
+  constexpr int CTA_SIZE = 128;
+  constexpr int BLOCK_CTAS = 128 / CTA_SIZE;
+  constexpr int TILE_SIZE = BLOCK_CTAS;
+  const dim3 block(CTA_SIZE, BLOCK_CTAS);
   const dim3 grid((bs + TILE_SIZE - 1) / TILE_SIZE);
 
-  FisherYatesSampleKernel<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+  FisherYatesSampleKernel<T, CTA_SIZE, BLOCK_CTAS, TILE_SIZE>
       <<<grid, block, 0, dev_ctx.stream()>>>(0,
                                              sample_size,
                                              bs,
@@ -352,7 +349,7 @@ void FisherYatesSampleNeighbors(const Context& dev_ctx,
                                              perm_data,
                                              col_ptr);
 
-  GatherEdge<T, WARP_SIZE, BLOCK_WARPS, TILE_SIZE>
+  GatherEdge<T, CTA_SIZE, BLOCK_CTAS, TILE_SIZE>
       <<<grid, block, 0, dev_ctx.stream()>>>(
           sample_size,
           bs,
diff --git a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
index c33fbfbd51f475..30194be846db0d 100644
--- a/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/group_norm_grad_kernel.cu
@@ -71,14 +71,14 @@ __global__ void GroupNormBackwardGetMeanAndVar(const T* x,
 
   if (flags & kHasScale) {
 #if CUDA_VERSION >= 11070
-    platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
+    paddle::platform::CudaAtomicAdd(&(d_scale[ccid]), d_scale_data);
 #else
     CudaAtomicAddWithWarp(&(d_scale[ccid]), d_scale_data);
 #endif
   }
   if (flags & kHasBias) {
 #if CUDA_VERSION >= 11070
-    platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
+    paddle::platform::CudaAtomicAdd(&(d_bias[ccid]), d_bias_data);
 #else
     CudaAtomicAddWithWarp(&(d_bias[ccid]), d_bias_data);
 #endif
diff --git a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
index 33bf0eba380e44..c9ee74f0ddf34a 100644
--- a/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
+++ b/paddle/phi/kernels/gpu/gumbel_softmax_kernel.cu
@@ -170,3 +170,10 @@ struct GumbleNoiseGenerator<GPUContext, T> {
 
 PD_REGISTER_KERNEL(
     gumbel_softmax, GPU, ALL_LAYOUT, phi::GumbelSoftmaxKernel, float, double) {}
+
+PD_REGISTER_KERNEL(gumbel_softmax_infer,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::GumbelSoftmaxInferKernel,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/multiplex_kernel.cu b/paddle/phi/kernels/gpu/multiplex_kernel.cu
index 743448a4686668..2a86827bcf4752 100644
--- a/paddle/phi/kernels/gpu/multiplex_kernel.cu
+++ b/paddle/phi/kernels/gpu/multiplex_kernel.cu
@@ -41,7 +41,7 @@ void MultiplexKernel(const Context& ctx,
   paddle::framework::TensorCopySync(ids, phi::CPUPlace(), &index_t_cpu);
   auto* index = index_t_cpu.data<int32_t>();
   auto stream = ctx.stream();
-  for (auto i = 0; i < rows; i++) {
+  for (auto i = 0; i < ids.dims()[0]; i++) {
     int32_t k = index[i];
     PADDLE_ENFORCE_GE(
         k, 0, errors::PreconditionNotMet("index must be nonnegative."));
diff --git a/paddle/phi/kernels/gpu/nms_kernel.cu b/paddle/phi/kernels/gpu/nms_kernel.cu
index 490753f1313655..dcc6d6e2b45f00 100644
--- a/paddle/phi/kernels/gpu/nms_kernel.cu
+++ b/paddle/phi/kernels/gpu/nms_kernel.cu
@@ -59,7 +59,6 @@ void NMSKernel(const Context& dev_ctx,
                const DenseTensor& boxes,
                float threshold,
                DenseTensor* output) {
-  auto* output_data = dev_ctx.template Alloc<int64_t>(output);
   const int64_t num_boxes = boxes.dims()[0];
   const auto blocks_per_line = CeilDivide(num_boxes, threadsPerBlock);
   dim3 block(threadsPerBlock);
@@ -93,11 +92,13 @@ void NMSKernel(const Context& dev_ctx,
       }
     }
   }
+  output->Resize(phi::make_ddim({last_box_num}));
+  auto* output_data = dev_ctx.template Alloc<int64_t>(output);
   paddle::memory::Copy(dev_ctx.GetPlace(),
                        output_data,
                        phi::CPUPlace(),
                        output_host,
-                       sizeof(int64_t) * num_boxes,
+                       sizeof(int64_t) * last_box_num,
                        dev_ctx.stream());
 }
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
index e9f820a318482c..fb7f1a2325790c 100644
--- a/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_grad_kernel.cu
@@ -509,4 +509,5 @@ PD_REGISTER_KERNEL(pad3d_grad,
                    phi::Pad3dGradKernel,
                    float,
                    double,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/pad3d_kernel.cu b/paddle/phi/kernels/gpu/pad3d_kernel.cu
index d1b1d70667673e..fa85c650bc8542 100644
--- a/paddle/phi/kernels/gpu/pad3d_kernel.cu
+++ b/paddle/phi/kernels/gpu/pad3d_kernel.cu
@@ -583,6 +583,7 @@ PD_REGISTER_KERNEL(pad3d,
                    ALL_LAYOUT,
                    phi::Pad3dKernel,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
index 1414fb9df0b41c..5c88bbbf425325 100644
--- a/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_grad_kernel.cu
@@ -23,4 +23,6 @@ PD_REGISTER_KERNEL(pixel_shuffle_grad,
                    ALL_LAYOUT,
                    phi::PixelShuffleGradKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
index e43d6f961236af..09eb0485a297fa 100644
--- a/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
+++ b/paddle/phi/kernels/gpu/pixel_shuffle_kernel.cu
@@ -18,5 +18,11 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/pixel_shuffle_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    pixel_shuffle, GPU, ALL_LAYOUT, phi::PixelShuffleKernel, float, double) {}
+PD_REGISTER_KERNEL(pixel_shuffle,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::PixelShuffleKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
index b4fde608b1e788..648c0fa627b253 100644
--- a/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
+++ b/paddle/phi/kernels/gpu/put_along_axis_kernel.cu
@@ -68,7 +68,7 @@ void PutAlongAxisKernel(const Context& dev_ctx,
     PADDLE_THROW(errors::InvalidArgument(
         "can not support reduce: '%s' for scatter kernel, only "
         "support reduce op: 'add', 'assign', 'mul' and 'multiply', the "
-        "defalut reduce op is 'assign' ",
+        "default reduce op is 'assign' ",
         reduce));
     return;
   }
diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
index 0ed299413c1726..c715831ffc7ffc 100644
--- a/paddle/phi/kernels/gpu/selu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -18,5 +18,10 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/selu_grad_kernel_impl.h"
 
-PD_REGISTER_KERNEL(
-    selu_grad, GPU, ALL_LAYOUT, phi::SeluGradKernel, float, double) {}
+PD_REGISTER_KERNEL(selu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SeluGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
index 7eed96699e7208..77e140cab14ce6 100644
--- a/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_grad_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/set_value_grad_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_grad_kernel_impl.h"
 
@@ -26,4 +27,7 @@ PD_REGISTER_KERNEL(set_value_grad,
                    double,
                    int,
                    int64_t,
-                   bool) {}
+                   bool,
+                   phi::dtype::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/set_value_kernel.cu b/paddle/phi/kernels/gpu/set_value_kernel.cu
index f788da010b6827..1a268c2f6b089b 100644
--- a/paddle/phi/kernels/gpu/set_value_kernel.cu
+++ b/paddle/phi/kernels/gpu/set_value_kernel.cu
@@ -15,6 +15,7 @@
 #include "paddle/phi/kernels/set_value_kernel.h"
 
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/set_value_kernel_impl.h"
 
@@ -26,7 +27,10 @@ PD_REGISTER_KERNEL(set_value,
                    double,
                    int,
                    int64_t,
-                   bool) {}
+                   bool,
+                   paddle::platform::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
 PD_REGISTER_KERNEL(set_value_with_tensor,
                    GPU,
                    ALL_LAYOUT,
@@ -35,4 +39,7 @@ PD_REGISTER_KERNEL(set_value_with_tensor,
                    double,
                    int,
                    int64_t,
-                   bool) {}
+                   bool,
+                   paddle::platform::float16,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/shard_index_kernel.cu b/paddle/phi/kernels/gpu/shard_index_kernel.cu
index d2497f56a0c76a..96fd3911c0d45c 100644
--- a/paddle/phi/kernels/gpu/shard_index_kernel.cu
+++ b/paddle/phi/kernels/gpu/shard_index_kernel.cu
@@ -33,7 +33,15 @@ __global__ void ShardIndexInner(const T* in_data,
   int shard_size = (index_num + nshards - 1) / nshards;
   int idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (idx < numel) {
-    assert(in_data[idx] >= 0 && in_data[idx] < index_num);
+    PADDLE_ENFORCE(in_data[idx] >= 0,
+                   "The input_index for Op(shard_index) must be "
+                   "greater or equal to 0, but the value given is %d.",
+                   in_data[idx]);
+    PADDLE_ENFORCE(in_data[idx] < index_num,
+                   "The input_index for Op(shard_index) must be less "
+                   "than index_num (%d), but the value given is %d.",
+                   index_num,
+                   in_data[idx]);
     if (in_data[idx] / shard_size == shard_id) {
       out_data[idx] = in_data[idx] % shard_size;
     } else {
diff --git a/paddle/phi/kernels/gpu/tile_grad_kernel.cu b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
index c092609e623d3f..d1e356df401a88 100644
--- a/paddle/phi/kernels/gpu/tile_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/tile_grad_kernel.cu
@@ -27,4 +27,5 @@ PD_REGISTER_KERNEL(tile_grad,
                    double,
                    int,
                    int64_t,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
index 23232970e191e6..458239814b65e1 100644
--- a/paddle/phi/kernels/gpu/uniform_random_kernel.cu
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -74,8 +74,12 @@ void UniformRandomRawKernel(const Context& dev_ctx,
     funcs::distribution_and_transform<T>(dev_ctx, out, dist, trans);
   } else {
     // Use OP seed
-    auto func = UniformGenerator<T>(
-        min.to<float>(), max.to<float>(), seed, diag_num, diag_step, diag_val);
+    auto func = UniformGenerator<T>(static_cast<T>(min.to<float>()),
+                                    static_cast<T>(max.to<float>()),
+                                    seed,
+                                    diag_num,
+                                    diag_step,
+                                    static_cast<T>(diag_val));
     IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
   }
 }
@@ -87,4 +91,5 @@ PD_REGISTER_KERNEL(uniform_random_raw,
                    ALL_LAYOUT,
                    phi::UniformRandomRawKernel,
                    float,
-                   double) {}
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
index 224651326d7626..31227e59433ea8 100644
--- a/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
+++ b/paddle/phi/kernels/gpu/viterbi_decode_kernel.cu
@@ -92,7 +92,7 @@ struct BinaryOperation {
     std::vector<DenseTensor*> outs{output};
     paddle::operators::
         LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
-            dev_ctx, ins, &outs, -1, BinaryFunctor<T>());
+            dev_ctx, ins, &outs, 0, BinaryFunctor<T>());
   }
 };
 
diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu
index 709dddcb82c7e7..4c411bfb9cd5a3 100644
--- a/paddle/phi/kernels/gpu/where_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
@@ -25,10 +25,10 @@ __global__ void WhereGradCUDAKernel(
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
   for (; idx < N; idx += blockDim.x * gridDim.x) {
     if (dx != nullptr) {
-      dx[idx] = cond[idx] ? dout[idx] : 0.;
+      dx[idx] = cond[idx] ? dout[idx] : static_cast<T>(0.);
     }
     if (dy != nullptr) {
-      dy[idx] = cond[idx] ? 0. : dout[idx];
+      dy[idx] = cond[idx] ? static_cast<T>(0.) : dout[idx];
     }
   }
 }
@@ -61,6 +61,8 @@ PD_REGISTER_KERNEL(where_grad,
                    GPU,
                    ALL_LAYOUT,
                    phi::WhereGradKernel,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    float,
                    double,
                    int,
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
index 441be02b99efa2..09a974fbc23400 100644
--- a/paddle/phi/kernels/gpu/where_kernel.cu
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -45,5 +45,13 @@ void WhereKernel(const Context& ctx,
 
 }  // namespace phi
 
-PD_REGISTER_KERNEL(
-    where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
+PD_REGISTER_KERNEL(where,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
index fb9580427e1f45..e61f58450b34f2 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_grad_kernel.cu
@@ -336,7 +336,7 @@ void ConvCudnnGradGradKernel(
 #else
       using search1 =
           paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_result1 = search1::Find<T>(args1, exhaustive_search, false, ctx);
+      fwd_result1 = search1::Find<T>(ctx, args1, exhaustive_search, false);
       workspace_size = search1::GetWorkspaceSize(args1, fwd_result1.algo);
 #endif
     }
@@ -364,7 +364,7 @@ void ConvCudnnGradGradKernel(
 #else
       using search2 =
           paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-      fwd_result2 = search2::Find<T>(args2, exhaustive_search, false, ctx);
+      fwd_result2 = search2::Find<T>(ctx, args2, exhaustive_search, false);
       workspace_size = std::max(
           workspace_size, search2::GetWorkspaceSize(args2, fwd_result2.algo));
 #endif
@@ -394,7 +394,7 @@ void ConvCudnnGradGradKernel(
     using search3 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
     filter_result =
-        search3::Find<T>(args3, exhaustive_search, deterministic, ctx);
+        search3::Find<T>(ctx, args3, exhaustive_search, deterministic);
     workspace_size = std::max(
         workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
 #endif
@@ -424,7 +424,7 @@ void ConvCudnnGradGradKernel(
     using search4 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
     data_result =
-        search4::Find<T>(args4, exhaustive_search, deterministic, ctx);
+        search4::Find<T>(ctx, args4, exhaustive_search, deterministic);
     workspace_size = std::max(
         workspace_size, search4::GetWorkspaceSize(args4, data_result.algo));
 #endif
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index bc7a8b4f378401..2d61ec6e62c9ca 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -373,7 +373,7 @@ void ConvCudnnGradKernel(const Context& ctx,
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_result = search1::Find<T>(args1, exhaustive_search, deterministic, ctx);
+    bwd_result = search1::Find<T>(ctx, args1, exhaustive_search, deterministic);
     workspace_size_d = std::max(workspace_size_d, bwd_result.workspace_size);
 #endif
   }
@@ -402,7 +402,7 @@ void ConvCudnnGradKernel(const Context& ctx,
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
     filter_result =
-        search2::Find<T>(args2, exhaustive_search, deterministic, ctx);
+        search2::Find<T>(ctx, args2, exhaustive_search, deterministic);
     VLOG(3) << "filter algo: " << filter_result.algo << ", time "
             << filter_result.time;
     workspace_size_w = std::max(workspace_size_w, filter_result.workspace_size);
diff --git a/paddle/phi/kernels/gpudnn/conv_kernel.cu b/paddle/phi/kernels/gpudnn/conv_kernel.cu
index aa591a34a4399c..7a6e8d8148fa15 100644
--- a/paddle/phi/kernels/gpudnn/conv_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_kernel.cu
@@ -25,7 +25,6 @@
 #endif
 
 #include "paddle/fluid/platform/cudnn_workspace_helper.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/phi/common/bfloat16.h"
 #include "paddle/phi/common/float16.h"
@@ -56,8 +55,7 @@ void ConvCudnnKernel(const Context& ctx,
 
   bool exhaustive_search = FLAGS_cudnn_exhaustive_search || exhaustive_search_t;
   bool deterministic = FLAGS_cudnn_deterministic;
-  auto exhaustive_deterministic = exhaustive_search && deterministic;
-  PADDLE_ENFORCE_EQ(exhaustive_deterministic,
+  PADDLE_ENFORCE_EQ(exhaustive_search && deterministic,
                     false,
                     phi::errors::InvalidArgument(
                         "Cann't set exhaustive_search True and "
@@ -315,7 +313,7 @@ void ConvCudnnKernel(const Context& ctx,
   paddle::operators::SearchResult<cudnnConvolutionFwdAlgo_t> fwd_result;
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-  fwd_result = search::Find<T>(args, exhaustive_search, deterministic, ctx);
+  fwd_result = search::Find<T>(ctx, args, exhaustive_search, deterministic);
   workspace_size = fwd_result.workspace_size;
 #endif
 
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
index 3acb1604f4a610..d05bd58e33080a 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_grad_kernel.cu
@@ -230,7 +230,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    fwd_result = search1::Find<T>(args1, false, deterministic, ctx);
+    fwd_result = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search1::GetWorkspaceSize(args1, fwd_result.algo));
 #endif
@@ -257,7 +257,7 @@ void ConvTransposeGradRawGPUDNNKernel(const Context& ctx,
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_result = search2::Find<T>(args2, false, deterministic, ctx);
+    filter_result = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search2::GetWorkspaceSize(args2, filter_result.algo));
 #endif
@@ -710,7 +710,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search1 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_result1 = search1::Find<T>(args1, false, deterministic, ctx);
+    bwd_result1 = search1::Find<T>(ctx, args1, false, deterministic, false);
     workspace_size = search1::GetWorkspaceSize(args1, bwd_result1.algo);
 #endif
 
@@ -734,7 +734,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search2 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-    bwd_result2 = search2::Find<T>(args2, false, deterministic, ctx);
+    bwd_result2 = search2::Find<T>(ctx, args2, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search2::GetWorkspaceSize(args2, bwd_result2.algo));
 #endif
@@ -761,7 +761,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search3 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionBwdFilterAlgoPerf_t>;
-    filter_result = search3::Find<T>(args3, false, deterministic, ctx);
+    filter_result = search3::Find<T>(ctx, args3, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search3::GetWorkspaceSize(args3, filter_result.algo));
 #endif
@@ -789,7 +789,7 @@ void Conv2dTransposeDoubleGradGPUDNNKernel(
 #else
     using search4 =
         paddle::operators::SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t>;
-    fwd_result = search4::Find<T>(args4, false, deterministic, ctx);
+    fwd_result = search4::Find<T>(ctx, args4, false, deterministic, false);
     workspace_size = std::max(
         workspace_size, search4::GetWorkspaceSize(args4, fwd_result.algo));
 #endif
diff --git a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
index 6fc1e2eff13520..84332f0ccb892a 100644
--- a/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_transpose_kernel.cu
@@ -230,7 +230,7 @@ void ConvTransposeRawGPUDNNKernel(const Context& ctx,
   paddle::operators::SearchResult<cudnnConvolutionBwdDataAlgo_t> bwd_result;
   using search =
       paddle::operators::SearchAlgorithm<cudnnConvolutionBwdDataAlgoPerf_t>;
-  bwd_result = search::Find<T>(args, false, deterministic, ctx);
+  bwd_result = search::Find<T>(ctx, args, false, deterministic, false);
   workspace_size =
       std::max(workspace_size, search::GetWorkspaceSize(args, bwd_result.algo));
 #endif
diff --git a/paddle/phi/kernels/gumbel_softmax_kernel.h b/paddle/phi/kernels/gumbel_softmax_kernel.h
index 46edb9750dd348..4ba1e56142d9bd 100644
--- a/paddle/phi/kernels/gumbel_softmax_kernel.h
+++ b/paddle/phi/kernels/gumbel_softmax_kernel.h
@@ -25,4 +25,12 @@ void GumbelSoftmaxKernel(const Context& dev_ctx,
                          int axis,
                          DenseTensor* out);
 
+template <typename T, typename Context>
+void GumbelSoftmaxInferKernel(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              float temperature,
+                              bool hard,
+                              int axis,
+                              DenseTensor* out);
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/add_n_kernel_impl.h b/paddle/phi/kernels/impl/add_n_kernel_impl.h
new file mode 100644
index 00000000000000..654ef5efbd9455
--- /dev/null
+++ b/paddle/phi/kernels/impl/add_n_kernel_impl.h
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/add_n_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddNArrayKernel(const Context& dev_ctx,
+                     const std::vector<const TensorArray*>& x,
+                     TensorArray* out) {
+  for (auto& ele : *out) {
+    dev_ctx.template Alloc<T>(&ele);
+  }
+  bool in_place = true;
+  if (x.size() > 0 && x[0]->size() == out->size()) {
+    for (size_t i = 0; i < out->size(); i++) {
+      if (x[0]->at(i).IsInitialized() &&
+          out->at(i).data() != x[0]->at(i).data()) {
+        in_place = false;
+        break;
+      }
+    }
+  } else {
+    in_place = false;
+  }
+  for (size_t i = in_place ? 1 : 0; i < x.size(); ++i) {
+    auto* in_array = x.at(i);
+
+    for (size_t j = 0; j < in_array->size(); ++j) {
+      if (in_array->at(j).IsInitialized() && (in_array->at(j).numel() != 0)) {
+        if (j >= out->size()) {
+          out->resize(j + 1);
+        }
+        if (!out->at(j).IsInitialized() || (out->at(j).numel() == 0)) {
+          Copy<Context>(dev_ctx,
+                        in_array->at(j),
+                        in_array->at(j).place(),
+                        false,
+                        &out->at(j));
+          out->at(j).set_lod(in_array->at(j).lod());
+        } else {
+          PADDLE_ENFORCE_EQ(
+              out->at(j).lod(),
+              in_array->at(j).lod(),
+              phi::errors::InvalidArgument(
+                  "The lod message between inputs[%d] and"
+                  " outputs[%d] must be same, but now is not same.",
+                  j,
+                  j));
+          auto in = EigenVector<T>::Flatten(in_array->at(j));
+          auto result = EigenVector<T>::Flatten(out->at(j));
+          result.device(*dev_ctx.eigen_device()) = result + in;
+        }
+      }
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/einsum_impl.h b/paddle/phi/kernels/impl/einsum_impl.h
index 80529c8b669aac..dafb967ae8ed5c 100644
--- a/paddle/phi/kernels/impl/einsum_impl.h
+++ b/paddle/phi/kernels/impl/einsum_impl.h
@@ -241,7 +241,7 @@ inline static void InferLabelShape(const std::vector<std::string>& op_labels,
       } else if (labelshape->is_default(c) || (*labelshape)[c] == -1) {
         (*labelshape)[c] = op_dim[dim_ptr];
         dim_ptr++;
-      } else {
+      } else if (op_dim[dim_ptr] != -1) {
         PADDLE_ENFORCE_EQ(
             (*labelshape)[c],
             op_dim[dim_ptr],
diff --git a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
index b9320eab85046f..5b5cf5280285db 100644
--- a/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_grad_kernel_impl.h
@@ -54,11 +54,8 @@ void FoldGradKernel(const Context& ctx,
 
   DDim out_shape =
       make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
-  DDim input_matrix_shape = make_ddim({x_dims[0],
-                                       kernel_sizes[0],
-                                       kernel_sizes[1],
-                                       output_height,
-                                       output_width});
+  DDim input_matrix_shape = make_ddim(
+      {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
 
   paddle::operators::math::
       Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, Context, T>
diff --git a/paddle/phi/kernels/impl/fold_kernel_impl.h b/paddle/phi/kernels/impl/fold_kernel_impl.h
index 415beca7bd9283..72a86fb8604149 100644
--- a/paddle/phi/kernels/impl/fold_kernel_impl.h
+++ b/paddle/phi/kernels/impl/fold_kernel_impl.h
@@ -56,11 +56,8 @@ void FoldKernel(const Context& ctx,
   DDim output_shape =
       make_ddim({n_output_plane, output_sizes[0], output_sizes[1]});
 
-  DDim input_matrix_shape = make_ddim({x_dims[0],
-                                       kernel_sizes[0],
-                                       kernel_sizes[1],
-                                       output_height,
-                                       output_width});
+  DDim input_matrix_shape = make_ddim(
+      {1, kernel_sizes[0], kernel_sizes[1], output_height, output_width});
 
   phi::funcs::SetConstant<Context, T> set_zero;
   set_zero(ctx, out, static_cast<T>(0));
@@ -68,6 +65,7 @@ void FoldKernel(const Context& ctx,
   for (int i = 0; i < batch_size; i++) {
     DenseTensor out_batch =
         out->Slice(i, i + 1).Resize(output_shape);  // im size=3
+
     DenseTensor in_batch =
         x.Slice(i, i + 1).Resize(input_matrix_shape);  // col size=5
     col2im(ctx, in_batch, dilations, strides, paddings, &out_batch);
diff --git a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
index 655634e319924d..ed800e70f5a36d 100644
--- a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
@@ -43,12 +43,13 @@ template <typename Context, typename T>
 struct OneHotGenerator;
 
 template <typename T, typename Context>
-void GumbelSoftmaxKernel(const Context& ctx,
-                         const DenseTensor& x,
-                         float temperature,
-                         bool hard,
-                         int axis,
-                         DenseTensor* out) {
+void GumbelSoftmaxKernelHelper(const Context& ctx,
+                               const DenseTensor& x,
+                               float temperature,
+                               bool hard,
+                               int axis,
+                               DenseTensor* out,
+                               bool is_test) {
   const int rank = x.dims().size();
   axis = funcs::CanonicalAxis(axis, rank);
   int axis_dim = x.dims()[axis];
@@ -80,18 +81,39 @@ void GumbelSoftmaxKernel(const Context& ctx,
                                               size_to_axis,
                                               size_from_axis,
                                               temperature);
-
-#ifdef PADDLE_ON_INFERENCE
-  paddle::operators::math::SoftmaxFunctor<Context, T, true>()(
-      ctx, axis_dim, &x_noise_2d, &out_2d);
-#else
-  paddle::operators::math::SoftmaxFunctor<Context, T, false>()(
-      ctx, axis_dim, &x_noise_2d, &out_2d);
-#endif
+  if (is_test) {
+    paddle::operators::math::SoftmaxFunctor<Context, T, true>()(
+        ctx, axis_dim, &x_noise_2d, &out_2d);
+  } else {
+    paddle::operators::math::SoftmaxFunctor<Context, T, false>()(
+        ctx, axis_dim, &x_noise_2d, &out_2d);
+  }
 
   if (hard) {
     OneHotGenerator<Context, T>::Transform(ctx, x, out, axis);
   }
 }
 
+template <typename T, typename Context>
+void GumbelSoftmaxKernel(const Context& ctx,
+                         const DenseTensor& x,
+                         float temperature,
+                         bool hard,
+                         int axis,
+                         DenseTensor* out) {
+  GumbelSoftmaxKernelHelper<T, Context>(
+      ctx, x, temperature, hard, axis, out, false);
+}
+
+template <typename T, typename Context>
+void GumbelSoftmaxInferKernel(const Context& ctx,
+                              const DenseTensor& x,
+                              float temperature,
+                              bool hard,
+                              int axis,
+                              DenseTensor* out) {
+  GumbelSoftmaxKernelHelper<T, Context>(
+      ctx, x, temperature, hard, axis, out, true);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/selu_kernel_impl.h b/paddle/phi/kernels/impl/selu_kernel_impl.h
index 288f7bb9b793e6..0725b141252bcb 100644
--- a/paddle/phi/kernels/impl/selu_kernel_impl.h
+++ b/paddle/phi/kernels/impl/selu_kernel_impl.h
@@ -57,14 +57,17 @@ struct SeluGradFunctor {
         dx_data_ptr_(dx_data_ptr) {}
 
   HOSTDEVICE void operator()(size_t idx) const {
-    T y_ele = y_data_ptr_[idx];
-    T dy_ele = dy_data_ptr_[idx];
+    using MT =
+        typename std::conditional<(sizeof(T) > sizeof(float)), T, float>::type;
 
-    float tmp = scale_;
+    auto y_ele = static_cast<MT>(y_data_ptr_[idx]);
+    auto dy_ele = static_cast<MT>(dy_data_ptr_[idx]);
+
+    auto tmp = static_cast<MT>(scale_);
     if (y_ele <= 0) {
-      tmp = y_ele + la_;
+      tmp = y_ele + static_cast<MT>(la_);
     }
-    dx_data_ptr_[idx] = dy_ele * tmp;
+    dx_data_ptr_[idx] = static_cast<T>(dy_ele * tmp);
   }
   const T* y_data_ptr_;
   const T* dy_data_ptr_;
diff --git a/paddle/phi/kernels/kps/reduce_sum_kernel.cu b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
index f3d3246854f336..c5a30a6a634a8e 100644
--- a/paddle/phi/kernels/kps/reduce_sum_kernel.cu
+++ b/paddle/phi/kernels/kps/reduce_sum_kernel.cu
@@ -13,11 +13,64 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/reduce_sum_kernel.h"
+#include <limits>
+#include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/gpu/reduce.h"
+#ifndef PADDLE_WITH_XPU_KP
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#endif
 
 namespace phi {
 
+#ifndef PADDLE_WITH_XPU_KP
+template <typename T,
+          int EigenDimSize = 5,
+          int ReducedDimSize = 1,
+          bool ReduceAll = false>
+void ReduceSumEigen(const KPDevice& dev_ctx,
+                    const DenseTensor& x,
+                    bool reduce_all,
+                    const std::vector<int64_t>& dims,
+                    DataType out_dtype,
+                    DenseTensor* out,
+                    std::vector<int>* reduce_dims) {
+  // Resize Input Tensor
+  auto new_x = x;
+  int added_dims = EigenDimSize - x.dims().size();
+  std::array<int64_t, EigenDimSize> new_x_dim;
+  new_x_dim.fill(1);
+  for (int i = 0; i < x.dims().size(); i++) {
+    new_x_dim[i + added_dims] = x.dims().at(i);
+  }
+  new_x.Resize(phi::DDim(new_x_dim.data(), new_x_dim.size()));
+  auto eigen_x_tensor = EigenTensor<T, EigenDimSize>::From(new_x);
+
+  // Create Out Tensor
+  dev_ctx.Alloc<T>(out);
+  auto origin_out_dims = out->dims();
+  constexpr int kReduceOutRank = ReduceAll ? 1 : EigenDimSize - ReducedDimSize;
+  // Resize Out Tensor
+  std::array<int64_t, kReduceOutRank> new_out_dim;
+  new_out_dim.fill(1);
+  for (int i = 0; i < out->dims().size(); i++) {
+    new_out_dim[i + added_dims] = out->dims().at(i);
+  }
+  out->Resize(phi::DDim(new_out_dim.data(), new_out_dim.size()));
+
+  auto eigen_out_tensor = EigenTensor<T, kReduceOutRank>::From(*out);
+  for (int i = 0; i < ReducedDimSize; i++) {
+    (*reduce_dims)[i] += added_dims;
+  }
+  auto eigen_reduce_dim =
+      EigenDim<ReducedDimSize>::From(phi::make_ddim(*reduce_dims));
+  // Caculate
+  eigen_out_tensor.device(*dev_ctx.eigen_device()) =
+      eigen_x_tensor.sum(eigen_reduce_dim);
+  out->Resize(origin_out_dims);
+}
+#endif
+
 template <typename T, typename Context>
 void SumRawKernel(const Context& dev_ctx,
                   const DenseTensor& x,
@@ -29,10 +82,65 @@ void SumRawKernel(const Context& dev_ctx,
   if (out_dtype == DataType::UNDEFINED && out->dtype() != x.dtype()) {
     out_dtype = out->dtype();
   }
-  phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
-      dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
-}
+  if (x.numel() > std::numeric_limits<int32_t>::max()) {
+#ifndef PADDLE_WITH_XPU_KP
+    if (out_dtype != phi::DataType::UNDEFINED && out_dtype != x.dtype()) {
+      PADDLE_THROW(phi::errors::Fatal(
+          "If Input.numel() > INT32_MAX, reduce_sum kernel uses EigenTensor "
+          "sum for reduce_sum function. As a result, input dtype should be "
+          "the same as out dtype"));
+    }
+
+    std::vector<int> reduce_dims = phi::funcs::details::GetReduceDim(
+        dims.GetData(), x.dims().size(), reduce_all);
+
+#define CALL_EIGEN_REDUCE_SUM_KERNEL(reduce_rank)              \
+  case reduce_rank: {                                          \
+    if (reduce_all) {                                          \
+      ReduceSumEigen<T, 5, reduce_rank, true>(dev_ctx,         \
+                                              x,               \
+                                              reduce_all,      \
+                                              dims.GetData(),  \
+                                              out_dtype,       \
+                                              out,             \
+                                              &reduce_dims);   \
+    } else {                                                   \
+      ReduceSumEigen<T, 5, reduce_rank, false>(dev_ctx,        \
+                                               x,              \
+                                               reduce_all,     \
+                                               dims.GetData(), \
+                                               out_dtype,      \
+                                               out,            \
+                                               &reduce_dims);  \
+    }                                                          \
+    break;                                                     \
+  }
 
+    switch (reduce_dims.size()) {
+      CALL_EIGEN_REDUCE_SUM_KERNEL(1);
+      CALL_EIGEN_REDUCE_SUM_KERNEL(2);
+      CALL_EIGEN_REDUCE_SUM_KERNEL(3);
+      CALL_EIGEN_REDUCE_SUM_KERNEL(4);
+      CALL_EIGEN_REDUCE_SUM_KERNEL(5);
+      default:
+        PADDLE_THROW(phi::errors::Fatal(
+            "If Input.numel() > INT32_MAX, reduce_sum kernel uses EigenTensor "
+            "sum for reduce_sum function. As a result, its dim should be <= "
+            "5."));
+        break;
+    }
+#undef CALL_EIGEN_REDUCE_SUM_KERNEL
+#else
+    PADDLE_THROW(phi::errors::Fatal(
+        "If Input.numel() > INT32_MAX, reduce_sum kernel uses EigenTensor "
+        "sum for reduce_sum function. Such case is only supported on GPU "
+        "now."));
+#endif
+  } else {
+    phi::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
+        dev_ctx, x, reduce_all, dims.GetData(), keep_dim, out_dtype, out);
+  }
+}
 }  // namespace phi
 
 #ifdef PADDLE_WITH_XPU_KP
diff --git a/paddle/phi/kernels/onednn/activation_grad_kernel.cc b/paddle/phi/kernels/onednn/activation_grad_kernel.cc
index 4ad073cb00d62f..cc7f71ff3646d7 100644
--- a/paddle/phi/kernels/onednn/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/activation_grad_kernel.h"
+#include "paddle/phi/kernels/gelu_grad_kernel.h"
 
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
@@ -23,16 +24,6 @@
 
 namespace phi {
 
-#define DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPX(name, functor_class) \
-  template <typename T, typename Context>                              \
-  void name##GradKernel(const Context& dev_ctx,                        \
-                        const DenseTensor& x,                          \
-                        const DenseTensor& dout,                       \
-                        DenseTensor* dx) {                             \
-    functor_class<T> functor;                                          \
-    functor(dev_ctx, x, dout, 0, 0, dx);                               \
-  }
-
 #define DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX( \
     name, functor_class, attr)                             \
   template <typename T, typename Context>                  \
@@ -55,18 +46,6 @@ namespace phi {
     functor(dev_ctx, out, dout, 0, 0, dx);                               \
   }
 
-#define DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPOUT( \
-    name, functor_class, attr)                               \
-  template <typename T, typename Context>                    \
-  void name##GradKernel(const Context& dev_ctx,              \
-                        const DenseTensor& out,              \
-                        const DenseTensor& dout,             \
-                        float attr,                          \
-                        DenseTensor* dx) {                   \
-    functor_class<T> functor;                                \
-    functor(dev_ctx, out, dout, attr, 0, dx);                \
-  }
-
 template <typename T>
 void eltwise_grad(const OneDNNContext& dev_ctx,
                   const DenseTensor& x,
@@ -158,12 +137,14 @@ using AbsOneDNNGradFunctor =
     OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_abs>;
 
 template <typename T>
-using ReluOneDNNGradFunctor =
-    OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_relu>;
+using EluOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
+    T,
+    dnnl::algorithm::eltwise_elu_use_dst_for_bwd>;
 
 template <typename T>
-using SwishOneDNNGradFunctor =
-    OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_swish>;
+using ExpOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
+    T,
+    dnnl::algorithm::eltwise_exp_use_dst_for_bwd>;
 
 template <typename T>
 using HardSwishOneDNNGradFunctor =
@@ -174,14 +155,26 @@ using MishOneDNNGradFunctor =
     OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_mish>;
 
 template <typename T>
-using SigmoidOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
+using GeluTanhOneDNNGradFunctor =
+    OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_gelu_tanh>;
+
+template <typename T>
+using GeluErfOneDNNGradFunctor =
+    OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_gelu_erf>;
+
+template <typename T>
+using ReluOneDNNGradFunctor =
+    OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_relu>;
+
+template <typename T>
+using Relu6OneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
     T,
-    dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>;
+    dnnl::algorithm::eltwise_clip_v2_use_dst_for_bwd>;
 
 template <typename T>
-using TanhOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
+using SigmoidOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
     T,
-    dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>;
+    dnnl::algorithm::eltwise_logistic_use_dst_for_bwd>;
 
 template <typename T>
 using SqrtOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
@@ -189,22 +182,21 @@ using SqrtOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
     dnnl::algorithm::eltwise_sqrt_use_dst_for_bwd>;
 
 template <typename T>
-using EluOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
-    T,
-    dnnl::algorithm::eltwise_elu_use_dst_for_bwd>;
+using SwishOneDNNGradFunctor =
+    OneDNNActivationGradFunc<T, dnnl::algorithm::eltwise_swish>;
 
 template <typename T>
-using ExpOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
+using TanhOneDNNGradUseOutFunctor = OneDNNActivationGradUseOutFunc<
     T,
-    dnnl::algorithm::eltwise_exp_use_dst_for_bwd>;
+    dnnl::algorithm::eltwise_tanh_use_dst_for_bwd>;
 
-DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhOneDNNGradUseOutFunctor);
-DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtOneDNNGradUseOutFunctor);
-DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid,
-                                            SigmoidOneDNNGradUseOutFunctor);
-DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpOneDNNGradUseOutFunctor);
 DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Abs, AbsOneDNNGradFunctor);
+DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Exp, ExpOneDNNGradUseOutFunctor);
 DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluOneDNNGradFunctor);
+DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sigmoid,
+                                            SigmoidOneDNNGradUseOutFunctor);
+DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Sqrt, SqrtOneDNNGradUseOutFunctor);
+DEFINE_ONEDNN_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhOneDNNGradUseOutFunctor);
 
 DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                   ReluOneDNNGradFunctor,
@@ -215,6 +207,33 @@ DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Mish,
 DEFINE_ONEDNN_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(Swish,
                                                   SwishOneDNNGradFunctor,
                                                   beta);
+
+template <typename T, typename Context>
+void EluGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& out,
+                   const DenseTensor& dout,
+                   float alpha,
+                   DenseTensor* dx) {
+  EluOneDNNGradUseOutFunctor<T> functor;
+  functor(dev_ctx, out, dout, alpha, 0, dx);
+}
+
+template <typename T, typename Context>
+void GeluGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    bool approximate,
+                    DenseTensor* x_grad) {
+  if (approximate) {
+    GeluTanhOneDNNGradFunctor<T> functor;
+    functor(dev_ctx, x, out_grad, 0, 0, x_grad);
+  } else {
+    GeluErfOneDNNGradFunctor<T> functor;
+    functor(dev_ctx, x, out_grad, 0, 0, x_grad);
+  }
+}
+
 template <typename T, typename Context>
 void HardSwishGradKernel(const Context& dev_ctx,
                          const DenseTensor& x,
@@ -224,18 +243,17 @@ void HardSwishGradKernel(const Context& dev_ctx,
                          float offset,
                          DenseTensor* dx) {
   HardSwishOneDNNGradFunctor<T> functor;
-  functor(dev_ctx, x, dout, threshold, 0, dx);
+  functor(dev_ctx, x, dout, 0, 0, dx);
 }
 
 template <typename T, typename Context>
-void EluGradKernel(const Context& dev_ctx,
-                   const DenseTensor& x,
-                   const DenseTensor& out,
-                   const DenseTensor& dout,
-                   float alpha,
-                   DenseTensor* dx) {
-  EluOneDNNGradUseOutFunctor<T> functor;
-  functor(dev_ctx, out, dout, alpha, 0, dx);
+void Relu6GradKernel(const Context& dev_ctx,
+                     const DenseTensor& out,
+                     const DenseTensor& dout,
+                     float threshold,
+                     DenseTensor* dx) {
+  Relu6OneDNNGradUseOutFunctor<T> functor;
+  functor(dev_ctx, out, dout, 0, threshold, dx);
 }
 
 }  // namespace phi
@@ -254,9 +272,11 @@ PD_REGISTER_KERNEL(relu_grad,
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(abs_grad, AbsGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(elu_grad, EluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(exp_grad, ExpGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(gelu_grad, GeluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_swish_grad, HardSwishGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(leaky_relu_grad, LeakyReluGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(mish_grad, MishGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(relu6_grad, Relu6GradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_grad, SigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sqrt_grad, SqrtGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(swish_grad, SwishGradKernel)
diff --git a/paddle/phi/kernels/onednn/activation_kernel.cc b/paddle/phi/kernels/onednn/activation_kernel.cc
index 40f2d8fd4c49e6..36ba1be724ccf1 100644
--- a/paddle/phi/kernels/onednn/activation_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/phi/kernels/activation_kernel.h"
+#include "paddle/phi/kernels/gelu_grad_kernel.h"
 
 #include "paddle/phi/backends/onednn/onednn_context.h"
 #include "paddle/phi/backends/onednn/onednn_reuse.h"
@@ -91,16 +92,18 @@ template <typename T>
 using AbsOneDNNFunctor = OneDNNActivationFunc<T, dnnl::algorithm::eltwise_abs>;
 
 template <typename T>
-using ReluOneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_relu>;
+using EluOneDNNFunctor = OneDNNActivationFunc<T, dnnl::algorithm::eltwise_elu>;
 
 template <typename T>
-using Relu6OneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_bounded_relu>;
+using ExpOneDNNFunctor = OneDNNActivationFunc<T, dnnl::algorithm::eltwise_exp>;
 
 template <typename T>
-using SwishOneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_swish>;
+using GeluTanhOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_gelu_tanh>;
+
+template <typename T>
+using GeluErfOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_gelu_erf>;
 
 template <typename T>
 using HardSwishOneDNNFunctor =
@@ -111,41 +114,46 @@ using MishOneDNNFunctor =
     OneDNNActivationFunc<T, dnnl::algorithm::eltwise_mish>;
 
 template <typename T>
-using SigmoidOneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_logistic>;
+using ReluOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_relu>;
 
 template <typename T>
-using TanhOneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_tanh>;
+using Relu6OneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_clip_v2>;
 
 template <typename T>
-using SqrtOneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_sqrt>;
+using RoundOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_round>;
 
 template <typename T>
-using EluOneDNNFunctor = OneDNNActivationFunc<T, dnnl::algorithm::eltwise_elu>;
+using SigmoidOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_logistic>;
 
 template <typename T>
-using ExpOneDNNFunctor = OneDNNActivationFunc<T, dnnl::algorithm::eltwise_exp>;
+using SqrtOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_sqrt>;
 
 template <typename T>
-using RoundOneDNNFunctor =
-    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_round>;
+using SwishOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_swish>;
+
+template <typename T>
+using TanhOneDNNFunctor =
+    OneDNNActivationFunc<T, dnnl::algorithm::eltwise_tanh>;
 
 DEFINE_ONEDNN_ACTIVATION_KERNEL(Abs, AbsOneDNNFunctor)
-DEFINE_ONEDNN_ACTIVATION_KERNEL(Relu, ReluOneDNNFunctor)
-DEFINE_ONEDNN_ACTIVATION_KERNEL(Tanh, TanhOneDNNFunctor)
 DEFINE_ONEDNN_ACTIVATION_KERNEL(Exp, ExpOneDNNFunctor)
-DEFINE_ONEDNN_ACTIVATION_KERNEL(Sqrt, SqrtOneDNNFunctor)
+DEFINE_ONEDNN_ACTIVATION_KERNEL(Relu, ReluOneDNNFunctor)
 DEFINE_ONEDNN_ACTIVATION_KERNEL(Sigmoid, SigmoidOneDNNFunctor)
+DEFINE_ONEDNN_ACTIVATION_KERNEL(Sqrt, SqrtOneDNNFunctor)
+DEFINE_ONEDNN_ACTIVATION_KERNEL(Tanh, TanhOneDNNFunctor)
 
 // round eltwise primitive doesn't support BF16, nor does it support grad
 DEFINE_ONEDNN_ACTIVATION_KERNEL(Round, RoundOneDNNFunctor)
 
+DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Elu, EluOneDNNFunctor, alpha)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, ReluOneDNNFunctor, alpha)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Mish, MishOneDNNFunctor, threshold)
-DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Elu, EluOneDNNFunctor, alpha)
-DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Relu6, Relu6OneDNNFunctor, threshold)
 DEFINE_ONEDNN_ACT_KERNEL_WITH_ONE_ATTRS(Swish, SwishOneDNNFunctor, beta)
 
 template <typename T, typename Context>
@@ -159,6 +167,29 @@ void HardSwishKernel(const Context& dev_ctx,
   functor(dev_ctx, x, threshold, 0, out);
 }
 
+template <typename T, typename Context>
+void GeluKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                bool approximate,
+                DenseTensor* out) {
+  if (approximate) {
+    GeluTanhOneDNNFunctor<T> functor;
+    functor(dev_ctx, x, 0, 0, out);
+  } else {
+    GeluErfOneDNNFunctor<T> functor;
+    functor(dev_ctx, x, 0, 0, out);
+  }
+}
+
+template <typename T, typename Context>
+void Relu6Kernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 float threshold,
+                 DenseTensor* out) {
+  Relu6OneDNNFunctor<T> functor;
+  functor(dev_ctx, x, 0, threshold, out);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(round, OneDNN, ALL_LAYOUT, phi::RoundKernel, float) {}
@@ -170,6 +201,7 @@ PD_REGISTER_KERNEL(round, OneDNN, ALL_LAYOUT, phi::RoundKernel, float) {}
 PD_REGISTER_ACTIVATION_KERNEL(abs, AbsKernel)
 PD_REGISTER_ACTIVATION_KERNEL(elu, EluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(exp, ExpKernel)
+PD_REGISTER_ACTIVATION_KERNEL(gelu, GeluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_swish, HardSwishKernel)
 PD_REGISTER_ACTIVATION_KERNEL(leaky_relu, LeakyReluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(mish, MishKernel)
diff --git a/paddle/phi/kernels/onednn/cast_kernel.cc b/paddle/phi/kernels/onednn/cast_kernel.cc
new file mode 100644
index 00000000000000..166db43db665dc
--- /dev/null
+++ b/paddle/phi/kernels/onednn/cast_kernel.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/cast_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void CastKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                DataType out_dtype,
+                DenseTensor* out) {
+  DataType in_dtype = x.dtype();
+
+  dnnl::memory::data_type in_dnnl_dtype = funcs::ToOneDNNDataType(in_dtype);
+  dnnl::memory::data_type out_dnnl_dtype = funcs::ToOneDNNDataType(out_dtype);
+
+  auto x_tz = phi::vectorize(x.dims());
+
+  funcs::ReorderOneDNNHandler reorder_handler(x_tz,
+                                              in_dtype,
+                                              in_dnnl_dtype,
+                                              out_dtype,
+                                              out_dnnl_dtype,
+                                              dev_ctx.GetEngine());
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+  auto reorder_dst_memory_p =
+      reorder_handler.AcquireDstMemory(out, x.mem_desc(), dev_ctx.GetPlace());
+  auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,
+                                                  reorder_src_memory_p);
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  astream.wait();
+
+  out->set_layout(DataLayout::ONEDNN);
+  out->set_mem_desc(reorder_dst_memory_p->get_desc());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    cast, OneDNN, ALL_LAYOUT, phi::CastKernel, float, phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/clip_grad_kernel.cc b/paddle/phi/kernels/onednn/clip_grad_kernel.cc
new file mode 100644
index 00000000000000..aded64616b1245
--- /dev/null
+++ b/paddle/phi/kernels/onednn/clip_grad_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/clip_grad_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void ClipGradKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& out_grad,
+                    const Scalar& min,
+                    const Scalar& max,
+                    DenseTensor* x_grad) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  funcs::ClipOneDNNHandler<T> handler(
+      min, max, onednn_engine, dev_ctx.GetPlace(), &x, &out_grad);
+
+  auto src_memory_p = handler.AcquireBackwardSrcMemory(&x);
+  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(&out_grad);
+  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(x_grad);
+  auto activation_backward_p = handler.AcquireBackwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  activation_backward_p->execute(astream,
+                                 {{DNNL_ARG_SRC, *src_memory_p},
+                                  {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
+                                  {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
+  astream.wait();
+
+  x_grad->set_mem_desc(diff_dst_memory_p->get_desc());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(clip_grad,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::ClipGradKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/clip_kernel.cc b/paddle/phi/kernels/onednn/clip_kernel.cc
new file mode 100644
index 00000000000000..7538dd9708a936
--- /dev/null
+++ b/paddle/phi/kernels/onednn/clip_kernel.cc
@@ -0,0 +1,46 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/clip_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void ClipKernel(const Context& dev_ctx,
+                const DenseTensor& x,
+                const Scalar& min,
+                const Scalar& max,
+                DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  funcs::ClipOneDNNHandler<T> handler(
+      min, max, onednn_engine, dev_ctx.GetPlace(), &x);
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p = handler.AcquireDstMemory(out);
+  auto activation_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  activation_p->execute(
+      astream, {{DNNL_ARG_FROM, *src_memory_p}, {DNNL_ARG_TO, *dst_memory_p}});
+  astream.wait();
+
+  out->set_mem_desc(dst_memory_p->get_desc());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    clip, OneDNN, ALL_LAYOUT, phi::ClipKernel, float, phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/concat_grad_kernel.cc b/paddle/phi/kernels/onednn/concat_grad_kernel.cc
new file mode 100644
index 00000000000000..be962a96acaf71
--- /dev/null
+++ b/paddle/phi/kernels/onednn/concat_grad_kernel.cc
@@ -0,0 +1,84 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/concat_grad_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/concat_funcs.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ConcatGradKernel(const Context& dev_ctx,
+                      const std::vector<const DenseTensor*>& x,
+                      const DenseTensor& out_grad,
+                      const Scalar& axis_scalar,
+                      std::vector<DenseTensor*> x_grad) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  for (size_t i = 0; i < x_grad.size(); ++i) {
+    if (x_grad[i] != nullptr) {
+      x_grad[i]->set_lod(x[i]->lod());
+    }
+  }
+
+  int axis = axis_scalar.to<int>();
+
+  auto out_grad_vec_dims = vectorize(out_grad.dims());
+
+  axis = funcs::ComputeAxis(axis, out_grad_vec_dims.size());
+
+  std::vector<int64_t> offset(out_grad_vec_dims.size(), 0);
+
+  dnnl::memory::data_type out_grad_type =
+      funcs::ToOneDNNDataType(out_grad.dtype());
+  funcs::ReorderOneDNNHandler reorder_handler(
+      out_grad_vec_dims, out_grad.dtype(), out_grad_type, onednn_engine);
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      out_grad.mem_desc(), funcs::to_void_cast(out_grad.data<T>()));
+
+  for (size_t i = 0; i < x_grad.size(); ++i) {
+    if (x_grad[i]->numel() != 0UL) {
+      auto x_grad_vec_dims = vectorize(x_grad[i]->dims());
+      auto slice_mem_p = reorder_handler.AcquireSubmemory(
+          x_grad_vec_dims, offset, reorder_src_memory_p);
+
+      auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+          x_grad[i],
+          x_grad_vec_dims,
+          funcs::GetPlainOneDNNFormat(x_grad_vec_dims.size()),
+          dev_ctx.GetPlace());
+      auto reorder_p =
+          reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
+
+      reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+
+      offset[axis] += x_grad[i]->dims()[axis];
+
+      x_grad[i]->set_mem_desc(reorder_dst_memory_p->get_desc());
+    }
+  }
+  astream.wait();
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(concat_grad,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::ConcatGradKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/concat_kernel.cc b/paddle/phi/kernels/onednn/concat_kernel.cc
new file mode 100644
index 00000000000000..7f92371f4b3b7c
--- /dev/null
+++ b/paddle/phi/kernels/onednn/concat_kernel.cc
@@ -0,0 +1,164 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/concat_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/concat_funcs.h"
+
+namespace phi {
+using memory = dnnl::memory;
+
+namespace funcs {
+
+template <typename T>
+class ConcatOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
+ public:
+  ConcatOneDNNHandler(Place cpu_place,
+                      int concat_axis,
+                      const dnnl::engine onednn_engine,
+                      const std::vector<const DenseTensor*>& inputs,
+                      DenseTensor* output)
+      : OneDNNHandlerNoCachingT<T, dnnl::concat>(onednn_engine, cpu_place) {
+    const int rank = inputs[0]->dims().size();
+
+    PADDLE_ENFORCE_EQ(
+        concat_axis >= -rank && concat_axis < rank,
+        true,
+        errors::InvalidArgument(
+            "The axis is expected to be in range of [%d, %d), but got %d",
+            -rank,
+            rank,
+            concat_axis));
+
+    if (concat_axis < 0) {
+      concat_axis = concat_axis + rank;
+    }
+
+    memory::data_type dt = ToOneDNNDataType(inputs[0]->dtype());
+    std::vector<memory::desc> srcs_md;
+    srcs_md.reserve(inputs.size());
+
+    // Create memory descriptors for each of inputs
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      srcs_md.push_back(inputs[i]->mem_desc());
+    }
+
+    auto dst_dims = vectorize<int64_t>(output->dims());
+
+    memory::desc dst_md = memory::desc(dst_dims, dt, OneDNNMemoryFormat::any);
+
+    this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
+  }
+
+  // (jczaja) concat oneDNN prim is not having .desc attribute so
+  // we cannot use base AcquireForwardPrimitiveDescriptor
+  void AcquireForwardPrimitiveDescriptor(
+      const memory::desc& dst_md,
+      const int concat_axis,
+      const std::vector<memory::desc>& srcs_md) {
+    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
+        dst_md, concat_axis, srcs_md, this->engine_));
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor& input,
+                                                 int i) {
+    const T* input_data = input.data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
+                                            to_void_cast<T>(input_data));
+  }
+};
+}  // namespace funcs
+
+static void EnforceLayouts(const std::vector<const DenseTensor*> inputs) {
+  for (auto* input : inputs) {
+    PADDLE_ENFORCE_EQ(
+        input->layout(),
+        DataLayout::ONEDNN,
+        errors::InvalidArgument("Wrong layout set for Input tensor"));
+  }
+}
+
+// From a multi-input, gather only nonempty inputs
+static const std::vector<const DenseTensor*> ReduceMultiInput(
+    const std::vector<const DenseTensor*>& inputs) {
+  std::vector<const DenseTensor*> reduced(inputs.size());
+  auto end_it = std::copy_if(
+      inputs.begin(), inputs.end(), reduced.begin(), [](const DenseTensor* t) {
+        return t->numel() > 0;
+      });
+  reduced.resize(std::distance(reduced.begin(), end_it));
+  return reduced;
+}
+
+template <typename T, typename Context>
+void ConcatKernel(const Context& dev_ctx,
+                  const std::vector<const DenseTensor*>& x,
+                  const Scalar& axis,
+                  DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+  // If any of the multiple inputs of concat has an input size of 0, the
+  // actual size of the multi_input will change
+  auto multi_input = ReduceMultiInput(x);
+  EnforceLayouts(multi_input);
+
+  auto out_dims_vec = vectorize(out->dims());
+  if (std::any_of(out_dims_vec.begin(), out_dims_vec.end(), [](int64_t i) {
+        return i < 0;
+      })) {
+    std::vector<phi::DDim> x_dims;
+    x_dims.reserve(x.size());
+    for (size_t i = 0; i < x.size(); ++i) {
+      x_dims.push_back(x[i]->dims());
+    }
+
+    DDim out_dims =
+        funcs::ComputeAndCheckShape(true, x_dims, axis.to<size_t>());
+    out->Resize(out_dims);
+  }
+
+  funcs::ConcatOneDNNHandler<T> handler(
+      dev_ctx.GetPlace(), axis.to<int>(), onednn_engine, multi_input, out);
+
+  std::vector<std::shared_ptr<memory>> srcs;
+  srcs.reserve(multi_input.size());
+
+  auto dst_mem = handler.AcquireDstMemory(out);
+  auto concat_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  std::unordered_map<int, memory> args;
+  for (size_t i = 0; i < multi_input.size(); ++i) {
+    srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i));
+    args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs.at(i))});
+  }
+  args.insert({DNNL_ARG_DST, *dst_mem});
+
+  concat_p->execute(astream, args);
+  astream.wait();
+
+  out->set_mem_desc(dst_mem->get_desc());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(concat,
+                   OneDNN,
+                   ONEDNN,
+                   phi::ConcatKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/onednn/expand_grad_kernel.cc b/paddle/phi/kernels/onednn/expand_grad_kernel.cc
new file mode 100644
index 00000000000000..dd8afdd84677ad
--- /dev/null
+++ b/paddle/phi/kernels/onednn/expand_grad_kernel.cc
@@ -0,0 +1,92 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_grad_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void ExpandGradKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& out_grad,
+                      const IntArray& shape,
+                      DenseTensor* in_grad) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  auto in_grad_vec_dims = vectorize(in_grad->dims());
+  auto out_grad_vec_dims = vectorize(out_grad.dims());
+
+  if (in_grad_vec_dims.size() != out_grad_vec_dims.size()) {
+    in_grad_vec_dims.insert(in_grad_vec_dims.begin(),
+                            out_grad_vec_dims.size() - in_grad_vec_dims.size(),
+                            1);
+  }
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  if (out_grad_vec_dims == in_grad_vec_dims) {
+    dnnl::memory::data_type out_grad_type =
+        funcs::ToOneDNNDataType(out_grad.dtype());
+    funcs::ReorderOneDNNHandler reorder_handler(
+        out_grad_vec_dims, out_grad.dtype(), out_grad_type, onednn_engine);
+
+    auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+        out_grad.mem_desc(), funcs::to_void_cast(out_grad.data<T>()));
+
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        in_grad,
+        funcs::GetPlainOneDNNFormat(in_grad_vec_dims.size()),
+        dev_ctx.GetPlace());
+
+    auto reorder_p = reorder_handler.AcquireReorder(reorder_src_memory_p,
+                                                    reorder_dst_memory_p);
+
+    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+    astream.wait();
+
+    in_grad->set_mem_desc(reorder_dst_memory_p->get_desc());
+  } else {
+    funcs::ReductionOneDNNHandler<T> handler(dnnl::algorithm::reduction_sum,
+                                             0.0f,
+                                             0.0f,
+                                             onednn_engine,
+                                             dev_ctx.GetPlace(),
+                                             &out_grad,
+                                             in_grad,
+                                             in_grad_vec_dims);
+
+    auto src_memory_p = handler.AcquireSrcMemory(&out_grad);
+    auto dst_memory_p = handler.AcquireDstMemory(in_grad);
+
+    std::unordered_map<int, dnnl::memory> reduction_args = {
+        {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+
+    auto reduction_p = handler.AcquireForwardPrimitive();
+
+    reduction_p->execute(astream, reduction_args);
+    astream.wait();
+    in_grad->set_layout(DataLayout::ONEDNN);
+    in_grad->set_mem_desc(
+        dst_memory_p->get_desc().reshape(vectorize<int64_t>(in_grad->dims())));
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(expand_grad,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::ExpandGradKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/expand_kernel.cc b/paddle/phi/kernels/onednn/expand_kernel.cc
new file mode 100644
index 00000000000000..52d12bb100d0f8
--- /dev/null
+++ b/paddle/phi/kernels/onednn/expand_kernel.cc
@@ -0,0 +1,83 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/expand_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+std::vector<int64_t> GetExtendedXDims(const std::vector<int64_t>& x_vec_dims,
+                                      int new_size) {
+  std::vector<int64_t> extended_x_dims(new_size, 1);
+  std::copy(x_vec_dims.begin(),
+            x_vec_dims.end(),
+            extended_x_dims.begin() + new_size - x_vec_dims.size());
+
+  return extended_x_dims;
+}
+
+template <typename T, typename Context>
+void ExpandKernel(const Context& dev_ctx,
+                  const DenseTensor& x,
+                  const IntArray& shape,
+                  DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  auto x_vec_dims = vectorize(x.dims());
+
+  auto out_new_dims = shape.GetData();
+
+  for (size_t i = 0; i < out_new_dims.size(); ++i) {
+    out_new_dims[i] = out_new_dims[i] > 0 ? out_new_dims[i] : x_vec_dims[i];
+  }
+
+  if (x_vec_dims.size() != out_new_dims.size()) {
+    x_vec_dims = GetExtendedXDims(x_vec_dims, out_new_dims.size());
+  }
+
+  out->Resize(make_ddim(out_new_dims));
+  funcs::BroadcastDataOneDNNHandler<T> handler(dnnl::algorithm::binary_add,
+                                               onednn_engine,
+                                               dev_ctx.GetPlace(),
+                                               &x,
+                                               out,
+                                               0.0f,
+                                               1.0f,
+                                               x_vec_dims);
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p = handler.AcquireZeroedDstMemory(out);
+  auto binary_p = handler.AcquireForwardPrimitive();
+
+  const std::unordered_map<int, dnnl::memory> args = {
+      {DNNL_ARG_SRC_0, *dst_memory_p},
+      {DNNL_ARG_SRC_1, *src_memory_p},
+      {DNNL_ARG_DST, *dst_memory_p}};
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  binary_p->execute(astream, args);
+  astream.wait();
+
+  out->set_mem_desc(dst_memory_p->get_desc());
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(expand,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::ExpandKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/full_kernel.cc b/paddle/phi/kernels/onednn/full_kernel.cc
new file mode 100644
index 00000000000000..5a444175bfb492
--- /dev/null
+++ b/paddle/phi/kernels/onednn/full_kernel.cc
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/full_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+namespace funcs {
+
+template <typename T>
+class FillConstantOneDNNHandler
+    : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
+ public:
+  FillConstantOneDNNHandler(DenseTensor* out,
+                            dnnl::engine engine,
+                            Place cpu_place)
+      : OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
+    const auto src0_md = dnnl::memory::desc({out->numel(), sizeof(T)},
+                                            OneDNNGetDataType<uint8_t>(),
+                                            dnnl::memory::format_tag::ab);
+
+    dnnl::primitive_attr attrs;
+    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
+
+    this->AcquireForwardPrimitiveDescriptor(
+        attrs, dnnl::algorithm::binary_add, src0_md, src1_md, src0_md);
+  }
+
+  static const dnnl::memory::desc src1_md;
+};
+
+template <typename T>
+const dnnl::memory::desc FillConstantOneDNNHandler<T>::src1_md(
+    {1, sizeof(T)}, OneDNNGetDataType<uint8_t>(), dnnl::memory::format_tag::ab);
+}  // namespace funcs
+
+template <typename T, typename Context>
+void FullKernel(const Context& dev_ctx,
+                const IntArray& shape,
+                const Scalar& val,
+                DataType dtype,
+                DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  T fill_value = val.to<T>();
+  out->Resize(make_ddim(shape.GetData()));
+
+  funcs::FillConstantOneDNNHandler<T> handler(
+      out, onednn_engine, dev_ctx.GetPlace());
+
+  dnnl::memory constant_value_memory =
+      dnnl::memory(funcs::FillConstantOneDNNHandler<T>::src1_md,
+                   onednn_engine,
+                   reinterpret_cast<uint8_t*>(&fill_value));
+
+  auto src0_memory_p = handler.AcquireDstMemory(out);
+  auto fill_constant_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  fill_constant_p->execute(astream,
+                           {{DNNL_ARG_SRC_0, *src0_memory_p},
+                            {DNNL_ARG_SRC_1, constant_value_memory},
+                            {DNNL_ARG_DST, *src0_memory_p}});
+  astream.wait();
+
+  // src0_memory_p's md was just to allow the usage of a binary
+  // primitive as a memset, and now we need to create a real one
+  out->set_mem_desc({vectorize(out->dims()),
+                     funcs::OneDNNGetDataType<T>(),
+                     funcs::GetPlainOneDNNFormat(out->dims().size())});
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(full, OneDNN, ALL_LAYOUT, phi::FullKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/gaussian_random_kernel.cc b/paddle/phi/kernels/onednn/gaussian_random_kernel.cc
index b8259754d0b84e..abed20b9cb0653 100644
--- a/paddle/phi/kernels/onednn/gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/onednn/gaussian_random_kernel.cc
@@ -28,8 +28,13 @@ void GaussianRandomKernel(const Context& ctx,
                           DataType dtype,
                           DenseTensor* out) {
   std::normal_distribution<T> dist(mean, std);
-  auto engine = std::make_shared<std::mt19937_64>();
-  engine->seed(seed);
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = ctx.GetGenerator()->GetCPUEngine();
+  }
 
   T* data = ctx.template Alloc<T>(out);
   for (int64_t i = 0; i < out->numel(); ++i) {
diff --git a/paddle/phi/kernels/onednn/interpolate_kernel.cc b/paddle/phi/kernels/onednn/interpolate_kernel.cc
new file mode 100644
index 00000000000000..f70b9bcaf1a6cd
--- /dev/null
+++ b/paddle/phi/kernels/onednn/interpolate_kernel.cc
@@ -0,0 +1,240 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/interpolate_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/interpolate_function.h"
+
+namespace phi {
+
+namespace funcs {
+template <typename T = float>
+class InterpolateOneDNNHandler
+    : public OneDNNHandlerNoCachingT<T, dnnl::resampling_forward> {
+ public:
+  InterpolateOneDNNHandler(const dnnl::algorithm algo,
+                           const dnnl::engine engine,
+                           Place cpu_place,
+                           const DenseTensor* x,
+                           DenseTensor* out)
+      : OneDNNHandlerNoCachingT<T, dnnl::resampling_forward>(engine,
+                                                             cpu_place) {
+    const auto dst_tz = vectorize(out->dims());
+    const auto dst_md = dnnl::memory::desc(
+        dst_tz, OneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
+    this->AcquireForwardPrimitiveDescriptor(
+        dnnl::prop_kind::forward_inference, algo, x->mem_desc(), dst_md);
+  }
+};
+}  // namespace funcs
+
+std::vector<int> ComputeOutputShape(
+    const DenseTensor* x,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
+    const std::string& data_layout,
+    int out_d,
+    int out_h,
+    int out_w,
+    const std::vector<float>& scale_attr) {
+  const auto& in_dims = x->dims();
+  const DDim in_dhw_dims = slice_ddim(in_dims, 2, in_dims.size());
+
+  std::vector<int> out_dims;
+  out_dims.reserve(5);
+  if (in_dhw_dims.size() == 1) {
+    out_dims.push_back(out_w);
+  } else if (in_dhw_dims.size() == 2) {
+    out_dims.push_back(out_h);
+    out_dims.push_back(out_w);
+  } else if (in_dhw_dims.size() == 3) {
+    out_dims.push_back(out_d);
+    out_dims.push_back(out_h);
+    out_dims.push_back(out_w);
+  }
+
+  if (size_tensor && size_tensor.get().size() > 0) {
+    auto new_size = funcs::get_new_shape(size_tensor.get());
+    if (new_size.size() == out_dims.size()) {
+      out_dims = new_size;
+    }
+  } else if (out_size) {
+    auto out_size_data =
+        funcs::get_new_data_from_tensor<int>(out_size.get_ptr());
+    if (out_size_data.size() == out_dims.size()) {
+      out_dims = out_size_data;
+    }
+  } else {
+    std::vector<float> scale;
+    scale.reserve(3);
+    if (scale_tensor) {
+      auto scale_data =
+          funcs::get_new_data_from_tensor<float>(scale_tensor.get_ptr());
+      scale.resize(3, scale_data[0]);
+      std::copy(scale_data.begin(), scale_data.end(), scale.begin());
+    } else {
+      if (scale_attr.size() > 0) {
+        scale.resize(3, scale_attr[0]);
+        std::copy(scale_attr.begin(), scale_attr.end(), scale.begin());
+      }
+    }
+
+    if (scale.size() == 3 && scale[0] > 0.0f && scale[1] > 0.0f &&
+        scale[2] > 0.0f) {
+      int j = 0;
+      std::vector<int64_t> in_dhw_vec = vectorize(in_dhw_dims);
+      std::transform(
+          in_dhw_vec.begin(),
+          in_dhw_vec.end(),
+          out_dims.begin(),
+          [&](int64_t i) -> int { return static_cast<int>(i * scale[j++]); });
+    }
+  }
+
+  PADDLE_ENFORCE_GT(
+      std::all_of(
+          out_dims.begin(), out_dims.end(), [](int i) { return i > 0; }),
+      0,
+      errors::InvalidArgument("out_d, out_h, out_w of Op(interpolate) "
+                              "should be greater than 0."));
+
+  const std::vector<int64_t> nc_dims = {in_dims[0], in_dims[1]};
+  out_dims.insert(out_dims.begin(), nc_dims.begin(), nc_dims.end());
+  return out_dims;
+}
+
+template <typename T, typename Context>
+void InterpolateKernel(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
+    const std::string& data_layout,
+    int out_d,
+    int out_h,
+    int out_w,
+    const std::vector<float>& scale,
+    const std::string& interp_method,
+    DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  const dnnl::algorithm algo = (interp_method == "nearest")
+                                   ? dnnl::algorithm::resampling_nearest
+                                   : dnnl::algorithm::resampling_linear;
+
+  const auto out_dims_vec = ComputeOutputShape(&x,
+                                               out_size,
+                                               size_tensor,
+                                               scale_tensor,
+                                               data_layout,
+                                               out_d,
+                                               out_h,
+                                               out_w,
+                                               scale);
+  DDim dim_out = make_ddim(out_dims_vec);
+  out->Resize(dim_out);
+
+  funcs::InterpolateOneDNNHandler<T> handler(
+      algo, onednn_engine, dev_ctx.GetPlace(), &x, out);
+
+  auto src_memory_p = handler.AcquireSrcMemory(&x);
+  auto dst_memory_p = handler.AcquireDstMemory(out);
+
+  auto resampling_prim = handler.AcquireForwardPrimitive();
+  const std::unordered_map<int, dnnl::memory> args = {
+      {DNNL_ARG_SRC, *src_memory_p}, {DNNL_ARG_DST, *dst_memory_p}};
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  resampling_prim->execute(astream, args);
+  astream.wait();
+
+  out->set_mem_desc(dst_memory_p->get_desc());
+}
+
+template <typename T, typename Context>
+void BilinearInterpKernel(
+    const Context& ctx,
+    const DenseTensor& x,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
+    const std::string& data_layout,
+    int out_d,
+    int out_h,
+    int out_w,
+    const std::vector<float>& scale,
+    const std::string& interp_method,
+    bool align_corners,
+    int align_mode,
+    DenseTensor* output) {
+  InterpolateKernel<T, Context>(ctx,
+                                x,
+                                out_size,
+                                size_tensor,
+                                scale_tensor,
+                                data_layout,
+                                out_d,
+                                out_h,
+                                out_w,
+                                scale,
+                                interp_method,
+                                output);
+}
+
+template <typename T, typename Context>
+void NearestInterpKernel(
+    const Context& ctx,
+    const DenseTensor& x,
+    const paddle::optional<DenseTensor>& out_size,
+    const paddle::optional<std::vector<const DenseTensor*>>& size_tensor,
+    const paddle::optional<DenseTensor>& scale_tensor,
+    const std::string& data_layout,
+    int out_d,
+    int out_h,
+    int out_w,
+    const std::vector<float>& scale,
+    const std::string& interp_method,
+    bool align_corners,
+    int align_mode,
+    DenseTensor* output) {
+  InterpolateKernel<T, Context>(ctx,
+                                x,
+                                out_size,
+                                size_tensor,
+                                scale_tensor,
+                                data_layout,
+                                out_d,
+                                out_h,
+                                out_w,
+                                scale,
+                                interp_method,
+                                output);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bilinear_interp, OneDNN, ALL_LAYOUT, phi::BilinearInterpKernel, float) {}
+
+PD_REGISTER_KERNEL(nearest_interp,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::NearestInterpKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/onednn/pad3d_kernel.cc b/paddle/phi/kernels/onednn/pad3d_kernel.cc
new file mode 100644
index 00000000000000..2d34e11afc4cde
--- /dev/null
+++ b/paddle/phi/kernels/onednn/pad3d_kernel.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad3d_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/onednn/pad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void Pad3dKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const IntArray& paddings,
+                 const std::string& mode,
+                 float pad_value,
+                 const std::string& data_format,
+                 DenseTensor* out) {
+  PadOpKernel<T, Context>(dev_ctx, x, paddings.GetData(), pad_value, out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad3d, OneDNN, ALL_LAYOUT, phi::Pad3dKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/pad_kernel.cc b/paddle/phi/kernels/onednn/pad_kernel.cc
new file mode 100644
index 00000000000000..4177f000dba147
--- /dev/null
+++ b/paddle/phi/kernels/onednn/pad_kernel.cc
@@ -0,0 +1,37 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/pad_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/onednn/pad_kernel_impl.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void PadKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const std::vector<int>& paddings,
+               const Scalar& pad_value,
+               DenseTensor* out) {
+  std::vector<int64_t> copied_paddings(paddings.begin(), paddings.end());
+
+  std::swap(copied_paddings[0], copied_paddings[2]);
+  std::swap(copied_paddings[1], copied_paddings[3]);
+  PadOpKernel<T, Context>(
+      dev_ctx, x, copied_paddings, pad_value.to<float>(), out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(pad, OneDNN, ALL_LAYOUT, phi::PadKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/pad_kernel_impl.h b/paddle/phi/kernels/onednn/pad_kernel_impl.h
new file mode 100644
index 00000000000000..eabe18855b796e
--- /dev/null
+++ b/paddle/phi/kernels/onednn/pad_kernel_impl.h
@@ -0,0 +1,177 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+
+namespace phi {
+
+/*
+Pad3D is done by using up to 7 reorders. Following example is done
+on 2D data for simplicity, but it is straightforward to extend it to 3D case.
+
+Let us consider following example:
+
+          N  C  H  W               L  R  T  B
+X_dims = (1, 1, 3, 3), paddings = (1, 2, 3, 4) in order Left, Right, Top, Bottom
+
+We have to copy the X tensor into Out tensor, but except from that we have to
+fill the rest of the memory with an additional padding. To avoid looping through
+the whole Out memory two times, only these parts of Out memory that won't store
+X's memory are filled with pad value. That behavior is achieved by using
+oneDNN's submemory descriptors which allows us to set offsets for each dimension
+and skip some parts of the memory. For 2D case up to 5 reorders will be used in
+Pad3D kernel(if padding=0 reorder is skipped). In the following example i'th
+number means, that this part of memory was filled by i'th reorder. 4'th reorder
+is copying X memory into Out memory. i&j means that both i'th and j'th reorder
+will set the padding at that location:
+
+               INDEX
+     | 0   1   2   3   4   5
+     |_______________________
+   0 |0&2  2   2   2  1&2 1&2
+   1 |0&2  2   2   2  1&2 1&2
+I  2 |0&2  2   2   2  1&2 1&2
+N  3 | 0   4   4   4   1   1
+D  4 | 0   4   4   4   1   1
+E  5 | 0   4   4   4   1   1
+X  6 |0&3  3   3   3  1&3 1&3
+   7 |0&3  3   3   3  1&3 1&3
+   8 |0&3  3   3   3  1&3 1&3
+   9 |0&3  3   3   3  1&3 1&3
+
+Since oneDNN's reorder cannot set the pad value to the memory by itself, we have
+to prefill Out's memory and use it as a temporary buffer, which later is copied
+into the rest of Out's memory. At the end last reorder is done which copies X
+memory into Out memory.
+
+*/
+
+inline int64_t CalculateNumOfPrefillElems(
+    const std::vector<int64_t>& out_tz, const std::vector<int64_t>& paddings) {
+  int64_t max_elems = 0;
+  int64_t independent_dims = out_tz[0] * out_tz[1];
+
+  for (size_t i = 0; i < paddings.size() / 2; ++i) {
+    int64_t elems = std::max(paddings[2 * i], paddings[2 * i + 1]);
+    for (size_t j = 0; j < paddings.size() / 2; ++j) {
+      if (j != i) {
+        elems *= out_tz[out_tz.size() - 1 - j];
+      }
+    }
+
+    if (max_elems < elems) {
+      max_elems = elems;
+    }
+  }
+  return independent_dims * max_elems;
+}
+
+template <typename T>
+void FillPartOfPadding(const dnnl::engine& onednn_engine,
+                       T* prefilled_mem_ptr,
+                       const std::shared_ptr<dnnl::memory>& out_mem_p,
+                       const std::vector<int64_t>& chunk_tz,
+                       const std::vector<int64_t>& offsets) {
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  dnnl::memory::desc prefilled_mem_desc(
+      chunk_tz,
+      funcs::OneDNNGetDataType<T>(),
+      funcs::GetPlainOneDNNFormat(chunk_tz.size()));
+  dnnl::memory prefilled_mem(
+      prefilled_mem_desc, onednn_engine, prefilled_mem_ptr);
+
+  dnnl::memory::desc out_slice_md =
+      out_mem_p->get_desc().submemory_desc(chunk_tz, {offsets});
+  dnnl::memory out_slice_mem(
+      out_slice_md, onednn_engine, out_mem_p->get_data_handle());
+
+  auto reorder_p = dnnl::reorder(prefilled_mem, out_slice_mem);
+  reorder_p.execute(astream, prefilled_mem, out_slice_mem);
+}
+
+template <typename T, typename Context>
+void PadOpKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const std::vector<int64_t>& paddings,
+                 float pad_value,
+                 DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  std::vector<int64_t> x_tz = vectorize(x.dims());
+  // due to the need of supporting NDHWC, inferring out shape
+  // must be done inside the kernel
+  std::vector<int64_t> out_tz(x_tz);
+
+  for (size_t i = 0; i < paddings.size() / 2; ++i) {
+    out_tz[out_tz.size() - 1 - i] += paddings[2 * i] + paddings[2 * i + 1];
+  }
+  out->Resize(make_ddim(out_tz));
+
+  funcs::ReorderOneDNNHandler reorder_handler(
+      x_tz, x.dtype(), funcs::ToOneDNNDataType(x.dtype()), onednn_engine);
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      out,
+      out_tz,
+      funcs::GetPlainOneDNNFormat(out_tz.size()),
+      dev_ctx.GetPlace());
+
+  // to avoid allocating new temporary memory, Out's memory is used as a tmp
+  // buffer for storing a contiguous memory consisting of pad_value, which
+  // later is used as a SRC for reorders that are filling Out with padding
+  T* out_ptr = out->data<T>();
+  std::fill(out_ptr,
+            out_ptr + CalculateNumOfPrefillElems(out_tz, paddings),
+            pad_value);
+
+  // paddings are in order: left, right, top, bottom, front, back
+  for (size_t i = 0; i < paddings.size(); ++i) {
+    if (paddings[i] != 0) {
+      std::vector<int64_t> offsets(out_tz.size(), 0);
+      std::vector<int64_t> chunk_tz(out_tz.begin(), out_tz.end());
+
+      chunk_tz[out_tz.size() - 1 - i / 2] = paddings[i];
+      if (i % 2 == 1) {
+        offsets[out_tz.size() - 1 - i / 2] =
+            paddings[i - 1] + x_tz[out_tz.size() - 1 - i / 2];
+      }
+
+      FillPartOfPadding(
+          onednn_engine, out_ptr, reorder_dst_memory_p, chunk_tz, offsets);
+    }
+  }
+  astream.wait();
+
+  std::vector<int64_t> offsets(out_tz.size(), 0);
+  for (size_t i = 0; i < paddings.size() / 2; ++i) {
+    offsets[out_tz.size() - 1 - i] = paddings[2 * i];
+  }
+
+  auto slice_mem_p =
+      reorder_handler.AcquireSubmemory(x_tz, offsets, reorder_dst_memory_p);
+
+  auto reorder_p =
+      reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p);
+  reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
+  astream.wait();
+
+  out->set_mem_desc(reorder_dst_memory_p->get_desc());
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/onednn/sgd_kernel.cc b/paddle/phi/kernels/onednn/sgd_kernel.cc
new file mode 100644
index 00000000000000..bbb02204105d16
--- /dev/null
+++ b/paddle/phi/kernels/onednn/sgd_kernel.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sgd_kernel.h"
+
+#include "paddle/phi/backends/onednn/axpy_handler.h"
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SGDDenseKernel(const Context& dev_ctx,
+                    const DenseTensor& param,
+                    const DenseTensor& learning_rate,
+                    const DenseTensor& grad,
+                    const paddle::optional<DenseTensor>& master_param,
+                    bool multi_precision,
+                    DenseTensor* param_out,
+                    DenseTensor* master_param_out) {
+  auto* out_data = dev_ctx.template Alloc<T>(param_out);
+  const T* param_data = param.data<T>();
+  const auto* grad_data = grad.data<T>();
+  const auto* lr = learning_rate.data<T>();
+  // Since denese SGD is not in place operation, first copy params to output
+  // tensor and then update it.
+  std::memcpy(out_data, param_data, param.memory_size());
+  funcs::OneDNNAXPYHandler<T>(param_out->numel(), -lr[0], dev_ctx.GetEngine())(
+      grad_data, out_data);
+}
+
+template <typename T, typename Context>
+void SGDDenseParamSparseGradKernel(
+    const Context& dev_ctx,
+    const DenseTensor& param,
+    const DenseTensor& learning_rate,
+    const SelectedRows& grad,
+    const paddle::optional<DenseTensor>& master_param,
+    bool multi_precision,
+    DenseTensor* param_out,
+    DenseTensor* master_param_out) {
+  const auto& grad_value = grad.value();
+  const auto& grad_rows = grad.rows();
+  const auto grad_height = grad.height();
+  const int64_t grad_val_height = static_cast<int64_t>(grad_rows.size());
+  const auto grad_width = grad_value.numel() / grad_val_height;
+
+  const auto* grad_data = grad_value.data<T>();
+  auto* out_data = param_out->data<T>();
+  const auto* lr = learning_rate.data<T>();
+
+  funcs::OneDNNAXPYHandler<T> axpy_handler(
+      grad_width, -lr[0], dev_ctx.GetEngine());
+
+  for (size_t i = 0; i < grad_rows.size(); ++i) {
+    PADDLE_ENFORCE_LT(
+        grad_rows[i],
+        grad_height,
+        errors::OutOfRange(
+            "Grad rows index value should be less than grad height."
+            "Got [%s], but expected less than [%s]",
+            grad_rows[i],
+            grad_height));
+    const int64_t row = grad_rows[i];
+    const auto* src = grad_data + i * grad_width;
+    auto* dst = out_data + row * grad_width;
+    axpy_handler(src, dst);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    sgd, OneDNN, ALL_LAYOUT, phi::SGDDenseKernel, float, phi::dtype::bfloat16) {
+}
+
+PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::SGDDenseParamSparseGradKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/shape_kernel.cc b/paddle/phi/kernels/onednn/shape_kernel.cc
new file mode 100644
index 00000000000000..b6fcd32f1c81a5
--- /dev/null
+++ b/paddle/phi/kernels/onednn/shape_kernel.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/shape_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void ShapeKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 DenseTensor* out) {
+  DDim x_dims = x.dims();
+
+  // Output of shape op is often fed as x to fill_constant ops
+  // and we need to rotate a shape otherwise Tensors of wrong shape may be
+  // allocated
+  if (OneDNNContext::tls().get_cur_paddle_data_layout() == DataLayout::kNHWC &&
+      x_dims.size() >= 3) {
+    auto rdims = vectorize<int>(x_dims);
+    std::rotate(rdims.begin() + 1, rdims.begin() + 2, rdims.end());
+    x_dims = make_ddim(rdims);
+  }
+
+  out->Resize({x_dims.size()});
+  auto out_data = dev_ctx.template Alloc<int32_t>(out);
+  for (int i = 0; i < x_dims.size(); ++i) {
+    out_data[i] = x_dims[i];
+  }
+
+  dnnl::memory::desc out_mem_desc(
+      vectorize(out->dims()),
+      funcs::ToOneDNNDataType(out->dtype()),
+      funcs::GetPlainOneDNNFormat(out->dims().size()));
+  out->set_mem_desc(out_mem_desc);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(shape,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::ShapeKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   int8_t,
+                   uint8_t) {}
diff --git a/paddle/phi/kernels/onednn/slice_grad_kernel.cc b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
new file mode 100644
index 00000000000000..c38a2237e54779
--- /dev/null
+++ b/paddle/phi/kernels/onednn/slice_grad_kernel.cc
@@ -0,0 +1,86 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/slice_grad_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SliceGradRawKernel(const Context& dev_ctx,
+                        const DenseTensor& input,
+                        const DenseTensor& out_grad,
+                        const std::vector<int64_t>& axes,
+                        const IntArray& starts,
+                        const IntArray& ends,
+                        const std::vector<int64_t>& infer_flags,
+                        const std::vector<int64_t>& decrease_axis,
+                        DenseTensor* input_grad) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  auto dx_dims = vectorize(input_grad->dims());
+
+  auto starts_vec = starts.GetData();
+  auto ends_vec = ends.GetData();
+
+  std::vector<int64_t> offsets(dx_dims.size(), 0);
+  std::vector<int64_t> slice_dims(dx_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    starts_vec[i] =
+        starts_vec[i] < 0 ? dx_dims[axes[i]] + starts_vec[i] : starts_vec[i];
+    ends_vec[i] = ends_vec[i] < 0 ? dx_dims[axes[i]] + ends_vec[i]
+                                  : std::min(ends_vec[i], dx_dims[axes[i]]);
+    offsets[axes[i]] = starts_vec[i];
+    slice_dims[axes[i]] = ends_vec[i] - starts_vec[i];
+  }
+
+  dnnl::memory::data_type out_grad_type =
+      funcs::ToOneDNNDataType(out_grad.dtype());
+
+  funcs::ReorderOneDNNHandler reorder_handler(
+      slice_dims, out_grad.dtype(), out_grad_type, onednn_engine);
+
+  auto reorder_src_memory_p =
+      reorder_handler.AcquireSrcMemory(out_grad.mem_desc().reshape(slice_dims),
+                                       funcs::to_void_cast(out_grad.data<T>()));
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      input_grad,
+      dx_dims,
+      funcs::GetPlainOneDNNFormat(dx_dims.size()),
+      dev_ctx.GetPlace());
+  memset(input_grad->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
+
+  auto slice_mem_p = reorder_handler.AcquireSubmemory(
+      slice_dims, offsets, reorder_dst_memory_p);
+
+  auto reorder_p =
+      reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p);
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
+  astream.wait();
+
+  input_grad->set_mem_desc(reorder_dst_memory_p->get_desc());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(slice_grad,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::SliceGradRawKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/slice_kernel.cc b/paddle/phi/kernels/onednn/slice_kernel.cc
new file mode 100644
index 00000000000000..3f74a2fe0be01c
--- /dev/null
+++ b/paddle/phi/kernels/onednn/slice_kernel.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/slice_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SliceRawKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const std::vector<int64_t>& axes,
+                    const IntArray& starts,
+                    const IntArray& ends,
+                    const std::vector<int64_t>& infer_flags,
+                    const std::vector<int64_t>& decrease_axis,
+                    DenseTensor* out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  auto x_vec_dims = vectorize(x.dims());
+
+  auto starts_vec = starts.GetData();
+  auto ends_vec = ends.GetData();
+
+  std::vector<int64_t> offsets(x_vec_dims.size(), 0);
+  std::vector<int64_t> slice_dims(x_vec_dims);
+
+  for (size_t i = 0; i < axes.size(); ++i) {
+    starts_vec[i] =
+        starts_vec[i] < 0 ? x_vec_dims[axes[i]] + starts_vec[i] : starts_vec[i];
+    ends_vec[i] = ends_vec[i] < 0 ? x_vec_dims[axes[i]] + ends_vec[i]
+                                  : std::min(ends_vec[i], x_vec_dims[axes[i]]);
+    offsets[axes[i]] = starts_vec[i];
+    slice_dims[axes[i]] =
+        std::max(static_cast<int64_t>(0), ends_vec[i] - starts_vec[i]);
+  }
+
+  out->Resize(make_ddim(slice_dims));
+
+  // Note(0x45f): To support slice Tensors with shapes like [0, 0, 0].
+  if (!x.initialized()) {
+    dev_ctx.Alloc(out, x.dtype());
+    out->set_layout(DataLayout::ONEDNN);
+    return;
+  }
+
+  dnnl::memory::data_type x_type = funcs::ToOneDNNDataType(x.dtype());
+
+  funcs::ReorderOneDNNHandler reorder_handler(
+      x_vec_dims, x.dtype(), x_type, onednn_engine);
+
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+  auto slice_mem_p = reorder_handler.AcquireSubmemory(
+      slice_dims, offsets, reorder_src_memory_p);
+  auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+      out,
+      slice_dims,
+      funcs::GetPlainOneDNNFormat(x_vec_dims.size()),
+      dev_ctx.GetPlace());
+
+  auto reorder_p =
+      reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+
+  std::vector<int64_t> new_out_dims(slice_dims.size() - decrease_axis.size());
+
+  if (new_out_dims.size() == 0) {
+    new_out_dims.emplace_back(1);
+  } else {
+    for (const auto& axis : decrease_axis) {
+      slice_dims[axis] = 0;
+    }
+
+    int i = 0;
+    for (const auto& slice_dim : slice_dims) {
+      if (slice_dim != 0) new_out_dims[i++] = slice_dim;
+    }
+  }
+
+  astream.wait();
+  out->Resize(make_ddim(new_out_dims));
+  out->set_mem_desc(reorder_dst_memory_p->get_desc().reshape(new_out_dims));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(slice,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::SliceRawKernel,
+                   float,
+                   int8_t,
+                   uint8_t,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/softmax_grad_kernel.cc b/paddle/phi/kernels/onednn/softmax_grad_kernel.cc
new file mode 100644
index 00000000000000..0209992a679cdc
--- /dev/null
+++ b/paddle/phi/kernels/onednn/softmax_grad_kernel.cc
@@ -0,0 +1,53 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/softmax_grad_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_context.h"
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& out,
+                       const DenseTensor& out_grad,
+                       int axis,
+                       DenseTensor* x_grad) {
+  funcs::SoftmaxOneDNNHandler<T> handler(
+      dev_ctx.GetEngine(), dev_ctx.GetPlace(), axis, &out, &out_grad);
+
+  auto dst_memory_p = handler.AcquireDstMemory(&out);
+  auto diff_dst_memory_p = handler.AcquireDiffDstMemory(&out_grad);
+  auto diff_src_memory_p = handler.AcquireDiffSrcMemory(x_grad);
+
+  auto softmax_bwd_p = handler.AcquireBackwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  softmax_bwd_p->execute(astream,
+                         {{DNNL_ARG_DST, *dst_memory_p},
+                          {DNNL_ARG_DIFF_DST, *diff_dst_memory_p},
+                          {DNNL_ARG_DIFF_SRC, *diff_src_memory_p}});
+  astream.wait();
+
+  x_grad->set_mem_desc(diff_src_memory_p->get_desc());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    softmax_grad, OneDNN, ALL_LAYOUT, phi::SoftmaxGradKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/split_kernel.cc b/paddle/phi/kernels/onednn/split_kernel.cc
new file mode 100644
index 00000000000000..1d0544758fe96f
--- /dev/null
+++ b/paddle/phi/kernels/onednn/split_kernel.cc
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/split_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SplitKernel(const Context& dev_ctx,
+                 const DenseTensor& x,
+                 const IntArray& sections,
+                 const Scalar& split_axis,
+                 std::vector<DenseTensor*> out) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  int axis = split_axis.to<int>();
+
+  auto outs_number = out.size();
+  const auto x_dims = x.dims();
+  auto x_vec_dims = vectorize(x_dims);
+
+  dnnl::memory::data_type x_type = funcs::ToOneDNNDataType(x.dtype());
+
+  auto& astream = OneDNNContext::tls().get_stream();
+
+  std::vector<int64_t> offset(x_vec_dims.size(), 0);
+  funcs::ReorderOneDNNHandler reorder_handler(
+      x_vec_dims, x.dtype(), x_type, onednn_engine);
+  auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
+      x.mem_desc(), funcs::to_void_cast(x.data<T>()));
+
+  for (size_t i = 0; i < outs_number; ++i) {
+    auto out_vec_dims = vectorize(out[i]->dims());
+    auto slice_mem_p = reorder_handler.AcquireSubmemory(
+        out_vec_dims, offset, reorder_src_memory_p);
+
+    auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
+        out[i], out_vec_dims, x.format(), dev_ctx.GetPlace());
+    auto reorder_p =
+        reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
+
+    reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
+
+    offset[axis] += sections.GetData()[i];
+    out[i]->set_mem_desc(reorder_dst_memory_p->get_desc());
+  }
+  astream.wait();
+}
+
+template <typename T, typename Context>
+void SplitWithNumKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        int num,
+                        const Scalar& axis_scalar,
+                        std::vector<DenseTensor*> outs) {
+  int axis_value = axis_scalar.to<int>();
+  auto input_axis_dim = x.dims().at(axis_value);
+  std::vector<int64_t> sections_vec;
+  for (int i = 0; i < num; ++i) {
+    sections_vec.push_back(input_axis_dim / num);
+  }
+  IntArray sections(sections_vec);
+  SplitKernel<T, Context>(dev_ctx, x, sections, axis_scalar, outs);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    split, OneDNN, ALL_LAYOUT, phi::SplitKernel, float, phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(split_with_num,
+                   OneDNN,
+                   ALL_LAYOUT,
+                   phi::SplitWithNumKernel,
+                   float,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/onednn/stack_kernel.cc b/paddle/phi/kernels/onednn/stack_kernel.cc
new file mode 100644
index 00000000000000..6ede31952e88d1
--- /dev/null
+++ b/paddle/phi/kernels/onednn/stack_kernel.cc
@@ -0,0 +1,127 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/stack_kernel.h"
+
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+namespace funcs {
+template <typename T>
+class StackOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
+ public:
+  StackOneDNNHandler(const Place& cpu_place,
+                     int stack_axis,
+                     const dnnl::engine onednn_engine,
+                     const std::vector<const DenseTensor*>& inputs,
+                     DenseTensor* output)
+      : OneDNNHandlerNoCachingT<T, dnnl::concat>(onednn_engine, cpu_place) {
+    int ndims = inputs[0]->dims().size();
+
+    if (stack_axis < 0) {
+      stack_axis = ndims + 1 + stack_axis;  // +1 to match output's ndims
+    }
+
+    // in stack op all inputs must have same dims
+    auto input_dims = vectorize<int64_t>(inputs[0]->dims());
+
+    dnnl::memory::data_type dt = ToOneDNNDataType(inputs[0]->dtype());
+    std::vector<memory::desc> srcs_md;
+    dnnl::memory::desc dst_md;
+    OneDNNMemoryFormat dst_fmt;
+
+    srcs_md.reserve(inputs.size());
+
+    // if stack is not done on last(non existing) axis, then we can optimize
+    // concat primitive by not adding additional dimension, since it causes
+    // wrong output format deduction and suboptimal performance as a result
+    if (stack_axis != ndims) {
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        srcs_md.push_back(inputs[i]->mem_desc());
+      }
+
+      input_dims[stack_axis] *= inputs.size();
+      dst_md = dnnl::memory::desc(input_dims, dt, OneDNNMemoryFormat::any);
+    } else {
+      auto extended_input_dims = vectorize<int64_t>(output->dims());
+      extended_input_dims[stack_axis] = 1;
+
+      for (size_t i = 0; i < inputs.size(); ++i) {
+        srcs_md.push_back(inputs[i]->mem_desc().reshape(extended_input_dims));
+      }
+
+      // concat primitive choses suboptimal format tag because it cannot
+      // distinguish between f.e. abcd and abdc if last dim is equal to 1 so
+      // enforcing is needed for better performance
+      dst_fmt = GetPlainOneDNNFormat(extended_input_dims.size());
+      dst_md = dnnl::memory::desc(vectorize(output->dims()), dt, dst_fmt);
+    }
+
+    this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md);
+  }
+
+  // concat oneDNN prim is not having .desc attribute so we cannot use default
+  // AcquireForwardPrimitiveDescriptor
+  void AcquireForwardPrimitiveDescriptor(
+      const memory::desc& dst_md,
+      const int stack_axis,
+      const std::vector<memory::desc>& srcs_md) {
+    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
+        dst_md, stack_axis, srcs_md, this->engine_));
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor& input,
+                                                 int i) {
+    const T* input_data = input.data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
+                                            to_void_cast<T>(input_data));
+  }
+};
+}  // namespace funcs
+
+template <typename T, typename Context>
+void StackKernel(const Context& dev_ctx,
+                 const std::vector<const DenseTensor*>& multi_input,
+                 int axis,
+                 DenseTensor* output) {
+  const auto& onednn_engine = dev_ctx.GetEngine();
+
+  funcs::StackOneDNNHandler<T> handler(
+      dev_ctx.GetPlace(), axis, onednn_engine, multi_input, output);
+
+  std::vector<std::shared_ptr<dnnl::memory>> srcs;
+  srcs.reserve(multi_input.size());
+
+  auto dst_mem = handler.AcquireDstMemory(output);
+  auto concat_p = handler.AcquireForwardPrimitive();
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  std::unordered_map<int, dnnl::memory> args;
+  for (size_t i = 0; i < multi_input.size(); ++i) {
+    srcs.push_back(handler.AcquireSrcMemory(*(multi_input[i]), i));
+    args.insert({DNNL_ARG_MULTIPLE_SRC + i, *(srcs.at(i))});
+  }
+  args.insert({DNNL_ARG_DST, *dst_mem});
+
+  concat_p->execute(astream, args);
+  astream.wait();
+
+  output->set_mem_desc(dst_mem->get_desc().reshape(vectorize(output->dims())));
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(stack, OneDNN, ALL_LAYOUT, phi::StackKernel, float) {}
diff --git a/paddle/phi/kernels/onednn/transpose_grad_kernel.cc b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
new file mode 100644
index 00000000000000..09f410c61c90fd
--- /dev/null
+++ b/paddle/phi/kernels/onednn/transpose_grad_kernel.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/onednn/onednn_reuse.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+template <typename T, typename Context>
+void TransposeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& out_grad,
+                         const std::vector<int>& axis,
+                         DenseTensor* x_grad) {
+  PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType() == phi::AllocationType::CPU,
+                    true,
+                    errors::PreconditionNotMet(
+                        "Operator DNNL TransposeGrad must use CPUPlace"));
+  if (!x_grad) return;
+
+  const auto& onednn_engine = dev_ctx.GetEngine();
+  std::vector<int> reversed_axis(axis);
+  if (axis.size() == 1) {
+    paddle::framework::TensorCopy(out_grad, out_grad.place(), x_grad);
+    x_grad->set_format(out_grad.format());
+    return;
+  }
+
+  for (size_t i = 0; i < axis.size(); i++) {
+    reversed_axis[axis[i]] = i;
+  }
+
+  const T* out_grad_data = out_grad.data<T>();
+  dev_ctx.template Alloc<T>(x_grad);
+  auto nchw_tz = vectorize<int64_t>(out_grad.dims());
+
+  funcs::TransposeOneDNNHandler<T> handler(
+      dev_ctx, nchw_tz, reversed_axis, onednn_engine);
+
+  auto transpose_src_memory_p = handler.AcquireSrcMemory(
+      out_grad.format(), funcs::to_void_cast<T>(out_grad_data));
+  auto transpose_dst_memory_p =
+      handler.AcquireDstMemory(x_grad, dev_ctx.GetPlace());
+  auto transpose_p =
+      handler.AcquireTranspose(transpose_dst_memory_p, transpose_src_memory_p);
+
+  auto& astream = OneDNNContext::tls().get_stream();
+  transpose_p->execute(
+      astream, *transpose_src_memory_p, *transpose_dst_memory_p);
+  astream.wait();
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    transpose_grad, OneDNN, ALL_LAYOUT, phi::TransposeGradKernel, float) {}
diff --git a/paddle/phi/kernels/selected_rows/add_n_kernel.h b/paddle/phi/kernels/selected_rows/add_n_kernel.h
new file mode 100644
index 00000000000000..c56985fb0723f9
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/add_n_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+namespace sr {
+
+template <typename T, typename Context>
+void AddNKernel(const Context& dev_ctx,
+                const std::vector<const SelectedRows*>& x,
+                SelectedRows* out);
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/selected_rows/cpu/add_n_kernel.cc b/paddle/phi/kernels/selected_rows/cpu/add_n_kernel.cc
new file mode 100644
index 00000000000000..1cd6529014e0ee
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/cpu/add_n_kernel.cc
@@ -0,0 +1,25 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
+
+PD_REGISTER_KERNEL(add_n_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sr::AddNKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu b/paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
new file mode 100644
index 00000000000000..43442348d2003d
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/gpu/add_n_kernel.cu
@@ -0,0 +1,26 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h"
+
+PD_REGISTER_KERNEL(add_n_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sr::AddNKernel,
+                   float,
+                   double,
+                   int,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h b/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
new file mode 100644
index 00000000000000..d5bd9f2b2c7c6e
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/impl/add_n_kernel_impl.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/kernels/selected_rows/add_n_kernel.h"
+
+#include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+namespace sr {
+template <typename T, typename Context>
+void AddNKernel(const Context &dev_ctx,
+                const std::vector<const SelectedRows *> &x,
+                SelectedRows *out) {
+  dev_ctx.template Alloc<T>(out->mutable_value());
+
+  bool in_place = false;
+  if (x.size() > 0 && x[0]->value().Holder() == out->value().Holder()) {
+    in_place = true;
+  }
+
+  if (in_place && x.size() < 2) {
+    return;
+  }
+
+  std::vector<const phi::SelectedRows *> inputs;
+  SelectedRows temp_in0;
+
+  if (in_place) {
+    auto &in0 = *x[0];
+    temp_in0.set_height(in0.height());
+    temp_in0.set_rows(in0.rows());
+    Copy<Context>(
+        dev_ctx, in0.value(), in0.place(), false, temp_in0.mutable_value());
+    inputs.push_back(&temp_in0);
+    for (size_t i = 1; i < x.size(); ++i) {
+      auto &in = *x[i];
+      if (in.rows().size() > 0) {
+        inputs.push_back(&in);
+      }
+    }
+  } else {
+    for (auto in_var : x) {
+      auto &in = *in_var;
+      if (in.rows().size() > 0) {
+        inputs.push_back(in_var);
+      }
+    }
+  }
+
+  out->mutable_rows()->clear();
+
+  bool has_data = false;
+  for (auto &in : inputs) {
+    if (in->rows().size() > 0) {
+      has_data = true;
+      break;
+    }
+  }
+  if (has_data) {
+    paddle::operators::math::scatter::MergeAdd<Context, T> merge_add;
+    merge_add(dev_ctx, inputs, out);
+
+    out->SyncIndex();
+
+  } else {
+    // no data, just set a empty out tensor.
+    auto *out_dense = out->mutable_value();
+    out_dense->clear();
+    out_dense->Resize(phi::make_ddim({0}));
+    dev_ctx.template Alloc<T>(out_dense);
+  }
+}
+}  // namespace sr
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
new file mode 100644
index 00000000000000..f9a96b15eedfe2
--- /dev/null
+++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.cc
@@ -0,0 +1,108 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/batch_norm_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_grad_kernel.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void BatchNormCooGradKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const paddle::optional<DenseTensor>& mean,
+                            const paddle::optional<DenseTensor>& variance,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            const paddle::optional<DenseTensor>& reserve_space,
+                            const SparseCooTensor& y_grad,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            SparseCooTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
+  *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
+  *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
+  phi::BatchNormGradKernel<T, Context>(dev_ctx,
+                                       x.values(),
+                                       scale,
+                                       bias,
+                                       mean,
+                                       variance,
+                                       saved_mean,
+                                       saved_variance,
+                                       reserve_space,
+                                       y_grad.values(),
+                                       momentum,
+                                       epsilon,
+                                       data_layout,
+                                       is_test,
+                                       use_global_stats,
+                                       trainable_statistics,
+                                       fuse_with_relu,
+                                       x_grad->mutable_values(),
+                                       scale_grad,
+                                       bias_grad);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(batch_norm_coo_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::BatchNormCooGradKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+#if defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::BatchNormCooGradKernel,
+                   float,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_KERNEL(batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::BatchNormCooGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32);  // x_grad
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);  // scale_grad
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);  // bias_grad
+  }
+}
+#endif
diff --git a/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h
new file mode 100644
index 00000000000000..b7051683170e63
--- /dev/null
+++ b/paddle/phi/kernels/sparse/batch_norm_grad_kernel.h
@@ -0,0 +1,48 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void BatchNormCooGradKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const paddle::optional<DenseTensor>& mean,
+                            const paddle::optional<DenseTensor>& variance,
+                            const DenseTensor& saved_mean,
+                            const DenseTensor& saved_variance,
+                            const paddle::optional<DenseTensor>& reserve_space,
+                            const SparseCooTensor& y_grad,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            SparseCooTensor* x_grad,
+                            DenseTensor* scale_grad,
+                            DenseTensor* bias_grad);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.cc b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
new file mode 100644
index 00000000000000..4f925e83a9b695
--- /dev/null
+++ b/paddle/phi/kernels/sparse/batch_norm_kernel.cc
@@ -0,0 +1,117 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/batch_norm_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/batch_norm_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void BatchNormCooKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const DenseTensor& scale,
+                        const DenseTensor& bias,
+                        const DenseTensor& mean,
+                        const DenseTensor& variance,
+                        float momentum,
+                        float epsilon,
+                        const std::string& data_layout,
+                        bool is_test,
+                        bool use_global_stats,
+                        bool trainable_statistics,
+                        bool fuse_with_relu,
+                        SparseCooTensor* y,
+                        DenseTensor* mean_out,
+                        DenseTensor* variance_out,
+                        DenseTensor* saved_mean,
+                        DenseTensor* saved_variance,
+                        DenseTensor* reserve_space) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
+  phi::BatchNormKernel<T, Context>(dev_ctx,
+                                   x.values(),
+                                   scale,
+                                   bias,
+                                   mean,
+                                   variance,
+                                   momentum,
+                                   epsilon,
+                                   data_layout,
+                                   is_test,
+                                   use_global_stats,
+                                   trainable_statistics,
+                                   fuse_with_relu,
+                                   y->mutable_values(),
+                                   mean_out,
+                                   variance_out,
+                                   saved_mean,
+                                   saved_variance,
+                                   reserve_space);
+  y->SetIndicesDict(x.GetIndicesDict());
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(batch_norm_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::BatchNormCooKernel,
+                   float,
+                   double) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+#if defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::BatchNormCooKernel,
+                   float,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+  kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+}
+#endif
+
+#if defined(PADDLE_WITH_CUDA)
+PD_REGISTER_KERNEL(batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::BatchNormCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  if (kernel_key.dtype() == phi::DataType::FLOAT16) {
+    kernel->InputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->InputAt(4).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+    kernel->OutputAt(4).SetDataType(phi::DataType::FLOAT32);
+  }
+}
+#endif
diff --git a/paddle/phi/kernels/sparse/batch_norm_kernel.h b/paddle/phi/kernels/sparse/batch_norm_kernel.h
new file mode 100644
index 00000000000000..282a8de7b39d4c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/batch_norm_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void BatchNormKernel(const Context& dev_ctx,
+                     const SparseCooTensor& x,
+                     const DenseTensor& scale,
+                     const DenseTensor& bias,
+                     const DenseTensor& mean,
+                     const DenseTensor& variance,
+                     float momentum,
+                     float epsilon,
+                     const std::string& data_layout,
+                     bool is_test,
+                     bool use_global_stats,
+                     bool trainable_statistics,
+                     bool fuse_with_relu,
+                     SparseCooTensor* y,
+                     DenseTensor* mean_out,
+                     DenseTensor* variance_out,
+                     DenseTensor* saved_mean,
+                     DenseTensor* saved_variance,
+                     DenseTensor* reserve_space);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
index 9c0939ec1149cc..98afed84d66438 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_grad_kernel.cc
@@ -12,6 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/phi/kernels/sparse/elementwise_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/elementwise_kernel.h"
+
 #include "glog/logging.h"
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
@@ -24,7 +27,7 @@ limitations under the License. */
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "paddle/phi/kernels/sparse/elementwise_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
 
 namespace phi {
 namespace sparse {
@@ -45,7 +48,7 @@ void AllocCooPtr(const Context& dev_ctx,
                  SparseCooTensor* dx) {
   DenseTensor dx_indices = phi::EmptyLike<IntT>(dev_ctx, x.indices());
   DenseTensor dx_values = phi::EmptyLike<T>(dev_ctx, x.values());
-  dx->SetMember(dx_indices, dx_values, x.dims(), true);
+  dx->SetMember(dx_indices, dx_values, x.dims(), x.coalesced());
 }
 
 template <typename T, typename IntT, typename Context>
@@ -412,3 +415,14 @@ PD_REGISTER_KERNEL(divide_coo_coo_grad,
   kernel->InputAt(2).SetDataLayout(phi::DataLayout::SPARSE_COO);
   kernel->InputAt(3).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(add_coo_dense_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ElementWiseAddDenseGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
index 4156e46dc819fc..3addd4bbbfbb0e 100644
--- a/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/elementwise_kernel.cc
@@ -13,14 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/phi/kernels/sparse/elementwise_kernel.h"
-
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/core/tensor_meta.h"
+#include "paddle/phi/core/tensor_utils.h"
 #include "paddle/phi/core/visit_type.h"
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
 #include "paddle/phi/kernels/elementwise_kernel.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/sparse/flatten_indices.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
 #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
 
 namespace phi {
@@ -155,6 +157,21 @@ void ElementWiseCooKernelImpl(const Context& dev_ctx,
                         "shape = [%s], Y's shape = [%s].",
                         x.dims(),
                         y.dims()));
+
+  // temporary policy: for broadcast add
+  // TODO(zhangkaihuo): implement a correct function
+  const bool is_add = std::is_same<Functor, funcs::AddFunctor<T>>::value;
+  if (is_add && x.indices().numel() == y.indices().numel()) {
+    int compare_indices = memcmp(x.indices().data<IntT>(),
+                                 y.indices().data<IntT>(),
+                                 sizeof(IntT) * x.indices().numel());
+    if (compare_indices == 0) {
+      EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);
+      phi::AddKernel<T, Context>(
+          dev_ctx, x.values(), y.values(), out->mutable_values());
+      return;
+    }
+  }
   int64_t element_size = 1;
   for (auto j = 1; j < x.values().dims().size(); ++j) {
     element_size *= x.values().dims()[j];
@@ -246,9 +263,7 @@ void ElementWiseCooKernelImpl(const Context& dev_ctx,
         vectorize(slice_ddim(x.values().dims(), 1, x.values().dims().size()));
     indeces_dim.insert(indeces_dim.begin(), nnz);
     DenseTensorMeta values_meta(
-        paddle::experimental::CppTypeToDataType<T>::Type(),
-        phi::make_ddim(indeces_dim),
-        DataLayout::NCHW);
+        x.dtype(), phi::make_ddim(indeces_dim), DataLayout::NCHW);
     phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
     phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
 
@@ -263,22 +278,16 @@ void ElementWiseCooKernelImpl(const Context& dev_ctx,
   }
 }
 
-#define DEFINE_CSR_ELEMENTWISE_CPU_KERNEL(name)                          \
-  template <typename T, typename IntT, typename Context>                 \
-  void ElementWise##name##CsrCPUKernel(const Context& dev_ctx,           \
-                                       const SparseCsrTensor& x,         \
-                                       const SparseCsrTensor& y,         \
-                                       SparseCsrTensor* out) {           \
-    funcs::name##Functor<T> functor;                                     \
-    auto coo_x = CsrToCoo<T>(dev_ctx, x);                                \
-    auto coo_y = CsrToCoo<T>(dev_ctx, y);                                \
-    DenseTensor indeces;                                                 \
-    DenseTensor values;                                                  \
-    SparseCooTensor coo_out;                                             \
-    coo_out.SetMember(indeces, values, x.dims());                        \
-    ElementWiseCooKernelImpl<T, IntT, Context, funcs::name##Functor<T>>( \
-        dev_ctx, coo_x, coo_y, &coo_out, functor);                       \
-    *out = CooToCsr<T>(dev_ctx, coo_out);                                \
+#define DEFINE_CSR_ELEMENTWISE_CPU_KERNEL(name)                               \
+  template <typename T, typename IntT, typename Context>                      \
+  void ElementWise##name##CsrCPUKernel(const Context& dev_ctx,                \
+                                       const SparseCsrTensor& x,              \
+                                       const SparseCsrTensor& y,              \
+                                       SparseCsrTensor* out) {                \
+    auto coo_x = CsrToCoo<T>(dev_ctx, x);                                     \
+    auto coo_y = CsrToCoo<T>(dev_ctx, y);                                     \
+    auto coo_out = ElementWise##name##Coo<T, Context>(dev_ctx, coo_x, coo_y); \
+    CooToCsrKernel<T>(dev_ctx, coo_out, out);                                 \
   }
 
 #define DEFINE_CSR_ELEMENTWISE_KERNEL(name)                               \
@@ -442,3 +451,14 @@ PD_REGISTER_KERNEL(divide_coo_coo,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
   kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
+
+PD_REGISTER_KERNEL(add_coo_dense,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ElementWiseAddDenseKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc
new file mode 100644
index 00000000000000..fc843f81c31ee1
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/reshape_grad_kernel.cc
@@ -0,0 +1,73 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void ReshapeCooGradKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const SparseCooTensor& dout,
+                          SparseCooTensor* dx) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  ReshapeCooKernel<T, Context>(dev_ctx, dout, x_shape, dx);
+}
+
+template <typename T, typename Context>
+void ReshapeCsrGradKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          const SparseCsrTensor& dout,
+                          SparseCsrTensor* dx) {
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, x, dx);
+  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  ReshapeCsrKernel<T, Context>(dev_ctx, dout, x_shape, dx);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reshape_coo_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCooGradKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(reshape_csr_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCsrGradKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc b/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
new file mode 100644
index 00000000000000..4f165156668100
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/reshape_kernel.cc
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/core/ddim.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void ReshapeCooKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const phi::IntArray& shape,
+                      SparseCooTensor* out) {
+  // TODO(OccupyMars2025): Currently, reshape is only applicable to sparse dims
+  int64_t x_nnz = x.nnz();
+
+  // Use DDim::reshape to handle -1 and 0 in the argument "shape"
+  std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
+  phi::DDim out_dims = x.dims().reshape(new_shape);
+  // get sparse part dimensions of x and out
+  std::vector<int64_t> x_sparse_part_dims;
+  std::vector<int64_t> out_sparse_part_dims;
+  for (int i = 0; i < x.sparse_dim(); ++i) {
+    x_sparse_part_dims.push_back(x.dims()[i]);
+  }
+  for (int i = 0; i < out_dims.size() - x.dense_dim(); ++i) {
+    out_sparse_part_dims.push_back(out_dims[i]);
+  }
+  DenseTensor out_indices = Empty<int64_t, Context>(
+      dev_ctx, {static_cast<int64_t>(out_sparse_part_dims.size()), x_nnz});
+  DenseTensor out_values(x.values());
+  out->SetMember(out_indices, out_values, out_dims, x.coalesced());
+
+  // compute values of indices
+  const DenseTensor& x_indices = x.indices();
+  const auto* x_indices_data = x_indices.data<int64_t>();
+  auto* out_indices_data = out_indices.data<int64_t>();
+
+  const phi::DDim& x_sparse_part_strides =
+      phi::stride(phi::make_ddim(x_sparse_part_dims));
+  const phi::DDim& out_sparse_part_strides =
+      phi::stride(phi::make_ddim(out_sparse_part_dims));
+  int64_t location = 0;
+  for (int64_t j = 0; j < x_nnz; ++j) {
+    location = 0;
+    for (int i = 0; i < x.sparse_dim(); ++i) {
+      location += x_indices_data[i * x_nnz + j] * x_sparse_part_strides[i];
+    }
+    for (size_t i = 0; i < out_sparse_part_dims.size(); ++i) {
+      out_indices_data[i * x_nnz + j] = location / out_sparse_part_strides[i];
+      location %= out_sparse_part_strides[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ReshapeCsrKernel(const Context& dev_ctx,
+                      const SparseCsrTensor& x,
+                      const phi::IntArray& shape,
+                      SparseCsrTensor* out) {
+  // transform csr format to coo format, and then use coo kernel
+  const SparseCooTensor x_coo = CsrToCoo<T, Context>(dev_ctx, x);
+  SparseCooTensor out_coo;
+  ReshapeCooKernel<T, Context>(dev_ctx, x_coo, shape, &out_coo);
+  CooToCsrKernel<T, Context>(dev_ctx, out_coo, out);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reshape_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCooKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(reshape_csr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCsrKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
index 5199f42ed99dd3..dcb4399aa28627 100644
--- a/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
+++ b/paddle/phi/kernels/sparse/cpu/sparse_utils_kernel.cc
@@ -103,6 +103,7 @@ void DenseToCooKernel(const Context& dev_ctx,
       ++index;
     }
   }
+
   out->SetMember(indices, values, x_dims, true);
 }
 
@@ -181,17 +182,12 @@ void CooToCsrCPUKernel(const CPUContext& dev_ctx,
   int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
   int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
 
-  phi::DenseTensor crows;
-  crows.Resize({batchs * (rows + 1)});
-  IntT* csr_crows_data = dev_ctx.template Alloc<IntT>(&crows);
-
-  phi::DenseTensor cols;
-  cols.Resize({non_zero_num});
-  IntT* csr_cols_data = dev_ctx.template Alloc<IntT>(&cols);
-
-  phi::DenseTensor values;
-  values.Resize({non_zero_num});
-  T* csr_values_data = dev_ctx.template Alloc<T>(&values);
+  phi::DenseTensor crows = phi::Empty<IntT>(dev_ctx, {batchs * (rows + 1)});
+  phi::DenseTensor cols = phi::Empty<IntT>(dev_ctx, {non_zero_num});
+  phi::DenseTensor values = phi::EmptyLike<T, CPUContext>(dev_ctx, x.values());
+  IntT* csr_crows_data = crows.data<IntT>();
+  IntT* csr_cols_data = cols.data<IntT>();
+  T* csr_values_data = values.data<T>();
 
   const auto& coo_indices = x.indices();
   const auto& coo_values = x.values();
@@ -270,8 +266,7 @@ void CooToDenseCPUKernel(const CPUContext& dev_ctx,
   const int64_t dense_dim = x.dense_dim();
 
   const T* x_data = values.data<T>();
-  *out = phi::Empty(dev_ctx,
-                    DenseTensorMeta(x.dtype(), x.dims(), x.values().layout()));
+  dev_ctx.template Alloc<T>(out);
   T* out_data = out->data<T>();
   int64_t base_offset = 1;
   for (int64_t i = 0; i < dense_dim; i++) {
@@ -334,7 +329,8 @@ PD_REGISTER_KERNEL(csr_to_coo,
                    int8_t,
                    int16_t,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   bool) {}
 
 PD_REGISTER_KERNEL(coo_to_csr,
                    CPU,
@@ -347,7 +343,8 @@ PD_REGISTER_KERNEL(coo_to_csr,
                    int8_t,
                    int16_t,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   bool) {}
 
 PD_REGISTER_KERNEL(dense_to_csr,
                    CPU,
@@ -403,6 +400,21 @@ PD_REGISTER_KERNEL(values_coo,
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
+PD_REGISTER_KERNEL(indices_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::IndicesCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
 PD_REGISTER_KERNEL(values_csr,
                    CPU,
                    ALL_LAYOUT,
@@ -415,7 +427,7 @@ PD_REGISTER_KERNEL(values_csr,
                    int16_t,
                    int,
                    int64_t) {
-  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
 }
 
 PD_REGISTER_KERNEL(sparse_coo_tensor,
diff --git a/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc b/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc
new file mode 100644
index 00000000000000..87822a9375ef5c
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/transpose_grad_kernel.cc
@@ -0,0 +1,78 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+std::vector<int> get_cpu_grad_perm(std::vector<int> perm) {
+  std::vector<int> grad_perm(perm.size());
+  for (unsigned int i = 0; i < perm.size(); ++i) {
+    grad_perm[perm[i]] = i;
+  }
+  return grad_perm;
+}
+
+template <typename T, typename Context>
+void TransposeCooGradKernel(const Context& dev_ctx,
+                            const SparseCooTensor& dout,
+                            const std::vector<int>& perm,
+                            SparseCooTensor* dx) {
+  std::vector<int> grad_perm = get_cpu_grad_perm(perm);
+  TransposeCooKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
+}
+
+template <typename T, typename Context>
+void TransposeCsrGradKernel(const Context& dev_ctx,
+                            const SparseCsrTensor& dout,
+                            const std::vector<int>& perm,
+                            SparseCsrTensor* dx) {
+  std::vector<int> grad_perm = get_cpu_grad_perm(perm);
+  TransposeCsrKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
+}
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose_coo_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCooGradKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(transpose_csr_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCsrGradKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc b/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
new file mode 100644
index 00000000000000..6dea63ffbce88d
--- /dev/null
+++ b/paddle/phi/kernels/sparse/cpu/transpose_kernel.cc
@@ -0,0 +1,231 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void TransposeCooKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::vector<int>& perm,
+                        SparseCooTensor* out) {
+  // create out sparse tensor
+  int64_t x_nnz = x.nnz();
+  DDim out_dims = x.dims().transpose(perm);
+  DenseTensor out_indices = EmptyLike<int64_t, Context>(dev_ctx, x.indices());
+  DenseTensor out_values(x.values());
+  out->SetMember(out_indices, out_values, out_dims, x.coalesced());
+
+  // compute values of indices
+  const DenseTensor& x_indices = x.indices();
+  const auto* x_indices_data = x_indices.data<int64_t>();
+  auto* out_indices_data = out_indices.data<int64_t>();
+  for (unsigned int i = 0; i < perm.size(); ++i) {
+    for (int64_t j = 0; j < x_nnz; ++j) {
+      out_indices_data[j + i * x_nnz] = x_indices_data[j + perm[i] * x_nnz];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TransposeCsrKernel(const Context& dev_ctx,
+                        const SparseCsrTensor& x,
+                        const std::vector<int>& perm,
+                        SparseCsrTensor* out) {
+  unsigned int n_dim = perm.size();
+  const DenseTensor& x_crows = x.crows();
+  const DenseTensor& x_cols = x.cols();
+  const DenseTensor& x_values = x.values();
+  DenseTensor out_crows, out_cols, out_values;
+  // return a copy of x
+  if (perm[0] == 0 && perm[1] == 1 && (n_dim == 2 || perm[2] == 2)) {
+    out_crows = x_crows;
+    out_cols = x_cols;
+    out_values = x_values;
+    out->SetMember(out_crows, out_cols, out_values, x.dims());
+    return;
+  }
+  // create out sparse tensor
+  DDim out_dims = x.dims().transpose(perm);
+  if (n_dim == 2) {
+    out_crows = Empty<int64_t, Context>(dev_ctx, {out_dims[0] + 1});
+  } else {
+    out_crows =
+        Empty<int64_t, Context>(dev_ctx, {out_dims[0] * (out_dims[1] + 1)});
+  }
+  out_cols = EmptyLike<int64_t, Context>(dev_ctx, x.cols());
+  out_values = EmptyLike<T, Context>(dev_ctx, x.values());
+  out->SetMember(out_crows, out_cols, out_values, out_dims);
+  // transpose by two stages
+  if (perm[0] == 1 && perm[1] == 2) {  // perm == {1, 2, 0}
+    SparseCsrTensor temp;
+    TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
+    TransposeCsrKernel<T, Context>(dev_ctx, temp, {0, 2, 1}, out);
+    return;
+  } else if (perm[0] == 2 && perm[1] == 0) {  // perm == {2, 0, 1}
+    SparseCsrTensor temp;
+    TransposeCsrKernel<T, Context>(dev_ctx, x, {0, 2, 1}, &temp);
+    TransposeCsrKernel<T, Context>(dev_ctx, temp, {1, 0, 2}, out);
+    return;
+  } else if (perm[0] == 2 && perm[1] == 1) {  // perm == {2, 1, 0}
+    SparseCsrTensor temp;
+    TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
+    TransposeCsrKernel<T, Context>(dev_ctx, temp, {2, 0, 1}, out);
+    return;
+  }
+
+  int64_t* out_crows_data = out_crows.data<int64_t>();
+  int64_t* out_cols_data = out_cols.data<int64_t>();
+  T* out_values_data = out_values.data<T>();
+  const int64_t* x_crows_data = x_crows.data<int64_t>();
+  const int64_t* x_cols_data = x_cols.data<int64_t>();
+  const T* x_values_data = x_values.data<T>();
+
+  int64_t x_nnz = x.nnz();
+  if (n_dim == 2) {  // perm == {1, 0}
+    // compute out_crows_data by x_cols_data
+    for (int i = 0; i < out_dims[0]; ++i) {
+      out_crows_data[i] = 0;
+    }
+    for (int i = 0; i < x_nnz; ++i) {
+      int j = x_cols_data[i];
+      out_crows_data[j + 1]++;
+    }
+    out_crows_data[out_dims[0]] = x_nnz;
+    for (int i = 1; i < out_dims[0]; ++i) {
+      out_crows_data[i] += out_crows_data[i - 1];
+    }
+    // compute out_cols_data and out_values_data by out_crows_data and x
+    std::unordered_map<int64_t, int> cols_offset;
+    for (int i = 0; i < x.dims()[0]; ++i) {
+      int64_t start = x_crows_data[i];
+      int64_t end = x_crows_data[i + 1];
+      for (int64_t j = start; j < end; ++j) {
+        int64_t x_cols_j = x_cols_data[j];
+        int64_t jjj = out_crows_data[x_cols_j];
+        if (cols_offset.count(jjj)) {
+          cols_offset[jjj]++;
+        } else {
+          cols_offset[jjj] = 0;
+        }
+        int64_t jjj_offset = jjj + cols_offset[jjj];
+        out_cols_data[jjj_offset] = i;
+        out_values_data[jjj_offset] = x_values_data[j];
+      }
+    }
+  } else {  // n_dim == 3
+    int out_n_rows = out_dims[1];
+    int x_n_rows = x.dims()[1];
+    for (int k = 0; k < out_dims[0]; ++k) {
+      if (perm[0] == 0) {  // perm == {0, 2, 1}
+        // compute out_crows_data by x_cols_data
+        for (int i = 0; i < out_n_rows; ++i) {
+          out_crows_data[i] = 0;
+        }
+        for (int i = 0; i < x_crows_data[x_n_rows]; ++i) {
+          int j = x_cols_data[i];
+          out_crows_data[j + 1]++;
+        }
+        out_crows_data[out_n_rows] = x_crows_data[x_n_rows];
+        for (int i = 1; i < out_n_rows; ++i) {
+          out_crows_data[i] += out_crows_data[i - 1];
+        }
+        // compute out_cols_data and out_values_data by out_crows_data and x
+        std::unordered_map<int64_t, int> cols_offset;
+        for (int i = 0; i < x_n_rows; ++i) {
+          int64_t start = x_crows_data[i];
+          int64_t end = x_crows_data[i + 1];
+          for (int64_t j = start; j < end; ++j) {
+            int64_t x_cols_j = x_cols_data[j];
+            int64_t jjj = out_crows_data[x_cols_j];
+            if (cols_offset.count(jjj)) {
+              cols_offset[jjj]++;
+            } else {
+              cols_offset[jjj] = 0;
+            }
+            int64_t jjj_offset = jjj + cols_offset[jjj];
+            out_cols_data[jjj_offset] = i;
+            out_values_data[jjj_offset] = x_values_data[j];
+          }
+        }
+        // x offset
+        x_cols_data += x_crows_data[x_n_rows];
+        x_values_data += x_crows_data[x_n_rows];
+        x_crows_data += x_n_rows + 1;
+      } else if (perm[0] == 1 && perm[1] == 0) {  // perm == {1, 0, 2}
+        for (int i = 0; i < out_n_rows; ++i) {
+          out_crows_data[i] = 0;
+        }
+        int x_cols_offset = 0;
+        int out_cols_index = 0;
+        for (int i = 0; i < x.dims()[0]; ++i) {
+          int x_crows_index = i * (x_n_rows + 1);
+          int start = x_crows_data[x_crows_index + k];
+          int end = x_crows_data[x_crows_index + 1 + k];
+          out_crows_data[i + 1] = end - start;
+          for (int j = start; j < end; ++j) {
+            out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j];
+            out_values_data[out_cols_index] = x_values_data[x_cols_offset + j];
+            out_cols_index++;
+          }
+          x_cols_offset += x_crows_data[x_crows_index + x_n_rows];
+        }
+        for (int i = 1; i <= out_n_rows; ++i) {
+          out_crows_data[i] += out_crows_data[i - 1];
+        }
+      }
+      // out offset
+      out_cols_data += out_crows_data[out_n_rows];
+      out_values_data += out_crows_data[out_n_rows];
+      out_crows_data += out_n_rows + 1;
+    }
+  }
+}
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose_coo,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCooKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(transpose_csr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCsrKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/elementwise_grad_kernel.h b/paddle/phi/kernels/sparse/elementwise_grad_kernel.h
index df3feb597e3c61..f16e2f95d47eb2 100644
--- a/paddle/phi/kernels/sparse/elementwise_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/elementwise_grad_kernel.h
@@ -14,8 +14,13 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/kernels/elementwise_add_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/infermeta/sparse/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
@@ -49,6 +54,9 @@ namespace sparse {
       const Sparse##type##Tensor& dout) {                          \
     Sparse##type##Tensor dx;                                       \
     Sparse##type##Tensor dy;                                       \
+    MetaTensor meta_dx(&dx), meta_dy(&dy);                         \
+    phi::UnchangedInferMeta(x, &meta_dx);                          \
+    phi::UnchangedInferMeta(y, &meta_dy);                          \
     ElementWise##name##type##GradKernel<T, Context>(               \
         dev_ctx, x, y, dout, &dx, &dy);                            \
     return std::vector<Sparse##type##Tensor>{dx, dy};              \
@@ -89,6 +97,9 @@ std::vector<SparseCsrTensor> ElementWiseDivideCsrGrad(
     const SparseCsrTensor& dout) {
   SparseCsrTensor dx;
   SparseCsrTensor dy;
+  MetaTensor meta_dx(&dx), meta_dy(&dy);
+  phi::UnchangedInferMeta(x, &meta_dx);
+  phi::UnchangedInferMeta(y, &meta_dy);
   ElementWiseDivideCsrGradKernel<T, Context>(
       dev_ctx, x, y, out, dout, &dx, &dy);
   return std::vector<SparseCsrTensor>{dx, dy};
@@ -103,10 +114,35 @@ std::vector<SparseCooTensor> ElementWiseDivideCooGrad(
     const SparseCooTensor& dout) {
   SparseCooTensor dx;
   SparseCooTensor dy;
+  MetaTensor meta_dx(&dx), meta_dy(&dy);
+  phi::UnchangedInferMeta(x, &meta_dx);
+  phi::UnchangedInferMeta(y, &meta_dy);
   ElementWiseDivideCooGradKernel<T, Context>(
       dev_ctx, x, y, out, dout, &dx, &dy);
   return std::vector<SparseCooTensor>{dx, dy};
 }
 
+template <typename T, typename Context>
+void ElementWiseAddDenseGradKernel(const Context& dev_ctx,
+                                   const SparseCooTensor& x,
+                                   const DenseTensor& y,
+                                   const SparseCooTensor& dout,
+                                   SparseCooTensor* dx,
+                                   DenseTensor* dy) {
+  DenseTensor* x_values_grad = nullptr;
+  DenseTensor* y_grad = nullptr;
+  if (dx) {
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+    x_values_grad = dx->mutable_values();
+  }
+
+  if (dy) {
+    *dy = phi::EmptyLike<T>(dev_ctx, y);
+    y_grad = dy;
+  }
+  phi::AddGradKernel<T, Context>(
+      dev_ctx, x.values(), y, dout.values(), -1, x_values_grad, y_grad);
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/elementwise_kernel.h b/paddle/phi/kernels/sparse/elementwise_kernel.h
index 0f9e67f7063bb8..515644d4fcfce2 100644
--- a/paddle/phi/kernels/sparse/elementwise_kernel.h
+++ b/paddle/phi/kernels/sparse/elementwise_kernel.h
@@ -14,9 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/sparse/elementwise_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/infermeta/binary.h"
 
 namespace phi {
 namespace sparse {
@@ -45,8 +50,10 @@ namespace sparse {
                                          const SparseCsrTensor& y) { \
     DenseTensor crows;                                               \
     DenseTensor cols;                                                \
-    DenseTensor non_zero_elements;                                   \
-    SparseCsrTensor out(crows, cols, non_zero_elements, x.dims());   \
+    DenseTensor values;                                              \
+    SparseCsrTensor out(crows, cols, values, x.dims());              \
+    MetaTensor meta_out(out);                                        \
+    phi::ElementwiseInferMeta(x, y, &meta_out);                      \
     ElementWise##name##CsrKernel<T, Context>(dev_ctx, x, y, &out);   \
     return out;                                                      \
   }
@@ -57,8 +64,10 @@ namespace sparse {
                                          const SparseCooTensor& x,   \
                                          const SparseCooTensor& y) { \
     DenseTensor indices;                                             \
-    DenseTensor non_zero_elements;                                   \
-    SparseCooTensor out(indices, non_zero_elements, x.dims());       \
+    DenseTensor values;                                              \
+    SparseCooTensor out(indices, values, x.dims());                  \
+    MetaTensor meta_out(out);                                        \
+    phi::ElementwiseInferMeta(x, y, &meta_out);                      \
     ElementWise##name##CooKernel<T, Context>(dev_ctx, x, y, &out);   \
     return out;                                                      \
   }
@@ -73,5 +82,21 @@ DEFINE_ELEMENTWISE_KERNEL_FUNC(Subtract)
 DEFINE_ELEMENTWISE_KERNEL_FUNC(Multiply)
 DEFINE_ELEMENTWISE_KERNEL_FUNC(Divide)
 
+template <typename T, typename Context>
+void ElementWiseAddDenseKernel(const Context& dev_ctx,
+                               const SparseCooTensor& x,
+                               const DenseTensor& y,
+                               SparseCooTensor* out) {
+  // TODO(zhangkaiuo): to support universal sparse + dense
+  if (y.dims().size() == 1 && y.dims()[0] == x.dims()[x.dims().size() - 1]) {
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);
+    phi::AddKernel<T, Context>(dev_ctx, x.values(), y, out->mutable_values());
+    out->SetIndicesDict(x.GetIndicesDict());
+  } else {
+    PADDLE_THROW(
+        errors::Unimplemented("Not support Sparse + Dense in GPU mode"));
+  }
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/empty_kernel.cc b/paddle/phi/kernels/sparse/empty_kernel.cc
index ebe0abc45cee3f..49a377ca70f67d 100644
--- a/paddle/phi/kernels/sparse/empty_kernel.cc
+++ b/paddle/phi/kernels/sparse/empty_kernel.cc
@@ -26,12 +26,12 @@ template <typename T, typename Context>
 void EmptyLikeCooKernel(const Context& dev_ctx,
                         const SparseCooTensor& x,
                         SparseCooTensor* out) {
-  out->set_dims(x.dims());
   *(out->mutable_indices()) = x.indices();
 
-  const DenseTensor& x_values = x.non_zero_elements();
-  DenseTensor* out_values = out->mutable_non_zero_elements();
+  const DenseTensor& x_values = x.values();
+  DenseTensor* out_values = out->mutable_values();
   out_values->Resize(x_values.dims());
+  out->set_meta(x.meta());
   dev_ctx.template Alloc<T>(out_values);
 }
 
@@ -39,13 +39,13 @@ template <typename T, typename Context>
 void EmptyLikeCsrKernel(const Context& dev_ctx,
                         const SparseCsrTensor& x,
                         SparseCsrTensor* out) {
-  out->set_dims(x.dims());
   *(out->mutable_crows()) = x.crows();
   *(out->mutable_cols()) = x.cols();
 
-  const DenseTensor& x_values = x.non_zero_elements();
-  DenseTensor* out_values = out->mutable_non_zero_elements();
+  const DenseTensor& x_values = x.values();
+  DenseTensor* out_values = out->mutable_values();
   out_values->Resize(x_values.dims());
+  out->set_meta(x.meta());
   dev_ctx.template Alloc<T>(out_values);
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
index d369c0ecd99fa0..a348c6aa11e1e4 100644
--- a/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/coalesce_kernel.cu
@@ -169,6 +169,7 @@ void CoalesceGPUKernel(const GPUContext& dev_ctx,
       indexs_ptr, const_dims, out_nnz, sparse_dim, out_indices.data<IntT>());
 
   out->SetMember(out_indices, out_values, x.dims(), true);
+  out->SetIndicesDict(x.GetIndicesDict());
 }
 
 template <typename T, typename Context>
diff --git a/paddle/phi/kernels/sparse/gpu/conv.cu.h b/paddle/phi/kernels/sparse/gpu/conv.cu.h
index 2a524eb46500d9..161930a06fa854 100644
--- a/paddle/phi/kernels/sparse/gpu/conv.cu.h
+++ b/paddle/phi/kernels/sparse/gpu/conv.cu.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include <thrust/remove.h>
+#include <thrust/sort.h>
 #include <thrust/unique.h>
 #include "paddle/phi/kernels/sparse/conv_kernel.h"
 
@@ -65,7 +66,6 @@ __global__ void GatherKernelV2(const T* inputs,
                                const int* index_groups,
                                const int non_zero_num,
                                const int kernel_size,
-                               const int max_voxel,
                                const int channels,
                                const int buffer_count,
                                T* output) {
@@ -83,11 +83,10 @@ __global__ void GatherKernelV2(const T* inputs,
 #pragma unroll
     for (int it = 0; it < buffer_count; it++) {
       int len = index_counts[indices_i + it * non_zero_num];
-      const int group_offset = it * kernel_size * max_voxel * non_zero_num;
+      const int group_offset = it * kernel_size * non_zero_num;
 #pragma unroll
       for (int j = 0; j < len; j++) {
-        int out_i = index_groups[indices_i * kernel_size * max_voxel + j +
-                                 group_offset];
+        int out_i = index_groups[indices_i * kernel_size + j + group_offset];
         phi::Store<T, VecSize>(
             in_vec, output + out_i * channels + channels_i * VecSize);
       }
@@ -129,7 +128,6 @@ inline void GatherV2(const GPUContext& dev_ctx,
                      const int* index_groups,
                      const int non_zero_num,
                      const int kernel_size,
-                     const int max_voxel,
                      const int channels,
                      const int buffer_count,
                      T* output) {
@@ -145,7 +143,6 @@ inline void GatherV2(const GPUContext& dev_ctx,
                                                            index_groups,
                                                            non_zero_num,
                                                            kernel_size,
-                                                           max_voxel,
                                                            channels,
                                                            buffer_count,
                                                            output);
@@ -160,7 +157,6 @@ inline void GatherV2(const GPUContext& dev_ctx,
                                                      index_groups,
                                                      non_zero_num,
                                                      kernel_size,
-                                                     max_voxel,
                                                      channels,
                                                      buffer_count,
                                                      output);
@@ -186,8 +182,7 @@ __global__ void UniqueKernel(const IntT* in_indexs,
   if (i < rulebook_len) {
     // atomicOr only support int
     int index = static_cast<int>(in_indexs[i]);
-    int change_index = index == 0 ? -1 : index;
-    int flag = atomicOr(out_index_table + index, change_index);
+    int flag = atomicOr(out_index_table + index, 1);
     if (flag == 0) {
       int j = atomicAdd(&count, 1);
       cache[j] = index;
@@ -207,7 +202,7 @@ __global__ void UniqueKernel(const IntT* in_indexs,
 template <typename IntT>
 __global__ void GroupIndexs(const int* out_index_table,
                             const int n,
-                            const int offset,
+                            const int kernel_size,
                             IntT* out_indexs,
                             int* out_index_counts,
                             int* out_index_groups) {
@@ -219,7 +214,7 @@ __global__ void GroupIndexs(const int* out_index_table,
     // kernel_size at most
     int j = atomicAdd(out_index_counts + real_index, 1);
     // nnz * kernel_size
-    out_index_groups[real_index * offset + j] = i;
+    out_index_groups[real_index * kernel_size + j] = i;
   }
 }
 
@@ -303,36 +298,18 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
   }
 }
 
-template <typename IntT, bool save_out_index = true>
+template <typename IntT>
 __global__ void GetOutIndexTable(const IntT* indices,
                                  const IntT non_zero_num,
                                  const Dims4D dims,
-                                 int* out_index_table,
-                                 int* out_index_table2,
-                                 int* max_voxel) {
-  __shared__ int cache_max;
-  if (threadIdx.x == 0) {
-    cache_max = 0;
-  }
-  __syncthreads();
-
+                                 int* out_index_table) {
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT batch = indices[i];
     IntT in_z = indices[i + non_zero_num];
     IntT in_y = indices[i + 2 * non_zero_num];
     IntT in_x = indices[i + 3 * non_zero_num];
     IntT index = PointToIndex(batch, in_x, in_y, in_z, dims);
-    if (save_out_index) {
-      out_index_table[index] = i == 0 ? -1 : i;
-    }
-
-    int count = atomicAdd(out_index_table2 + index, 1);
-    atomicMax(&cache_max, count);
-  }
-
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    atomicMax(max_voxel, cache_max + 1);
+    out_index_table[index] = i == 0 ? -1 : i;
   }
 }
 
@@ -341,22 +318,10 @@ __global__ void GetOutIndexTable(int* indexs,
                                  const int non_zero_num,
                                  const Dims4D out_dims,
                                  int* out_index_table,
-                                 int* out_index_table2,
-                                 int* max_voxel,
                                  IntT* out_indices) {
-  __shared__ int cache_max;
-  if (threadIdx.x == 0) {
-    cache_max = 0;
-  }
-  __syncthreads();
-
   CUDA_KERNEL_LOOP_TYPE(i, non_zero_num, int64_t) {
     IntT index = static_cast<IntT>(indexs[i]);
     out_index_table[index] = i;
-
-    int count = atomicAdd(out_index_table2 + index, 1);
-    atomicMax(&cache_max, count);
-
     IntT batch, x, y, z;
     phi::funcs::sparse::IndexToPoint<Dims4D>(
         index, out_dims, &batch, &x, &y, &z);
@@ -367,11 +332,6 @@ __global__ void GetOutIndexTable(int* indexs,
     out_indices[i + non_zero_num * 3] = x;
     indexs[i] = 0;
   }
-
-  __syncthreads();
-  if (threadIdx.x == 0) {
-    atomicMax(max_voxel, cache_max + 1);
-  }
 }
 
 template <typename IntT>
@@ -491,7 +451,7 @@ __global__ void ProductSubmRuleBookKernel(const T* x_indices,
 
 template <typename IntT>
 __global__ void GroupIndexs(const int n,
-                            const int offset,
+                            const int kernel_size,
                             const IntT* indexs,
                             int* index_counts,
                             int* index_groups) {
@@ -500,7 +460,7 @@ __global__ void GroupIndexs(const int n,
     // kernel_size at most
     int j = atomicAdd(index_counts + index, 1);
     // nnz * kernel_size
-    index_groups[index * offset + j] = i;
+    index_groups[index * kernel_size + j] = i;
   }
 }
 
@@ -508,7 +468,7 @@ __global__ void GroupIndexs(const int n,
 template <typename IntT>
 __global__ void GroupIndexsV2(const int rulebook_len,
                               const int non_zero_num,
-                              const int offset,
+                              const int kernel_size,
                               const int half_kernel_offset,
                               const IntT* indexs,
                               int* index_counts,
@@ -519,11 +479,11 @@ __global__ void GroupIndexsV2(const int rulebook_len,
         i < half_kernel_offset ? index_counts : index_counts + non_zero_num;
     int* groups_ptr = i < half_kernel_offset
                           ? index_groups
-                          : index_groups + non_zero_num * offset;
+                          : index_groups + non_zero_num * kernel_size;
     // conflict kernel_size times at most
     int j = atomicAdd(counts_ptr + index, 1);
     // nnz * kernel_size
-    groups_ptr[index * offset + j] = i;
+    groups_ptr[index * kernel_size + j] = i;
   }
 }
 
@@ -622,10 +582,6 @@ int ProductRuleBook(const Context& dev_ctx,
   DenseTensor out_index_table = phi::Empty<int>(dev_ctx, {table_size});
   int* out_index_table_ptr = out_index_table.data<int>();
 
-  DenseTensor out_index_table2 = phi::Empty<int>(dev_ctx, {table_size + 1});
-  int* out_index_table2_ptr = out_index_table2.data<int>();
-  int* h_max_voxel = h_counter + kernel_size;
-
   if (subm) {
     DenseTensor tmp_rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
     IntT* rulebook_ptr = tmp_rulebook.data<IntT>();
@@ -636,29 +592,14 @@ int ProductRuleBook(const Context& dev_ctx,
 
     phi::backends::gpu::GpuMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
-    phi::backends::gpu::GpuMemsetAsync(out_index_table2_ptr,
-                                       0,
-                                       sizeof(int) * (table_size + 1),
-                                       dev_ctx.stream());
 
     auto config =
         phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
-    GetOutIndexTable<IntT>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(out_indices.data<IntT>(),
-                               non_zero_num,
-                               d_x_dims,
-                               out_index_table_ptr,
-                               out_index_table2_ptr,
-                               out_index_table2_ptr + table_size);
-    phi::backends::gpu::GpuMemcpyAsync(h_max_voxel,
-                                       out_index_table2_ptr + table_size,
-                                       sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
+    GetOutIndexTable<IntT><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(
+        out_indices.data<IntT>(), non_zero_num, d_x_dims, out_index_table_ptr);
 
     size_t cache_size =
         kernel_size * 2 * sizeof(int) +
@@ -712,22 +653,6 @@ int ProductRuleBook(const Context& dev_ctx,
                                              out_rulebook_ptr);
     *rulebook = out_rulebook;
 
-    unique_value->ResizeAndAllocate(
-        {static_cast<int>(non_zero_num * h_max_voxel[0] * kernel_size)});
-    int* unique_value_ptr = unique_value->data<int>();
-    out_index->ResizeAndAllocate({static_cast<int>(rulebook_len)});
-    int* out_index_ptr = out_index->data<int>();
-    phi::backends::gpu::GpuMemsetAsync(
-        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
-    GroupIndexs<<<config.block_per_grid,
-                  config.thread_per_block,
-                  0,
-                  dev_ctx.stream()>>>(rulebook_len,
-                                      kernel_size * h_max_voxel[0],
-                                      out_rulebook_ptr + rulebook_len,
-                                      out_index_ptr,
-                                      unique_value_ptr);
-
     return rulebook_len;
 
   } else {
@@ -772,6 +697,7 @@ int ProductRuleBook(const Context& dev_ctx,
 
     phi::backends::gpu::GpuMemsetAsync(
         out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
+
     phi::backends::gpu::GpuMemsetAsync(
         unique_key_ptr, 0, sizeof(int), dev_ctx.stream());
 
@@ -785,6 +711,7 @@ int ProductRuleBook(const Context& dev_ctx,
                                              out_index_table_ptr,
                                              out_index_ptr,
                                              unique_key_ptr);
+
     int out_nnz = 0;
     phi::backends::gpu::GpuMemcpyAsync(&out_nnz,
                                        unique_key_ptr,
@@ -792,6 +719,13 @@ int ProductRuleBook(const Context& dev_ctx,
                                        gpuMemcpyDeviceToHost,
                                        dev_ctx.stream());
     dev_ctx.Wait();
+#ifdef PADDLE_WITH_HIP
+    thrust::sort(thrust::hip::par.on(dev_ctx.stream()),
+#else
+    thrust::sort(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                 out_index_ptr,
+                 out_index_ptr + out_nnz);
 
     const int64_t sparse_dim = 4;
     phi::DenseTensor out_indices =
@@ -802,35 +736,17 @@ int ProductRuleBook(const Context& dev_ctx,
 
     IntT* out_indices_ptr = out_indices.data<IntT>();
 
-    phi::backends::gpu::GpuMemsetAsync(
-        out_index_table_ptr, 0, sizeof(int) * table_size, dev_ctx.stream());
-    phi::backends::gpu::GpuMemsetAsync(out_index_table2_ptr,
-                                       0,
-                                       sizeof(int) * (table_size + 1),
-                                       dev_ctx.stream());
-
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_nnz, 1);
-    GetOutIndexTable<IntT>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(out_index_ptr,
-                               out_nnz,
-                               d_out_dims,
-                               out_index_table_ptr,
-                               out_index_table2_ptr,
-                               out_index_table2_ptr + table_size,
-                               out_indices_ptr);
-    phi::backends::gpu::GpuMemcpyAsync(h_max_voxel,
-                                       out_index_table2_ptr + table_size,
-                                       sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-
+    GetOutIndexTable<IntT><<<config.block_per_grid,
+                             config.thread_per_block,
+                             0,
+                             dev_ctx.stream()>>>(out_index_ptr,
+                                                 out_nnz,
+                                                 d_out_dims,
+                                                 out_index_table_ptr,
+                                                 out_indices_ptr);
     config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
-    unique_value->ResizeAndAllocate(
-        {static_cast<int>(out_nnz * h_max_voxel[0] * kernel_size)});
+    unique_value->ResizeAndAllocate({static_cast<int>(out_nnz * kernel_size)});
     int* unique_value_ptr = unique_value->data<int>();
 
     GroupIndexs<<<config.block_per_grid,
@@ -838,7 +754,7 @@ int ProductRuleBook(const Context& dev_ctx,
                   0,
                   dev_ctx.stream()>>>(out_index_table_ptr,
                                       rulebook_len,
-                                      kernel_size * h_max_voxel[0],
+                                      kernel_size,
                                       rulebook_ptr + rulebook_len,
                                       out_index_ptr,
                                       unique_value_ptr);
diff --git a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
index 5d57afab403ad4..adfdb09968cbb6 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_grad_kernel.cu
@@ -119,44 +119,10 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
     }
   }
 
-  int max_voxel = counter_ptr[kernel_size];
-  if (!subm) {
-    const auto& x_dims = x.dims();
-    Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
-    int64_t table_size = 1;
-    for (int i = 0; i < x_dims.size() - 1; i++) {
-      table_size *= x_dims[i];
-    }
-    DenseTensor in_index_table = phi::Empty<int>(dev_ctx, {table_size + 1});
-    int* in_index_table_ptr = in_index_table.data<int>();
-    phi::backends::gpu::GpuMemsetAsync(in_index_table_ptr,
-                                       0,
-                                       sizeof(int) * (table_size + 1),
-                                       dev_ctx.stream());
-    auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x.nnz(), 1);
-    GetOutIndexTable<IntT, false>
-        <<<config.block_per_grid,
-           config.thread_per_block,
-           0,
-           dev_ctx.stream()>>>(x.indices().data<IntT>(),
-                               x.nnz(),
-                               d_x_dims,
-                               nullptr,
-                               in_index_table_ptr,
-                               in_index_table_ptr + table_size);
-
-    phi::backends::gpu::GpuMemcpyAsync(&max_voxel,
-                                       in_index_table_ptr + table_size,
-                                       sizeof(int),
-                                       gpuMemcpyDeviceToHost,
-                                       dev_ctx.stream());
-    dev_ctx.Wait();
-  }
-
   auto config =
       phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
   DenseTensor unique_value = phi::Empty<int>(
-      dev_ctx, {static_cast<int>(x_grad->nnz() * max_voxel * kernel_size * 2)});
+      dev_ctx, {static_cast<int>(x_grad->nnz() * kernel_size * 2)});
   DenseTensor out_index =
       phi::Empty<int>(dev_ctx, {static_cast<int>(x.nnz() * 2)});
   int* out_index_ptr = out_index.data<int>();
@@ -169,7 +135,7 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                   0,
                   dev_ctx.stream()>>>(rulebook_len,
                                       x.nnz(),
-                                      kernel_size * max_voxel,
+                                      kernel_size,
                                       offsets[kernel_size / 2],
                                       rulebook_ptr,
                                       out_index_ptr,
@@ -181,7 +147,6 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                     unique_value_ptr,
                     x.nnz(),
                     kernel_size,
-                    max_voxel,
                     in_channels,
                     2,
                     in_features_ptr);
@@ -242,7 +207,6 @@ void Conv3dCooGradGPUKernel(const GPUContext& dev_ctx,
                                    unique_value.data<int>(),
                                    x_grad->nnz(),
                                    kernel_size,
-                                   max_voxel,
                                    in_channels,
                                    2,
                                    x_grad_values_ptr);
diff --git a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
index e5727c4faab526..282033e62e3572 100644
--- a/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/conv_kernel.cu
@@ -66,7 +66,7 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
   DenseTensor h_counter, h_offsets;
-  h_counter.Resize({kernel_size + 1});
+  h_counter.Resize({kernel_size});
   h_offsets.Resize({kernel_size + 1});
   int* h_counter_ptr = dev_ctx.template HostAlloc<int>(&h_counter);
   int* h_offsets_ptr = dev_ctx.template HostAlloc<int>(&h_offsets);
@@ -74,7 +74,7 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
   // Second algorithm:
   // https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf
   // 1. product rulebook
-  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size + 1});
+  DenseTensor counter_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
   DenseTensor offsets_per_kernel = phi::Empty<int>(dev_ctx, {kernel_size});
   DenseTensor out_index = phi::Empty<int>(dev_ctx, {1});
   DenseTensor unique_value = phi::Empty<int>(dev_ctx, {1});
@@ -143,6 +143,26 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
   T* out_values_ptr = out_values->data<T>();
   set_zero(dev_ctx, out_values, static_cast<T>(0.0f));
 
+  if (subm) {
+    auto config =
+        phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rulebook_len, 1);
+    unique_value.ResizeAndAllocate(
+        {static_cast<int>(out->nnz() * kernel_size)});
+    out_index.ResizeAndAllocate({static_cast<int>(rulebook_len)});
+    int* out_index_ptr = out_index.data<int>();
+    int* unique_value_ptr = unique_value.data<int>();
+    phi::backends::gpu::GpuMemsetAsync(
+        out_index_ptr, 0, sizeof(int) * rulebook_len, dev_ctx.stream());
+    GroupIndexs<<<config.block_per_grid,
+                  config.thread_per_block,
+                  0,
+                  dev_ctx.stream()>>>(rulebook_len,
+                                      kernel_size,
+                                      rulebook_ptr + rulebook_len,
+                                      out_index_ptr,
+                                      unique_value_ptr);
+  }
+
   const T* kernel_ptr = kernel.data<T>();
   for (int i = 0; i < kernel_size; i++) {
     if (h_counter_ptr[i] <= 0) {
@@ -176,7 +196,6 @@ void Conv3dCooGPUKernel(const GPUContext& dev_ctx,
                                    unique_value.data<int>(),
                                    out->nnz(),
                                    kernel_size,
-                                   h_counter_ptr[kernel_size],
                                    out_channels,
                                    1,
                                    out_values_ptr);
diff --git a/paddle/phi/kernels/sparse/gpu/elementwise_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/elementwise_grad_kernel.cu
new file mode 100644
index 00000000000000..e7f0c9d96e9205
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/elementwise_grad_kernel.cu
@@ -0,0 +1,71 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/elementwise_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_grad_base.h"
+#include "paddle/phi/kernels/funcs/reduce_function.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void ElementWiseAddCooGradKernel(const Context& dev_ctx,
+                                 const SparseCooTensor& x,
+                                 const SparseCooTensor& y,
+                                 const SparseCooTensor& dout,
+                                 SparseCooTensor* dx,
+                                 SparseCooTensor* dy) {
+  if (dx) {
+    EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+  }
+
+  if (dy) {
+    EmptyLikeCooKernel<T, Context>(dev_ctx, y, dy);
+    Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dy);
+  }
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_coo_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ElementWiseAddCooGradKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(add_coo_dense_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ElementWiseAddDenseGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
new file mode 100644
index 00000000000000..47daa1eae19eda
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/elementwise_kernel.cu
@@ -0,0 +1,102 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/equal.h>
+#include <thrust/execution_policy.h>
+
+#include "paddle/phi/kernels/elementwise_add_kernel.h"
+#include "paddle/phi/kernels/sparse/elementwise_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/visit_type.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename IntT>
+void ElementWiseAddCooGPUKernel(const GPUContext& dev_ctx,
+                                const SparseCooTensor& x,
+                                const SparseCooTensor& y,
+                                SparseCooTensor* out) {
+  // TODO(zhangkaiuo): to support universal sparse + sparse
+  const auto& x_indices = x.indices();
+  const auto& y_indices = y.indices();
+  PADDLE_ENFORCE_EQ(
+      x_indices.numel(),
+      y_indices.numel(),
+      phi::errors::PreconditionNotMet(
+          "The numel of x.indices() and y.indices() should be equal"));
+  const IntT* x_indices_ptr = x_indices.data<IntT>();
+  const IntT* y_indices_ptr = y_indices.data<IntT>();
+#ifdef PADDLE_WITH_HIP
+  bool is_same = thrust::equal(thrust::hip::par.on(dev_ctx.stream()),
+#else
+  bool is_same = thrust::equal(thrust::cuda::par.on(dev_ctx.stream()),
+#endif
+                               x_indices_ptr,
+                               x_indices_ptr + x_indices.numel(),
+                               y_indices_ptr);
+  PADDLE_ENFORCE_EQ(
+      is_same,
+      true,
+      phi::errors::PreconditionNotMet(
+          "Currently, ElementWiseAddCooKernel only supports the case "
+          "where x and y have the same indices"));
+  EmptyLikeCooKernel<T, GPUContext>(dev_ctx, x, out);
+  phi::AddKernel<T, GPUContext>(
+      dev_ctx, x.values(), y.values(), out->mutable_values());
+  out->SetIndicesDict(x.GetIndicesDict());
+}
+
+template <typename T, typename Context>
+void ElementWiseAddCooKernel(const Context& dev_ctx,
+                             const SparseCooTensor& x,
+                             const SparseCooTensor& y,
+                             SparseCooTensor* out) {
+  PD_VISIT_BASE_INTEGRAL_TYPES(x.indices().dtype(), "VerifyIndices", ([&] {
+                                 ElementWiseAddCooGPUKernel<T, data_t>(
+                                     dev_ctx, x, y, out);
+                               }));
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(add_coo_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ElementWiseAddCooKernel,
+                   float,
+                   double,
+                   int16_t,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+  kernel->InputAt(1).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
+
+PD_REGISTER_KERNEL(add_coo_dense,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ElementWiseAddDenseKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
+}
diff --git a/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu
new file mode 100644
index 00000000000000..bfc81676eb8041
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/reshape_grad_kernel.cu
@@ -0,0 +1,77 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+// just copy from paddle\phi\kernels\sparse\cpu\reshape_grad_kernel.cc
+template <typename T, typename Context>
+void ReshapeCooGradKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const SparseCooTensor& dout,
+                          SparseCooTensor* dx) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
+  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  ReshapeCooKernel<T, Context>(dev_ctx, dout, x_shape, dx);
+}
+
+// just copy from paddle\phi\kernels\sparse\cpu\reshape_grad_kernel.cc
+template <typename T, typename Context>
+void ReshapeCsrGradKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          const SparseCsrTensor& dout,
+                          SparseCsrTensor* dx) {
+  EmptyLikeCsrKernel<T, Context>(dev_ctx, x, dx);
+  phi::IntArray x_shape(phi::vectorize(x.dims()));
+  ReshapeCsrKernel<T, Context>(dev_ctx, dout, x_shape, dx);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reshape_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCooGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(reshape_csr_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCsrGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
new file mode 100644
index 00000000000000..6e3a9842e8c30b
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/reshape_kernel.cu
@@ -0,0 +1,165 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_kernel_impl.h"
+
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+__global__ void ReshapeCooCudaKernel(const int64_t* x_indices_data,
+                                     const int num_x_sparse_part_dims,
+                                     const int num_out_sparse_part_dims,
+                                     const int64_t x_nnz,
+                                     const int64_t* x_sparse_part_strides,
+                                     const int64_t* out_sparse_part_strides,
+                                     int64_t* out_indices_data) {
+  CUDA_KERNEL_LOOP_TYPE(j, x_nnz, int64_t) {
+    int64_t location = 0;
+    for (int i = 0; i < num_x_sparse_part_dims; ++i) {
+      location += x_indices_data[i * x_nnz + j] * x_sparse_part_strides[i];
+    }
+    for (int i = 0; i < num_out_sparse_part_dims; ++i) {
+      out_indices_data[i * x_nnz + j] = location / out_sparse_part_strides[i];
+      location %= out_sparse_part_strides[i];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void ReshapeCooKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const phi::IntArray& shape,
+                      SparseCooTensor* out) {
+  int64_t x_nnz = x.nnz();
+  std::vector<int> new_shape(shape.GetData().begin(), shape.GetData().end());
+  phi::DDim out_dims = x.dims().reshape(new_shape);
+  //  get sparse part dimensions of x and out
+  std::vector<int64_t> x_sparse_part_dims;
+  std::vector<int64_t> out_sparse_part_dims;
+  for (int i = 0; i < x.sparse_dim(); ++i) {
+    x_sparse_part_dims.push_back(x.dims()[i]);
+  }
+  for (int i = 0; i < out_dims.size() - x.dense_dim(); ++i) {
+    out_sparse_part_dims.push_back(out_dims[i]);
+  }
+
+  DenseTensor out_indices = Empty<int64_t, Context>(
+      dev_ctx, {static_cast<int64_t>(out_sparse_part_dims.size()), x_nnz});
+  DenseTensor out_values(x.values());
+  out->SetMember(out_indices, out_values, out_dims, x.coalesced());
+
+  // compute values of out indices
+  const auto* x_indices_data = x.indices().data<int64_t>();
+  auto* out_indices_data = out_indices.data<int64_t>();
+  const phi::DDim& x_sparse_part_strides =
+      phi::stride(phi::make_ddim(x_sparse_part_dims));
+  const phi::DDim& out_sparse_part_strides =
+      phi::stride(phi::make_ddim(out_sparse_part_dims));
+
+  int64_t *destination_x_sparse_part_strides,
+      *destination_out_sparse_part_strides;
+
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void**>(&destination_x_sparse_part_strides),
+            sizeof(int64_t) * x_sparse_part_strides.size());
+  hipMemcpy(destination_x_sparse_part_strides,
+            x_sparse_part_strides.Get(),
+            sizeof(int64_t) * x_sparse_part_strides.size(),
+            hipMemcpyHostToDevice);
+  hipMalloc(reinterpret_cast<void**>(&destination_out_sparse_part_strides),
+            sizeof(int64_t) * out_sparse_part_strides.size());
+  hipMemcpy(destination_out_sparse_part_strides,
+            out_sparse_part_strides.Get(),
+            sizeof(int64_t) * out_sparse_part_strides.size(),
+            hipMemcpyHostToDevice);
+#else
+  cudaMalloc(reinterpret_cast<void**>(&destination_x_sparse_part_strides),
+             sizeof(int64_t) * x_sparse_part_strides.size());
+  cudaMemcpy(destination_x_sparse_part_strides,
+             x_sparse_part_strides.Get(),
+             sizeof(int64_t) * x_sparse_part_strides.size(),
+             cudaMemcpyHostToDevice);
+  cudaMalloc(reinterpret_cast<void**>(&destination_out_sparse_part_strides),
+             sizeof(int64_t) * out_sparse_part_strides.size());
+  cudaMemcpy(destination_out_sparse_part_strides,
+             out_sparse_part_strides.Get(),
+             sizeof(int64_t) * out_sparse_part_strides.size(),
+             cudaMemcpyHostToDevice);
+#endif
+
+  auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz, 1);
+  ReshapeCooCudaKernel<<<config.block_per_grid.x,
+                         config.thread_per_block.x,
+                         0,
+                         dev_ctx.stream()>>>(
+      x_indices_data,
+      x_sparse_part_dims.size(),
+      out_sparse_part_dims.size(),
+      x_nnz,
+      destination_x_sparse_part_strides,
+      destination_out_sparse_part_strides,
+      out_indices_data);
+}
+
+// just copy from paddle\phi\kernels\sparse\cpu\reshape_kernel.cc
+template <typename T, typename Context>
+void ReshapeCsrKernel(const Context& dev_ctx,
+                      const SparseCsrTensor& x,
+                      const phi::IntArray& shape,
+                      SparseCsrTensor* out) {
+  // transform csr format to coo format, and then use coo kernel
+  const SparseCooTensor x_coo = CsrToCoo<T, Context>(dev_ctx, x);
+  SparseCooTensor out_coo;
+  ReshapeCooKernel<T, Context>(dev_ctx, x_coo, shape, &out_coo);
+  CooToCsrKernel<T, Context>(dev_ctx, out_coo, out);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(reshape_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCooKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(reshape_csr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::ReshapeCsrKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
index 2ceda7da750e23..c72a38cd8fd323 100644
--- a/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/sparse_utils_kernel.cu
@@ -172,6 +172,7 @@ void DenseToCooKernel(const Context& dev_ctx,
                                                      temp_indexs_ptr,
                                                      indices_data,
                                                      sparse_data);
+
   out->SetMember(indices, values, x_dims, true);
 }
 
@@ -461,8 +462,8 @@ void CooToDenseGPUKernel(const GPUContext& dev_ctx,
 
   const auto place = dev_ctx.GetPlace();
   const T* x_data = values.data<T>();
-  *out = phi::Empty(
-      dev_ctx, phi::DenseTensorMeta(x.dtype(), x.dims(), x.values().layout()));
+  dev_ctx.template Alloc<T>(out);
+
   T* out_data = out->data<T>();
   int64_t base_offset = 1;
   for (int64_t i = 0; i < dense_dim; i++) {
@@ -538,7 +539,8 @@ PD_REGISTER_KERNEL(csr_to_coo,
                    int8_t,
                    int16_t,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   bool) {}
 
 PD_REGISTER_KERNEL(coo_to_csr,
                    GPU,
@@ -551,7 +553,8 @@ PD_REGISTER_KERNEL(coo_to_csr,
                    int8_t,
                    int16_t,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   bool) {}
 
 PD_REGISTER_KERNEL(dense_to_csr,
                    GPU,
@@ -619,6 +622,21 @@ PD_REGISTER_KERNEL(values_csr,
                    int16_t,
                    int,
                    int64_t) {
+  kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
+}
+
+PD_REGISTER_KERNEL(indices_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::IndicesCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   uint8_t,
+                   int8_t,
+                   int16_t,
+                   int,
+                   int64_t) {
   kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
 }
 
diff --git a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu
new file mode 100644
index 00000000000000..e0805578a0f86a
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_grad_kernel.cu
@@ -0,0 +1,85 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sync_batch_norm_grad_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SyncBatchNormCooGradKernel(
+    const Context& dev_ctx,
+    const SparseCooTensor& x,
+    const DenseTensor& scale,
+    const DenseTensor& bias,
+    const DenseTensor& saved_mean,
+    const DenseTensor& saved_variance,
+    const paddle::optional<DenseTensor>& reserve_space,
+    const SparseCooTensor& y_grad,
+    float momentum,
+    float epsilon,
+    const std::string& data_layout,
+    bool is_test,
+    bool use_global_stats,
+    bool trainable_statistics,
+    bool fuse_with_relu,
+    SparseCooTensor* x_grad,
+    DenseTensor* scale_grad,
+    DenseTensor* bias_grad) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, x_grad);
+  *scale_grad = phi::EmptyLike<T, Context>(dev_ctx, scale);
+  *bias_grad = phi::EmptyLike<T, Context>(dev_ctx, bias);
+  phi::SyncBatchNormGradKernel<T, Context>(dev_ctx,
+                                           x.values(),
+                                           scale,
+                                           bias,
+                                           saved_mean,
+                                           saved_variance,
+                                           reserve_space,
+                                           y_grad.values(),
+                                           momentum,
+                                           epsilon,
+                                           data_layout,
+                                           is_test,
+                                           use_global_stats,
+                                           trainable_statistics,
+                                           fuse_with_relu,
+                                           x_grad->mutable_values(),
+                                           scale_grad,
+                                           bias_grad);
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooGradKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu
new file mode 100644
index 00000000000000..a518148f2c95bb
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/sync_batch_norm_kernel.cu
@@ -0,0 +1,84 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/sparse/sync_batch_norm_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sync_batch_norm_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SyncBatchNormCooKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& mean,
+                            const DenseTensor& variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            SparseCooTensor* y,
+                            DenseTensor* mean_out,
+                            DenseTensor* variance_out,
+                            DenseTensor* saved_mean,
+                            DenseTensor* saved_variance,
+                            DenseTensor* reserve_space) {
+  EmptyLikeCooKernel<T, Context>(dev_ctx, x, y);
+  phi::SyncBatchNormKernel<T, Context>(dev_ctx,
+                                       x.values(),
+                                       scale,
+                                       bias,
+                                       mean,
+                                       variance,
+                                       momentum,
+                                       epsilon,
+                                       data_layout,
+                                       is_test,
+                                       use_global_stats,
+                                       trainable_statistics,
+                                       fuse_with_relu,
+                                       y->mutable_values(),
+                                       mean_out,
+                                       variance_out,
+                                       saved_mean,
+                                       saved_variance,
+                                       reserve_space);
+  y->SetIndicesDict(x.GetIndicesDict());
+}
+
+}  // namespace sparse
+}  // namespace phi
+
+#ifdef PADDLE_WITH_HIP
+PD_REGISTER_KERNEL(sync_batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooKernel,
+                   float,
+                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(sync_batch_norm_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::SyncBatchNormCooKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif
diff --git a/paddle/phi/kernels/sparse/gpu/transpose_grad_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_grad_kernel.cu
new file mode 100644
index 00000000000000..32d842161c2e54
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/transpose_grad_kernel.cu
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
+
+namespace phi {
+namespace sparse {
+
+std::vector<int> get_gpu_grad_perm(std::vector<int> perm) {
+  std::vector<int> grad_perm(perm.size());
+  for (unsigned int i = 0; i < perm.size(); ++i) {
+    grad_perm[perm[i]] = i;
+  }
+  return grad_perm;
+}
+
+template <typename T, typename Context>
+void TransposeCooGradKernel(const Context& dev_ctx,
+                            const SparseCooTensor& dout,
+                            const std::vector<int>& perm,
+                            SparseCooTensor* dx) {
+  std::vector<int> grad_perm = get_gpu_grad_perm(perm);
+  TransposeCooKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
+}
+
+template <typename T, typename Context>
+void TransposeCsrGradKernel(const Context& dev_ctx,
+                            const SparseCsrTensor& dout,
+                            const std::vector<int>& perm,
+                            SparseCsrTensor* dx) {
+  std::vector<int> grad_perm = get_gpu_grad_perm(perm);
+  TransposeCsrKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
+}
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose_coo_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCooGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(transpose_csr_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCsrGradKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
new file mode 100644
index 00000000000000..692076b80e9efd
--- /dev/null
+++ b/paddle/phi/kernels/sparse/gpu/transpose_kernel.cu
@@ -0,0 +1,338 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+
+namespace phi {
+namespace sparse {
+
+__global__ void TransposeCooCudaKernel(const int64_t *x_indices_data,
+                                       const int *perm,
+                                       const std::size_t n_dim,
+                                       const int64_t x_nnz,
+                                       int64_t *out_indices_data) {
+  CUDA_KERNEL_LOOP_TYPE(index, x_nnz * n_dim, int64_t) {
+    int64_t i = index / x_nnz;
+    int64_t j = index % x_nnz;
+    out_indices_data[index] = x_indices_data[j + perm[i] * x_nnz];
+  }
+}
+
+template <typename T>
+__global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data,
+                                         const int64_t *x_cols_data,
+                                         const T *x_values_data,
+                                         const int *perm,
+                                         const int64_t *x_dims,
+                                         const int64_t *out_dims,
+                                         const int64_t x_nnz,
+                                         int64_t *out_crows_data,
+                                         int64_t *out_cols_data,
+                                         T *out_values_data) {
+  int64_t __index__ =
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  // compute out_crows_data by x_cols_data
+  for (int64_t i = __index__; i <= out_dims[0]; i += blockDim.x * gridDim.x) {
+    out_crows_data[i] = 0;
+  }
+  __syncthreads();
+  if (__index__ == 0) {
+    for (int64_t i = 0; i < x_nnz; ++i) {
+      int j = x_cols_data[i];
+      out_crows_data[j + 2]++;
+    }
+    for (int64_t i = 0; i < out_dims[0]; i += 1) {
+      out_crows_data[i + 1] += out_crows_data[i];
+    }
+    // compute out_cols_data and out_values_data by out_crows_data and x
+    for (int i = 0; i < x_dims[0]; ++i) {
+      int64_t start = x_crows_data[i];
+      int64_t end = x_crows_data[i + 1];
+      for (int64_t j = start; j < end; ++j) {
+        int64_t x_cols_j = x_cols_data[j] + 1;
+        int64_t jjj = out_crows_data[x_cols_j];
+        out_cols_data[jjj] = i;
+        out_values_data[jjj] = x_values_data[j];
+        out_crows_data[x_cols_j]++;
+      }
+    }
+  }
+}
+
+template <typename T>
+__global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data,
+                                         const int64_t *x_cols_data,
+                                         const T *x_values_data,
+                                         const int *perm,
+                                         const int64_t *x_dims,
+                                         const int64_t *out_dims,
+                                         const std::size_t n_dim,
+                                         const int64_t x_nnz,
+                                         int64_t *out_crows_data,
+                                         int64_t *out_cols_data,
+                                         T *out_values_data) {
+  int64_t __index__ =
+      static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (__index__ == 0) {
+    int out_n_rows = out_dims[1];
+    int x_n_rows = x_dims[1];
+    for (int k = 0; k < out_dims[0]; ++k) {
+      if (perm[0] == 0) {  // dims == {0, 2, 1}
+        // compute out_crows_data by x_cols_data
+        for (int i = 0; i <= out_n_rows; ++i) {
+          out_crows_data[i] = 0;
+        }
+        for (int i = 0; i < x_crows_data[x_n_rows]; ++i) {
+          int j = x_cols_data[i];
+          out_crows_data[j + 2]++;
+        }
+        for (int i = 0; i < out_n_rows; ++i) {
+          out_crows_data[i + 1] += out_crows_data[i];
+        }
+        // compute out_cols_data and out_values_data by out_crows_data and x
+        for (int i = 0; i < x_n_rows; ++i) {
+          int64_t start = x_crows_data[i];
+          int64_t end = x_crows_data[i + 1];
+          for (int64_t j = start; j < end; ++j) {
+            int64_t x_cols_j = x_cols_data[j] + 1;
+            int64_t jjj = out_crows_data[x_cols_j];
+            out_cols_data[jjj] = i;
+            out_values_data[jjj] = x_values_data[j];
+            out_crows_data[x_cols_j]++;
+          }
+        }
+        // x offset
+        x_cols_data += x_crows_data[x_n_rows];
+        x_values_data += x_crows_data[x_n_rows];
+        x_crows_data += x_n_rows + 1;
+      } else if (perm[0] == 1 && perm[1] == 0) {  // perm == {1, 0, 2}
+        for (int i = 0; i < out_n_rows; ++i) {
+          out_crows_data[i] = 0;
+        }
+        int x_cols_offset = 0;
+        int out_cols_index = 0;
+        for (int i = 0; i < x_dims[0]; ++i) {
+          int x_crows_index = i * (x_n_rows + 1);
+          int start = x_crows_data[x_crows_index + k];
+          int end = x_crows_data[x_crows_index + 1 + k];
+          out_crows_data[i + 1] = end - start;
+          for (int j = start; j < end; ++j) {
+            out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j];
+            out_values_data[out_cols_index] = x_values_data[x_cols_offset + j];
+            out_cols_index++;
+          }
+          x_cols_offset += x_crows_data[x_crows_index + x_n_rows];
+        }
+        for (int i = 1; i <= out_n_rows; ++i) {
+          out_crows_data[i] += out_crows_data[i - 1];
+        }
+      }
+      // out offset
+      out_cols_data += out_crows_data[out_n_rows];
+      out_values_data += out_crows_data[out_n_rows];
+      out_crows_data += out_n_rows + 1;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void TransposeCooKernel(const Context &dev_ctx,
+                        const SparseCooTensor &x,
+                        const std::vector<int> &perm,
+                        SparseCooTensor *out) {
+  // create out sparse tensor
+  int64_t x_nnz = x.nnz();
+  std::size_t n_dim = perm.size();
+  DDim out_dims = x.dims().transpose(perm);
+  DenseTensor out_indices = EmptyLike<int64_t, Context>(dev_ctx, x.indices());
+  DenseTensor out_values(x.values());
+  out->SetMember(out_indices, out_values, out_dims, x.coalesced());
+
+  // compute values of indices
+  const DenseTensor &x_indices = x.indices();
+  const auto *x_indices_data = x_indices.data<int64_t>();
+  auto *out_indices_data = out_indices.data<int64_t>();
+  int *d_perm;
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  hipMemcpy(
+      d_perm, perm.data(), sizeof(int) * perm.size(), hipMemcpyHostToDevice);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  cudaMemcpy(
+      d_perm, perm.data(), sizeof(int) * perm.size(), cudaMemcpyHostToDevice);
+#endif
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz * n_dim, 1);
+  TransposeCooCudaKernel<<<config.block_per_grid.x,
+                           config.thread_per_block.x,
+                           0,
+                           dev_ctx.stream()>>>(
+      x_indices_data, d_perm, n_dim, x_nnz, out_indices_data);
+}
+
+template <typename T, typename Context>
+void TransposeCsrKernel(const Context &dev_ctx,
+                        const SparseCsrTensor &x,
+                        const std::vector<int> &perm,
+                        SparseCsrTensor *out) {
+  std::size_t n_dim = perm.size();
+  const DenseTensor &x_crows = x.crows();
+  const DenseTensor &x_cols = x.cols();
+  const DenseTensor &x_values = x.non_zero_elements();
+  DenseTensor out_crows, out_cols, out_values;
+  // return a copy of x
+  if (perm[0] == 0 && perm[1] == 1 && (n_dim == 2 || perm[2] == 2)) {
+    out_crows = x_crows;
+    out_cols = x_cols;
+    out_values = x_values;
+    out->SetMember(out_crows, out_cols, out_values, x.dims());
+    return;
+  }
+  // create out sparse tensor
+  DDim out_dims = x.dims().transpose(perm);
+  if (n_dim == 2) {
+    out_crows = Empty<int64_t, Context>(dev_ctx, {out_dims[0] + 1});
+  } else {
+    out_crows =
+        Empty<int64_t, Context>(dev_ctx, {out_dims[0] * (out_dims[1] + 1)});
+  }
+  out_cols = EmptyLike<int64_t, Context>(dev_ctx, x.cols());
+  out_values = EmptyLike<T, Context>(dev_ctx, x.values());
+  out->SetMember(out_crows, out_cols, out_values, out_dims);
+  // transpose by two stages
+  if (perm[0] == 1 && perm[1] == 2) {  // perm == {1, 2, 0}
+    SparseCsrTensor temp;
+    TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
+    TransposeCsrKernel<T, Context>(dev_ctx, temp, {0, 2, 1}, out);
+    return;
+  } else if (perm[0] == 2 && perm[1] == 0) {  // perm == {2, 0, 1}
+    SparseCsrTensor temp;
+    TransposeCsrKernel<T, Context>(dev_ctx, x, {0, 2, 1}, &temp);
+    TransposeCsrKernel<T, Context>(dev_ctx, temp, {1, 0, 2}, out);
+    return;
+  } else if (perm[0] == 2 && perm[1] == 1) {  // perm == {2, 1, 0}
+    SparseCsrTensor temp;
+    TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
+    TransposeCsrKernel<T, Context>(dev_ctx, temp, {2, 0, 1}, out);
+    return;
+  }
+  int64_t *out_crows_data = out_crows.data<int64_t>();
+  int64_t *out_cols_data = out_cols.data<int64_t>();
+  T *out_values_data = out_values.data<T>();
+  const int64_t *x_crows_data = x_crows.data<int64_t>();
+  const int64_t *x_cols_data = x_cols.data<int64_t>();
+  const T *x_values_data = x_values.data<T>();
+  int *d_perm;
+  int64_t *d_x_dims, *d_out_dims;
+#ifdef PADDLE_WITH_HIP
+  hipMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  hipMemcpy(
+      d_perm, perm.data(), sizeof(int) * perm.size(), hipMemcpyHostToDevice);
+  hipMalloc(reinterpret_cast<void **>(&d_x_dims),
+            sizeof(int64_t) * x.dims().size());
+  hipMemcpy(d_x_dims,
+            x.dims().Get(),
+            sizeof(int64_t) * x.dims().size(),
+            hipMemcpyHostToDevice);
+  hipMalloc(reinterpret_cast<void **>(&d_out_dims),
+            sizeof(int64_t) * out_dims.size());
+  hipMemcpy(d_out_dims,
+            out_dims.Get(),
+            sizeof(int64_t) * out_dims.size(),
+            hipMemcpyHostToDevice);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
+  cudaMemcpy(
+      d_perm, perm.data(), sizeof(int) * perm.size(), cudaMemcpyHostToDevice);
+  cudaMalloc(reinterpret_cast<void **>(&d_x_dims),
+             sizeof(int64_t) * x.dims().size());
+  cudaMemcpy(d_x_dims,
+             x.dims().Get(),
+             sizeof(int64_t) * x.dims().size(),
+             cudaMemcpyHostToDevice);
+  cudaMalloc(reinterpret_cast<void **>(&d_out_dims),
+             sizeof(int64_t) * out_dims.size());
+  cudaMemcpy(d_out_dims,
+             out_dims.Get(),
+             sizeof(int64_t) * out_dims.size(),
+             cudaMemcpyHostToDevice);
+#endif
+  int64_t x_nnz = x.nnz();
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1);
+  if (perm.size() == 2) {
+    TransposeCsr2DCudaKernel<T><<<config.block_per_grid.x,
+                                  config.thread_per_block.x,
+                                  0,
+                                  dev_ctx.stream()>>>(x_crows_data,
+                                                      x_cols_data,
+                                                      x_values_data,
+                                                      d_perm,
+                                                      d_x_dims,
+                                                      d_out_dims,
+                                                      x_nnz,
+                                                      out_crows_data,
+                                                      out_cols_data,
+                                                      out_values_data);
+  } else {
+    TransposeCsr3DCudaKernel<T><<<1, 1, 0, dev_ctx.stream()>>>(x_crows_data,
+                                                               x_cols_data,
+                                                               x_values_data,
+                                                               d_perm,
+                                                               d_x_dims,
+                                                               d_out_dims,
+                                                               perm.size(),
+                                                               x_nnz,
+                                                               out_crows_data,
+                                                               out_cols_data,
+                                                               out_values_data);
+  }
+}
+}  // namespace sparse
+}  // namespace phi
+
+PD_REGISTER_KERNEL(transpose_coo,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCooKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
+
+PD_REGISTER_KERNEL(transpose_csr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::sparse::TransposeCsrKernel,
+                   phi::dtype::float16,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool) {}
diff --git a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
index c965be21fb0298..a4b89fd813270e 100644
--- a/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
+++ b/paddle/phi/kernels/sparse/impl/unary_kernel_impl.h
@@ -37,6 +37,7 @@ namespace sparse {
     EmptyLikeCooKernel<T, Context>(dev_ctx, x, out);                       \
     phi::prefix##Kernel<T, Context>(                                       \
         dev_ctx, x.non_zero_elements(), out->mutable_non_zero_elements()); \
+    out->SetIndicesDict(x.GetIndicesDict());                               \
   }                                                                        \
                                                                            \
   template <typename T, typename Context>                                  \
@@ -105,6 +106,7 @@ void ScaleCooKernel(const Context& dev_ctx,
                                bias,
                                bias_after_scale,
                                out->mutable_non_zero_elements());
+  out->SetIndicesDict(x.GetIndicesDict());
 }
 
 template <typename T, typename Context>
@@ -129,8 +131,6 @@ void CastCooKernel(const Context& dev_ctx,
                    DataType index_dtype,
                    DataType value_dtype,
                    SparseCooTensor* out) {
-  out->set_dims(x.dims());
-
   const DenseTensor& x_indices = x.indices();
   const DenseTensor& x_values = x.non_zero_elements();
   DenseTensor* out_indices = out->mutable_indices();
@@ -157,6 +157,7 @@ void CastCooKernel(const Context& dev_ctx,
     meta.set_dtype(value_dtype);
     phi::CastKernel<T, Context>(dev_ctx, x_values, value_dtype, out_values);
   }
+  out->SetIndicesDict(x.GetIndicesDict());
 }
 
 template <typename T, typename Context>
@@ -165,8 +166,6 @@ void CastCsrKernel(const Context& dev_ctx,
                    DataType index_dtype,
                    DataType value_dtype,
                    SparseCsrTensor* out) {
-  out->set_dims(x.dims());
-
   const DenseTensor& x_crows = x.crows();
   const DenseTensor& x_cols = x.cols();
   const DenseTensor& x_values = x.non_zero_elements();
diff --git a/paddle/phi/kernels/sparse/sparse_utils_kernel.h b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
index 932427d42cd157..8639f91469454d 100644
--- a/paddle/phi/kernels/sparse/sparse_utils_kernel.h
+++ b/paddle/phi/kernels/sparse/sparse_utils_kernel.h
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
+#include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/kernels/empty_kernel.h"
 
 namespace phi {
@@ -36,6 +37,8 @@ SparseCooTensor DenseToCoo(const Context& dev_ctx,
   DenseTensor indices;
   DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
+  MetaTensor meta_out(&coo);
+  phi::UnchangedInferMeta(x, &meta_out);
   DenseToCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   return coo;
 }
@@ -50,6 +53,8 @@ SparseCooTensor CsrToCoo(const Context& dev_ctx, const SparseCsrTensor& x) {
   DenseTensor indices;
   DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
+  MetaTensor meta_out(&coo);
+  phi::UnchangedInferMeta(x, &meta_out);
   CsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   return coo;
 }
@@ -65,6 +70,8 @@ SparseCsrTensor CooToCsr(const Context& dev_ctx, const SparseCooTensor& x) {
   DenseTensor cols;
   DenseTensor non_zero_elements;
   SparseCsrTensor csr(crows, cols, non_zero_elements, x.dims());
+  MetaTensor meta_out(&csr);
+  phi::UnchangedInferMeta(x, &meta_out);
   CooToCsrKernel<T, Context>(dev_ctx, x, &csr);
   return csr;
 }
@@ -79,10 +86,13 @@ void DenseToCsrKernel(const Context& dev_ctx,
                     true,
                     phi::errors::InvalidArgument(
                         "SparseCsrTensor only support 2-D or 3-D Tensor."));
+
   const int64_t sparse_dim = x_dims.size() == 2 ? 2 : 3;
   DenseTensor indices;
   DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
+  MetaTensor meta_out(&coo);
+  phi::UnchangedInferMeta(x, &meta_out);
   DenseToCooKernel<T, Context>(dev_ctx, x, sparse_dim, &coo);
   CooToCsrKernel<T, Context>(dev_ctx, coo, out);
 }
@@ -93,6 +103,8 @@ SparseCsrTensor DenseToCsr(const Context& dev_ctx, const DenseTensor& x) {
   DenseTensor cols;
   DenseTensor non_zero_elements;
   SparseCsrTensor csr(crows, cols, non_zero_elements, x.dims());
+  MetaTensor meta_out(&csr);
+  phi::UnchangedInferMeta(x, &meta_out);
   DenseToCsrKernel<T, Context>(dev_ctx, x, &csr);
   return csr;
 }
@@ -117,6 +129,8 @@ void CsrToDenseKernel(const Context& dev_ctx,
   DenseTensor indices;
   DenseTensor values;
   SparseCooTensor coo(indices, values, x.dims());
+  MetaTensor meta_out(&coo);
+  phi::UnchangedInferMeta(x, &meta_out);
   CsrToCooKernel<T, Context>(dev_ctx, x, &coo);
   CooToDenseKernel<T, Context>(dev_ctx, coo, out);
 }
@@ -143,14 +157,20 @@ void ValuesCsrKernel(const Context& dev_ctx,
   *out = x.non_zero_elements();
 }
 
+template <typename T, typename Context>
+void IndicesCooKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      DenseTensor* out) {
+  *out = x.indices();
+}
+
 template <typename T, typename Context>
 void SparseCooTensorKernel(const Context& dev_ctx,
                            const DenseTensor& values,
                            const DenseTensor& indices,
-                           const IntArray& dense_shape,
+                           const std::vector<int64_t>& shape,
                            SparseCooTensor* out) {
-  *out =
-      SparseCooTensor(indices, values, phi::make_ddim(dense_shape.GetData()));
+  *out = SparseCooTensor(indices, values, phi::make_ddim(shape));
 }
 
 }  // namespace sparse
diff --git a/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h b/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h
new file mode 100644
index 00000000000000..9591e6f035ca79
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sync_batch_norm_grad_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SyncBatchNormCooGradKernel(
+    const Context& dev_ctx,
+    const SparseCooTensor& x,
+    const DenseTensor& scale,
+    const DenseTensor& bias,
+    const DenseTensor& saved_mean,
+    const DenseTensor& saved_variance,
+    const paddle::optional<DenseTensor>& reserve_space,
+    const SparseCooTensor& y_grad,
+    float momentum,
+    float epsilon,
+    const std::string& data_layout,
+    bool is_test,
+    bool use_global_stats,
+    bool trainable_statistics,
+    bool fuse_with_relu,
+    SparseCooTensor* x_grad,
+    DenseTensor* scale_grad,
+    DenseTensor* bias_grad);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h b/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h
new file mode 100644
index 00000000000000..7ee4baa107971f
--- /dev/null
+++ b/paddle/phi/kernels/sparse/sync_batch_norm_kernel.h
@@ -0,0 +1,47 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/sparse_coo_tensor.h"
+
+namespace phi {
+namespace sparse {
+
+template <typename T, typename Context>
+void SyncBatchNormCooKernel(const Context& dev_ctx,
+                            const SparseCooTensor& x,
+                            const DenseTensor& scale,
+                            const DenseTensor& bias,
+                            const DenseTensor& mean,
+                            const DenseTensor& variance,
+                            float momentum,
+                            float epsilon,
+                            const std::string& data_layout,
+                            bool is_test,
+                            bool use_global_stats,
+                            bool trainable_statistics,
+                            bool fuse_with_relu,
+                            SparseCooTensor* y,
+                            DenseTensor* mean_out,
+                            DenseTensor* variance_out,
+                            DenseTensor* saved_mean,
+                            DenseTensor* saved_variance,
+                            DenseTensor* reserve_space);
+
+}  // namespace sparse
+}  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_grad_kernel.h b/paddle/phi/kernels/sparse/unary_grad_kernel.h
index eb2cf9ed697e9d..b446e1b99ed411 100644
--- a/paddle/phi/kernels/sparse/unary_grad_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_grad_kernel.h
@@ -77,5 +77,29 @@ void CastCsrGradKernel(const Context& dev_ctx,
                        DataType value_dtype,
                        SparseCsrTensor* dx);
 
+template <typename T, typename Context>
+void TransposeCooGradKernel(const Context& dev_ctx,
+                            const SparseCooTensor& dout,
+                            const std::vector<int>& perm,
+                            SparseCooTensor* dx);
+
+template <typename T, typename Context>
+void TransposeCsrGradKernel(const Context& dev_ctx,
+                            const SparseCsrTensor& dout,
+                            const std::vector<int>& perm,
+                            SparseCsrTensor* dx);
+
+template <typename T, typename Context>
+void ReshapeCooGradKernel(const Context& dev_ctx,
+                          const SparseCooTensor& x,
+                          const SparseCooTensor& dout,
+                          SparseCooTensor* dx);
+
+template <typename T, typename Context>
+void ReshapeCsrGradKernel(const Context& dev_ctx,
+                          const SparseCsrTensor& x,
+                          const SparseCsrTensor& dout,
+                          SparseCsrTensor* dx);
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/sparse/unary_kernel.h b/paddle/phi/kernels/sparse/unary_kernel.h
index fdb6b21a44427c..a81e724d1fe481 100644
--- a/paddle/phi/kernels/sparse/unary_kernel.h
+++ b/paddle/phi/kernels/sparse/unary_kernel.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include "paddle/phi/common/int_array.h"
+#include "paddle/phi/core/ddim.h"
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 
@@ -99,6 +101,48 @@ void CastCsrKernel(const Context& dev_ctx,
                    DataType value_dtype,
                    SparseCsrTensor* out);
 
+template <typename T, typename Context>
+void TransposeCooKernel(const Context& dev_ctx,
+                        const SparseCooTensor& x,
+                        const std::vector<int>& perm,
+                        SparseCooTensor* out);
+
+template <typename T, typename Context>
+void TransposeCsrKernel(const Context& dev_ctx,
+                        const SparseCsrTensor& x,
+                        const std::vector<int>& perm,
+                        SparseCsrTensor* out);
+
+template <typename T, typename Context>
+SparseCooTensor TransposeCoo(const Context& dev_ctx,
+                             const SparseCooTensor& x,
+                             const std::vector<int>& perm) {
+  PADDLE_ENFORCE_EQ(x.sparse_dim(),
+                    perm.size(),
+                    phi::errors::InvalidArgument(
+                        "size of perm must be equal than the x.sparse_dim()"));
+  SparseCooTensor coo;
+  TransposeCooKernel<T, Context>(dev_ctx, x, perm, &coo);
+  return coo;
+}
+
+template <typename T, typename Context>
+SparseCsrTensor TransposeCsr(const Context& dev_ctx,
+                             const SparseCsrTensor& x,
+                             const std::vector<int>& perm) {
+  PADDLE_ENFORCE_LE(
+      2,
+      perm.size(),
+      phi::errors::InvalidArgument("size of perm must be equal to 2 or 3"));
+  PADDLE_ENFORCE_GE(
+      3,
+      perm.size(),
+      phi::errors::InvalidArgument("size of perm must be equal to 2 or 3"));
+  SparseCsrTensor csr;
+  TransposeCsrKernel<T, Context>(dev_ctx, x, perm, &csr);
+  return csr;
+}
+
 template <typename T, typename Context>
 SparseCooTensor ReluCoo(const Context& dev_ctx, const SparseCooTensor& x) {
   SparseCooTensor coo;
@@ -113,5 +157,43 @@ SparseCooTensor ReluCsr(const Context& dev_ctx, const SparseCooTensor& x) {
   return csr;
 }
 
+template <typename T, typename Context>
+void ReshapeCooKernel(const Context& dev_ctx,
+                      const SparseCooTensor& x,
+                      const phi::IntArray& shape,
+                      SparseCooTensor* out);
+
+template <typename T, typename Context>
+void ReshapeCsrKernel(const Context& dev_ctx,
+                      const SparseCsrTensor& x,
+                      const phi::IntArray& shape,
+                      SparseCsrTensor* out);
+
+template <typename T, typename Context>
+SparseCooTensor ReshapeCoo(const Context& dev_ctx,
+                           const SparseCooTensor& x,
+                           const phi::IntArray& shape) {
+  SparseCooTensor coo;
+  ReshapeCooKernel<T, Context>(dev_ctx, x, shape, &coo);
+  return coo;
+}
+
+template <typename T, typename Context>
+SparseCsrTensor ReshapeCsr(const Context& dev_ctx,
+                           const SparseCsrTensor& x,
+                           const phi::IntArray& shape) {
+  PADDLE_ENFORCE_LE(
+      2,
+      shape.size(),
+      phi::errors::InvalidArgument("size of shape must be equal to 2 or 3"));
+  PADDLE_ENFORCE_GE(
+      3,
+      shape.size(),
+      phi::errors::InvalidArgument("size of shape must be equal to 2 or 3"));
+  SparseCsrTensor csr;
+  ReshapeCsrKernel<T, Context>(dev_ctx, x, shape, &csr);
+  return csr;
+}
+
 }  // namespace sparse
 }  // namespace phi
diff --git a/paddle/phi/kernels/strided_slice_grad_kernel.cc b/paddle/phi/kernels/strided_slice_grad_kernel.cc
index 38dd360ea66c21..8551fc6e845743 100644
--- a/paddle/phi/kernels/strided_slice_grad_kernel.cc
+++ b/paddle/phi/kernels/strided_slice_grad_kernel.cc
@@ -64,6 +64,7 @@ PD_REGISTER_KERNEL(strided_slice_grad,
                    int64_t,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 #endif
diff --git a/paddle/phi/kernels/transfer_layout_kernel.cc b/paddle/phi/kernels/transfer_layout_kernel.cc
index 25a986ea82fb02..be232b7c671e9b 100644
--- a/paddle/phi/kernels/transfer_layout_kernel.cc
+++ b/paddle/phi/kernels/transfer_layout_kernel.cc
@@ -121,8 +121,10 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
       OneDNNContext::tls().set_cur_paddle_data_layout(src_layout);
     }
 
-    out->set_layout(DataLayout::ONEDNN);
-    out->set_format(out_format);
+    dnnl::memory::desc out_mem_desc(vectorize<int64_t>(out->dims()),
+                                    funcs::ToOneDNNDataType(x.dtype()),
+                                    out_format);
+    out->set_mem_desc(out_mem_desc);
   } else if (src_layout == DataLayout::ONEDNN &&
              dst_layout != DataLayout::ONEDNN) {
     // Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
diff --git a/paddle/phi/kernels/uniform_random_kernel.cc b/paddle/phi/kernels/uniform_random_kernel.cc
index 11f61e5b4a03b9..6669438cc3b7b2 100644
--- a/paddle/phi/kernels/uniform_random_kernel.cc
+++ b/paddle/phi/kernels/uniform_random_kernel.cc
@@ -51,8 +51,13 @@ PD_REGISTER_KERNEL(uniform_random,
                    phi::dtype::bfloat16) {}
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-PD_REGISTER_KERNEL(
-    uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {}
+PD_REGISTER_KERNEL(uniform_random,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
 #endif
 
 #ifdef PADDLE_WITH_XPU
diff --git a/paddle/phi/kernels/xpu/activation_kernel.cc b/paddle/phi/kernels/xpu/activation_kernel.cc
index 514d5e0b281586..3bb59f52bbf886 100644
--- a/paddle/phi/kernels/xpu/activation_kernel.cc
+++ b/paddle/phi/kernels/xpu/activation_kernel.cc
@@ -82,18 +82,43 @@ int xpu_activation_func(
 }
 
 template <typename Context, typename T, typename XPUType>
-int xpu_activation_1attr_func(
+int xpu_activation_func_with_max_x_y(
     const Context& dev_ctx,
     const DenseTensor& x,
     DenseTensor* out,
-    float attr,
-    std::function<int(xpu::Context*, const XPUType*, XPUType*, int, float)>
+    std::function<
+        int(xpu::Context*, const XPUType*, XPUType*, int, const float*, float*)>
         func) {
+  // does not support "const float* max_x, float* max_y" now
   int r = func(dev_ctx.x_context(),
                reinterpret_cast<const XPUType*>(x.data<T>()),
                reinterpret_cast<XPUType*>(out->data<T>()),
                x.numel(),
-               attr);
+               nullptr,
+               nullptr);
+  return r;
+}
+
+template <typename Context, typename T, typename XPUType>
+int xpu_activation_1attr_func(const Context& dev_ctx,
+                              const DenseTensor& x,
+                              DenseTensor* out,
+                              float attr,
+                              std::function<int(xpu::Context*,
+                                                const XPUType*,
+                                                XPUType*,
+                                                int,
+                                                float,
+                                                const float*,
+                                                float*)> func) {
+  // does not support "const float* max_x, float* max_y" now
+  int r = func(dev_ctx.x_context(),
+               reinterpret_cast<const XPUType*>(x.data<T>()),
+               reinterpret_cast<XPUType*>(out->data<T>()),
+               x.numel(),
+               attr,
+               nullptr,
+               nullptr);
   return r;
 }
 
@@ -213,7 +238,7 @@ struct XPUHardSwishFunctor : public funcs::BaseActivationFunctor<T> {
         offset,
         3.0f,
         errors::External("Not support offset [%f] in XPU", offset));
-    int r = xpu_activation_func<Context, T, XPUType>(
+    int r = xpu_activation_func_with_max_x_y<Context, T, XPUType>(
         dev_ctx, x, out, xpu::hard_swish<XPUType>);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "hard_swish");
   }
@@ -259,7 +284,7 @@ struct XPURelu6Functor : public funcs::BaseActivationFunctor<T> {
   void operator()(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out) const {
-    int r = xpu_activation_func<Context, T, XPUType>(
+    int r = xpu_activation_func_with_max_x_y<Context, T, XPUType>(
         dev_ctx, x, out, xpu::relu6<XPUType>);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "relu6");
   }
@@ -272,7 +297,7 @@ struct XPUSigmoidFunctor : public funcs::BaseActivationFunctor<T> {
   void operator()(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out) const {
-    int r = xpu_activation_func<Context, T, XPUType>(
+    int r = xpu_activation_func_with_max_x_y<Context, T, XPUType>(
         dev_ctx, x, out, xpu::sigmoid<XPUType>);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "sigmoid");
   }
@@ -363,7 +388,7 @@ struct XPUTanhFunctor : public funcs::BaseActivationFunctor<T> {
   void operator()(const Context& dev_ctx,
                   const DenseTensor& x,
                   DenseTensor* out) const {
-    int r = xpu_activation_func<Context, T, XPUType>(
+    int r = xpu_activation_func_with_max_x_y<Context, T, XPUType>(
         dev_ctx, x, out, xpu::tanh<XPUType>);
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "tanh");
   }
diff --git a/paddle/phi/kernels/xpu/adam_kernel.cc b/paddle/phi/kernels/xpu/adam_kernel.cc
index b4d3301667e234..e17af77abac0de 100644
--- a/paddle/phi/kernels/xpu/adam_kernel.cc
+++ b/paddle/phi/kernels/xpu/adam_kernel.cc
@@ -65,7 +65,7 @@ void AdamDenseKernel(const Context& dev_ctx,
   const float* beta1_const_pow_ptr = nullptr;
   if (beta1_pow.place() == CPUPlace()) {
     DenseTensor xpu_beta1_pow;
-    phi::Copy(dev_ctx, beta1_pow, beta1_pow.place(), false, &xpu_beta1_pow);
+    phi::Copy(dev_ctx, beta1_pow, dev_ctx.GetPlace(), false, &xpu_beta1_pow);
     if (xpu_beta1_pow.dtype() == DataType::FLOAT16)
       funcs::GetDataPointer<Context, float>(
           xpu_beta1_pow, &beta1_pow_ptr, dev_ctx);
@@ -82,7 +82,7 @@ void AdamDenseKernel(const Context& dev_ctx,
   const float* beta2_const_pow_ptr = nullptr;
   if (beta2_pow.place() == CPUPlace()) {
     DenseTensor xpu_beta2_pow;
-    phi::Copy(dev_ctx, beta2_pow, beta2_pow.place(), false, &xpu_beta2_pow);
+    phi::Copy(dev_ctx, beta2_pow, dev_ctx.GetPlace(), false, &xpu_beta2_pow);
     if (xpu_beta2_pow.dtype() == DataType::FLOAT16)
       funcs::GetDataPointer<Context, float>(
           xpu_beta2_pow, &beta2_pow_ptr, dev_ctx);
diff --git a/paddle/phi/kernels/xpu/arange_kernel.cc b/paddle/phi/kernels/xpu/arange_kernel.cc
index 58a133c1c9a22f..84896ca1993a56 100644
--- a/paddle/phi/kernels/xpu/arange_kernel.cc
+++ b/paddle/phi/kernels/xpu/arange_kernel.cc
@@ -20,6 +20,18 @@ limitations under the License. */
 
 namespace phi {
 
+template <typename T, typename Context>
+inline T GetValue(const Context& dev_ctx, const DenseTensor& x) {
+  T value = static_cast<T>(0);
+  if (x.place() != CPUPlace()) {
+    DenseTensor cpu_x;
+    Copy(dev_ctx, x, CPUPlace(), true, &cpu_x);
+    value = cpu_x.data<T>()[0];
+  } else {
+    value = x.data<T>()[0];
+  }
+  return value;
+}
 template <typename T, typename Context>
 void ArangeKernel(const Context& dev_ctx,
                   const DenseTensor& start,
@@ -29,19 +41,9 @@ void ArangeKernel(const Context& dev_ctx,
   auto place = dev_ctx.GetPlace();
   auto cpu_place = phi::CPUPlace();
 
-  DenseTensor n_cpu;
-  n_cpu.Resize({start.numel()});
-  T* n_cpu_data = dev_ctx.template HostAlloc<T>(&n_cpu);
-
-  paddle::memory::Copy(
-      cpu_place, n_cpu_data, place, start.data<T>(), sizeof(T) * start.numel());
-  T start_value = n_cpu_data[0];
-  paddle::memory::Copy(
-      cpu_place, n_cpu_data, place, end.data<T>(), sizeof(T) * end.numel());
-  T end_value = n_cpu_data[0];
-  paddle::memory::Copy(
-      cpu_place, n_cpu_data, place, step.data<T>(), sizeof(T) * step.numel());
-  T step_value = n_cpu_data[0];
+  T start_value = GetValue<T, Context>(dev_ctx, start);
+  T end_value = GetValue<T, Context>(dev_ctx, end);
+  T step_value = GetValue<T, Context>(dev_ctx, step);
 
   int64_t size = 0;
   phi::funcs::GetSize(start_value, end_value, step_value, &size);
@@ -50,7 +52,9 @@ void ArangeKernel(const Context& dev_ctx,
 
   DenseTensor out_cpu;
   out_cpu.Resize({out->numel()});
-  T* out_cpu_data = dev_ctx.template HostAlloc<T>(&out_cpu);
+  dev_ctx.template HostAlloc<T>(&out_cpu);
+  T* out_cpu_data = out_cpu.data<T>();
+
   T value = start_value;
   for (int64_t i = 0; i < size; ++i) {
     out_cpu_data[i] = value;
@@ -63,4 +67,8 @@ void ArangeKernel(const Context& dev_ctx,
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
-    arange, XPU, ALL_LAYOUT, phi::ArangeKernel, float, double, int, int64_t) {}
+    arange, XPU, ALL_LAYOUT, phi::ArangeKernel, float, double, int, int64_t) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(1).SetBackend(phi::Backend::ALL_BACKEND);
+  kernel->InputAt(2).SetBackend(phi::Backend::ALL_BACKEND);
+}
diff --git a/paddle/phi/kernels/xpu/bmm_grad_kernel.cc b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
new file mode 100644
index 00000000000000..246da888d2ca57
--- /dev/null
+++ b/paddle/phi/kernels/xpu/bmm_grad_kernel.cc
@@ -0,0 +1,107 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bmm_grad_kernel.h"
+
+#include "paddle/phi/kernels/xpu/bmm_xpu_utils.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void MatMul(const Context& dev_ctx,
+            const DenseTensor& a,
+            bool trans_a,
+            const DenseTensor& b,
+            bool trans_b,
+            DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  xpu::Context* xpu_ctx = dev_ctx.x_context();
+  if (std::is_same<paddle::platform::float16, T>::value) {
+    MatMulXPUFunction<T, int16_t>(a, b, out, trans_a, trans_b, xpu_ctx);
+  } else {
+    if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
+      MatMulXPUFunction<T, int32_t>(a, b, out, trans_a, trans_b, xpu_ctx);
+    } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+      MatMulXPUFunction<T, float>(a, b, out, trans_a, trans_b, xpu_ctx);
+    } else {
+      MatMulXPUFunction<T, int16_t>(a, b, out, trans_a, trans_b, xpu_ctx);
+    }
+  }
+}
+
+template <typename T, typename Context>
+void CalcInputGrad(const Context& dev_ctx,
+                   const DenseTensor& a,
+                   bool trans_a,
+                   const DenseTensor& b,
+                   bool trans_b,
+                   DenseTensor* out) {
+  if (out == nullptr) return;
+  MatMul<T, Context>(dev_ctx, a, trans_a, b, trans_b, out);
+}
+
+template <typename T, typename Context>
+void BmmGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& y,
+                   const DenseTensor& out_grad,
+                   DenseTensor* x_grad,
+                   DenseTensor* y_grad) {
+  DenseTensor x_help = x;
+  DenseTensor y_help = y;
+  DenseTensor out_grad_help = out_grad;
+  ReshapeXYOutIntoMatrixSequence(
+      &x_help, &y_help, &out_grad_help, false, false);
+
+  phi::DDim dx_dims;
+  if (x_grad) {
+    dx_dims = x_grad->dims();
+    if (dx_dims != x_help.dims()) {
+      x_grad->Resize(x_help.dims());
+    }
+  }
+
+  phi::DDim dy_dims;
+  if (y_grad) {
+    dy_dims = y_grad->dims();
+    if (dy_dims != y_help.dims()) {
+      y_grad->Resize(y_help.dims());
+    }
+  }
+
+  CalcInputGrad<T, Context>(
+      dev_ctx, out_grad_help, false, y_help, true, x_grad);
+  CalcInputGrad<T, Context>(
+      dev_ctx, x_help, true, out_grad_help, false, y_grad);
+
+  if (x_grad) {
+    if (dx_dims != x_help.dims()) {
+      x_grad->Resize(dx_dims);
+    }
+  }
+  if (y_grad) {
+    if (dy_dims != y_help.dims()) {
+      y_grad->Resize(dy_dims);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(bmm_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::BmmGradKernel,
+                   float,
+                   paddle::platform::float16) {}
diff --git a/paddle/phi/kernels/xpu/bmm_kernel.cc b/paddle/phi/kernels/xpu/bmm_kernel.cc
new file mode 100644
index 00000000000000..b75383bbaa5aa1
--- /dev/null
+++ b/paddle/phi/kernels/xpu/bmm_kernel.cc
@@ -0,0 +1,80 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/bmm_kernel.h"
+#include "paddle/phi/kernels/xpu/bmm_xpu_utils.h"
+namespace phi {
+template <typename T, typename Context>
+void BmmKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const DenseTensor& y,
+               DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  if (x.numel() == 0 || y.numel() == 0) {
+    return;
+  }
+  bool trans_x = false;
+  bool trans_y = false;
+
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+
+  PADDLE_ENFORCE_EQ(x_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "Input(X) of BmmOp must be 3-dimensional in BmmOp, "
+                        "but received X's shape: [%s]",
+                        x_dims));
+  PADDLE_ENFORCE_EQ(y_dims.size(),
+                    3,
+                    phi::errors::InvalidArgument(
+                        "Input(Y) of BmmOp must be 3-dimensional in BmmOp, "
+                        "but received Y's shape: [%s].",
+                        y_dims));
+  PADDLE_ENFORCE_EQ(
+      x_dims[0],
+      y_dims[0],
+      phi::errors::InvalidArgument(
+          "Input(X) and Input(Y) must have the same batch size in BmmOp, "
+          "but received X's batch size: [%s],"
+          "Y's batch size [%s]",
+          x_dims[0],
+          y_dims[0]));
+  PADDLE_ENFORCE_EQ(
+      x_dims[2],
+      y_dims[1],
+      phi::errors::InvalidArgument(
+          "Input(X)'s width must be equal with Input(Y)'s height in BmmOp,"
+          "but receive X's width: [%s],"
+          "Y's height: [%s].",
+          x_dims[2],
+          y_dims[1]));
+
+  xpu::Context* xpu_ctx = dev_ctx.x_context();
+  if (std::is_same<paddle::platform::float16, T>::value) {
+    MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, xpu_ctx);
+  } else {
+    if (std::getenv("XPU_PADDLE_FC_INT32") != nullptr) {
+      MatMulXPUFunction<T, int32_t>(x, y, out, trans_x, trans_y, xpu_ctx);
+    } else if (std::getenv("XPU_PADDLE_FC_LOCAL_INT16") != nullptr) {
+      MatMulXPUFunction<T, float>(x, y, out, trans_x, trans_y, xpu_ctx);
+    } else {
+      MatMulXPUFunction<T, int16_t>(x, y, out, trans_x, trans_y, xpu_ctx);
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    bmm, XPU, ALL_LAYOUT, phi::BmmKernel, float, paddle::platform::float16) {}
diff --git a/paddle/phi/kernels/xpu/bmm_xpu_utils.h b/paddle/phi/kernels/xpu/bmm_xpu_utils.h
new file mode 100644
index 00000000000000..f0ac5c7e14ea1e
--- /dev/null
+++ b/paddle/phi/kernels/xpu/bmm_xpu_utils.h
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/matmul_grad_kernel_impl.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+
+namespace phi {
+template <typename T, typename FCT>
+static void MatMulXPUFunction(const DenseTensor& x,
+                              const DenseTensor& y,
+                              DenseTensor* out,
+                              bool trans_x,
+                              bool trans_y,
+                              xpu::Context* xpu_ctx) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& x_dims = x.dims();
+  const auto& y_dims = y.dims();
+
+  auto mat_dim_a = phi::funcs::CreateMatrixDescriptor(
+      RowMatrixFromVector(x_dims), 0, trans_x);
+  auto mat_dim_b = phi::funcs::CreateMatrixDescriptor(
+      ColumnMatrixFromVector(y_dims), 0, trans_y);
+
+  T* data_c = out->data<T>();
+  int m = mat_dim_a.height_;
+  int n = mat_dim_b.width_;
+  int k = mat_dim_a.width_;
+  int batch_size = mat_dim_a.batch_size_;
+  // batch matmul
+  int r = xpu::fc_batched<XPUType, XPUType, XPUType, FCT>(
+      xpu_ctx,                                        // Context* ctx,
+      batch_size,                                     // int batch_size,
+      mat_dim_a.trans_,                               // bool x_trans,
+      mat_dim_b.trans_,                               // bool w_trans,
+      m,                                              // int m,
+      n,                                              // int n,
+      k,                                              // int k,
+      1.0,                                            // float alpha,
+      reinterpret_cast<const XPUType*>(x.data<T>()),  // const TX* x,
+      mat_dim_a.stride_,                              // int stride_a,
+      reinterpret_cast<const XPUType*>(y.data<T>()),  // const TW* w,
+      mat_dim_b.stride_,                              // int stride_b,
+      0.0,                                            // float beta,
+      reinterpret_cast<XPUType*>(data_c),             // TY* y,
+      m * n,                                          // int stride_c,
+      nullptr,                                        // const float* x_maxptr,
+      nullptr);                                       // const float* w_maxptr
+
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_batched");
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/gaussian_random_kernel.cc b/paddle/phi/kernels/xpu/gaussian_random_kernel.cc
index ee216e75883108..913ad2472e9a85 100644
--- a/paddle/phi/kernels/xpu/gaussian_random_kernel.cc
+++ b/paddle/phi/kernels/xpu/gaussian_random_kernel.cc
@@ -42,7 +42,7 @@ void GaussianRandomKernel(const Context& ctx,
   for (int64_t i = 0; i < size; ++i) {
     data_cpu[i] = dist(*engine);
   }
-  paddle::memory::Copy(phi::XPUPlace(),
+  paddle::memory::Copy(ctx.GetPlace(),
                        data,
                        phi::CPUPlace(),
                        reinterpret_cast<void*>(data_cpu.get()),
diff --git a/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc b/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
new file mode 100644
index 00000000000000..7f2b08538a93c0
--- /dev/null
+++ b/paddle/phi/kernels/xpu/roi_align_grad_kernel.cc
@@ -0,0 +1,114 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignGradKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        const DenseTensor& boxes,
+                        const paddle::optional<DenseTensor>& boxes_num,
+                        const DenseTensor& out_grad,
+                        int pooled_height,
+                        int pooled_width,
+                        float spatial_scale,
+                        int sampling_ratio,
+                        bool aligned,
+                        DenseTensor* dx) {
+  int rois_num = boxes.dims()[0];
+  int channels = x.dims()[1];
+  int height = x.dims()[2];
+  int width = x.dims()[3];
+
+  if (!dx) {
+    return;
+  }
+  DenseTensor roi_batch_id_list;
+  roi_batch_id_list.Resize({rois_num});
+  auto cplace = phi::CPUPlace();
+  auto xplace = dev_ctx.GetPlace();
+
+  int rois_batch_size = 0;
+  int* cpu_lod = nullptr;
+  if (boxes_num) {
+    rois_batch_size = boxes_num->numel();
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(cplace,
+                         rois_num_list.data(),
+                         xplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * rois_batch_size);
+    cpu_lod = new int[rois_batch_size + 1];
+    cpu_lod[0] = 0;
+    for (int i = 0; i < rois_batch_size; i++) {
+      cpu_lod[i + 1] = cpu_lod[i] + rois_num_list[i];
+    }
+  } else {
+    auto rois_lod = boxes.lod().back();
+    rois_batch_size = rois_lod.size() - 1;
+    cpu_lod = new int[rois_batch_size + 1];
+    for (int i = 0; i < rois_batch_size + 1; i++) {
+      cpu_lod[i] = rois_lod[i];
+    }
+  }
+  int* roi_id_data = nullptr;
+  int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
+                     (rois_batch_size + 1) * sizeof(int));
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+  paddle::memory::Copy(xplace,
+                       roi_id_data,
+                       cplace,
+                       cpu_lod,
+                       (rois_batch_size + 1) * sizeof(int));
+  dev_ctx.template Alloc<T>(dx);
+
+  int output_grad_size = out_grad.numel();
+
+  delete[] cpu_lod;
+  if (output_grad_size > 0) {
+    r = xpu::roi_align_grad<T, int>(dev_ctx.x_context(),
+                                    out_grad.data<T>(),
+                                    dx->data<T>(),
+                                    boxes.data<T>(),
+                                    roi_id_data,
+                                    x.dims()[0],
+                                    channels,
+                                    height,
+                                    width,
+                                    out_grad.dims()[0],
+                                    pooled_height,
+                                    pooled_width,
+                                    spatial_scale,
+                                    sampling_ratio,
+                                    true,
+                                    aligned);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "roi_align_grad");
+  }
+  if (dev_ctx.x_context()->xpu_stream) {
+    dev_ctx.Wait();
+  }
+  xpu_free(roi_id_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    roi_align_grad, XPU, ALL_LAYOUT, phi::RoiAlignGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/roi_align_kernel.cc b/paddle/phi/kernels/xpu/roi_align_kernel.cc
new file mode 100644
index 00000000000000..dacb676693cd0c
--- /dev/null
+++ b/paddle/phi/kernels/xpu/roi_align_kernel.cc
@@ -0,0 +1,149 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/roi_align_kernel.h"
+
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void RoiAlignKernel(const Context& dev_ctx,
+                    const DenseTensor& x,
+                    const DenseTensor& boxes,
+                    const paddle::optional<DenseTensor>& boxes_num,
+                    int pooled_height,
+                    int pooled_width,
+                    float spatial_scale,
+                    int sampling_ratio,
+                    bool aligned,
+                    DenseTensor* out) {
+  const auto& in_dims = x.dims();
+  int batch_size = in_dims[0];
+  int channels = in_dims[1];
+  int height = in_dims[2];
+  int width = in_dims[3];
+
+  int rois_num = boxes.dims()[0];
+
+  if (rois_num == 0) return;
+
+  DenseTensor roi_batch_id_list;
+  roi_batch_id_list.Resize({rois_num});
+  auto cplace = phi::CPUPlace();
+  int* roi_batch_id_data = dev_ctx.template HostAlloc<int>(&roi_batch_id_list);
+  auto xplace = dev_ctx.GetPlace();
+  int rois_batch_size = 0;
+  int* cpu_lod = nullptr;
+  if (boxes_num) {
+    rois_batch_size = boxes_num->numel();
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The rois_batch_size and imgs "
+            "batch_size must be the same. But received rois_batch_size = %d, "
+            "batch_size = %d",
+            rois_batch_size,
+            batch_size));
+
+    std::vector<int> rois_num_list(rois_batch_size);
+    paddle::memory::Copy(cplace,
+                         rois_num_list.data(),
+                         xplace,
+                         boxes_num->data<int>(),
+                         sizeof(int) * rois_batch_size);
+    cpu_lod = new int[rois_batch_size + 1];
+    cpu_lod[0] = 0;
+    for (int i = 0; i < rois_batch_size; i++) {
+      cpu_lod[i + 1] = cpu_lod[i] + rois_num_list[i];
+    }
+  } else {
+    auto lod = boxes.lod();
+    PADDLE_ENFORCE_EQ(lod.empty(),
+                      false,
+                      errors::InvalidArgument("Input(ROIs) in ROIAlignOp does "
+                                              "not contain LoD information."));
+    auto rois_lod = lod.back();
+    rois_batch_size = rois_lod.size() - 1;
+    PADDLE_ENFORCE_EQ(
+        rois_batch_size,
+        batch_size,
+        errors::InvalidArgument(
+            "The batch size of rois and batch size "
+            "of images must be the same. But received rois batch size = %d, "
+            "and images batch size = %d",
+            rois_batch_size,
+            batch_size));
+    int rois_num_with_lod = rois_lod[rois_batch_size];
+    PADDLE_ENFORCE_EQ(
+        rois_num,
+        rois_num_with_lod,
+        errors::InvalidArgument(
+            "The actual number of rois and the number of rois "
+            "provided from Input(RoIsLoD) in RoIAlign must be the same."
+            " But received actual number of rois is %d, and the number "
+            "of rois from RoIsLoD is %d",
+            rois_num,
+            rois_num_with_lod));
+    for (int n = 0; n < rois_batch_size; ++n) {
+      for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) {
+        roi_batch_id_data[i] = n;
+      }
+    }
+    cpu_lod = new int[rois_batch_size + 1];
+    for (int i = 0; i < rois_batch_size + 1; i++) {
+      cpu_lod[i] = rois_lod[i];
+    }
+  }
+
+  int* roi_id_data = nullptr;
+  int r = xpu_malloc(reinterpret_cast<void**>(&roi_id_data),
+                     (rois_batch_size + 1) * sizeof(int));
+  PADDLE_ENFORCE_XPU_SUCCESS(r);
+  paddle::memory::Copy(xplace,
+                       roi_id_data,
+                       cplace,
+                       cpu_lod,
+                       (rois_batch_size + 1) * sizeof(int));
+  delete[] cpu_lod;
+  r = xpu::roi_align<T, int>(dev_ctx.x_context(),
+                             x.data<T>(),
+                             dev_ctx.template Alloc<T>(out),
+                             boxes.data<T>(),
+                             roi_id_data,
+                             batch_size,
+                             channels,
+                             height,
+                             width,
+                             out->dims()[0],
+                             pooled_height,
+                             pooled_width,
+                             spatial_scale,
+                             sampling_ratio,
+                             true,
+                             aligned);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "roi_align_grad");
+  if (dev_ctx.x_context()->xpu_stream) {
+    dev_ctx.Wait();
+  }
+  xpu_free(roi_id_data);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(roi_align, XPU, ALL_LAYOUT, phi::RoiAlignKernel, float) {}
diff --git a/paddle/phi/ops/compat/fused_softmax_mask_sig.cc b/paddle/phi/ops/compat/fused_softmax_mask_sig.cc
new file mode 100644
index 00000000000000..415df81763a083
--- /dev/null
+++ b/paddle/phi/ops/compat/fused_softmax_mask_sig.cc
@@ -0,0 +1,28 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SoftmaxMaskFuseGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "fused_softmax_mask_grad", {"Softmax", "Out@GRAD"}, {}, {"X@GRAD"});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(fused_softmax_mask_grad,
+                           phi::SoftmaxMaskFuseGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/gumbel_softmax_sig.cc b/paddle/phi/ops/compat/gumbel_softmax_sig.cc
index 65537f8c8948a8..b4afa64c1d2b39 100644
--- a/paddle/phi/ops/compat/gumbel_softmax_sig.cc
+++ b/paddle/phi/ops/compat/gumbel_softmax_sig.cc
@@ -16,6 +16,23 @@ limitations under the License. */
 
 namespace phi {
 
+KernelSignature GumbelSoftmaxOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  bool is_test = false;
+  if (ctx.HasAttr("is_test")) {
+    is_test = paddle::any_cast<bool>(ctx.Attr("is_test"));
+  }
+  if (is_test) {
+    return KernelSignature("gumbel_softmax_infer",
+                           {"X"},
+                           {"temperature", "hard", "axis"},
+                           {"Out"});
+  } else {
+    return KernelSignature(
+        "gumbel_softmax", {"X"}, {"temperature", "hard", "axis"}, {"Out"});
+  }
+}
+
 KernelSignature GumbelSoftmaxGradOpArgumentMapping(
     const ArgumentMappingContext& ctx) {
   return KernelSignature(
@@ -24,5 +41,6 @@ KernelSignature GumbelSoftmaxGradOpArgumentMapping(
 
 }  // namespace phi
 
+PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax, phi::GumbelSoftmaxOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(gumbel_softmax_grad,
                            phi::GumbelSoftmaxGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/memcpy_d2h_sig.cc b/paddle/phi/ops/compat/memcpy_sig.cc
similarity index 77%
rename from paddle/phi/ops/compat/memcpy_d2h_sig.cc
rename to paddle/phi/ops/compat/memcpy_sig.cc
index 38b0f01082e757..96da0abbd4f34b 100644
--- a/paddle/phi/ops/compat/memcpy_d2h_sig.cc
+++ b/paddle/phi/ops/compat/memcpy_sig.cc
@@ -27,6 +27,15 @@ KernelSignature MemcpyD2HOpArgumentMapping(const ArgumentMappingContext& ctx) {
   return KernelSignature("memcpy_d2h", {"X"}, {"dst_place_type"}, {"Out"});
 }
 
+KernelSignature MemcpyOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.IsDenseTensorInput("X")) {
+    return KernelSignature("memcpy", {"X"}, {"dst_place_type"}, {"Out"});
+  }
+
+  return KernelSignature("unregistered", {}, {}, {});
+}
+
 }  // namespace phi
 
 PD_REGISTER_ARG_MAPPING_FN(memcpy_d2h, phi::MemcpyD2HOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(memcpy, phi::MemcpyOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/set_value_sig.cc b/paddle/phi/ops/compat/set_value_sig.cc
index 6ff94a6e263f45..8c98606600a98f 100644
--- a/paddle/phi/ops/compat/set_value_sig.cc
+++ b/paddle/phi/ops/compat/set_value_sig.cc
@@ -724,6 +724,21 @@ KernelSignature SetValueOpArgumentMapping(const ArgumentMappingContext& ctx) {
                                     "shape",
                                     "bool_values"},
                                    {"Out"});
+          } else if (ctx.HasAttr("fp16_values") &&
+                     !paddle::any_cast<std::vector<float>>(
+                          ctx.Attr("fp16_values"))
+                          .empty()) {
+            return KernelSignature("set_value",
+                                   {"Input"},
+                                   {"starts",
+                                    "ends",
+                                    "steps",
+                                    "axes",
+                                    "decrease_axes",
+                                    "none_axes",
+                                    "shape",
+                                    "fp16_values"},
+                                   {"Out"});
           }
         }
       }
diff --git a/paddle/phi/ops/compat/sparse_manual_op_sig.cc b/paddle/phi/ops/compat/sparse_manual_op_sig.cc
new file mode 100644
index 00000000000000..6e520cbdd96cdb
--- /dev/null
+++ b/paddle/phi/ops/compat/sparse_manual_op_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SparseIndicesOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  if (ctx.IsSparseCooTensorInput("x")) {
+    return KernelSignature("indices_coo", {"x"}, {}, {"out"});
+  } else {
+    return KernelSignature("unregistered", {}, {}, {});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(sparse_indices, phi::SparseIndicesOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/sum_sig.cc b/paddle/phi/ops/compat/sum_sig.cc
index d71111408f8546..ff68d43ed7b33e 100644
--- a/paddle/phi/ops/compat/sum_sig.cc
+++ b/paddle/phi/ops/compat/sum_sig.cc
@@ -18,10 +18,13 @@
 namespace phi {
 
 KernelSignature SumOpArgumentMapping(const ArgumentMappingContext& ctx) {
-  if (ctx.IsDenseTensorInputs("X")) {
+  if (ctx.IsSelectedRowsInputs("X")) {
+    return KernelSignature("add_n_sr", {"X"}, {}, {"Out"});
+  } else if (ctx.IsDenseTensorVectorInput("X")) {
+    return KernelSignature("add_n_array", {"X"}, {}, {"Out"});
+  } else {
     return KernelSignature("add_n", {"X"}, {}, {"Out"});
   }
-  return KernelSignature("unregistered", {}, {}, {});
 }
 
 }  // namespace phi
diff --git a/paddle/phi/tests/api/CMakeLists.txt b/paddle/phi/tests/api/CMakeLists.txt
index a337e4ee4bd1b6..e8fb6359004de6 100644
--- a/paddle/phi/tests/api/CMakeLists.txt
+++ b/paddle/phi/tests/api/CMakeLists.txt
@@ -109,3 +109,7 @@ cc_test(
   test_strings_lower_upper_api
   SRCS test_strings_lower_upper_api.cc
   DEPS ${COMMON_API_TEST_DEPS})
+cc_test(
+  test_add_n_api
+  SRCS test_add_n_api.cc
+  DEPS ${COMMON_API_TEST_DEPS})
diff --git a/paddle/phi/tests/api/test_add_n_api.cc b/paddle/phi/tests/api/test_add_n_api.cc
new file mode 100644
index 00000000000000..bfa957667f67a3
--- /dev/null
+++ b/paddle/phi/tests/api/test_add_n_api.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/api/lib/api_custom_impl.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/selected_rows.h"
+
+PD_DECLARE_KERNEL(add_n_sr, CPU, ALL_LAYOUT);
+
+namespace paddle {
+namespace tests {
+
+TEST(API, add_n) {
+  // 1. create tensor
+  std::vector<int64_t> rows = {0, 1, 2, 3, 4, 5, 6};
+  int64_t row_numel = 12;
+  auto x_sr = std::make_shared<phi::SelectedRows>(rows, 10);
+  auto x_meta = phi::DenseTensorMeta(
+      phi::DataType::FLOAT32,
+      phi::make_ddim({static_cast<int64_t>(rows.size()), row_numel}),
+      phi::DataLayout::NCHW);
+  x_sr->mutable_value()->set_meta(x_meta);
+  x_sr->AllocateFrom(paddle::memory::allocation::AllocatorFacade::Instance()
+                         .GetAllocator(paddle::platform::CPUPlace())
+                         .get(),
+                     phi::DataType::FLOAT32);
+  auto* dense_x_data = x_sr->mutable_value()->data<float>();
+
+  auto y_sr = std::make_shared<phi::SelectedRows>(rows, 10);
+  y_sr->mutable_value()->set_meta(x_meta);
+  y_sr->AllocateFrom(paddle::memory::allocation::AllocatorFacade::Instance()
+                         .GetAllocator(paddle::platform::CPUPlace())
+                         .get(),
+                     phi::DataType::FLOAT32);
+  auto* dense_y_data = y_sr->mutable_value()->data<float>();
+
+  float sum[84] = {0.0};
+  for (size_t i = 0; i < 7; ++i) {
+    for (size_t j = 0; j < 12; ++j) {
+      dense_x_data[i * 12 + j] = (i * 4 + j);
+      dense_y_data[i * 12 + j] = (i * 4 + j);
+      sum[i * 12 + j] += (i * 4 + j) * 2;
+    }
+  }
+
+  paddle::experimental::Tensor x(x_sr);
+  paddle::experimental::Tensor y(y_sr);
+  auto out = paddle::experimental::add_n_impl({x, y});
+
+  // check slice result
+  ASSERT_EQ(
+      static_cast<int>(std::dynamic_pointer_cast<phi::SelectedRows>(out.impl())
+                           ->rows()
+                           .size()),
+      7);
+  for (int64_t i = 0; i < 84; ++i) {
+    ASSERT_EQ(sum[i],
+              std::dynamic_pointer_cast<phi::SelectedRows>(out.impl())
+                  ->value()
+                  .data<float>()[i]);
+  }
+}
+
+}  // namespace tests
+}  // namespace paddle
diff --git a/paddle/phi/tests/core/test_sparse_coo_tensor.cc b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
index e9ee1dde6b2a56..81e58843f5475c 100644
--- a/paddle/phi/tests/core/test_sparse_coo_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_coo_tensor.cc
@@ -52,7 +52,6 @@ TEST(sparse_coo_tensor, construct) {
   CHECK_EQ(sparse.numel(), 9);
   CHECK(sparse.dims() == dense_dims);
   CHECK(sparse.dtype() == DataType::FLOAT32);
-  CHECK(sparse.layout() == DataLayout::SPARSE_COO);
   CHECK(sparse.place() == phi::CPUPlace());
 }
 
diff --git a/paddle/phi/tests/core/test_sparse_csr_tensor.cc b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
index 7fad7bac399cd4..42f87fc5aae127 100644
--- a/paddle/phi/tests/core/test_sparse_csr_tensor.cc
+++ b/paddle/phi/tests/core/test_sparse_csr_tensor.cc
@@ -62,7 +62,6 @@ TEST(sparse_csr_tensor, construct) {
   CHECK_EQ(sparse.numel(), 9);
   CHECK(sparse.dims() == dense_dims);
   CHECK(sparse.dtype() == DataType::FLOAT32);
-  CHECK(sparse.layout() == DataLayout::SPARSE_CSR);
   CHECK(sparse.place() == paddle::platform::CPUPlace());
   CHECK(sparse.initialized() == true);
 }
diff --git a/paddle/phi/tests/kernels/CMakeLists.txt b/paddle/phi/tests/kernels/CMakeLists.txt
index d1c9d25483fec1..600638cd84507c 100644
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -74,6 +74,10 @@ cc_test(
   test_sparse_elementwise_dev_api
   SRCS test_sparse_elementwise_dev_api.cc
   DEPS phi phi_api_utils)
+cc_test(
+  test_sparse_transpose_dev_api
+  SRCS test_sparse_transpose_dev_api.cc
+  DEPS phi phi_api_utils)
 
 cc_test(
   test_math_function
diff --git a/paddle/phi/tests/kernels/test_sparse_transpose_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_transpose_dev_api.cc
new file mode 100644
index 00000000000000..b2d5ed1d61b494
--- /dev/null
+++ b/paddle/phi/tests/kernels/test_sparse_transpose_dev_api.cc
@@ -0,0 +1,165 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include "paddle/fluid/memory/allocation/allocator_facade.h"
+#include "paddle/phi/api/lib/utils/allocator.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/empty_kernel.h"
+#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
+#include "paddle/phi/kernels/sparse/unary_kernel.h"
+#include "paddle/phi/kernels/transpose_grad_kernel.h"
+#include "paddle/phi/kernels/transpose_kernel.h"
+namespace phi {
+namespace tests {
+
+TEST(DEV_API, sparse_transpose_coo) {
+  std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+
+  DenseTensor dense_x = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({3, 2, 2}), DataLayout::NCHW));
+  memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
+  auto sparse_coo = sparse::DenseToCoo<float>(dev_ctx_cpu, dense_x, 3);
+  auto sparse_out =
+      sparse::TransposeCoo<float>(dev_ctx_cpu, sparse_coo, {2, 1, 0});
+  DenseTensor dense_out = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({2, 2, 3}), DataLayout::NCHW));
+  TransposeKernel<float>(dev_ctx_cpu, dense_x, {2, 1, 0}, &dense_out);
+
+  // backward
+  DenseTensor dense_grad_x = phi::EmptyLike<float>(dev_ctx_cpu, dense_out);
+  TransposeGradKernel<float>(dev_ctx_cpu, dense_out, {2, 1, 0}, &dense_grad_x);
+  SparseCooTensor sparse_grad_x;
+  sparse::EmptyLikeCooKernel<float>(dev_ctx_cpu, sparse_coo, &sparse_grad_x);
+
+  SparseCooTensor sparse_out_grad(
+      sparse_coo.indices(), sparse_coo.values(), {2, 2, 3});
+  sparse::TransposeCooGradKernel<float>(
+      dev_ctx_cpu, sparse_out_grad, {2, 1, 0}, &sparse_grad_x);
+}
+
+TEST(DEV_API, sparse_transpose_csr_case1) {
+  std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+
+  DenseTensor dense_x = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({3, 2, 2}), DataLayout::NCHW));
+  memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
+  auto sparse_csr = sparse::DenseToCsr<float>(dev_ctx_cpu, dense_x);
+
+  auto sparse_out =
+      sparse::TransposeCsr<float>(dev_ctx_cpu, sparse_csr, {2, 1, 0});
+  DenseTensor dense_out = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({2, 2, 3}), DataLayout::NCHW));
+  TransposeKernel<float>(dev_ctx_cpu, dense_x, {2, 1, 0}, &dense_out);
+
+  // backward
+  DenseTensor dense_grad_x = phi::EmptyLike<float>(dev_ctx_cpu, dense_out);
+  TransposeGradKernel<float>(dev_ctx_cpu, dense_out, {2, 1, 0}, &dense_grad_x);
+  SparseCsrTensor sparse_grad_x;
+  sparse::EmptyLikeCsrKernel<float>(dev_ctx_cpu, sparse_csr, &sparse_grad_x);
+  sparse::TransposeCsrGradKernel<float>(
+      dev_ctx_cpu, sparse_out, {2, 1, 0}, &sparse_grad_x);
+}
+
+TEST(DEV_API, sparse_transpose_csr_case2) {
+  std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+
+  DenseTensor dense_x = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({3, 2, 2}), DataLayout::NCHW));
+  memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
+  auto sparse_csr = sparse::DenseToCsr<float>(dev_ctx_cpu, dense_x);
+
+  auto sparse_out =
+      sparse::TransposeCsr<float>(dev_ctx_cpu, sparse_csr, {1, 2, 0});
+  DenseTensor dense_out = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({2, 2, 3}), DataLayout::NCHW));
+  TransposeKernel<float>(dev_ctx_cpu, dense_x, {1, 2, 0}, &dense_out);
+}
+
+TEST(DEV_API, sparse_transpose_csr_case3) {
+  std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
+  phi::CPUContext dev_ctx_cpu;
+  dev_ctx_cpu.SetAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+  dev_ctx_cpu.SetHostAllocator(
+      paddle::memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(paddle::platform::CPUPlace())
+          .get());
+
+  DenseTensor dense_x = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({3, 4}), DataLayout::NCHW));
+  memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
+  auto sparse_csr = sparse::DenseToCsr<float>(dev_ctx_cpu, dense_x);
+
+  auto sparse_out =
+      sparse::TransposeCsr<float>(dev_ctx_cpu, sparse_csr, {1, 0});
+  DenseTensor dense_out = phi::Empty(
+      dev_ctx_cpu,
+      DenseTensorMeta(
+          DataType::FLOAT32, phi::make_ddim({4, 3}), DataLayout::NCHW));
+  TransposeKernel<float>(dev_ctx_cpu, dense_x, {1, 0}, &dense_out);
+}
+
+}  // namespace tests
+}  // namespace phi
diff --git a/paddle/phi/tests/ops/test_op_signature.h b/paddle/phi/tests/ops/test_op_signature.h
index 745f263208fc28..7f89fb34994fcd 100644
--- a/paddle/phi/tests/ops/test_op_signature.h
+++ b/paddle/phi/tests/ops/test_op_signature.h
@@ -77,11 +77,23 @@ class TestArgumentMappingContext : public phi::ArgumentMappingContext {
     return selected_rows_inputs.count(name) > 0;
   }
 
+  bool IsSelectedRowsInputs(const std::string& name) const override {
+    return selected_rows_inputs.count(name) > 0;
+  }
+
   // add member if needed
   bool IsDenseTensorVectorInput(const std::string& name) const override {
     return false;
   }
 
+  bool IsSparseCooTensorInput(const std::string& name) const override {
+    return false;
+  }
+
+  bool IsSparseCsrTensorInput(const std::string& name) const override {
+    return false;
+  }
+
   bool IsDenseTensorOutput(const std::string& name) const override {
     return dense_tensor_outputs.count(name) > 0;
   }
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 4bd9bfb670fc1a..27308622055755 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -116,7 +116,6 @@ if "%WITH_PYTHON%" == "ON" (
 rem -------Caching strategy 1: keep build directory for incremental compilation-----------
 rmdir %BUILD_DIR%\python /s/q
 rmdir %BUILD_DIR%\paddle\third_party\externalError /s/q
-rem rmdir %BUILD_DIR%\paddle\fluid\pybind /s/q
 rmdir %BUILD_DIR%\paddle_install_dir /s/q
 rmdir %BUILD_DIR%\paddle_inference_install_dir /s/q
 rmdir %BUILD_DIR%\paddle_inference_c_install_dir /s/q
@@ -172,14 +171,16 @@ echo ipipe_log_param_Windows_Build_Cache: %Windows_Build_Cache%
 cd /d %BUILD_DIR%
 dir .
 dir %cache_dir%
-dir paddle\fluid\pybind\Release
 rem -------Caching strategy 1: End --------------------------------
 
 
 rem -------Caching strategy 2: sccache decorate compiler-----------
 if not defined SCCACHE_ROOT set SCCACHE_ROOT=D:\sccache
+set PATH=%SCCACHE_ROOT%;%PATH%
 if "%WITH_SCCACHE%"=="ON" (
     cmd /C sccache -V || call :install_sccache
+    cmd /C sccache -V || echo install sccache failed!
+
     sccache --stop-server 2> NUL
     del %SCCACHE_ROOT%\sccache_log.txt
 
@@ -203,7 +204,7 @@ if "%WITH_SCCACHE%"=="ON" (
     sccache -z
     goto :CASE_%1
 ) else (
-    del %PYTHON_ROOT%\sccache.exe 2> NUL
+    del %SCCACHE_ROOT%\sccache.exe 2> NUL
     goto :CASE_%1
 )
 
@@ -211,7 +212,7 @@ if "%WITH_SCCACHE%"=="ON" (
 echo There is not sccache in this PC, will install sccache.
 echo Download package from https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe
 %PYTHON_ROOT%\python.exe -c "import wget;wget.download('https://paddle-ci.gz.bcebos.com/window_requirement/sccache.exe')"
-xcopy sccache.exe %PYTHON_ROOT%\ /Y
+xcopy sccache.exe %SCCACHE_ROOT%\ /Y
 del sccache.exe
 goto:eof
 rem -------Caching strategy 2: End --------------------------------
@@ -231,7 +232,7 @@ set WITH_MKL=ON
 set WITH_GPU=ON
 set WITH_AVX=ON
 set MSVC_STATIC_CRT=OFF
-set ON_INFER=OFF
+set ON_INFER=ON
 set WITH_TENSORRT=ON
 set WITH_INFERENCE_API_TEST=OFF
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
@@ -248,7 +249,7 @@ set WITH_MKL=OFF
 set WITH_GPU=OFF
 set WITH_AVX=OFF
 set MSVC_STATIC_CRT=ON
-set ON_INFER=OFF
+set ON_INFER=ON
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=Auto
 
 call :cmake || goto cmake_error
@@ -281,7 +282,7 @@ goto:success
 rem ------Build windows avx whl package------
 :CASE_build_avx_whl
 set WITH_AVX=ON
-set ON_INFER=OFF
+set ON_INFER=ON
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All
 
 call :cmake || goto cmake_error
@@ -292,7 +293,7 @@ goto:success
 rem ------Build windows no-avx whl package------
 :CASE_build_no_avx_whl
 set WITH_AVX=OFF
-set ON_INFER=OFF
+set ON_INFER=ON
 if not defined CUDA_ARCH_NAME set CUDA_ARCH_NAME=All
 
 call :cmake || goto cmake_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 4ee05ba079e344..271cdd23100d06 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -68,6 +68,9 @@ function cmake_base() {
     # Delete previous built whl packages
     rm -rf python/dist 2>/dev/null || true
 
+    # Delete previous built paddle cache
+    rm -rf python/paddle 2>/dev/null || true
+
     # Support build for all python3 versions
     PYTHON_FLAGS=""
     SYSTEM=`uname -s`
@@ -76,7 +79,7 @@ function cmake_base() {
         if [ "$1" == "cp36-cp36m" ] || [ "$1" == "" ]; then
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.6/lib/
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
@@ -88,7 +91,7 @@ function cmake_base() {
         elif [ "$1" == "cp37-cp37m" ]; then
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.7/lib/
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
@@ -100,7 +103,7 @@ function cmake_base() {
         elif [ "$1" == "cp38-cp38" ]; then
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.8" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.8/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.8/lib/
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.8/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.8/include/python3.8/
@@ -112,7 +115,7 @@ function cmake_base() {
         elif [ "$1" == "cp39-cp39" ]; then
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.9" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.9/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.9/lib/
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.9/include/python3.9/
@@ -124,7 +127,7 @@ function cmake_base() {
         elif [ "$1" == "cp310-cp310" ]; then
             if [ -d "/Library/Frameworks/Python.framework/Versions/3.10" ]; then
                 export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
-                export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.10/lib/
+                export DYLD_LIBRARY_PATH=${DYLD_LIBRARY_PATH}:/Library/Frameworks/Python.framework/Versions/3.10/lib/
                 export PATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/:${PATH}
                 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.10/bin/python3
             -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.10/include/python3.10/
@@ -272,7 +275,6 @@ EOF
         -DWITH_DISTRIBUTE=${distibuted_flag} \
         -DWITH_MKL=${WITH_MKL:-ON} \
         -DWITH_AVX=${WITH_AVX:-OFF} \
-        -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \
         -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \
         -DNEW_RELEASE_PYPI=${NEW_RELEASE_PYPI:-OFF} \
         -DNEW_RELEASE_ALL=${NEW_RELEASE_ALL:-OFF} \
@@ -546,23 +548,26 @@ EOF
 }
 
 
-function combine_avx_noavx_build() {
-    mkdir -p ${PADDLE_ROOT}/build.noavx
-    cd ${PADDLE_ROOT}/build.noavx
-    WITH_AVX=OFF
+function avx_build() {
+    mkdir -p ${PADDLE_ROOT}/build
+    cd ${PADDLE_ROOT}/build
+    WITH_AVX=ON
+
     cmake_base ${PYTHON_ABI:-""}
     build_base
+}
+
 
-    # build combined one
+function noavx_build() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
-    NOAVX_CORE_FILE=`find ${PADDLE_ROOT}/build.noavx/python/paddle/fluid/ -name "core_noavx.*"`
-    WITH_AVX=ON
+    WITH_AVX=OFF
 
     cmake_base ${PYTHON_ABI:-""}
     build_base
 }
 
+
 function mac_m1_arm_build() {
     mkdir -p ${PADDLE_ROOT}/build
     cd ${PADDLE_ROOT}/build
@@ -989,12 +994,13 @@ function generate_upstream_develop_api_spec() {
     mkdir -p ${PADDLE_ROOT}/build/pr_whl && mv ${PADDLE_ROOT}/build/python/dist/*.whl ${PADDLE_ROOT}/build/pr_whl/
     echo "pr_whl_size: ${pr_whl_size}"
 
-    rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt
+    rm -rf ${PADDLE_ROOT}/build/Makefile ${PADDLE_ROOT}/build/CMakeCache.txt ${PADDLE_ROOT}/build/python
     cmake_change=`git diff --name-only upstream/$BRANCH | grep "cmake/external" || true`
 
     cd ${PADDLE_ROOT}
-    git checkout .
-    git checkout -b develop_base_pr upstream/$BRANCH
+    git fetch upstream $BRANCH
+    git checkout -b develop_base_pr -t upstream/$BRANCH
+    git log --pretty=oneline -10
 
     dev_commit=`git log -1|head -1|awk '{print $2}'`
     dev_url="https://xly-devops.bj.bcebos.com/PR/build_whl/0/${dev_commit}/paddlepaddle_gpu-0.0.0-cp37-cp37m-linux_x86_64.whl"
@@ -2887,12 +2893,12 @@ EOF
     local LIB_TYPE=$1
     case $LIB_TYPE in
       full)
-        # Build full Paddle Python module. Will timeout without caching 'copy_paddle_pybind' first
-        make -j `nproc` framework_py_proto copy_paddle_pybind paddle_python
+        # Build full Paddle Python module. Will timeout without caching 'copy_libpaddle' first
+        make -j `nproc` framework_py_proto copy_libpaddle paddle_python
         ;;
       pybind)
         # Build paddle pybind library. Takes 49 minutes to build. Might timeout
-        make -j `nproc` copy_paddle_pybind
+        make -j `nproc` copy_libpaddle
         ;;
       proto)
         # Even smaller library.
@@ -3485,16 +3491,25 @@ function main() {
         gen_dockerfile ${PYTHON_ABI:-""}
         assert_api_spec_approvals
         ;;
-      combine_avx_noavx)
-        combine_avx_noavx_build
+      avx_build)
+        avx_build
+        gen_dockerfile ${PYTHON_ABI:-""}
+        ;;
+      noavx_build)
+        noavx_build
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
       mac_m1_arm)
         mac_m1_arm_build
         gen_dockerfile ${PYTHON_ABI:-""}
         ;;
-      combine_avx_noavx_build_and_test)
-        combine_avx_noavx_build
+      avx_build_and_test)
+        avx_build
+        gen_dockerfile ${PYTHON_ABI:-""}
+        parallel_test_base
+        ;;
+      noavx_build_and_test)
+        noavx_build
         gen_dockerfile ${PYTHON_ABI:-""}
         parallel_test_base
         ;;
diff --git a/paddle/scripts/windows_build/build.bat b/paddle/scripts/windows_build/build.bat
index 9a2ed349e5b926..0aeacfef7f9bd9 100644
--- a/paddle/scripts/windows_build/build.bat
+++ b/paddle/scripts/windows_build/build.bat
@@ -82,8 +82,8 @@ echo Current directory : %cd%
 
 call:rest_env
 
-echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
-cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All -DNOAVX_CORE_FILE=%dst_path%_noavx\python\paddle\fluid\core_noavx.pyd
+echo cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
+cmake %dst_path%\..\Paddle -G "Visual Studio 15 2017 Win64" -T host=x64 -DWITH_GPU=%WITH_GPU% -DWITH_MKL=%WITH_MKL% -DWITH_AVX=ON -DPYTHON_INCLUDE_DIR=%PYTHON_DIR%\include\ -DPYTHON_LIBRARY=%PYTHON_DIR%\libs\ -DPYTHON_EXECUTABLE=%PYTHON_DIR%\python.exe -DCMAKE_BUILD_TYPE=Release -DWITH_TESTING=OFF -DWITH_PYTHON=ON -DCUDA_TOOLKIT_ROOT_DIR=%CUDA_DIR% -DCUDA_ARCH_NAME=All
 
 set  MSBUILDDISABLENODEREUSE=1
 
@@ -184,4 +184,4 @@ exit /b 1
 :END
 echo BUILD SUCCESSFULLY
 
-ENDLOCAL
\ No newline at end of file
+ENDLOCAL
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000000000..e9513d0648ba1f
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,4 @@
+[tool.black]
+exclude = "build"
+line-length = 80
+skip-string-normalization = true
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index e33af8b1bd52e3..b7d8eb1dcbc594 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -20,28 +20,7 @@ endif()
 
 set(SETUP_LOG_FILE "setup.py.log")
 
-set(FLUID_CORE_NAME "core")
-if(WITH_AVX AND AVX_FOUND)
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_avx")
-  if(NOT DEFINED NOAVX_CORE_FILE OR NOAVX_CORE_FILE STREQUAL "")
-    message(
-      STATUS
-        "MESSAGE: This is just a message for publishing release.
-      You are building AVX version without NOAVX core.
-      So the wheel package may fail on NOAVX machine.
-      You can add -DNOAVX_CORE_FILE=/path/to/your/core_noavx.* in cmake command
-      to get a full wheel package to resolve this warning.
-      While, this version will still work on local machine.")
-  endif()
-
-  if(NOAVX_CORE_FILE AND NOT EXISTS "${NOAVX_CORE_FILE}")
-    message(FATAL_ERROR "The file ${NOAVX_CORE_FILE} does not exist!")
-  endif()
-
-  set(HAS_NOAVX_CORE ON)
-else()
-  set(FLUID_CORE_NAME "${FLUID_CORE_NAME}_noavx")
-endif()
+set(FLUID_CORE_NAME "libpaddle")
 
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
                ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
@@ -52,51 +31,22 @@ if(WIN32)
   # Python would use the .pyd by default under Windows series platform
   set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.pyd)
   set(FLUID_CORE_LIB ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.lib)
-
   add_custom_command(
     OUTPUT ${FLUID_CORE}
-    COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-    COMMAND cmake -E copy $<TARGET_LINKER_FILE:paddle_pybind> ${FLUID_CORE_LIB}
-    DEPENDS paddle_pybind)
-
-  set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.pyd)
+    COMMAND cmake -E copy $<TARGET_FILE:libpaddle> ${FLUID_CORE}
+    COMMAND cmake -E copy $<TARGET_LINKER_FILE:libpaddle> ${FLUID_CORE_LIB}
+    DEPENDS libpaddle)
 else()
   set(FLUID_CORE ${FLUID_DST_DIR}/${FLUID_CORE_NAME}.so)
   add_custom_command(
     OUTPUT ${FLUID_CORE}
-    COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${FLUID_CORE}
-    DEPENDS paddle_pybind)
-
-  set(FLUID_NOAVX_CORE ${FLUID_DST_DIR}/core_noavx.so)
+    COMMAND cmake -E copy $<TARGET_FILE:paddle> ${FLUID_CORE}
+    DEPENDS paddle)
 endif()
 
 set(FLUID_CORE_DEPS ${FLUID_CORE})
 
-if(HAS_NOAVX_CORE AND EXISTS "${NOAVX_CORE_FILE}")
-  get_filename_component(NOAVX_CORE_NAME ${NOAVX_CORE_FILE} NAME)
-  get_filename_component(NOAVX_CORE_EXT ${NOAVX_CORE_FILE} EXT)
-  if(WIN32)
-    if(NOT NOAVX_CORE_EXT STREQUAL ".pyd")
-      message(
-        FATAL_ERROR
-          "Wrong file ${NOAVX_CORE_NAME}, the ext does not match windows *.pyd!"
-      )
-    endif()
-  else()
-    if(NOT NOAVX_CORE_EXT STREQUAL ".so")
-      message(
-        FATAL_ERROR
-          "Wrong file ${NOAVX_CORE_NAME}, the ext does not match *.so!")
-    endif()
-  endif()
-  add_custom_command(
-    OUTPUT ${FLUID_NOAVX_CORE}
-    COMMAND cmake -E copy ${NOAVX_CORE_FILE} ${FLUID_NOAVX_CORE}
-    DEPENDS paddle_pybind)
-  list(APPEND FLUID_CORE_DEPS ${FLUID_NOAVX_CORE})
-endif()
-
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${FLUID_CORE_DEPS})
+add_custom_target(copy_libpaddle ALL DEPENDS ${FLUID_CORE_DEPS})
 
 if(WIN32)
   add_custom_command(
@@ -107,8 +57,8 @@ if(WIN32)
     COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py
             bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
-            profiler_py_proto pass_desc_py_proto ${PY_FILES})
+    DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+            pass_desc_py_proto ${PY_FILES})
 else()
   add_custom_command(
     OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
@@ -116,8 +66,8 @@ else()
     COMMAND cp -r ${PADDLE_SOURCE_DIR}/python/paddle ${PADDLE_BINARY_DIR}/python
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMENT "Packing whl packages------>>>"
-    DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto
-            profiler_py_proto pass_desc_py_proto ${PY_FILES})
+    DEPENDS copy_libpaddle ${FLUID_CORE} framework_py_proto profiler_py_proto
+            pass_desc_py_proto ${PY_FILES})
 endif()
 
 add_custom_target(paddle_python ALL
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index b39f4161eee978..c8286c09b10fac 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -38,6 +38,7 @@
 from .fluid.dataset import *  # noqa: F401
 from .fluid.lazy_init import LazyGuard  # noqa: F401
 
+from .framework.dtype import iinfo  # noqa: F401
 from .framework.dtype import dtype as dtype  # noqa: F401
 from .framework.dtype import uint8  # noqa: F401
 from .framework.dtype import int8  # noqa: F401
@@ -79,7 +80,9 @@
 import paddle.reader  # noqa: F401
 import paddle.static  # noqa: F401
 import paddle.vision  # noqa: F401
+import paddle.audio  # noqa: F401
 import paddle.geometric  # noqa: F401
+import paddle.sparse
 
 from .tensor.attribute import is_complex  # noqa: F401
 from .tensor.attribute import is_integer  # noqa: F401
@@ -386,6 +389,7 @@
 disable_static()
 
 __all__ = [  # noqa
+    'iinfo',
     'dtype',
     'uint8',
     'int8',
diff --git a/python/paddle/audio/__init__.py b/python/paddle/audio/__init__.py
new file mode 100644
index 00000000000000..ee768ab6d029c7
--- /dev/null
+++ b/python/paddle/audio/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import features
+from . import functional
+from . import datasets
+from . import backends
+
+from .backends.backend import info, load, save
+
+__all__ = [
+    "functional", "features", "datasets", "backends", "load", "info", "save"
+]
diff --git a/python/paddle/audio/backends/__init__.py b/python/paddle/audio/backends/__init__.py
new file mode 100644
index 00000000000000..ac19a14c69a01a
--- /dev/null
+++ b/python/paddle/audio/backends/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from . import init_backend
+from .init_backend import get_current_backend  # noqa: F401
+from .init_backend import list_available_backends  # noqa: F401
+from .init_backend import set_backend
+
+init_backend._init_set_audio_backend()
+
+__all__ = [
+    'get_current_backend',
+    'list_available_backends',
+    'set_backend',
+]
diff --git a/python/paddle/audio/backends/backend.py b/python/paddle/audio/backends/backend.py
new file mode 100644
index 00000000000000..fbfd11d20e0b54
--- /dev/null
+++ b/python/paddle/audio/backends/backend.py
@@ -0,0 +1,146 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import paddle
+
+from pathlib import Path
+from typing import Optional, Tuple, Union
+
+
+class AudioInfo:
+    """ Audio info, return type of backend info function """
+
+    def __init__(self, sample_rate: int, num_samples: int, num_channels: int,
+                 bits_per_sample: int, encoding: str):
+        self.sample_rate = sample_rate
+        self.num_samples = num_samples
+        self.num_channels = num_channels
+        self.bits_per_sample = bits_per_sample
+        self.encoding = encoding
+
+
+def info(filepath: str) -> AudioInfo:
+    """Get signal information of input audio file.
+
+    Args:
+       filepath: audio path or file object.
+
+    Returns:
+        AudioInfo: info of the given audio.
+
+    Example:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_info = paddle.audio.info(filepath)
+    """
+    # for API doc
+    raise NotImplementedError("please set audio backend")
+
+
+def load(filepath: Union[str, Path],
+         frame_offset: int = 0,
+         num_frames: int = -1,
+         normalize: bool = True,
+         channels_first: bool = True) -> Tuple[paddle.Tensor, int]:
+    """Load audio data from file.Load the audio content start form frame_offset, and get num_frames.
+
+    Args:
+        frame_offset: from 0 to total frames,
+        num_frames: from -1 (means total frames) or number frames which want to read,
+        normalize:
+            if True: return audio which norm to (-1, 1), dtype=float32
+            if False: return audio with raw data, dtype=int16
+
+        channels_first:
+            if True: return audio with shape (channels, time)
+
+    Return:
+        Tuple[paddle.Tensor, int]: (audio_content, sample rate)
+
+    Exampels:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_data_read, sr = paddle.audio.load(filepath)
+    """
+    # for API doc
+    raise NotImplementedError("please set audio backend")
+
+
+def save(
+    filepath: str,
+    src: paddle.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = 16,
+):
+    """
+    Save audio tensor to file.
+
+    Args:
+        filepath: saved path
+        src: the audio tensor
+        sample_rate: the number of samples of audio per second.
+        channels_first: src channel infomation
+            if True, means input tensor is (channels, time)
+            if False, means input tensor is (time, channels)
+        encoding:encoding format, wave_backend only support PCM16 now.
+        bits_per_sample: bits per sample, wave_backend only support 16 bits now.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            filepath = "./test.wav"
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+    """
+    # for API doc
+    raise NotImplementedError("please set audio backend")
diff --git a/python/paddle/audio/backends/init_backend.py b/python/paddle/audio/backends/init_backend.py
new file mode 100644
index 00000000000000..a066e4e23a64e5
--- /dev/null
+++ b/python/paddle/audio/backends/init_backend.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import warnings
+from . import wave_backend
+from . import backend
+from typing import List
+
+import paddle
+
+
+def _check_version(version: str) -> bool:
+    # require paddleaudio >= 1.0.2
+    ver_arr = version.split('.')
+    v0 = int(ver_arr[0])
+    v1 = int(ver_arr[1])
+    v2 = int(ver_arr[2])
+    if v0 < 1:
+        return False
+    if v0 == 1 and v1 == 0 and v2 <= 1:
+        return False
+    return True
+
+
+def list_available_backends() -> List[str]:
+    """ List available backends, the backends in paddleaudio and the default backend.
+
+    Returns:
+        List[str]: The list of available backends.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            wav_path = "./test.wav"
+
+            current_backend = paddle.audio.backends.get_current_backend()
+            print(current_backend) # wave_backend, the default backend.
+            backends = paddle.audio.backends.list_available_backends()
+            # default backends is ['wave_backend']
+            # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2
+            if 'soundfile' in backends:
+                paddle.audio.backends.set_backend('soundfile')
+
+            paddle.audio.save(wav_path, waveform, sample_rate)
+
+    """
+    backends = []
+    try:
+        import paddleaudio
+    except ImportError:
+        package = "paddleaudio"
+        warn_msg = (
+            "Failed importing {}. \n"
+            "only wave_banckend(only can deal with PCM16 WAV) supportted.\n"
+            "if want soundfile_backend(more audio type suppported),\n"
+            "please manually installed (usually with `pip install {} >= 1.0.2`). "
+        ).format(package, package)
+        warnings.warn(warn_msg)
+
+    if "paddleaudio" in sys.modules:
+        version = paddleaudio.__version__
+        if _check_version(version) == False:
+            err_msg = (
+                "the version of paddleaudio installed is {},\n"
+                "please ensure the paddleaudio >= 1.0.2.").format(version)
+            raise ImportError(err_msg)
+        backends = paddleaudio.backends.list_audio_backends()
+    backends.append("wave_backend")
+    return backends
+
+
+def get_current_backend() -> str:
+    """ Get the name of the current audio backend
+
+    Returns:
+        str: The name of the current backend,
+        the wave_backend or backend imported from paddleaudio
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            wav_path = "./test.wav"
+
+            current_backend = paddle.audio.backends.get_current_backend()
+            print(current_backend) # wave_backend, the default backend.
+            backends = paddle.audio.backends.list_available_backends()
+            # default backends is ['wave_backend']
+            # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2
+
+            if 'soundfile' in backends:
+                paddle.audio.backends.set_backend('soundfile')
+
+            paddle.audio.save(wav_path, waveform, sample_rate)
+
+    """
+    current_backend = None
+    if "paddleaudio" in sys.modules:
+        import paddleaudio
+        current_backend = paddleaudio.backends.get_audio_backend()
+        if paddle.audio.load == paddleaudio.load:
+            return current_backend
+    return "wave_backend"
+
+
+def set_backend(backend_name: str):
+    """Set the backend by one of the list_audio_backend return.
+
+    Args:
+        backend (str): one of the list_audio_backend. "wave_backend" is the default. "soundfile" imported from paddleaudio.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            wav_path = "./test.wav"
+
+            current_backend = paddle.audio.backends.get_current_backend()
+            print(current_backend) # wave_backend, the default backend.
+            backends = paddle.audio.backends.list_available_backends()
+            # default backends is ['wave_backend']
+            # backends is ['wave_backend', 'soundfile'], if have installed paddleaudio >= 1.0.2
+
+            if 'soundfile' in backends:
+                paddle.audio.backends.set_backend('soundfile')
+
+            paddle.audio.save(wav_path, waveform, sample_rate)
+
+    """
+    if backend_name not in list_available_backends():
+        raise NotImplementedError()
+
+    if backend_name == "wave_backend":
+        module = wave_backend
+    else:
+        import paddleaudio
+        paddleaudio.backends.set_audio_backend(backend_name)
+        module = paddleaudio
+
+    for func in ["save", "load", "info"]:
+        setattr(backend, func, getattr(module, func))
+        setattr(paddle.audio, func, getattr(module, func))
+
+
+def _init_set_audio_backend():
+    # init the default wave_backend.
+    for func in ["save", "load", "info"]:
+        setattr(backend, func, getattr(wave_backend, func))
diff --git a/python/paddle/audio/backends/wave_backend.py b/python/paddle/audio/backends/wave_backend.py
new file mode 100644
index 00000000000000..66f2d48fe19a55
--- /dev/null
+++ b/python/paddle/audio/backends/wave_backend.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+
+import wave
+import numpy as np
+from pathlib import Path
+
+from typing import Optional, Tuple, Union
+from .backend import AudioInfo
+
+
+def _error_message():
+    package = "paddleaudio"
+    warn_msg = (
+        "only PCM16 WAV supportted. \n"
+        "if want support more other audio types, please "
+        "manually installed (usually with `pip install {}`). \n "
+        "and use paddle.audio.backends.set_backend('soundfile') to set audio backend"
+    ).format(package)
+    return warn_msg
+
+
+def info(filepath: str) -> AudioInfo:
+    """Get signal information of input audio file.
+
+    Args:
+       filepath: audio path or file object.
+
+    Returns:
+        AudioInfo: info of the given audio.
+
+    Example:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_info = paddle.audio.info(filepath)
+    """
+
+    if hasattr(filepath, 'read'):
+        file_obj = filepath
+    else:
+        file_obj = open(filepath, 'rb')
+
+    try:
+        file_ = wave.open(file_obj)
+    except wave.Error:
+        file_obj.seek(0)
+        file_obj.close()
+        err_msg = _error_message()
+        raise NotImplementedError(err_msg)
+
+    channels = file_.getnchannels()
+    sample_rate = file_.getframerate()
+    sample_frames = file_.getnframes()  # audio frame
+    bits_per_sample = file_.getsampwidth() * 8
+    encoding = "PCM_S"  # default WAV encoding, only support
+    file_obj.close()
+    return AudioInfo(sample_rate, sample_frames, channels, bits_per_sample,
+                     encoding)
+
+
+def load(filepath: Union[str, Path],
+         frame_offset: int = 0,
+         num_frames: int = -1,
+         normalize: bool = True,
+         channels_first: bool = True) -> Tuple[paddle.Tensor, int]:
+    """Load audio data from file. load the audio content start form frame_offset, and get num_frames.
+
+    Args:
+        frame_offset: from 0 to total frames,
+        num_frames: from -1 (means total frames) or number frames which want to read,
+        normalize:
+            if True: return audio which norm to (-1, 1), dtype=float32
+            if False: return audio with raw data, dtype=int16
+
+        channels_first:
+            if True: return audio with shape (channels, time)
+
+    Return:
+        Tuple[paddle.Tensor, int]: (audio_content, sample rate)
+
+    Exampels:
+        .. code-block:: python
+
+            import os
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            base_dir = os.getcwd()
+            filepath = os.path.join(base_dir, "test.wav")
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+            wav_data_read, sr = paddle.audio.load(filepath)
+    """
+    if hasattr(filepath, 'read'):
+        file_obj = filepath
+    else:
+        file_obj = open(filepath, 'rb')
+
+    try:
+        file_ = wave.open(file_obj)
+    except wave.Error:
+        file_obj.seek(0)
+        file_obj.close()
+        err_msg = _error_message()
+        raise NotImplementedError(err_msg)
+
+    channels = file_.getnchannels()
+    sample_rate = file_.getframerate()
+    frames = file_.getnframes()  # audio frame
+
+    audio_content = file_.readframes(frames)
+    file_obj.close()
+
+    # default_subtype = "PCM_16", only support PCM16 WAV
+    audio_as_np16 = np.frombuffer(audio_content, dtype=np.int16)
+    audio_as_np32 = audio_as_np16.astype(np.float32)
+    if normalize:
+        # dtype = "float32"
+        audio_norm = audio_as_np32 / (2**15)
+    else:
+        # dtype = "int16"
+        audio_norm = audio_as_np32
+
+    waveform = np.reshape(audio_norm, (frames, channels))
+    if num_frames != -1:
+        waveform = waveform[frame_offset:frame_offset + num_frames, :]
+    waveform = paddle.to_tensor(waveform)
+    if channels_first:
+        waveform = paddle.transpose(waveform, perm=[1, 0])
+    return waveform, sample_rate
+
+
+def save(
+    filepath: str,
+    src: paddle.Tensor,
+    sample_rate: int,
+    channels_first: bool = True,
+    encoding: Optional[str] = None,
+    bits_per_sample: Optional[int] = 16,
+):
+    """
+    Save audio tensor to file.
+
+    Args:
+        filepath: saved path
+        src: the audio tensor
+        sample_rate: the number of samples of audio per second.
+        channels_first: src channel infomation
+            if True, means input tensor is (channels, time)
+            if False, means input tensor is (time, channels)
+        encoding: audio encoding format, wave_backend only support PCM16 now.
+        bits_per_sample: bits per sample, wave_backend only support 16 bits now.
+
+    Returns:
+        None
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+            filepath = "./test.wav"
+
+            paddle.audio.save(filepath, waveform, sample_rate)
+    """
+    assert src.ndim == 2, "Expected 2D tensor"
+
+    audio_numpy = src.numpy()
+
+    # change src shape to (time, channels)
+    if channels_first:
+        audio_numpy = np.transpose(audio_numpy)
+
+    channels = audio_numpy.shape[1]
+
+    # only support PCM16
+    if bits_per_sample not in (None, 16):
+        raise ValueError("Invalid bits_per_sample, only supprt 16 bit")
+
+    sample_width = int(bits_per_sample / 8)  # 2
+
+    if src.dtype == paddle.float32:
+        audio_numpy = (audio_numpy * (2**15)).astype("<h")
+
+    with wave.open(filepath, 'w') as f:
+        f.setnchannels(channels)
+        f.setsampwidth(sample_width)
+        f.setframerate(sample_rate)
+        f.writeframes(audio_numpy.tobytes())
diff --git a/python/paddle/audio/datasets/__init__.py b/python/paddle/audio/datasets/__init__.py
new file mode 100644
index 00000000000000..56d176ba4ef3a7
--- /dev/null
+++ b/python/paddle/audio/datasets/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .esc50 import ESC50
+from .tess import TESS
+
+__all__ = ["ESC50", "TESS"]
diff --git a/python/paddle/audio/datasets/dataset.py b/python/paddle/audio/datasets/dataset.py
new file mode 100644
index 00000000000000..67fda01f3fde90
--- /dev/null
+++ b/python/paddle/audio/datasets/dataset.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import paddle
+
+from ..features import MelSpectrogram
+from ..features import Spectrogram
+from ..features import MFCC
+from ..features import LogMelSpectrogram
+
+feat_funcs = {
+    'raw': None,
+    'melspectrogram': MelSpectrogram,
+    'mfcc': MFCC,
+    'logmelspectrogram': LogMelSpectrogram,
+    'spectrogram': Spectrogram
+}
+
+
+class AudioClassificationDataset(paddle.io.Dataset):
+    """
+    Base class of audio classification dataset.
+    """
+
+    def __init__(self,
+                 files: List[str],
+                 labels: List[int],
+                 feat_type: str = 'raw',
+                 sample_rate: int = None,
+                 **kwargs):
+        """
+        Ags:
+            files (:obj:`List[str]`): A list of absolute path of audio files.
+            labels (:obj:`List[int]`): Labels of audio files.
+            feat_type (:obj:`str`, `optional`, defaults to `raw`):
+                It identifies the feature type that user wants to extrace of an audio file.
+        """
+        super(AudioClassificationDataset, self).__init__()
+
+        if feat_type not in feat_funcs.keys():
+            raise RuntimeError(
+                f"Unknown feat_type: {feat_type}, it must be one in {list(feat_funcs.keys())}"
+            )
+
+        self.files = files
+        self.labels = labels
+
+        self.feat_type = feat_type
+        self.sample_rate = sample_rate
+        self.feat_config = kwargs  # Pass keyword arguments to customize feature config
+
+    def _get_data(self, input_file: str):
+        raise NotImplementedError
+
+    def _convert_to_record(self, idx):
+        file, label = self.files[idx], self.labels[idx]
+        waveform, sample_rate = paddle.audio.load(file)
+        self.sample_rate = sample_rate
+
+        feat_func = feat_funcs[self.feat_type]
+
+        record = {}
+        if len(waveform.shape) == 2:
+            waveform = waveform.squeeze(0)  # 1D input
+        waveform = paddle.to_tensor(waveform, dtype=paddle.float32)
+        if feat_func is not None:
+            waveform = waveform.unsqueeze(0)  # (batch_size, T)
+            if self.feat_type != 'spectrogram':
+                feature_extractor = feat_func(sr=self.sample_rate,
+                                              **self.feat_config)
+            else:
+                feature_extractor = feat_func(**self.feat_config)
+            record['feat'] = feature_extractor(waveform).squeeze(0)
+        else:
+            record['feat'] = waveform
+        record['label'] = label
+        return record
+
+    def __getitem__(self, idx):
+        record = self._convert_to_record(idx)
+        return record['feat'], record['label']
+
+    def __len__(self):
+        return len(self.files)
diff --git a/python/paddle/audio/datasets/esc50.py b/python/paddle/audio/datasets/esc50.py
new file mode 100644
index 00000000000000..b50845eda2ebb9
--- /dev/null
+++ b/python/paddle/audio/datasets/esc50.py
@@ -0,0 +1,190 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from paddle.utils import download
+from paddle.dataset.common import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = []
+
+
+class ESC50(AudioClassificationDataset):
+    """
+    The ESC-50 dataset is a labeled collection of 2000 environmental audio recordings
+    suitable for benchmarking methods of environmental sound classification. The dataset
+    consists of 5-second-long recordings organized into 50 semantical classes (with
+    40 examples per class)
+
+    Reference:
+        ESC: Dataset for Environmental Sound Classification
+        http://dx.doi.org/10.1145/2733373.2806390
+
+    Args:
+       mode (str, optional): It identifies the dataset mode (train or dev). Default:train.
+       split (int, optional): It specify the fold of dev dataset. Default:1.
+       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Default:raw.
+       archive(dict, optional): it tells where to download the audio archive. Default:None.
+
+    Returns:
+        :ref:`api_paddle_io_Dataset`. An instance of ESC50 dataset.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            mode = 'dev'
+            esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='raw')
+            for idx in range(5):
+                audio, label = esc50_dataset[idx]
+                # do something with audio, label
+                print(audio.shape, label)
+                # [audio_data_length] , label_id
+
+            esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='mfcc',
+                                                    n_mfcc=40)
+            for idx in range(5):
+                audio, label = esc50_dataset[idx]
+                # do something with mfcc feature, label
+                print(audio.shape, label)
+                # [feature_dim, length] , label_id
+    """
+
+    archive = {
+        'url': 'https://paddleaudio.bj.bcebos.com/datasets/ESC-50-master.zip',
+        'md5': '7771e4b9d86d0945acce719c7a59305a',
+    }
+
+    label_list = [
+        # Animals
+        'Dog',
+        'Rooster',
+        'Pig',
+        'Cow',
+        'Frog',
+        'Cat',
+        'Hen',
+        'Insects (flying)',
+        'Sheep',
+        'Crow',
+        # Natural soundscapes & water sounds
+        'Rain',
+        'Sea waves',
+        'Crackling fire',
+        'Crickets',
+        'Chirping birds',
+        'Water drops',
+        'Wind',
+        'Pouring water',
+        'Toilet flush',
+        'Thunderstorm',
+        # Human, non-speech sounds
+        'Crying baby',
+        'Sneezing',
+        'Clapping',
+        'Breathing',
+        'Coughing',
+        'Footsteps',
+        'Laughing',
+        'Brushing teeth',
+        'Snoring',
+        'Drinking, sipping',
+        # Interior/domestic sounds
+        'Door knock',
+        'Mouse click',
+        'Keyboard typing',
+        'Door, wood creaks',
+        'Can opening',
+        'Washing machine',
+        'Vacuum cleaner',
+        'Clock alarm',
+        'Clock tick',
+        'Glass breaking',
+        # Exterior/urban noises
+        'Helicopter',
+        'Chainsaw',
+        'Siren',
+        'Car horn',
+        'Engine',
+        'Train',
+        'Church bells',
+        'Airplane',
+        'Fireworks',
+        'Hand saw',
+    ]
+    meta = os.path.join('ESC-50-master', 'meta', 'esc50.csv')
+    meta_info = collections.namedtuple(
+        'META_INFO',
+        ('filename', 'fold', 'target', 'category', 'esc10', 'src_file', 'take'),
+    )
+    audio_path = os.path.join('ESC-50-master', 'audio')
+
+    def __init__(
+        self,
+        mode: str = 'train',
+        split: int = 1,
+        feat_type: str = 'raw',
+        archive=None,
+        **kwargs,
+    ):
+        assert split in range(
+            1, 6
+        ), f'The selected split should be integer, and 1 <= split <= 5, but got {split}'
+        if archive is not None:
+            self.archive = archive
+        files, labels = self._get_data(mode, split)
+        super(ESC50, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs
+        )
+
+    def _get_meta_info(self) -> List[collections.namedtuple]:
+        ret = []
+        with open(os.path.join(DATA_HOME, self.meta), 'r') as rf:
+            for line in rf.readlines()[1:]:
+                ret.append(self.meta_info(*line.strip().split(',')))
+        return ret
+
+    def _get_data(self, mode: str, split: int) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(
+            os.path.join(DATA_HOME, self.audio_path)
+        ) or not os.path.isfile(os.path.join(DATA_HOME, self.meta)):
+            download.get_path_from_url(
+                self.archive['url'],
+                DATA_HOME,
+                self.archive['md5'],
+                decompress=True,
+            )
+
+        meta_info = self._get_meta_info()
+
+        files = []
+        labels = []
+        for sample in meta_info:
+            filename, fold, target, _, _, _, _ = sample
+            if mode == 'train' and int(fold) != split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+            if mode != 'train' and int(fold) == split:
+                files.append(os.path.join(DATA_HOME, self.audio_path, filename))
+                labels.append(int(target))
+
+        return files, labels
diff --git a/python/paddle/audio/datasets/tess.py b/python/paddle/audio/datasets/tess.py
new file mode 100644
index 00000000000000..a379aedc6026bc
--- /dev/null
+++ b/python/paddle/audio/datasets/tess.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import os
+from typing import List
+from typing import Tuple
+
+from paddle.utils import download
+from paddle.dataset.common import DATA_HOME
+from .dataset import AudioClassificationDataset
+
+__all__ = []
+
+
+class TESS(AudioClassificationDataset):
+    """
+    TESS is a set of 200 target words were spoken in the carrier phrase
+    "Say the word _____' by two actresses (aged 26 and 64 years) and
+    recordings were made of the set portraying each of seven emotions(anger,
+    disgust, fear, happiness, pleasant surprise, sadness, and neutral).
+    There are 2800 stimuli in total.
+
+    Reference:
+        Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487
+        https://doi.org/10.5683/SP2/E8H2MF
+
+    Args:
+       mode (str, optional): It identifies the dataset mode (train or dev). Defaults to train.
+       n_folds (int, optional): Split the dataset into n folds. 1 fold for dev dataset and n-1 for train dataset. Defaults to 5.
+       split (int, optional): It specify the fold of dev dataset. Defaults to 1.
+       feat_type (str, optional): It identifies the feature type that user wants to extrace of an audio file. Defaults to raw.
+       archive(dict): it tells where to download the audio archive. Defaults to None.
+
+    Returns:
+        :ref:`api_paddle_io_Dataset`. An instance of TESS dataset.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            mode = 'dev'
+            tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                    feat_type='raw')
+            for idx in range(5):
+                audio, label = tess_dataset[idx]
+                # do something with audio, label
+                print(audio.shape, label)
+                # [audio_data_length] , label_id
+
+            tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                    feat_type='mfcc',
+                                                    n_mfcc=40)
+            for idx in range(5):
+                audio, label = tess_dataset[idx]
+                # do something with mfcc feature, label
+                print(audio.shape, label)
+                # [feature_dim, num_frames] , label_id
+    """
+
+    archive = {
+        'url': 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip',
+        'md5': '1465311b24d1de704c4c63e4ccc470c7',
+    }
+
+    label_list = [
+        'angry',
+        'disgust',
+        'fear',
+        'happy',
+        'neutral',
+        'ps',  # pleasant surprise
+        'sad',
+    ]
+    meta_info = collections.namedtuple(
+        'META_INFO', ('speaker', 'word', 'emotion')
+    )
+    audio_path = 'TESS_Toronto_emotional_speech_set'
+
+    def __init__(
+        self,
+        mode: str = 'train',
+        n_folds: int = 5,
+        split: int = 1,
+        feat_type: str = 'raw',
+        archive=None,
+        **kwargs,
+    ):
+        assert isinstance(n_folds, int) and (
+            n_folds >= 1
+        ), f'the n_folds should be integer and n_folds >= 1, but got {n_folds}'
+        assert split in range(
+            1, n_folds + 1
+        ), f'The selected split should be integer and should be 1 <= split <= {n_folds}, but got {split}'
+        if archive is not None:
+            self.archive = archive
+        files, labels = self._get_data(mode, n_folds, split)
+        super(TESS, self).__init__(
+            files=files, labels=labels, feat_type=feat_type, **kwargs
+        )
+
+    def _get_meta_info(self, files) -> List[collections.namedtuple]:
+        ret = []
+        for file in files:
+            basename_without_extend = os.path.basename(file)[:-4]
+            ret.append(self.meta_info(*basename_without_extend.split('_')))
+        return ret
+
+    def _get_data(
+        self, mode: str, n_folds: int, split: int
+    ) -> Tuple[List[str], List[int]]:
+        if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)):
+            download.get_path_from_url(
+                self.archive['url'],
+                DATA_HOME,
+                self.archive['md5'],
+                decompress=True,
+            )
+
+        wav_files = []
+        for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)):
+            for file in files:
+                if file.endswith('.wav'):
+                    wav_files.append(os.path.join(root, file))
+
+        meta_info = self._get_meta_info(wav_files)
+
+        files = []
+        labels = []
+        for idx, sample in enumerate(meta_info):
+            _, _, emotion = sample
+            target = self.label_list.index(emotion)
+            fold = idx % n_folds + 1
+
+            if mode == 'train' and int(fold) != split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+            if mode != 'train' and int(fold) == split:
+                files.append(wav_files[idx])
+                labels.append(target)
+
+        return files, labels
diff --git a/python/paddle/audio/features/__init__.py b/python/paddle/audio/features/__init__.py
new file mode 100644
index 00000000000000..3c0bf499f1eff4
--- /dev/null
+++ b/python/paddle/audio/features/__init__.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .layers import LogMelSpectrogram  # noqa: F401
+from .layers import MelSpectrogram  # noqa: F401
+from .layers import MFCC  # noqa: F401
+from .layers import Spectrogram  # noqa: F401
+
+__all__ = [  # noqa
+    'LogMelSpectrogram',
+    'MelSpectrogram',
+    'MFCC',
+    'Spectrogram',
+]
diff --git a/python/paddle/audio/features/layers.py b/python/paddle/audio/features/layers.py
new file mode 100644
index 00000000000000..d21a24d34241fe
--- /dev/null
+++ b/python/paddle/audio/features/layers.py
@@ -0,0 +1,394 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import partial
+from typing import Optional
+from typing import Union
+
+import paddle
+import paddle.nn as nn
+from paddle import Tensor
+
+from ..functional import compute_fbank_matrix
+from ..functional import create_dct
+from ..functional import power_to_db
+from ..functional.window import get_window
+
+
+class Spectrogram(nn.Layer):
+    """Compute spectrogram of given signals, typically audio waveforms.
+    The spectorgram is defined as the complex norm of the short-time Fourier transformation.
+
+    Args:
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+
+    Returns:
+        :ref:`api_paddle_nn_Layer`. An instance of Spectrogram.
+
+
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.audio.features import Spectrogram
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+
+            feature_extractor = Spectrogram(n_fft=512, window = 'hann', power = 1.0)
+            feats = feature_extractor(waveform)
+    """
+
+    def __init__(self,
+                 n_fft: int = 512,
+                 hop_length: Optional[int] = 512,
+                 win_length: Optional[int] = None,
+                 window: str = 'hann',
+                 power: float = 1.0,
+                 center: bool = True,
+                 pad_mode: str = 'reflect',
+                 dtype: str = 'float32') -> None:
+        super(Spectrogram, self).__init__()
+
+        assert power > 0, 'Power of spectrogram must be > 0.'
+        self.power = power
+
+        if win_length is None:
+            win_length = n_fft
+
+        self.fft_window = get_window(window,
+                                     win_length,
+                                     fftbins=True,
+                                     dtype=dtype)
+        self._stft = partial(paddle.signal.stft,
+                             n_fft=n_fft,
+                             hop_length=hop_length,
+                             win_length=win_length,
+                             window=self.fft_window,
+                             center=center,
+                             pad_mode=pad_mode)
+        self.register_buffer('fft_window', self.fft_window)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Spectrograms with shape `(N, n_fft//2 + 1, num_frames)`.
+        """
+        stft = self._stft(x)
+        spectrogram = paddle.pow(paddle.abs(stft), self.power)
+        return spectrogram
+
+
+class MelSpectrogram(nn.Layer):
+    """Compute the melspectrogram of given signals, typically audio waveforms. It is computed by multiplying spectrogram with Mel filter bank matrix.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+
+    Returns:
+        :ref:`api_paddle_nn_Layer`. An instance of MelSpectrogram.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.audio.features import MelSpectrogram
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+
+            feature_extractor = MelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
+            feats = feature_extractor(waveform)
+    """
+
+    def __init__(self,
+                 sr: int = 22050,
+                 n_fft: int = 2048,
+                 hop_length: Optional[int] = 512,
+                 win_length: Optional[int] = None,
+                 window: str = 'hann',
+                 power: float = 2.0,
+                 center: bool = True,
+                 pad_mode: str = 'reflect',
+                 n_mels: int = 64,
+                 f_min: float = 50.0,
+                 f_max: Optional[float] = None,
+                 htk: bool = False,
+                 norm: Union[str, float] = 'slaney',
+                 dtype: str = 'float32') -> None:
+        super(MelSpectrogram, self).__init__()
+
+        self._spectrogram = Spectrogram(n_fft=n_fft,
+                                        hop_length=hop_length,
+                                        win_length=win_length,
+                                        window=window,
+                                        power=power,
+                                        center=center,
+                                        pad_mode=pad_mode,
+                                        dtype=dtype)
+        self.n_mels = n_mels
+        self.f_min = f_min
+        self.f_max = f_max
+        self.htk = htk
+        self.norm = norm
+        if f_max is None:
+            f_max = sr // 2
+        self.fbank_matrix = compute_fbank_matrix(sr=sr,
+                                                 n_fft=n_fft,
+                                                 n_mels=n_mels,
+                                                 f_min=f_min,
+                                                 f_max=f_max,
+                                                 htk=htk,
+                                                 norm=norm,
+                                                 dtype=dtype)
+        self.register_buffer('fbank_matrix', self.fbank_matrix)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
+        spect_feature = self._spectrogram(x)
+        mel_feature = paddle.matmul(self.fbank_matrix, spect_feature)
+        return mel_feature
+
+
+class LogMelSpectrogram(nn.Layer):
+    """Compute log-mel-spectrogram feature of given signals, typically audio waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+
+    Returns:
+        :ref:`api_paddle_nn_Layer`. An instance of LogMelSpectrogram.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.audio.features import LogMelSpectrogram
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+
+            feature_extractor = LogMelSpectrogram(sr=sample_rate, n_fft=512, window = 'hann', power = 1.0)
+            feats = feature_extractor(waveform)
+    """
+
+    def __init__(self,
+                 sr: int = 22050,
+                 n_fft: int = 512,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: str = 'hann',
+                 power: float = 2.0,
+                 center: bool = True,
+                 pad_mode: str = 'reflect',
+                 n_mels: int = 64,
+                 f_min: float = 50.0,
+                 f_max: Optional[float] = None,
+                 htk: bool = False,
+                 norm: Union[str, float] = 'slaney',
+                 ref_value: float = 1.0,
+                 amin: float = 1e-10,
+                 top_db: Optional[float] = None,
+                 dtype: str = 'float32') -> None:
+        super(LogMelSpectrogram, self).__init__()
+
+        self._melspectrogram = MelSpectrogram(sr=sr,
+                                              n_fft=n_fft,
+                                              hop_length=hop_length,
+                                              win_length=win_length,
+                                              window=window,
+                                              power=power,
+                                              center=center,
+                                              pad_mode=pad_mode,
+                                              n_mels=n_mels,
+                                              f_min=f_min,
+                                              f_max=f_max,
+                                              htk=htk,
+                                              norm=norm,
+                                              dtype=dtype)
+
+        self.ref_value = ref_value
+        self.amin = amin
+        self.top_db = top_db
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Log mel spectrograms with shape `(N, n_mels, num_frames)`.
+        """
+        mel_feature = self._melspectrogram(x)
+        log_mel_feature = power_to_db(mel_feature,
+                                      ref_value=self.ref_value,
+                                      amin=self.amin,
+                                      top_db=self.top_db)
+        return log_mel_feature
+
+
+class MFCC(nn.Layer):
+    """Compute mel frequency cepstral coefficients(MFCCs) feature of given waveforms.
+
+    Args:
+        sr (int, optional): Sample rate. Defaults to 22050.
+        n_mfcc (int, optional): [description]. Defaults to 40.
+        n_fft (int, optional): The number of frequency components of the discrete Fourier transform. Defaults to 512.
+        hop_length (Optional[int], optional): The hop length of the short time FFT. If `None`, it is set to `win_length//4`. Defaults to None.
+        win_length (Optional[int], optional): The window length of the short time FFT. If `None`, it is set to same as `n_fft`. Defaults to None.
+        window (str, optional): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'kaiser', 'gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'. Defaults to 'hann'.
+        power (float, optional): Exponent for the magnitude spectrogram. Defaults to 2.0.
+        center (bool, optional): Whether to pad `x` to make that the :math:`t \times hop\\_length` at the center of `t`-th frame. Defaults to True.
+        pad_mode (str, optional): Choose padding pattern when `center` is `True`. Defaults to 'reflect'.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 50.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use HTK formula in computing fbank matrix. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization in computing fbank matrix. Slaney-style is used by default. You can specify norm=1.0/2.0 to use customized p-norm normalization. Defaults to 'slaney'.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): The minimum value of input magnitude. Defaults to 1e-10.
+        top_db (Optional[float], optional): The maximum db value of spectrogram. Defaults to None.
+        dtype (str, optional): Data type of input and window. Defaults to 'float32'.
+
+    Returns:
+        :ref:`api_paddle_nn_Layer`. An instance of MFCC.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.audio.features import MFCC
+
+            sample_rate = 16000
+            wav_duration = 0.5
+            num_channels = 1
+            num_frames = sample_rate * wav_duration
+            wav_data = paddle.linspace(-1.0, 1.0, num_frames) * 0.1
+            waveform = wav_data.tile([num_channels, 1])
+
+            feature_extractor = MFCC(sr=sample_rate, n_fft=512, window = 'hann')
+            feats = feature_extractor(waveform)
+    """
+
+    def __init__(self,
+                 sr: int = 22050,
+                 n_mfcc: int = 40,
+                 n_fft: int = 512,
+                 hop_length: Optional[int] = None,
+                 win_length: Optional[int] = None,
+                 window: str = 'hann',
+                 power: float = 2.0,
+                 center: bool = True,
+                 pad_mode: str = 'reflect',
+                 n_mels: int = 64,
+                 f_min: float = 50.0,
+                 f_max: Optional[float] = None,
+                 htk: bool = False,
+                 norm: Union[str, float] = 'slaney',
+                 ref_value: float = 1.0,
+                 amin: float = 1e-10,
+                 top_db: Optional[float] = None,
+                 dtype: str = 'float32') -> None:
+        super(MFCC, self).__init__()
+        assert n_mfcc <= n_mels, 'n_mfcc cannot be larger than n_mels: %d vs %d' % (
+            n_mfcc, n_mels)
+        self._log_melspectrogram = LogMelSpectrogram(sr=sr,
+                                                     n_fft=n_fft,
+                                                     hop_length=hop_length,
+                                                     win_length=win_length,
+                                                     window=window,
+                                                     power=power,
+                                                     center=center,
+                                                     pad_mode=pad_mode,
+                                                     n_mels=n_mels,
+                                                     f_min=f_min,
+                                                     f_max=f_max,
+                                                     htk=htk,
+                                                     norm=norm,
+                                                     ref_value=ref_value,
+                                                     amin=amin,
+                                                     top_db=top_db,
+                                                     dtype=dtype)
+        self.dct_matrix = create_dct(n_mfcc=n_mfcc, n_mels=n_mels, dtype=dtype)
+        self.register_buffer('dct_matrix', self.dct_matrix)
+
+    def forward(self, x: Tensor) -> Tensor:
+        """
+        Args:
+            x (Tensor): Tensor of waveforms with shape `(N, T)`
+
+        Returns:
+            Tensor: Mel frequency cepstral coefficients with shape `(N, n_mfcc, num_frames)`.
+        """
+        log_mel_feature = self._log_melspectrogram(x)
+        mfcc = paddle.matmul(log_mel_feature.transpose(
+            (0, 2, 1)), self.dct_matrix).transpose((0, 2, 1))  # (B, n_mels, L)
+        return mfcc
diff --git a/python/paddle/audio/functional/__init__.py b/python/paddle/audio/functional/__init__.py
new file mode 100644
index 00000000000000..b7db53d6c22a6f
--- /dev/null
+++ b/python/paddle/audio/functional/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .functional import compute_fbank_matrix  # noqa: F401
+from .functional import create_dct  # noqa: F401
+from .functional import fft_frequencies  # noqa: F401
+from .functional import hz_to_mel  # noqa: F401
+from .functional import mel_frequencies  # noqa: F401
+from .functional import mel_to_hz  # noqa: F401
+from .functional import power_to_db  # noqa: F401
+from .window import get_window  # noqa: F401
+
+__all__ = [  # noqa
+    'compute_fbank_matrix',
+    'create_dct',
+    'fft_frequencies',
+    'hz_to_mel',
+    'mel_frequencies',
+    'mel_to_hz',
+    'power_to_db',
+    'get_window',
+]
diff --git a/python/paddle/audio/functional/functional.py b/python/paddle/audio/functional/functional.py
new file mode 100644
index 00000000000000..bb6a7856f429cb
--- /dev/null
+++ b/python/paddle/audio/functional/functional.py
@@ -0,0 +1,327 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# Modified from librosa(https://github.com/librosa/librosa)
+import math
+from typing import Optional
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+
+def hz_to_mel(freq: Union[Tensor, float],
+              htk: bool = False) -> Union[Tensor, float]:
+    """Convert Hz to Mels.
+
+    Args:
+        freq (Union[Tensor, float]): The input tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        Union[Tensor, float]: Frequency in mels.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            val = 3.0
+            htk_flag = True
+            mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
+                paddle.to_tensor(val), htk_flag)
+    """
+
+    if htk:
+        if isinstance(freq, Tensor):
+            return 2595.0 * paddle.log10(1.0 + freq / 700.0)
+        else:
+            return 2595.0 * math.log10(1.0 + freq / 700.0)
+
+    # Fill in the linear part
+    f_min = 0.0
+    f_sp = 200.0 / 3
+
+    mels = (freq - f_min) / f_sp
+
+    # Fill in the log-scale part
+
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+
+    if isinstance(freq, Tensor):
+        target = min_log_mel + paddle.log(
+            freq / min_log_hz + 1e-10) / logstep  # prevent nan with 1e-10
+        mask = (freq > min_log_hz).astype(freq.dtype)
+        mels = target * mask + mels * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if freq >= min_log_hz:
+            mels = min_log_mel + math.log(freq / min_log_hz + 1e-10) / logstep
+
+    return mels
+
+
+def mel_to_hz(mel: Union[float, Tensor],
+              htk: bool = False) -> Union[float, Tensor]:
+    """Convert mel bin numbers to frequencies.
+
+    Args:
+        mel (Union[float, Tensor]): The mel frequency represented as a tensor with arbitrary shape.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+
+    Returns:
+        Union[float, Tensor]: Frequencies in Hz.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            val = 3.0
+            htk_flag = True
+            mel_paddle_tensor = paddle.audio.functional.mel_to_hz(
+                paddle.to_tensor(val), htk_flag)
+
+    """
+    if htk:
+        return 700.0 * (10.0**(mel / 2595.0) - 1.0)
+
+    f_min = 0.0
+    f_sp = 200.0 / 3
+    freqs = f_min + f_sp * mel
+    # And now the nonlinear scale
+    min_log_hz = 1000.0  # beginning of log region (Hz)
+    min_log_mel = (min_log_hz - f_min) / f_sp  # same (Mels)
+    logstep = math.log(6.4) / 27.0  # step size for log region
+    if isinstance(mel, Tensor):
+        target = min_log_hz * paddle.exp(logstep * (mel - min_log_mel))
+        mask = (mel > min_log_mel).astype(mel.dtype)
+        freqs = target * mask + freqs * (
+            1 - mask)  # will replace by masked_fill OP in future
+    else:
+        if mel >= min_log_mel:
+            freqs = min_log_hz * math.exp(logstep * (mel - min_log_mel))
+    return freqs
+
+
+def mel_frequencies(n_mels: int = 64,
+                    f_min: float = 0.0,
+                    f_max: float = 11025.0,
+                    htk: bool = False,
+                    dtype: str = 'float32') -> Tensor:
+    """Compute mel frequencies.
+
+    Args:
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        fmax (float, optional): Maximum frequency in Hz. Defaults to 11025.0.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
+    Returns:
+        Tensor: Tensor of n_mels frequencies in Hz with shape `(n_mels,)`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            n_mels = 64
+            f_min = 0.5
+            f_max = 10000
+            htk_flag = True
+
+            paddle_mel_freq = paddle.audio.functional.mel_frequencies(
+                n_mels, f_min, f_max, htk_flag, 'float64')
+    """
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    min_mel = hz_to_mel(f_min, htk=htk)
+    max_mel = hz_to_mel(f_max, htk=htk)
+    mels = paddle.linspace(min_mel, max_mel, n_mels, dtype=dtype)
+    freqs = mel_to_hz(mels, htk=htk)
+    return freqs
+
+
+def fft_frequencies(sr: int, n_fft: int, dtype: str = 'float32') -> Tensor:
+    """Compute fourier frequencies.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        dtype (str, optional): The data type of the return frequencies. Defaults to 'float32'.
+
+    Returns:
+        Tensor: FFT frequencies in Hz with shape `(n_fft//2 + 1,)`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            sr = 16000
+            n_fft = 128
+            fft_freq = paddle.audio.functional.fft_frequencies(sr, n_fft)
+    """
+    return paddle.linspace(0, float(sr) / 2, int(1 + n_fft // 2), dtype=dtype)
+
+
+def compute_fbank_matrix(sr: int,
+                         n_fft: int,
+                         n_mels: int = 64,
+                         f_min: float = 0.0,
+                         f_max: Optional[float] = None,
+                         htk: bool = False,
+                         norm: Union[str, float] = 'slaney',
+                         dtype: str = 'float32') -> Tensor:
+    """Compute fbank matrix.
+
+    Args:
+        sr (int): Sample rate.
+        n_fft (int): Number of fft bins.
+        n_mels (int, optional): Number of mel bins. Defaults to 64.
+        f_min (float, optional): Minimum frequency in Hz. Defaults to 0.0.
+        f_max (Optional[float], optional): Maximum frequency in Hz. Defaults to None.
+        htk (bool, optional): Use htk scaling. Defaults to False.
+        norm (Union[str, float], optional): Type of normalization. Defaults to 'slaney'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
+    Returns:
+        Tensor: Mel transform matrix with shape `(n_mels, n_fft//2 + 1)`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            n_mfcc = 23
+            n_mels = 51
+            paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
+    """
+
+    if f_max is None:
+        f_max = float(sr) / 2
+
+    # Initialize the weights
+    weights = paddle.zeros((n_mels, int(1 + n_fft // 2)), dtype=dtype)
+
+    # Center freqs of each FFT bin
+    fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft, dtype=dtype)
+
+    # 'Center freqs' of mel bands - uniformly spaced between limits
+    mel_f = mel_frequencies(n_mels + 2,
+                            f_min=f_min,
+                            f_max=f_max,
+                            htk=htk,
+                            dtype=dtype)
+
+    fdiff = mel_f[1:] - mel_f[:-1]  #np.diff(mel_f)
+    ramps = mel_f.unsqueeze(1) - fftfreqs.unsqueeze(0)
+    #ramps = np.subtract.outer(mel_f, fftfreqs)
+
+    for i in range(n_mels):
+        # lower and upper slopes for all bins
+        lower = -ramps[i] / fdiff[i]
+        upper = ramps[i + 2] / fdiff[i + 1]
+
+        # .. then intersect them with each other and zero
+        weights[i] = paddle.maximum(paddle.zeros_like(lower),
+                                    paddle.minimum(lower, upper))
+
+    # Slaney-style mel is scaled to be approx constant energy per channel
+    if norm == 'slaney':
+        enorm = 2.0 / (mel_f[2:n_mels + 2] - mel_f[:n_mels])
+        weights *= enorm.unsqueeze(1)
+    elif isinstance(norm, int) or isinstance(norm, float):
+        weights = paddle.nn.functional.normalize(weights, p=norm, axis=-1)
+
+    return weights
+
+
+def power_to_db(spect: Tensor,
+                ref_value: float = 1.0,
+                amin: float = 1e-10,
+                top_db: Optional[float] = 80.0) -> Tensor:
+    """Convert a power spectrogram (amplitude squared) to decibel (dB) units. The function computes the scaling `10 * log10(x / ref)` in a numerically stable way.
+
+    Args:
+        spect (Tensor): STFT power spectrogram.
+        ref_value (float, optional): The reference value. If smaller than 1.0, the db level of the signal will be pulled up accordingly. Otherwise, the db level is pushed down. Defaults to 1.0.
+        amin (float, optional): Minimum threshold. Defaults to 1e-10.
+        top_db (Optional[float], optional): Threshold the output at `top_db` below the peak. Defaults to None.
+
+    Returns:
+        Tensor: Power spectrogram in db scale.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            val = 3.0
+            decibel_paddle = paddle.audio.functional.power_to_db(
+                paddle.to_tensor(val))
+    """
+    if amin <= 0:
+        raise Exception("amin must be strictly positive")
+
+    if ref_value <= 0:
+        raise Exception("ref_value must be strictly positive")
+
+    ones = paddle.ones_like(spect)
+    log_spec = 10.0 * paddle.log10(paddle.maximum(ones * amin, spect))
+    log_spec -= 10.0 * math.log10(max(ref_value, amin))
+
+    if top_db is not None:
+        if top_db < 0:
+            raise Exception("top_db must be non-negative")
+        log_spec = paddle.maximum(log_spec, ones * (log_spec.max() - top_db))
+
+    return log_spec
+
+
+def create_dct(n_mfcc: int,
+               n_mels: int,
+               norm: Optional[str] = 'ortho',
+               dtype: str = 'float32') -> Tensor:
+    """Create a discrete cosine transform(DCT) matrix.
+
+    Args:
+        n_mfcc (int): Number of mel frequency cepstral coefficients. 
+        n_mels (int): Number of mel filterbanks.
+        norm (Optional[str], optional): Normalizaiton type. Defaults to 'ortho'.
+        dtype (str, optional): The data type of the return matrix. Defaults to 'float32'.
+
+    Returns:
+        Tensor: The DCT matrix with shape `(n_mels, n_mfcc)`.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            n_mfcc = 23
+            n_mels = 257
+            dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
+    """
+    n = paddle.arange(n_mels, dtype=dtype)
+    k = paddle.arange(n_mfcc, dtype=dtype).unsqueeze(1)
+    dct = paddle.cos(math.pi / float(n_mels) * (n + 0.5) *
+                     k)  # size (n_mfcc, n_mels)
+    if norm is None:
+        dct *= 2.0
+    else:
+        assert norm == "ortho"
+        dct[0] *= 1.0 / math.sqrt(2.0)
+        dct *= math.sqrt(2.0 / float(n_mels))
+    return dct.T
diff --git a/python/paddle/audio/functional/window.py b/python/paddle/audio/functional/window.py
new file mode 100644
index 00000000000000..472c56b87acf95
--- /dev/null
+++ b/python/paddle/audio/functional/window.py
@@ -0,0 +1,385 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+import math
+from typing import List
+from typing import Tuple
+from typing import Union
+
+import paddle
+from paddle import Tensor
+
+
+class WindowFunctionRegister(object):
+    def __init__(self):
+        self._functions_dict = dict()
+
+    def register(self, func=None):
+        def add_subfunction(func):
+            name = func.__name__
+            self._functions_dict[name] = func
+            return func
+
+        return add_subfunction
+
+    def get(self, name):
+        return self._functions_dict[name]
+
+
+window_function_register = WindowFunctionRegister()
+
+
+@window_function_register.register()
+def _cat(x: List[Tensor], data_type: str) -> Tensor:
+    l = [paddle.to_tensor(_, data_type) for _ in x]
+    return paddle.concat(l)
+
+
+@window_function_register.register()
+def _acosh(x: Union[Tensor, float]) -> Tensor:
+    if isinstance(x, float):
+        return math.log(x + math.sqrt(x**2 - 1))
+    return paddle.log(x + paddle.sqrt(paddle.square(x) - 1))
+
+
+@window_function_register.register()
+def _extend(M: int, sym: bool) -> bool:
+    """Extend window by 1 sample if needed for DFT-even symmetry."""
+    if not sym:
+        return M + 1, True
+    else:
+        return M, False
+
+
+@window_function_register.register()
+def _len_guards(M: int) -> bool:
+    """Handle small or incorrect window lengths."""
+    if int(M) != M or M < 0:
+        raise ValueError('Window length M must be a non-negative integer')
+
+    return M <= 1
+
+
+@window_function_register.register()
+def _truncate(w: Tensor, needed: bool) -> Tensor:
+    """Truncate window by 1 sample if needed for DFT-even symmetry."""
+    if needed:
+        return w[:-1]
+    else:
+        return w
+
+
+@window_function_register.register()
+def _general_gaussian(
+    M: int, p, sig, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute a window with a generalized Gaussian shape.
+    This function is consistent with scipy.signal.windows.general_gaussian().
+    """
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    w = paddle.exp(-0.5 * paddle.abs(n / sig) ** (2 * p))
+
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _general_cosine(
+    M: int, a: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute a generic weighted sum of cosine terms window.
+    This function is consistent with scipy.signal.windows.general_cosine().
+    """
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    fac = paddle.linspace(-math.pi, math.pi, M, dtype=dtype)
+    w = paddle.zeros((M,), dtype=dtype)
+    for k in range(len(a)):
+        w += a[k] * paddle.cos(k * fac)
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _general_hamming(
+    M: int, alpha: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute a generalized Hamming window.
+    This function is consistent with scipy.signal.windows.general_hamming()
+    """
+    return _general_cosine(M, [alpha, 1.0 - alpha], sym, dtype=dtype)
+
+
+@window_function_register.register()
+def _taylor(
+    M: int, nbar=4, sll=30, norm=True, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute a Taylor window.
+    The Taylor window taper function approximates the Dolph-Chebyshev window's
+    constant sidelobe level for a parameterized number of near-in sidelobes.
+    """
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    # Original text uses a negative sidelobe level parameter and then negates
+    # it in the calculation of B. To keep consistent with other methods we
+    # assume the sidelobe level parameter to be positive.
+    B = 10 ** (sll / 20)
+    A = _acosh(B) / math.pi
+    s2 = nbar**2 / (A**2 + (nbar - 0.5) ** 2)
+    ma = paddle.arange(1, nbar, dtype=dtype)
+
+    Fm = paddle.empty((nbar - 1,), dtype=dtype)
+    signs = paddle.empty_like(ma)
+    signs[::2] = 1
+    signs[1::2] = -1
+    m2 = ma * ma
+    for mi in range(len(ma)):
+        numer = signs[mi] * paddle.prod(
+            1 - m2[mi] / s2 / (A**2 + (ma - 0.5) ** 2)
+        )
+        if mi == 0:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
+        elif mi == len(ma) - 1:
+            denom = 2 * paddle.prod(1 - m2[mi] / m2[:mi])
+        else:
+            denom = (
+                2
+                * paddle.prod(1 - m2[mi] / m2[:mi])
+                * paddle.prod(1 - m2[mi] / m2[mi + 1 :])
+            )
+
+        Fm[mi] = numer / denom
+
+    def W(n):
+        return 1 + 2 * paddle.matmul(
+            Fm.unsqueeze(0),
+            paddle.cos(2 * math.pi * ma.unsqueeze(1) * (n - M / 2.0 + 0.5) / M),
+        )
+
+    w = W(paddle.arange(0, M, dtype=dtype))
+
+    # normalize (Note that this is not described in the original text [1])
+    if norm:
+        scale = 1.0 / W((M - 1) / 2)
+        w *= scale
+    w = w.squeeze()
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _hamming(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a Hamming window.
+    The Hamming window is a taper formed by using a raised cosine with
+    non-zero endpoints, optimized to minimize the nearest side lobe.
+    """
+    return _general_hamming(M, 0.54, sym, dtype=dtype)
+
+
+@window_function_register.register()
+def _hann(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a Hann window.
+    The Hann window is a taper formed by using a raised cosine or sine-squared
+    with ends that touch zero.
+    """
+    return _general_hamming(M, 0.5, sym, dtype=dtype)
+
+
+@window_function_register.register()
+def _tukey(
+    M: int, alpha=0.5, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute a Tukey window.
+    The Tukey window is also known as a tapered cosine window.
+    """
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+
+    if alpha <= 0:
+        return paddle.ones((M,), dtype=dtype)
+    elif alpha >= 1.0:
+        return hann(M, sym=sym)
+
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype)
+    width = int(alpha * (M - 1) / 2.0)
+    n1 = n[0 : width + 1]
+    n2 = n[width + 1 : M - width - 1]
+    n3 = n[M - width - 1 :]
+
+    w1 = 0.5 * (1 + paddle.cos(math.pi * (-1 + 2.0 * n1 / alpha / (M - 1))))
+    w2 = paddle.ones(n2.shape, dtype=dtype)
+    w3 = 0.5 * (
+        1
+        + paddle.cos(math.pi * (-2.0 / alpha + 1 + 2.0 * n3 / alpha / (M - 1)))
+    )
+    w = paddle.concat([w1, w2, w3])
+
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _gaussian(
+    M: int, std: float, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute a Gaussian window.
+    The Gaussian widows has a Gaussian shape defined by the standard deviation(std).
+    """
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(0, M, dtype=dtype) - (M - 1.0) / 2.0
+    sig2 = 2 * std * std
+    w = paddle.exp(-(n**2) / sig2)
+
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _exponential(
+    M: int, center=None, tau=1.0, sym: bool = True, dtype: str = 'float64'
+) -> Tensor:
+    """Compute an exponential (or Poisson) window."""
+    if sym and center is not None:
+        raise ValueError("If sym==True, center must be None.")
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    if center is None:
+        center = (M - 1) / 2
+
+    n = paddle.arange(0, M, dtype=dtype)
+    w = paddle.exp(-paddle.abs(n - center) / tau)
+
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _triang(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a triangular window."""
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    n = paddle.arange(1, (M + 1) // 2 + 1, dtype=dtype)
+    if M % 2 == 0:
+        w = (2 * n - 1.0) / M
+        w = paddle.concat([w, w[::-1]])
+    else:
+        w = 2 * n / (M + 1.0)
+        w = paddle.concat([w, w[-2::-1]])
+
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _bohman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a Bohman window.
+    The Bohman window is the autocorrelation of a cosine window.
+    """
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+
+    fac = paddle.abs(paddle.linspace(-1, 1, M, dtype=dtype)[1:-1])
+    w = (1 - fac) * paddle.cos(math.pi * fac) + 1.0 / math.pi * paddle.sin(
+        math.pi * fac
+    )
+    w = _cat([0, w, 0], dtype)
+
+    return _truncate(w, needs_trunc)
+
+
+@window_function_register.register()
+def _blackman(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a Blackman window.
+    The Blackman window is a taper formed by using the first three terms of
+    a summation of cosines. It was designed to have close to the minimal
+    leakage possible.  It is close to optimal, only slightly worse than a
+    Kaiser window.
+    """
+    return _general_cosine(M, [0.42, 0.50, 0.08], sym, dtype=dtype)
+
+
+@window_function_register.register()
+def _cosine(M: int, sym: bool = True, dtype: str = 'float64') -> Tensor:
+    """Compute a window with a simple cosine shape."""
+    if _len_guards(M):
+        return paddle.ones((M,), dtype=dtype)
+    M, needs_trunc = _extend(M, sym)
+    w = paddle.sin(math.pi / M * (paddle.arange(0, M, dtype=dtype) + 0.5))
+
+    return _truncate(w, needs_trunc)
+
+
+def get_window(
+    window: Union[str, Tuple[str, float]],
+    win_length: int,
+    fftbins: bool = True,
+    dtype: str = 'float64',
+) -> Tensor:
+    """Return a window of a given length and type.
+
+    Args:
+        window (Union[str, Tuple[str, float]]): The window function applied to the signal before the Fourier transform. Supported window functions: 'hamming', 'hann', 'gaussian', 'general_gaussian', 'exponential', 'triang', 'bohman', 'blackman', 'cosine', 'tukey', 'taylor'.
+        win_length (int): Number of samples.
+        fftbins (bool, optional): If True, create a "periodic" window. Otherwise, create a "symmetric" window, for use in filter design. Defaults to True.
+        dtype (str, optional): The data type of the return window. Defaults to 'float64'.
+
+    Returns:
+        Tensor: The window represented as a tensor.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            n_fft = 512
+            cosine_window = paddle.audio.functional.get_window('cosine', n_fft)
+
+            std = 7
+            gaussian_window = paddle.audio.functional.get_window(('gaussian',std), n_fft)
+    """
+    sym = not fftbins
+
+    args = ()
+    if isinstance(window, tuple):
+        winstr = window[0]
+        if len(window) > 1:
+            args = window[1:]
+    elif isinstance(window, str):
+        if window in ['gaussian', 'exponential']:
+            raise ValueError(
+                "The '" + window + "' window needs one or "
+                "more parameters -- pass a tuple."
+            )
+        else:
+            winstr = window
+    else:
+        raise ValueError(
+            "%s as window type is not supported." % str(type(window))
+        )
+
+    try:
+        winfunc = window_function_register.get('_' + winstr)
+    except KeyError as e:
+        raise ValueError("Unknown window type.") from e
+
+    params = (win_length,) + args
+    kwargs = {'sym': sym}
+    return winfunc(*params, dtype=dtype, **kwargs)
diff --git a/python/paddle/autograd/py_layer.py b/python/paddle/autograd/py_layer.py
index 673b047d5a3bad..254aabb04b3cb1 100644
--- a/python/paddle/autograd/py_layer.py
+++ b/python/paddle/autograd/py_layer.py
@@ -54,16 +54,16 @@ def __init__(self):
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
-        .. note::
-            This API should be called at most once, and only inside `forward`. 
+
+        Note:
+            This API should be called at most once, and only inside `forward`.
 
         Args:
             tensors(list of Tensors): Tensors to be stored.
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -94,7 +94,7 @@ def saved_tensor(self):
         Get the tensors stored by ``save_for_backward``.
 
         Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
             then return these tensors, otherwise return None.
 
         Examples:
@@ -124,9 +124,7 @@ def backward(ctx, dy):
 
 
 def with_mateclass(meta, *bases):
-
     class impl(meta):
-
         def __new__(cls, name, temp_bases, attrs):
             return meta(name, bases, attrs)
 
@@ -134,7 +132,6 @@ def __new__(cls, name, temp_bases, attrs):
 
 
 class CPyLayer(object):
-
     @classmethod
     @dygraph_only
     def apply(cls, *args, **kwargs):
@@ -147,7 +144,7 @@ def apply(cls, *args, **kwargs):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -182,12 +179,14 @@ def backward(ctx, dy):
 
 
 class PyLayerBackward(LegacyPyLayerContext):
-
     def backward(self, *args, **kwargs):
         with paddle.fluid.dygraph.guard():
             with paddle.fluid.dygraph.no_grad():
-                if self._amp_state and 'enable' in self._amp_state and self._amp_state[
-                        'enable']:
+                if (
+                    self._amp_state
+                    and 'enable' in self._amp_state
+                    and self._amp_state['enable']
+                ):
                     with auto_cast(**args[0]._amp_state):
                         return self._forward_cls.backward(*args, **kwargs)
                 else:
@@ -197,10 +196,10 @@ def backward(self, *args, **kwargs):
 
 
 class LayerMeta(type):
-
     def __init__(cls, name, bases, attrs):
-        cls._backward_function = type(name + '_backward', (PyLayerBackward, ),
-                                      {"_forward_cls": cls})
+        cls._backward_function = type(
+            name + '_backward', (PyLayerBackward,), {"_forward_cls": cls}
+        )
 
         return super(LayerMeta, cls).__init__(name, bases, attrs)
 
@@ -210,15 +209,15 @@ class LegacyPyLayer(with_mateclass(LayerMeta, CPyLayer)):
     Build a custom `Layer` by creating subclasses. Subclasses need to follow the following rules:
     1. Subclasses contain `forward` and `backward` function. Both forward and backward are @staticmethod.
     Their first argument should be a context and `None` can not be included in the returned result.
-    2. Input of backward contains a context as the first argument, and the rest arguments are the 
-    gradient of forward's output tensors. so the number of backward's input tensors equal to 
-    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`, 
+    2. Input of backward contains a context as the first argument, and the rest arguments are the
+    gradient of forward's output tensors. so the number of backward's input tensors equal to
+    the number of forward output tensors. If you need the forward's inputs or outputs in `backward`,
     you can use `save_for_backward` to store the required tensors, and then use them in the backward.
     3. Output of backward function can only be `Tensor` or tuple/list of `Tensor`.
-    Output tensors of backward are the gradient of forward's input tensors, 
+    Output tensors of backward are the gradient of forward's input tensors,
     so the number of backward's output tensors equal to the number of forward input tensors.
     After building the custom Layer, run it through the `apply` method.
-    
+
 
     Examples:
         .. code-block:: python
@@ -259,8 +258,8 @@ def backward(ctx, dy):
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
         `None` can not be included in the returned result.
 
         Args:
@@ -269,7 +268,7 @@ def forward(ctx, *args, **kwargs):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -292,14 +291,15 @@ def backward(ctx, dy):
                         return grad
         """
         raise NotImplementedError(
-            "You must implement the forward function for PyLayer.")
+            "You must implement the forward function for PyLayer."
+        )
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
         """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
         are the gradient of forward's input tensors.
 
         Args:
@@ -308,7 +308,7 @@ def backward(ctx, *args, **kwargs):
 
         Returns:
             Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
         Examples:
             .. code-block:: python
 
@@ -332,24 +332,24 @@ def backward(ctx, dy):
         """
 
         raise NotImplementedError(
-            "You must implement the backward function for PyLayer.")
+            "You must implement the backward function for PyLayer."
+        )
 
 
 class EagerPyLayerContext(object):
-
     def save_for_backward(self, *tensors):
         """
         Saves given tensors that backward need. Use ``saved_tensor`` in the `backward` to get the saved tensors.
-        
-        .. note::
-            This API should be called at most once, and only inside `forward`. 
+
+        Note:
+            This API should be called at most once, and only inside `forward`.
 
         Args:
             tensors(list of Tensors): Tensors to be stored.
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -380,7 +380,7 @@ def saved_tensor(self):
         Get the tensors stored by ``save_for_backward``.
 
         Returns:
-            list of Tensors or None: If context contains tensors stored by `save_for_backward`, 
+            list of Tensors or None: If context contains tensors stored by `save_for_backward`,
             then return these tensors, otherwise return None.
 
         Examples:
@@ -410,11 +410,11 @@ def backward(ctx, dy):
     def mark_not_inplace(self, *args):
         """
         Marks inputs as not inplace.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
         and all arguments should be Tensor inputs.
 
-        If the Tensor returned by `forward` method is the same as the Tensor input of forward, 
-        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output. 
+        If the Tensor returned by `forward` method is the same as the Tensor input of forward,
+        and this Tensor is marked as not_inplace, then Paddle will help the user create a new Tensor as output.
         Thereby preventing the auto grad information of the input Tensor from being overwritten.
 
         Examples:
@@ -427,7 +427,7 @@ class Exp(paddle.autograd.PyLayer):
                     def forward(ctx, x):
                         ctx.mark_not_inplace(x)
                         return x
-                    
+
                     @staticmethod
                     def backward(ctx, grad_output):
                         out = grad_output.exp()
@@ -438,7 +438,7 @@ def backward(ctx, grad_output):
                 attn_layers = []
                 for idx in range(0, 2):
                     attn_layers.append(Exp())
-                
+
                 for step in range(0, 2):
                     a = x
                     for j in range(0,2):
@@ -450,7 +450,7 @@ def backward(ctx, grad_output):
     def mark_non_differentiable(self, *args):
         """
         Marks outputs as non-differentiable.
-        This should be called at most once, only from inside the `forward` method, 
+        This should be called at most once, only from inside the `forward` method,
         and all arguments should be tensor outputs.
 
         This will mark outputs as not requiring gradients, increasing the
@@ -542,30 +542,27 @@ def backward(ctx, grad, grad2):
 
 
 class EagerPyLayerBackward(core.eager.PyLayer, EagerPyLayerContext):
-
     def backward(self, *args):
         return self._forward_cls.backward(self, *args)
 
 
 class EagerPyLayerMeta(type):
-
     def __init__(cls, name, bases, attrs):
-        cls._backward_function = type(name + '_backward',
-                                      (EagerPyLayerBackward, ),
-                                      {"_forward_cls": cls})
+        cls._backward_function = type(
+            name + '_backward', (EagerPyLayerBackward,), {"_forward_cls": cls}
+        )
 
         return super(EagerPyLayerMeta, cls).__init__(name, bases, attrs)
 
 
 class EagerPyLayer(
-        with_mateclass(EagerPyLayerMeta, core.eager.PyLayer,
-                       EagerPyLayerContext)):
-
+    with_mateclass(EagerPyLayerMeta, core.eager.PyLayer, EagerPyLayerContext)
+):
     @staticmethod
     def forward(ctx, *args, **kwargs):
         """
-        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as 
-        the first argument, followed by any number of arguments (tensors or other types). 
+        It is to be overloaded by subclasses. It must accept a object of `PyLayerContext` as
+        the first argument, followed by any number of arguments (tensors or other types).
         `None` can not be included in the returned result.
 
         Args:
@@ -574,7 +571,7 @@ def forward(ctx, *args, **kwargs):
 
         Returns:
             tensors or other types : output of PyLayer.
-        
+
         Examples:
             .. code-block:: python
 
@@ -597,14 +594,15 @@ def backward(ctx, dy):
                         return grad
         """
         raise NotImplementedError(
-            "You must implement the forward function for PyLayer.")
+            "You must implement the forward function for PyLayer."
+        )
 
     @staticmethod
     def backward(ctx, *args):
         """
-        This is a function to calculate the gradient. It is to be overloaded by subclasses. 
-        It must accept a object of `PyLayerContext` as the first argument, and the rest 
-        arguments are the gradient of forward's output tensors. Output tensors of backward 
+        This is a function to calculate the gradient. It is to be overloaded by subclasses.
+        It must accept a object of `PyLayerContext` as the first argument, and the rest
+        arguments are the gradient of forward's output tensors. Output tensors of backward
         are the gradient of forward's input tensors.
 
         Args:
@@ -613,7 +611,7 @@ def backward(ctx, *args):
 
         Returns:
             Tensor or list of Tensors: The gradient of forward's input tensor(s).
-        
+
         Examples:
             .. code-block:: python
 
@@ -637,11 +635,11 @@ def backward(ctx, dy):
         """
 
         raise NotImplementedError(
-            "You must implement the backward function for PyLayer.")
+            "You must implement the backward function for PyLayer."
+        )
 
 
 def once_differentiable(backward):
-
     def wrapper(ctx, *args):
         with paddle.fluid.dygraph.no_grad():
             outputs = backward(ctx, *args)
diff --git a/python/paddle/device/cuda/__init__.py b/python/paddle/device/cuda/__init__.py
index d867f071229925..316f9de6122654 100644
--- a/python/paddle/device/cuda/__init__.py
+++ b/python/paddle/device/cuda/__init__.py
@@ -42,12 +42,12 @@ def current_stream(device=None):
     Return the current CUDA stream by the device.
 
     Parameters:
-        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from. 
+        device(paddle.CUDAPlace()|int, optional): The device or the ID of the device which want to get stream from.
         If device is None, the device is the current device. Default: None.
-    
+
     Returns:
         CUDAStream: the stream to the device.
-    
+
     Examples:
         .. code-block:: python
 
@@ -82,7 +82,7 @@ def synchronize(device=None):
     Parameters:
         device(paddle.CUDAPlace()|int, optional): The device or the ID of the device.
         If device is None, the device is the current device. Default: None.
-    
+
     Examples:
         .. code-block:: python
 
@@ -111,7 +111,7 @@ def synchronize(device=None):
 def device_count():
     '''
     Return the number of GPUs available.
-    
+
     Returns:
         int: the number of GPUs available.
 
@@ -124,8 +124,11 @@ def device_count():
 
     '''
 
-    num_gpus = core.get_cuda_device_count() if hasattr(
-        core, 'get_cuda_device_count') else 0
+    num_gpus = (
+        core.get_cuda_device_count()
+        if hasattr(core, 'get_cuda_device_count')
+        else 0
+    )
 
     return num_gpus
 
@@ -158,14 +161,14 @@ def extract_cuda_device_id(device, op_name):
     Return the id of the given cuda device. It is just a utility that will not be exposed to users.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
             the string name of device like 'gpu:x'.
             Default: None.
 
     Return:
         int: The id of the given device. If device is None, return the id of current device.
     '''
-    if (device is None):
+    if device is None:
         return core.get_cuda_current_device_id()
 
     if isinstance(device, int):
@@ -178,15 +181,19 @@ def extract_cuda_device_id(device, op_name):
         else:
             raise ValueError(
                 "The current string {} is not expected. Because {} only support string which is like 'gpu:x'. "
-                "Please input appropriate string again!".format(
-                    device, op_name))
+                "Please input appropriate string again!".format(device, op_name)
+            )
     else:
         raise ValueError(
             "The device type {} is not expected. Because {} only support int, str or paddle.CUDAPlace. "
-            "Please input appropriate device again!".format(device, op_name))
+            "Please input appropriate device again!".format(device, op_name)
+        )
 
-    assert device_id >= 0, f"The device id must be not less than 0, but got id = {device_id}."
-    assert device_id < device_count(
+    assert (
+        device_id >= 0
+    ), f"The device id must be not less than 0, but got id = {device_id}."
+    assert (
+        device_id < device_count()
     ), f"The device id {device_id} exceeds gpu card number {device_count()}"
 
     return device_id
@@ -196,13 +203,13 @@ def max_memory_allocated(device=None):
     '''
     Return the peak size of gpu memory that is allocated to tensor of the given device.
 
-    .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need. 
+    Note:
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may larger than the memory size that tensor actually need.
         For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -232,8 +239,8 @@ def max_memory_reserved(device=None):
     Return the peak size of GPU memory that is held by the allocator of the given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -262,13 +269,13 @@ def memory_allocated(device=None):
     '''
     Return the current size of gpu memory that is allocated to tensor of the given device.
 
-    .. note::
-        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need. 
-        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes. 
+    Note:
+        The size of GPU memory allocated to tensor is 256-byte aligned in Paddle, which may be larger than the memory size that tensor actually need.
+        For instance, a float32 tensor with shape [1] in GPU will take up 256 bytes memory, even though storing a float32 data requires only 4 bytes.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
@@ -298,14 +305,14 @@ def memory_reserved(device=None):
     Return the current size of GPU memory that is held by the allocator of the given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x'. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x'. If device is None, the device is the current device.
             Default: None.
 
     Return:
         int: The current size of GPU memory that is held by the allocator of the given device, in bytes.
 
-    Examples:    
+    Examples:
         .. code-block:: python
 
             # required: gpu
@@ -389,18 +396,18 @@ def get_device_properties(device=None):
     Return the properties of given device.
 
     Args:
-        device(paddle.CUDAPlace or int or str): The device, the id of the device or 
-            the string name of device like 'gpu:x' which to get the properties of the 
-            device from. If device is None, the device is the current device. 
+        device(paddle.CUDAPlace or int or str): The device, the id of the device or
+            the string name of device like 'gpu:x' which to get the properties of the
+            device from. If device is None, the device is the current device.
             Default: None.
 
     Returns:
-        _gpuDeviceProperties: The properties of the device which include ASCII string 
-        identifying device, major compute capability, minor compute capability, global 
+        _gpuDeviceProperties: The properties of the device which include ASCII string
+        identifying device, major compute capability, minor compute capability, global
         memory available and the number of multiprocessors on the device.
 
     Examples:
-    
+
         .. code-block:: python
 
             # required: gpu
@@ -424,7 +431,8 @@ def get_device_properties(device=None):
         raise ValueError(
             "The API paddle.device.cuda.get_device_properties is not supported in "
             "CPU-only PaddlePaddle. Please reinstall PaddlePaddle with GPU support "
-            "to call this API.")
+            "to call this API."
+        )
 
     if device is not None:
         if isinstance(device, int):
@@ -438,12 +446,14 @@ def get_device_properties(device=None):
                 raise ValueError(
                     "The current string {} is not expected. Because paddle.device."
                     "cuda.get_device_properties only support string which is like 'gpu:x'. "
-                    "Please input appropriate string again!".format(device))
+                    "Please input appropriate string again!".format(device)
+                )
         else:
             raise ValueError(
                 "The device type {} is not expected. Because paddle.device.cuda."
                 "get_device_properties only support int, str or paddle.CUDAPlace. "
-                "Please input appropriate device again!".format(device))
+                "Please input appropriate device again!".format(device)
+            )
     else:
         device_id = -1
 
@@ -484,7 +494,7 @@ def get_device_capability(device=None):
     Return the major and minor revision numbers defining the device's compute capability which are got from CUDA function `cudaDeviceProp <https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__DEVICE.html#group__CUDART__DEVICE_1g1bf9d625a931d657e08db2b4391170f0>`_.
 
     Parameters:
-        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device. 
+        device(paddle.CUDAPlace|int, optional): The device or the ID of the device. If device is None (default), the device is the current device.
 
     Returns:
         tuple(int,int): the major and minor revision numbers defining the device's compute capability.
diff --git a/python/paddle/distributed/__init__.py b/python/paddle/distributed/__init__.py
index bf9773ad9409fe..43f819dd770a13 100644
--- a/python/paddle/distributed/__init__.py
+++ b/python/paddle/distributed/__init__.py
@@ -51,7 +51,7 @@
 from .collective import P2POp  # noqa: F401
 from .collective import reduce_scatter  # noqa: F401
 
-from .communication import *  # noqa: F401
+from .communication import stream
 
 from .auto_parallel import shard_op  # noqa: F401
 from .auto_parallel import shard_tensor  # noqa: F401
@@ -65,17 +65,44 @@
 from paddle.fluid.dygraph.parallel import ParallelEnv  # noqa: F401
 
 from . import cloud_utils  # noqa: F401
-from . import utils  # noqa: F401
 
-from .sharding import *  # noqa: F401
+from .sharding import group_sharded_parallel, save_group_sharded_model
 
 __all__ = [  # noqa
-    "spawn", "launch", "scatter", "broadcast", "ParallelEnv", "new_group",
-    "init_parallel_env", "gloo_init_parallel_env", "gloo_barrier",
-    "gloo_release", "QueueDataset", "split", "CountFilterEntry",
-    "ShowClickEntry", "get_world_size", "get_group", "all_gather",
-    "all_gather_object", "InMemoryDataset", "barrier", "all_reduce", "alltoall",
-    "send", "reduce", "recv", "ReduceOp", "wait", "get_rank",
-    "ProbabilityEntry", "ParallelMode", "is_initialized", "isend", "irecv",
-    "reduce_scatter"
+    "spawn",
+    "launch",
+    "scatter",
+    "broadcast",
+    "ParallelEnv",
+    "new_group",
+    "init_parallel_env",
+    "gloo_init_parallel_env",
+    "gloo_barrier",
+    "gloo_release",
+    "QueueDataset",
+    "split",
+    "CountFilterEntry",
+    "ShowClickEntry",
+    "get_world_size",
+    "get_group",
+    "all_gather",
+    "all_gather_object",
+    "InMemoryDataset",
+    "barrier",
+    "all_reduce",
+    "alltoall",
+    "alltoall_single",
+    "send",
+    "reduce",
+    "recv",
+    "ReduceOp",
+    "wait",
+    "get_rank",
+    "ProbabilityEntry",
+    "ParallelMode",
+    "is_initialized",
+    "destroy_process_group",
+    "isend",
+    "irecv",
+    "reduce_scatter",
 ]
diff --git a/python/paddle/distributed/auto_parallel/__init__.py b/python/paddle/distributed/auto_parallel/__init__.py
index 4dc68edfe2d553..269a0ec644dbd2 100644
--- a/python/paddle/distributed/auto_parallel/__init__.py
+++ b/python/paddle/distributed/auto_parallel/__init__.py
@@ -12,10 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .interface import shard_tensor  # noqa: F401
-from .interface import shard_op  # noqa: F401
+from .strategy import Strategy
 from .process_mesh import ProcessMesh
-from .reshard import Resharder  # noqa: F401
-from .cost_model import estimate_cost
+from .engine import Engine
+from .interface import shard_tensor
+from .interface import shard_op
+from .interface import recompute
+from .interface import fetch
 
 __all__ = []
diff --git a/python/paddle/distributed/auto_parallel/callbacks.py b/python/paddle/distributed/auto_parallel/callbacks.py
new file mode 100644
index 00000000000000..17ce5bd71b8168
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/callbacks.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+
+import paddle
+from paddle.hapi.callbacks import ProgBarLogger, ModelCheckpoint, LRScheduler, CallbackList, Callback
+from .interface import CollectionNames, get_collection
+
+
+def config_callbacks(callbacks=None,
+                     engine=None,
+                     batch_size=None,
+                     epochs=None,
+                     steps=None,
+                     log_freq=2,
+                     verbose=2,
+                     save_freq=1,
+                     save_dir=None,
+                     metrics=None,
+                     acc_step=1,
+                     mode='train'):
+    cbks = callbacks or []
+    cbks = cbks if isinstance(cbks, (list, tuple)) else [cbks]
+
+    if not any(isinstance(k, ProgBarLogger) for k in cbks) and verbose:
+        cbks = [ProgBarLoggerAuto(log_freq, verbose=verbose)] + cbks
+
+    if not any(isinstance(k, LRScheduler) for k in cbks):
+        cbks = [LRSchedulerAuto()] + cbks
+
+    if not any(isinstance(k, ModelCheckpoint) for k in cbks):
+        cbks = cbks + [ModelCheckpointAuto(save_freq, save_dir)]
+
+    if not any(isinstance(k, Profiler) for k in cbks) and verbose == 3:
+        cbks = cbks + [Profiler(timer_only=True)]
+
+    if not any(isinstance(k, History) for k in cbks):
+        cbks = cbks + [History()]
+
+    for i, k in enumerate(cbks):
+        if isinstance(k, ProgBarLogger):
+            cbks[i] = ProgBarLoggerAuto(k.log_freq, k.verbose)
+        if isinstance(k, LRScheduler):
+            cbks[i] = LRSchedulerAuto(k.by_step, k.by_epoch)
+        if isinstance(k, ModelCheckpoint):
+            cbks[i] = ModelCheckpointAuto(k.save_freq, k.save_dir)
+
+    cbk_list = CallbackList(cbks)
+    cbk_list.set_model(engine)
+    metrics = metrics or [] if mode != 'test' else []
+    params = {
+        'batch_size': batch_size,
+        'epochs': epochs,
+        'steps': steps,
+        'verbose': verbose,
+        'metrics': metrics,
+        'acc_step': acc_step,
+    }
+    cbk_list.set_params(params)
+    return cbk_list
+
+
+class ProgBarLoggerAuto(ProgBarLogger):
+
+    def __init__(self, log_freq=1, verbose=2):
+        super(ProgBarLoggerAuto, self).__init__(log_freq, verbose)
+
+    def _is_print(self):
+        return True
+
+    def _updates(self, logs, mode):
+        values = []
+        metrics = getattr(self, '%s_metrics' % (mode))
+        progbar = getattr(self, '%s_progbar' % (mode))
+        steps = getattr(self, '%s_step' % (mode))
+
+        for k in metrics:
+            if k in logs:
+                values.append((k, logs[k]))
+
+        if 'lr' in logs:
+            values.append(('lr', logs['lr']))
+
+        fetches_logs = logs.get('fetches', {})
+        collect_logging = get_collection(CollectionNames.LOGGING)
+        for name, var in collect_logging:
+            k = name or var.name
+            if k in fetches_logs:
+                values.append((k, fetches_logs[k]))
+
+        out_logs = logs.get('outputs', {})
+        for k in out_logs:
+            values.append((k, out_logs[k]))
+
+        if self.verbose == 3 and hasattr(self, '_%s_timer' % (mode)):
+            timer = getattr(self, '_%s_timer' % (mode))
+            cnt = timer['count'] if timer['count'] > 0 else 1.0
+            samples = timer['samples'] if timer['samples'] > 0 else 1.0
+            values.append(
+                ('avg_reader_cost', "%.5f sec" % (timer['data_time'] / cnt)))
+            values.append(
+                ('avg_batch_cost', "%.5f sec" % (timer['batch_time'] / cnt)))
+            values.append(
+                ('ips', "%.5f samples/sec" %
+                 (samples / (timer['data_time'] + timer['batch_time']))))
+            timer['count'] = 0
+            timer['samples'] = 0
+            timer['data_time'] = 0.
+            timer['batch_time'] = 0.
+
+        progbar.update(steps, values)
+
+    def on_eval_batch_end(self, step, logs=None):
+        logs = logs or {}
+        self.eval_step += 1
+        samples = self.params['batch_size']
+        self.evaled_samples += samples
+
+        self._eval_timer['batch_time'] += (
+            time.time() - self._eval_timer['batch_data_end_time'])
+        self._eval_timer['count'] += 1
+        samples = self.params['batch_size']
+        self._eval_timer['samples'] += samples
+
+        if self._is_print() and self.eval_step % self.log_freq == 0:
+            if self.eval_steps is None or self.eval_step < self.eval_steps:
+                self._updates(logs, 'eval')
+
+        self._eval_timer['batch_start_time'] = time.time()
+
+
+class LRSchedulerAuto(LRScheduler):
+
+    def __init__(self, by_step=True, by_epoch=False):
+        super(LRSchedulerAuto, self).__init__(by_step, by_epoch)
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.acc_step = self.params["acc_step"]
+        self.epoch = epoch
+        self.train_step = 0
+
+    def on_train_batch_end(self, step, logs=None):
+        self.train_step += 1
+
+        if self.by_step and self.train_step % self.acc_step == 0:
+            if self.model._optimizer and \
+                hasattr(self.model._optimizer, '_learning_rate') and \
+                isinstance(self.model._optimizer._learning_rate,
+                           paddle.optimizer.lr.LRScheduler):
+                self.model._optimizer._learning_rate.step()
+
+
+class History(Callback):
+
+    def __init__(self):
+        self.history = {}
+
+    def on_train_begin(self, logs=None):
+        self.epoch = []
+
+    def on_epoch_end(self, epoch, logs=None):
+        logs = logs or {}
+        self.epoch.append(epoch)
+        for k, v in logs.items():
+            self.history.setdefault(k, []).append(v)
+
+        self.model.history = self
+
+
+class Profiler(Callback):
+
+    def __init__(self, *args, **kwargs):
+        self.prof = paddle.profiler.Profiler(*args, **kwargs)
+
+    def on_epoch_begin(self, epoch=None, logs=None):
+        self.epoch = epoch
+        self.train_step = 0
+        self.batch_size = self.params["batch_size"]
+        self.steps = self.params['steps']
+
+    def on_train_begin(self, logs=None):
+        self.prof.start()
+
+    def on_train_batch_end(self, step, logs=None):
+        self.train_step += 1
+        self.prof.step(num_samples=self.batch_size)
+        print("step {}:{}".format(self.train_step,
+                                  self.prof.step_info(unit='samples')))
+
+    def on_train_end(self, logs=None):
+        self.prof.stop()
+        self.prof.summary()
+
+
+class ModelCheckpointAuto(ModelCheckpoint):
+
+    def __init__(self, *args, **kwargs):
+        super(ModelCheckpointAuto, self).__init__(*args, **kwargs)
+
+    def _is_save(self):
+        return self.model and self.save_dir
+
+    def on_epoch_end(self, epoch, logs=None):
+        if self._is_save() and (self.epoch + 1) % self.save_freq == 0:
+            path = '{}/epoch{}'.format(self.save_dir, epoch)
+            print('save checkpoint at {}'.format(os.path.abspath(path)))
+            self.model.save(path)
+
+    def on_train_end(self, logs=None):
+        if self._is_save():
+            path = '{}/final'.format(self.save_dir)
+            print('save checkpoint at {}'.format(os.path.abspath(path)))
+            self.model.save(path)
diff --git a/python/paddle/distributed/auto_parallel/completion.py b/python/paddle/distributed/auto_parallel/completion.py
index 1775a823c57b6f..1717f3e011c62b 100644
--- a/python/paddle/distributed/auto_parallel/completion.py
+++ b/python/paddle/distributed/auto_parallel/completion.py
@@ -15,11 +15,12 @@
 import copy
 from copy import deepcopy
 import time
+from numpy import sort
 
 from paddle.fluid import core
 from paddle.fluid import framework
 
-from .utils import print_program_with_dist_attr, is_gradient_clip_op
+from .utils import is_gradient_clip_op, __not_shape_var_type__
 from .operators import find_compatible_distributed_operator_impls
 from .dist_context import get_default_distributed_context, _node_id
 from .dist_tensor import DistributedTensor
@@ -142,6 +143,7 @@ class Completer:
     def __init__(self, dist_context):
         assert dist_context is not None
         self._dist_context = dist_context
+        self._has_prepared = False
 
     def _update_tensor_node_dims_mapping(self, tensor_node, fwd=True):
         changed = False
@@ -366,7 +368,14 @@ def _update_dims_mapping_between_graphs(self):
     def _update_dims_mapping_for_special(self):
         # Set the dims_mapping of a tensor to the dims_mapping inside the op which produces it
         op_nodes = self._dist_context._serial_ordered_op_nodes
+        # NOTE: this list may be changed if Paddle changes the existing rules.
+        related_reader_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read"
+        ]
         for op_node in op_nodes:
+            if op_node.op() is not None \
+                and op_node.op().type() in related_reader_ops:
+                continue
             op_dist_attr = self._dist_context.get_dist_attr_for_graph(op_node)
             for tensor_node in op_node.outputs:
                 if tensor_node.is_var() and tensor_node.var() is not None:
@@ -406,6 +415,7 @@ def _update_dims_mapping(self):
                 reach_fix_point = False
             else:
                 reach_fix_point = True
+        # NOTE: this will be removed after changing the reshard rule
         self._update_dims_mapping_for_special()
 
     def _update_process_mesh_by_nearest(self, op_node, nearest_op_node):
@@ -494,14 +504,14 @@ def _find_nodes_related_to_cond(source_node):
                         for tensor_node in node.inputs:
                             if tensor_node.is_var() and tensor_node.var(
                             ) is not None:
-                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                if tensor_node.var().type() in __not_shape_var_type__ \
                                     or len(tensor_node.var().shape()) != 1:
                                     flag = False
                                     break
                         for tensor_node in node.outputs:
                             if tensor_node.is_var() and tensor_node.var(
                             ) is not None:
-                                if tensor_node.var().type() == core.VarDesc.VarType.READER \
+                                if tensor_node.var().type() in __not_shape_var_type__ \
                                     or len(tensor_node.var().shape()) != 1:
                                     flag = False
                                     break
@@ -719,6 +729,20 @@ def _update_process_mesh(self):
         self._update_process_mesh_between_graphs()
 
     def _prepare(self):
+        def _find_nearest_parent_nodes(sorted_parent_nodes, child_idx):
+            before_node = None
+            after_node = None
+            pos = -1
+            for pos, (parent_idx, parent_node) in enumerate(sorted_parent_nodes):
+                if parent_idx > child_idx:
+                    after_node = parent_node
+                    break
+            if pos > 0:
+                _, before_node = sorted_parent_nodes[pos - 1]
+            return before_node, after_node
+
+        if self._has_prepared:
+            return
         self._while_op_nodes = {}
         self._array_nodes = {}
         self._node_pairs_between_graphs = []
@@ -732,26 +756,32 @@ def _prepare(self):
                     if self._array_nodes.get(array_var_name, None) is None:
                         self._array_nodes[array_var_name] = []
                     self._array_nodes[array_var_name].append(node)
+                    # Add the array input node
+                    self._array_nodes[array_var_name].append(node.inputs[0])
                 if node.op().type() == "write_to_array":
                     array_var_name = node.op().output("Out")[0]
                     if self._array_nodes.get(array_var_name, None) is None:
                         self._array_nodes[array_var_name] = []
                     self._array_nodes[array_var_name].append(node)
                     self._array_nodes[array_var_name].append(node.outputs[0])
+            # TODO: Use dict and name as the key to store the nodes,
+            # and use the id comparsion to deal with the before or after position
             if node.is_var() and node.var() is not None:
                 if node.node.graph_id() != 0:
-                    for before_node in reversed(all_nodes[:idx]):
-                        if before_node.is_var() and before_node.var() is not None \
-                            and before_node.node.graph_id() == node.node.graph_id() - 1 \
-                                and before_node.var().name() == node.var().name():
-                            self._node_pairs_between_graphs.append(
-                                (before_node, node))
-                    for after_node in all_nodes[idx + 1:]:
-                        if after_node.is_var() and after_node.var() is not None \
-                            and after_node.node.graph_id() == node.node.graph_id() - 1 \
-                                and after_node.var().name() == node.var().name():
+                    parent_nodes = self._dist_context._tensor_nodes_with_same_name[node.node.graph_id() - 1].get(node.var().name(), None)
+                    if parent_nodes is not None:
+                        sorted_parent_nodes = sorted(parent_nodes, key=lambda x: x[0])
+                        # before_node, after_node = _find_nearest_parent_nodes(sorted_parent_nodes, idx)
+                        # if before_node is not None:
+                        #     self._node_pairs_between_graphs.append(
+                        #         (before_node, node))
+                        # if after_node is not None:
+                        #     self._node_pairs_between_graphs.append(
+                        #         (after_node, node))
+                        for _, parent_node in sorted_parent_nodes:
                             self._node_pairs_between_graphs.append(
-                                (after_node, node))
+                                (parent_node, node))
+        self._has_prepared = True
 
     def complete_forward_annotation(self, serial_main_program=None):
         """ Complete annotation for the partial annotated serial_main_program.
@@ -773,14 +803,22 @@ def complete_forward_annotation(self, serial_main_program=None):
 
             # self._dist_context.validate_dist_attr_for_program()
 
+            start_time = time.time()
             self._prepare()
+            print("bot-completion-prepare: ", time.time() - start_time, flush=True)
 
+            start_time = time.time()
             self._update_process_mesh()
+            print("bot-completion-mesh: ", time.time() - start_time, flush=True)
 
+            start_time = time.time()
             self._update_dims_mapping()
+            print("bot-graph-dims: ", time.time() - start_time, flush=True)
 
+            start_time = time.time()
             # Copy the corresponding distributed attribute from graph to serial_main_program
             self._dist_context.copy_dist_attr_from_graph_to_program()
+            print("bot-completion-copy: ", time.time() - start_time, flush=True)
         else:
             self._dist_context.initialize(with_graph=False)
 
@@ -899,6 +937,72 @@ def _update_dist_attr_for_dp(self):
             else:
                 dist_op.dist_attr = original_op_dist_attr
 
+    def _complete_tensor_dist_attr_by_op(self, serial_main_program=None):
+        if serial_main_program is None:
+            serial_main_program = self._dist_context.serial_main_program
+        else:
+            self._dist_context._serial_main_program = serial_main_program
+
+        self._dist_context.initialize()
+
+        self._prepare()
+
+        has_set_dist_attr = set()
+
+        all_nodes = self._dist_context.serial_ordered_nodes
+        for node in all_nodes:
+            if node.is_op():
+                if node.op().type() in ["while"]:
+                    continue
+                dist_op = self._dist_context.get_dist_op_for_graph(node)
+                op_dist_attr = dist_op.dist_attr
+                for tensor_node in node.inputs:
+                    if tensor_node.is_var() and tensor_node.var() is not None:
+                        # Skip the non-leaf var node
+                        if len(tensor_node.inputs) != 0:
+                            continue
+                        tensor_desc = tensor_node.var()
+                        tensor_name = tensor_desc.name()
+                        tensor = dist_op.get_serial_input(tensor_name)
+                        # Use the first op to set the tensor dist attr
+                        if tensor_name in has_set_dist_attr:
+                            continue
+                        tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                            tensor_node)
+                        tensor_dist_attr.process_mesh = op_dist_attr.process_mesh
+                        tensor_dist_attr.dims_mapping = op_dist_attr.get_input_dims_mapping(
+                            tensor_name) if tensor.is_parameter else [
+                                -1 for i in tensor_desc.shape()
+                            ]
+                        has_set_dist_attr.add(tensor_name)
+                for tensor_node in node.outputs:
+                    if tensor_node.is_var() and tensor_node.var() is not None:
+                        tensor_name = tensor_node.var().name()
+                        if tensor_name in has_set_dist_attr:
+                            continue
+                        tensor_dist_attr = self._dist_context.get_tensor_dist_attr_for_graph(
+                            tensor_node)
+                        tensor_dist_attr.process_mesh = op_dist_attr.process_mesh
+                        tensor_dist_attr.dims_mapping = op_dist_attr.get_output_dims_mapping(
+                            tensor_name)
+                        has_set_dist_attr.add(tensor_name)
+
+        self._update_process_mesh_for_specials()
+
+        self._update_process_mesh_between_graphs()
+
+        self._update_dims_mapping_for_special()
+
+        self._update_dims_mapping_between_graphs()
+
+        # Copy the corresponding distributed attribute from graph to serial_main_program
+        self._dist_context.copy_dist_attr_from_graph_to_program()
+
+        # Do the validation check and amend some completion
+        self._dist_context.amend_dist_attr_for_program()
+
+        self._dist_context.validate_dist_attr_for_program()
+
     def _complete_high_order_grad_annotation(self, serial_main_program=None):
         """
         NOTE: 
@@ -1037,7 +1141,7 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.set_output_dims_mapping(
                         output_name, ref_fwd_dims_mapping)
 
-                elif grad_op.type == 'fill_zeros_like':
+                elif grad_op.type == 'fill_any_like':
                     ref_var_name = grad_op.input_arg_names[0]
                     ref_var = vars[ref_var_name]
                     ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
@@ -1274,7 +1378,7 @@ def _get_op_by_id(ops, id):
                     grad_op_dist_attr.impl_type = "default"
                     grad_op_dist_attr.impl_idx = 0
 
-                elif grad_op.type == 'fill_zeros_like':
+                elif grad_op.type == 'fill_any_like':
                     ref_var_name = grad_op.input_arg_names[0]
                     ref_var = vars[ref_var_name]
                     ref_dist_attr = self._dist_context.get_tensor_dist_attr_for_program(
diff --git a/python/paddle/distributed/auto_parallel/constants.py b/python/paddle/distributed/auto_parallel/constants.py
new file mode 100644
index 00000000000000..44d804a481683d
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/constants.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+from collections import defaultdict
+
+# _g_default_config[category][field] = default_value
+_g_default_config = defaultdict(dict)
+
+
+def get_category_default_config(category):
+    return _g_default_config[category]
+
+
+def set_category_default_config(category, default_value):
+    _g_default_config[category] = default_value
+
+
+def get_field_default_config(category, field):
+    return _g_default_config[category][field]
+
+
+def set_field_default_config(category, field, default_value):
+    _g_default_config[category][field] = default_value
+
+
+NOT_FOUND = "not_found"
+
+#########################################
+# base configuration
+#########################################
+BASE = "base"
+set_field_default_config(BASE, "auto_mode", "semi")
+set_field_default_config(BASE, "gradient_scale", True)
+set_field_default_config(BASE, "use_cache", True)
+set_field_default_config(BASE, "return_numpy", True)
+set_field_default_config(BASE, "all_ranks", False)
+set_field_default_config(BASE, "split_data", True)
+set_field_default_config(BASE, "seed", None)
+set_field_default_config(BASE, "reinit", False)  # Only for debug
+
+#########################################
+# recompute configuration
+#########################################
+RECOMPUTE = "recompute"
+set_field_default_config(RECOMPUTE, "enable", False)
+set_field_default_config(RECOMPUTE, "checkpoints", None)
+set_field_default_config(RECOMPUTE, "enable_tuning", False)
+
+#########################################
+# AMP configuration
+#########################################
+AMP = "amp"
+set_field_default_config(AMP, "enable", False)
+set_field_default_config(AMP, "init_loss_scaling", 32768.0)
+set_field_default_config(AMP, "incr_every_n_steps", 1000)
+set_field_default_config(AMP, "decr_every_n_nan_or_inf", 2)
+set_field_default_config(AMP, "incr_ratio", 2.0)
+set_field_default_config(AMP, "decr_ratio", 0.8)
+set_field_default_config(AMP, "use_dynamic_loss_scaling", True)
+set_field_default_config(AMP, "custom_white_list", [])
+set_field_default_config(AMP, "custom_black_list", [])
+set_field_default_config(AMP, "custom_black_varnames", [])
+set_field_default_config(AMP, "use_pure_fp16", False)
+set_field_default_config(AMP, "use_fp16_guard", True)
+set_field_default_config(AMP, "use_optimizer_fp16", False)
+
+#########################################
+# sharding configuration
+#########################################
+SHARDING = "sharding"
+set_field_default_config(SHARDING, "enable", False)
+set_field_default_config(SHARDING, "stage", 1)
+set_field_default_config(SHARDING, "degree", 8)
+set_field_default_config(SHARDING, "segment_broadcast_MB", 32.0)
+set_field_default_config(SHARDING, "enable_tuning", False)
+set_field_default_config(SHARDING, "tuning_range", [])
+
+#########################################
+# gradient merge configuration
+#########################################
+GRADIENT_MERGE = "gradient_merge"
+set_field_default_config(GRADIENT_MERGE, "enable", False)
+set_field_default_config(GRADIENT_MERGE, "k_steps", 1)
+set_field_default_config(GRADIENT_MERGE, "avg", True)
+
+#########################################
+# pipeline configuration
+#########################################
+PIPELINE = "pipeline"
+set_field_default_config(PIPELINE, "enable", False)
+set_field_default_config(PIPELINE, "schedule_mode", "1F1B")
+set_field_default_config(PIPELINE, "micro_batch_size", 1)
+set_field_default_config(PIPELINE, "accumulate_steps", 1)
+set_field_default_config(PIPELINE, "generation_batch_size", 1)
+
+#########################################
+# quantization configuration
+#########################################
+QAT = "qat"
+set_field_default_config(QAT, "enable", False)
+set_field_default_config(QAT, "channel_wise_abs_max", True)
+set_field_default_config(QAT, "weight_bits", 8)
+set_field_default_config(QAT, "activation_bits", 8)
+set_field_default_config(QAT, "not_quant_pattern", ['skip_quant'])
+set_field_default_config(QAT, "algo", None)
+
+# #########################################
+# auto tuning configuration
+# #########################################
+TUNING = "tuning"
+set_field_default_config(TUNING, "enable", False)
+set_field_default_config(TUNING, "batch_size", 1)
+set_field_default_config(TUNING, "dataset", None)
+set_field_default_config(TUNING, "profile_start_step", 1)
+set_field_default_config(TUNING, "profile_end_step", 1)
+set_field_default_config(TUNING, "run_after_tuning", True)
+set_field_default_config(TUNING, "verbose", True)
+
+#########################################
+# dataset configuration
+#########################################
+DATASET = "dataset"
+set_field_default_config(DATASET, "enable", False)
+set_field_default_config(DATASET, "num_shards", 1)
diff --git a/python/paddle/distributed/auto_parallel/converter.py b/python/paddle/distributed/auto_parallel/converter.py
index 69292ab1827e81..35674be05b0d2f 100644
--- a/python/paddle/distributed/auto_parallel/converter.py
+++ b/python/paddle/distributed/auto_parallel/converter.py
@@ -16,7 +16,7 @@
 import warnings
 import logging
 import numpy as np
-from ..utils import get_logger
+from ..utils.log_utils import get_logger
 
 
 class Converter(object):
diff --git a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
index b4ac972bcfd29c..1217a0b4d0bf7b 100644
--- a/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/comp_op_cost.py
@@ -20,9 +20,28 @@ class AdamOpCost(CompOpCost):
     OP_TYPE = "adam"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(AdamOpCost, self).__init__(op=op,
-                                         op_desc=op_desc,
-                                         cluster=cluster)
+        super(AdamOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class ArgsortOpCost(CompOpCost):
+    OP_TYPE = "argsort"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(ArgsortOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -39,9 +58,9 @@ class AssignOpCost(CompOpCost):
     OP_TYPE = "assign"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(AssignOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(AssignOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -58,9 +77,9 @@ class AssignValueOpCost(CompOpCost):
     OP_TYPE = "assign_value"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(AssignValueOpCost, self).__init__(op=op,
-                                                op_desc=op_desc,
-                                                cluster=cluster)
+        super(AssignValueOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -77,9 +96,9 @@ class BeamSearchOpCost(CompOpCost):
     OP_TYPE = "beam_search"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(BeamSearchOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(BeamSearchOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -96,9 +115,9 @@ class BeamSearchDecodeOpCost(CompOpCost):
     OP_TYPE = "beam_search_decode"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(BeamSearchDecodeOpCost, self).__init__(op=op,
-                                                     op_desc=op_desc,
-                                                     cluster=cluster)
+        super(BeamSearchDecodeOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -115,9 +134,9 @@ class CastOpCost(CompOpCost):
     OP_TYPE = "cast"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(CastOpCost, self).__init__(op=op,
-                                         op_desc=op_desc,
-                                         cluster=cluster)
+        super(CastOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -134,9 +153,9 @@ class ConcatOpCost(CompOpCost):
     OP_TYPE = "concat"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ConcatOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(ConcatOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -153,9 +172,28 @@ class DropoutOpCost(CompOpCost):
     OP_TYPE = "dropout"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(DropoutOpCost, self).__init__(op=op,
-                                            op_desc=op_desc,
-                                            cluster=cluster)
+        super(DropoutOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class DropoutGradOpCost(CompOpCost):
+    OP_TYPE = "dropout_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(DropoutGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -172,9 +210,9 @@ class ElementwiseAddOpCost(CompOpCost):
     OP_TYPE = "elementwise_add"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseAddOpCost, self).__init__(op=op,
-                                                   op_desc=op_desc,
-                                                   cluster=cluster)
+        super(ElementwiseAddOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -191,9 +229,9 @@ class ElementwiseAddGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_add_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseAddGradOpCost, self).__init__(op=op,
-                                                       op_desc=op_desc,
-                                                       cluster=cluster)
+        super(ElementwiseAddGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -210,9 +248,9 @@ class ElementwiseDivOpCost(CompOpCost):
     OP_TYPE = "elementwise_div"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseDivOpCost, self).__init__(op=op,
-                                                   op_desc=op_desc,
-                                                   cluster=cluster)
+        super(ElementwiseDivOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -229,9 +267,9 @@ class ElementwiseDivGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_div_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseDivGradOpCost, self).__init__(op=op,
-                                                       op_desc=op_desc,
-                                                       cluster=cluster)
+        super(ElementwiseDivGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -248,9 +286,9 @@ class ElementwiseMulOpCost(CompOpCost):
     OP_TYPE = "elementwise_mul"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseMulOpCost, self).__init__(op=op,
-                                                   op_desc=op_desc,
-                                                   cluster=cluster)
+        super(ElementwiseMulOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -267,9 +305,9 @@ class ElementwiseMulGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_mul_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseMulGradOpCost, self).__init__(op=op,
-                                                       op_desc=op_desc,
-                                                       cluster=cluster)
+        super(ElementwiseMulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -286,9 +324,9 @@ class ElementwiseSubOpCost(CompOpCost):
     OP_TYPE = "elementwise_sub"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseSubOpCost, self).__init__(op=op,
-                                                   op_desc=op_desc,
-                                                   cluster=cluster)
+        super(ElementwiseSubOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -305,9 +343,28 @@ class ElementwiseSubGradOpCost(CompOpCost):
     OP_TYPE = "elementwise_sub_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ElementwiseSubGradOpCost, self).__init__(op=op,
-                                                       op_desc=op_desc,
-                                                       cluster=cluster)
+        super(ElementwiseSubGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class EqualOpCost(CompOpCost):
+    OP_TYPE = "equal"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(EqualOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -324,9 +381,9 @@ class EmbeddingOpCost(CompOpCost):
     OP_TYPE = "c_embedding"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(EmbeddingOpCost, self).__init__(op=op,
-                                              op_desc=op_desc,
-                                              cluster=cluster)
+        super(EmbeddingOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -343,9 +400,9 @@ class EmbeddingGradOpCost(CompOpCost):
     OP_TYPE = "c_embedding_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(EmbeddingGradOpCost, self).__init__(op=op,
-                                                  op_desc=op_desc,
-                                                  cluster=cluster)
+        super(EmbeddingGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -362,9 +419,9 @@ class FillConstantOpCost(CompOpCost):
     OP_TYPE = "fill_constant"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(FillConstantOpCost, self).__init__(op=op,
-                                                 op_desc=op_desc,
-                                                 cluster=cluster)
+        super(FillConstantOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -381,9 +438,47 @@ class FillConstantBatchSizeLikeOpCost(CompOpCost):
     OP_TYPE = "fill_constant_batch_size_like"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(FillConstantBatchSizeLikeOpCost, self).__init__(op=op,
-                                                              op_desc=op_desc,
-                                                              cluster=cluster)
+        super(FillConstantBatchSizeLikeOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class FusedSoftmaxMaskUpperTriangleOpCost(CompOpCost):
+    OP_TYPE = "fused_softmax_mask_upper_triangle"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(FusedSoftmaxMaskUpperTriangleOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
+
+    # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
+    def calc_flops(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+    def calc_time(self):
+        # NOTE: The actual formula will be filled in the future
+        return 0
+
+
+@register_op_cost
+class FusedSoftmaxMaskUpperTriangleGradOpCost(CompOpCost):
+    OP_TYPE = "fused_softmax_mask_upper_triangle_grad"
+
+    def __init__(self, op=None, op_desc=None, cluster=None):
+        super(FusedSoftmaxMaskUpperTriangleGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -400,9 +495,9 @@ class GatherOpCost(CompOpCost):
     OP_TYPE = "gather"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GatherOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(GatherOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -419,9 +514,9 @@ class GeluOpCost(CompOpCost):
     OP_TYPE = "gelu"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GeluOpCost, self).__init__(op=op,
-                                         op_desc=op_desc,
-                                         cluster=cluster)
+        super(GeluOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -438,9 +533,9 @@ class GeluGradOpCost(CompOpCost):
     OP_TYPE = "gelu_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GeluGradOpCost, self).__init__(op=op,
-                                             op_desc=op_desc,
-                                             cluster=cluster)
+        super(GeluGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -457,9 +552,9 @@ class GreaterEqualOpCost(CompOpCost):
     OP_TYPE = "greater_equal"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(GreaterEqualOpCost, self).__init__(op=op,
-                                                 op_desc=op_desc,
-                                                 cluster=cluster)
+        super(GreaterEqualOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -476,9 +571,9 @@ class IncrementOpCost(CompOpCost):
     OP_TYPE = "increment"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(IncrementOpCost, self).__init__(op=op,
-                                              op_desc=op_desc,
-                                              cluster=cluster)
+        super(IncrementOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -491,9 +586,9 @@ class IsEmptyOpCost(CompOpCost):
     OP_TYPE = "is_empty"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(IsEmptyOpCost, self).__init__(op=op,
-                                            op_desc=op_desc,
-                                            cluster=cluster)
+        super(IsEmptyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -506,9 +601,9 @@ class LayerNormOpCost(CompOpCost):
     OP_TYPE = "layer_norm"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LayerNormOpCost, self).__init__(op=op,
-                                              op_desc=op_desc,
-                                              cluster=cluster)
+        super(LayerNormOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -525,9 +620,9 @@ class LayerNormGradOpCost(CompOpCost):
     OP_TYPE = "layer_norm_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LayerNormGradOpCost, self).__init__(op=op,
-                                                  op_desc=op_desc,
-                                                  cluster=cluster)
+        super(LayerNormGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -544,9 +639,9 @@ class LessThanOpCost(CompOpCost):
     OP_TYPE = "less_than"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LessThanOpCost, self).__init__(op=op,
-                                             op_desc=op_desc,
-                                             cluster=cluster)
+        super(LessThanOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -563,9 +658,9 @@ class LogicalNotOpCost(CompOpCost):
     OP_TYPE = "logical_not"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LogicalNotOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(LogicalNotOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -582,9 +677,9 @@ class LogicalAndOpCost(CompOpCost):
     OP_TYPE = "logical_and"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LogicalAndOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(LogicalAndOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -601,9 +696,9 @@ class LodResetOpCost(CompOpCost):
     OP_TYPE = "lod_reset"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LodResetOpCost, self).__init__(op=op,
-                                             op_desc=op_desc,
-                                             cluster=cluster)
+        super(LodResetOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -637,9 +732,9 @@ class LookupTableV2OpCost(CompOpCost):
     OP_TYPE = "lookup_table_v2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LookupTableV2OpCost, self).__init__(op=op,
-                                                  op_desc=op_desc,
-                                                  cluster=cluster)
+        super(LookupTableV2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -656,9 +751,9 @@ class LookupTableV2GradOpCost(CompOpCost):
     OP_TYPE = "lookup_table_v2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(LookupTableV2GradOpCost, self).__init__(op=op,
-                                                      op_desc=op_desc,
-                                                      cluster=cluster)
+        super(LookupTableV2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -675,9 +770,9 @@ class MatmulOpCost(CompOpCost):
     OP_TYPE = "matmul"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(MatmulOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -694,9 +789,9 @@ class MatmulGradOpCost(CompOpCost):
     OP_TYPE = "matmul_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulGradOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(MatmulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -713,9 +808,9 @@ class MatmulV2OpCost(CompOpCost):
     OP_TYPE = "matmul_v2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulV2OpCost, self).__init__(op=op,
-                                             op_desc=op_desc,
-                                             cluster=cluster)
+        super(MatmulV2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -732,9 +827,9 @@ class MatmulV2GradOpCost(CompOpCost):
     OP_TYPE = "matmul_v2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MatmulV2GradOpCost, self).__init__(op=op,
-                                                 op_desc=op_desc,
-                                                 cluster=cluster)
+        super(MatmulV2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -751,9 +846,9 @@ class MemcpyOpCost(CompOpCost):
     OP_TYPE = "memcpy"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MemcpyOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(MemcpyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -787,9 +882,9 @@ class MulGradOpCost(CompOpCost):
     OP_TYPE = "mul_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(MulGradOpCost, self).__init__(op=op,
-                                            op_desc=op_desc,
-                                            cluster=cluster)
+        super(MulGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -806,9 +901,9 @@ class OneHotOpCost(CompOpCost):
     OP_TYPE = "one_hot"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(OneHotOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(OneHotOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -825,9 +920,9 @@ class ReadFromArrayOpCost(CompOpCost):
     OP_TYPE = "read_from_array"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReadFromArrayOpCost, self).__init__(op=op,
-                                                  op_desc=op_desc,
-                                                  cluster=cluster)
+        super(ReadFromArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -844,9 +939,9 @@ class ReduceSumOpCost(CompOpCost):
     OP_TYPE = "reduce_sum"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceSumOpCost, self).__init__(op=op,
-                                              op_desc=op_desc,
-                                              cluster=cluster)
+        super(ReduceSumOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -863,9 +958,9 @@ class ReduceSumGradOpCost(CompOpCost):
     OP_TYPE = "reduce_sum_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceSumGradOpCost, self).__init__(op=op,
-                                                  op_desc=op_desc,
-                                                  cluster=cluster)
+        super(ReduceSumGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -882,9 +977,9 @@ class Reshape2OpCost(CompOpCost):
     OP_TYPE = "reshape2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Reshape2OpCost, self).__init__(op=op,
-                                             op_desc=op_desc,
-                                             cluster=cluster)
+        super(Reshape2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -901,9 +996,9 @@ class Reshape2GradOpCost(CompOpCost):
     OP_TYPE = "reshape2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Reshape2GradOpCost, self).__init__(op=op,
-                                                 op_desc=op_desc,
-                                                 cluster=cluster)
+        super(Reshape2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -920,9 +1015,9 @@ class ReduceMeanOpCost(CompOpCost):
     OP_TYPE = "reduce_mean"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceMeanOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(ReduceMeanOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -939,9 +1034,9 @@ class ReduceMeanGradOpCost(CompOpCost):
     OP_TYPE = "reduce_mean_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ReduceMeanGradOpCost, self).__init__(op=op,
-                                                   op_desc=op_desc,
-                                                   cluster=cluster)
+        super(ReduceMeanGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -958,9 +1053,9 @@ class SamplingIdOpCost(CompOpCost):
     OP_TYPE = "sampling_id"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SamplingIdOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(SamplingIdOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -977,9 +1072,9 @@ class ScaleOpCost(CompOpCost):
     OP_TYPE = "scale"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(ScaleOpCost, self).__init__(op=op,
-                                          op_desc=op_desc,
-                                          cluster=cluster)
+        super(ScaleOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -996,9 +1091,9 @@ class SliceOpCost(CompOpCost):
     OP_TYPE = "slice"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SliceOpCost, self).__init__(op=op,
-                                          op_desc=op_desc,
-                                          cluster=cluster)
+        super(SliceOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1015,9 +1110,9 @@ class SoftmaxOpCost(CompOpCost):
     OP_TYPE = "softmax"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxOpCost, self).__init__(op=op,
-                                            op_desc=op_desc,
-                                            cluster=cluster)
+        super(SoftmaxOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1034,9 +1129,9 @@ class SoftmaxGradOpCost(CompOpCost):
     OP_TYPE = "softmax_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxGradOpCost, self).__init__(op=op,
-                                                op_desc=op_desc,
-                                                cluster=cluster)
+        super(SoftmaxGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1053,9 +1148,9 @@ class SoftmaxWithCrossEntropyOpCost(CompOpCost):
     OP_TYPE = "softmax_with_cross_entropy"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxWithCrossEntropyOpCost, self).__init__(op=op,
-                                                            op_desc=op_desc,
-                                                            cluster=cluster)
+        super(SoftmaxWithCrossEntropyOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1072,9 +1167,9 @@ class SoftmaxWithCrossEntropyGradOpCost(CompOpCost):
     OP_TYPE = "softmax_with_cross_entropy_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(op=op,
-                                                                op_desc=op_desc,
-                                                                cluster=cluster)
+        super(SoftmaxWithCrossEntropyGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1091,9 +1186,9 @@ class SplitOpCost(CompOpCost):
     OP_TYPE = "split"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SplitOpCost, self).__init__(op=op,
-                                          op_desc=op_desc,
-                                          cluster=cluster)
+        super(SplitOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1110,9 +1205,9 @@ class Squeeze2OpCost(CompOpCost):
     OP_TYPE = "squeeze2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Squeeze2OpCost, self).__init__(op=op,
-                                             op_desc=op_desc,
-                                             cluster=cluster)
+        super(Squeeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1129,9 +1224,9 @@ class SquareOpCost(CompOpCost):
     OP_TYPE = "square"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SquareOpCost, self).__init__(op=op,
-                                           op_desc=op_desc,
-                                           cluster=cluster)
+        super(SquareOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1148,9 +1243,9 @@ class SquareGradOpCost(CompOpCost):
     OP_TYPE = "square_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(SquareGradOpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(SquareGradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1184,9 +1279,9 @@ class TopKOpCost(CompOpCost):
     OP_TYPE = "top_k"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(TopKOpCost, self).__init__(op=op,
-                                         op_desc=op_desc,
-                                         cluster=cluster)
+        super(TopKOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1203,9 +1298,9 @@ class Transpose2OpCost(CompOpCost):
     OP_TYPE = "transpose2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Transpose2OpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(Transpose2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1222,9 +1317,9 @@ class Transpose2GradOpCost(CompOpCost):
     OP_TYPE = "transpose2_grad"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Transpose2GradOpCost, self).__init__(op=op,
-                                                   op_desc=op_desc,
-                                                   cluster=cluster)
+        super(Transpose2GradOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1241,9 +1336,9 @@ class Unsqueeze2OpCost(CompOpCost):
     OP_TYPE = "unsqueeze2"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(Unsqueeze2OpCost, self).__init__(op=op,
-                                               op_desc=op_desc,
-                                               cluster=cluster)
+        super(Unsqueeze2OpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
@@ -1260,9 +1355,9 @@ class WriteToArrayOpCost(CompOpCost):
     OP_TYPE = "write_to_array"
 
     def __init__(self, op=None, op_desc=None, cluster=None):
-        super(WriteToArrayOpCost, self).__init__(op=op,
-                                                 op_desc=op_desc,
-                                                 cluster=cluster)
+        super(WriteToArrayOpCost, self).__init__(
+            op=op, op_desc=op_desc, cluster=cluster
+        )
 
     # For a concrete COMP OP, the calc_time and calc_flops function need to be overrided
     def calc_flops(self):
diff --git a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
index 7bdde90b6a7119..a3d737769d01c8 100644
--- a/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
+++ b/python/paddle/distributed/auto_parallel/cost/estimate_cost.py
@@ -27,12 +27,9 @@
 class CostEstimator:
     _sepical_op_type = ["fused_attention", "fused_feedforward"]
 
-    def __init__(self,
-                 program,
-                 cluster,
-                 mode="modeling",
-                 rank=None,
-                 loop_count=10):
+    def __init__(
+        self, program, cluster, mode="modeling", rank=None, loop_count=10
+    ):
         self._program = program
         self._cluster = cluster
         self._check_mode(mode)
@@ -41,10 +38,13 @@ def __init__(self,
         self._loop_count = loop_count
         self._global_cost = Cost()
         self._local_cost_mapping = {}
-        self._detailed_cost = OrderedDict(
+        self._detailed_cost = (
+            OrderedDict()
         )  # {`op_id`: {"reshard": [], "dist_op": [], "local_cost": local_cost}}}
         self._bubble_time_mapping = {}
         self._ordered_ops = []
+        self.max_memories = {}
+        self.max_memory = None
 
     @property
     def loop_count(self):
@@ -104,7 +104,8 @@ def local_bubble_time(self, rank=None):
     def _check_mode(self, mode):
         if mode not in ["modeling", "profiling"]:
             raise ValueError(
-                "Just support modeling and profiling, but got {}".format(mode))
+                "Just support modeling and profiling, but got {}".format(mode)
+            )
 
     def _is_special_var_name(self, var_name):
         special_var_name = ["lod_tensor_blocking_queue_0"]
@@ -114,6 +115,7 @@ def _is_special_var_name(self, var_name):
 
     def _estimate_core(self, dist_context, resharder, block):
         from ..reshard import get_var_with_recursion
+
         ops = block.ops
         loop_count = None
         if block.desc.id != self.program.global_block().desc.id:
@@ -123,15 +125,16 @@ def _estimate_core(self, dist_context, resharder, block):
         for i in range(loop_count):
             for op in ops:
                 self._detailed_cost[op.desc.id()] = OrderedDict()
-                # if in the while sub block, the detail of cost is the last cost
+                # If in the while sub block, the detail of cost is the last cost
                 detail = self._detailed_cost[op.desc.id()]
                 detail["reshard_cost"] = OrderedDict()  #
                 detail["dist_op_cost"] = []
                 if int(op.attr('op_role')) == int(OpRole.Optimize):
                     continue
                 if op.type in [
-                        "create_py_reader", "create_double_buffer_reader",
-                        "read"
+                    "create_py_reader",
+                    "create_double_buffer_reader",
+                    "read",
                 ]:
                     continue
 
@@ -147,15 +150,15 @@ def _estimate_core(self, dist_context, resharder, block):
                     var = get_var_with_recursion(var_name, block, self.program)
                     reshard_cost = resharder.get_cost(op, var, self.cluster)
 
-                    # calc reshard cost
+                    # Calc reshard cost
                     if reshard_cost is not None:
                         detail["reshard_cost"][var_name] = reshard_cost
 
                         comm_costs = reshard_cost[0]
                         local_comp_cost = reshard_cost[1]
                         for comm_cost in comm_costs:
-                            # time is cumulative in global cost and local cost, but memory and flops just are cumulative in global cost.
-                            # comm sync
+                            # Time is cumulative in global cost and local cost, but memory and flops just are cumulative in global cost.
+                            # Comm sync
                             for item in comm_cost:
                                 group_ranks, cost = item
                                 max_time = None
@@ -170,38 +173,44 @@ def _estimate_core(self, dist_context, resharder, block):
                                             max_time = rank_cost.time
 
                                 for rank in group_ranks:
-                                    self.local_cost(
-                                        rank).time = max_time + cost.time
+                                    self.local_cost(rank).time = (
+                                        max_time + cost.time
+                                    )
 
                                     if rank not in self._bubble_time_mapping:
                                         self._bubble_time_mapping[rank] = 0
 
                                     self._bubble_time_mapping[rank] += (
-                                        max_time - cost_time[rank])
+                                        max_time - cost_time[rank]
+                                    )
 
                         for rank in local_comp_cost:
                             for comp_cost in local_comp_cost[rank]:
                                 self.local_cost(rank).time += comp_cost.time
 
-                # calc dist op cost
+                # Calc dist op cost
                 dist_op = dist_context.get_dist_op_for_program(op)
                 op_dist_attr = dist_op.dist_attr
                 processes = op_dist_attr.process_mesh.processes
 
                 container = get_distributed_operator_impl_container(
-                    op_dist_attr.impl_type)
+                    op_dist_attr.impl_type
+                )
                 dist_impl = container.impls[op_dist_attr.impl_idx]
 
-                dist_op_cost = dist_impl.calc_cost(op.attr('op_role'), dist_op,
-                                                   dist_context, self.cluster)
+                dist_op_cost = dist_impl.calc_cost(
+                    op.attr('op_role'), dist_op, dist_context, self.cluster
+                )
                 detail["dist_op_cost"] = dist_op_cost
 
                 if dist_op_cost is None:
-                    assert dist_op.serial_op.type in CostEstimator._sepical_op_type
+                    assert (
+                        dist_op.serial_op.type in CostEstimator._sepical_op_type
+                    )
                     continue
                 for item in dist_op_cost:
                     if isinstance(item, list):
-                        # comm sync
+                        # Comm sync
                         for comm_op_cost in item:
                             max_time = None
                             cost_time = {}
@@ -215,16 +224,18 @@ def _estimate_core(self, dist_context, resharder, block):
                                     if max_time < rank_cost.time:
                                         max_time = rank_cost.time
                             for rank in group_ranks:
-                                self.local_cost(
-                                    rank).time = max_time + comm_op_cost.time
+                                self.local_cost(rank).time = (
+                                    max_time + comm_op_cost.time
+                                )
                                 if rank not in self._bubble_time_mapping:
                                     self._bubble_time_mapping[rank] = 0
                                 self._bubble_time_mapping[rank] += (
-                                    max_time - cost_time[rank])
+                                    max_time - cost_time[rank]
+                                )
                     elif isinstance(item, dict):
-                        # op just one
+                        # Op just one
                         for rank in processes:
-                            # dp+pp+mp
+                            # DP+PP+MP
                             if rank not in item:
                                 continue
                             self.local_cost(rank).time += item[rank].time
@@ -245,8 +256,11 @@ def _calculate_bytes(self, sizes, dtype):
             dtype_factor = 8
         elif dtype == paddle.float32 or dtype == paddle.int32:
             dtype_factor = 4
-        elif dtype == paddle.float16 or dtype == paddle.bfloat16 \
-            or dtype == paddle.int16:
+        elif (
+            dtype == paddle.float16
+            or dtype == paddle.bfloat16
+            or dtype == paddle.int16
+        ):
             dtype_factor = 2
         elif dtype == paddle.int8 or dtype == paddle.uint8:
             dtype_factor = 1
@@ -267,9 +281,10 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping):
             return result
 
         memories = {}
-        max_memories = {}
-        var_info = {
-        }  # var_name: [[process_mesh, dims_mapping], [id]], [[process_mesh, dims_mapping], [id]]}
+        self.max_memories = {}
+        var_info = (
+            {}
+        )  # var_name: [[process_mesh, dims_mapping], [id]], [[process_mesh, dims_mapping], [id]]}
 
         for block in self.program.blocks:
             for op in block.ops:
@@ -277,38 +292,52 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping):
         self._ordered_ops.sort(key=lambda x: x[0])
 
         for op_id, op in self._ordered_ops:
+            if op.type in [
+                "create_py_reader",
+                "create_double_buffer_reader",
+                "read",
+            ]:
+                continue
             dist_op = dist_context.get_dist_op_for_program(op)
             process_mesh = dist_op.dist_attr.process_mesh
             for var_name in op.input_arg_names:
                 input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
-                    var_name)
+                    var_name
+                )
                 if var_name not in var_info:
                     var_info[var_name] = {}
-                key = _convert_pm_and_dm_to_str(process_mesh,
-                                                input_dims_mapping)
+                key = _convert_pm_and_dm_to_str(
+                    process_mesh, input_dims_mapping
+                )
                 if key not in var_info[var_name]:
                     var_info[var_name][key] = {}
-                # it is even partition now
+                # It is even partition now
                 if "memory" not in var_info[var_name][key]:
                     var = dist_op.get_serial_input(var_name)
                     global_sizes = var.shape
                     dtype = var.dtype
                     sizes = DistributedTensor.get_local_sizes(
-                        global_sizes, input_dims_mapping, process_mesh.topology,
-                        process_mesh.processes)
+                        global_sizes,
+                        input_dims_mapping,
+                        process_mesh.topology,
+                        process_mesh.processes,
+                    )
                     var_info[var_name][key]["memory"] = self._calculate_bytes(
-                        sizes, dtype)
+                        sizes, dtype
+                    )
                 if "position" not in var_info[var_name][key]:
                     var_info[var_name][key]["position"] = []
                 var_info[var_name][key]["position"].append(op_id)
 
             for var_name in op.output_arg_names:
                 output_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(
-                    var_name)
+                    var_name
+                )
                 if var_name not in var_info:
                     var_info[var_name] = {}
-                key = _convert_pm_and_dm_to_str(process_mesh,
-                                                output_dims_mapping)
+                key = _convert_pm_and_dm_to_str(
+                    process_mesh, output_dims_mapping
+                )
                 if key not in var_info[var_name]:
                     var_info[var_name][key] = {}
                 if "memory" not in var_info[var_name][key]:
@@ -316,35 +345,47 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping):
                     global_sizes = var.shape
                     dtype = var.dtype
                     sizes = DistributedTensor.get_local_sizes(
-                        global_sizes, output_dims_mapping,
-                        process_mesh.topology, process_mesh.processes)
+                        global_sizes,
+                        output_dims_mapping,
+                        process_mesh.topology,
+                        process_mesh.processes,
+                    )
                     var_info[var_name][key]["memory"] = self._calculate_bytes(
-                        sizes, dtype)
+                        sizes, dtype
+                    )
                 if "position" not in var_info[var_name][key]:
                     var_info[var_name][key]["position"] = []
                 var_info[var_name][key]["position"].append(op_id)
 
         has_used_vars = set()
         for op_id, op in self._ordered_ops:
+            if op.type in [
+                "create_py_reader",
+                "create_double_buffer_reader",
+                "read",
+            ]:
+                continue
             can_free_memories = {}
             can_free_vars = set()
             dist_op = dist_context.get_dist_op_for_program(op)
             process_mesh = dist_op.dist_attr.process_mesh
             for var_name in op.input_arg_names:
                 input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
-                    var_name)
-                key = _convert_pm_and_dm_to_str(process_mesh,
-                                                input_dims_mapping)
+                    var_name
+                )
+                key = _convert_pm_and_dm_to_str(
+                    process_mesh, input_dims_mapping
+                )
                 has_used_var = var_name + key
                 var = dist_op.get_serial_input(var_name)
-                # not used
+                # Not used
                 if var_name + key not in has_used_vars:
                     has_used_vars.add(has_used_var)
                     for process in process_mesh.processes:
                         if process not in memories:
                             memories[process] = 0
                         memories[process] += var_info[var_name][key]["memory"]
-                # used
+                # Used
                 else:
                     if op_id == var_info[var_name][key]["position"][-1]:
                         if has_used_var not in can_free_vars:
@@ -354,23 +395,26 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping):
                                     if process not in can_free_memories:
                                         can_free_memories[process] = 0
                                     can_free_memories[process] += var_info[
-                                        var_name][key]["memory"]
+                                        var_name
+                                    ][key]["memory"]
 
             for var_name in op.output_arg_names:
                 output_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(
-                    var_name)
-                key = _convert_pm_and_dm_to_str(process_mesh,
-                                                output_dims_mapping)
+                    var_name
+                )
+                key = _convert_pm_and_dm_to_str(
+                    process_mesh, output_dims_mapping
+                )
                 has_used_var = var_name + key
                 var = dist_op.get_serial_output(var_name)
-                # not used
+                # Not used
                 if var_name + key not in has_used_vars:
                     has_used_vars.add(has_used_var)
                     for process in process_mesh.processes:
                         if process not in memories:
                             memories[process] = 0
                         memories[process] += var_info[var_name][key]["memory"]
-                # used
+                # Used
                 else:
                     if op_id == var_info[var_name][key]["position"][-1]:
                         if has_used_var not in can_free_vars:
@@ -380,33 +424,202 @@ def _convert_pm_and_dm_to_str(process_mesh, dims_mapping):
                                     if process not in can_free_memories:
                                         can_free_memories[process] = 0
                                     can_free_memories[process] += var_info[
-                                        var_name][key]["memory"]
+                                        var_name
+                                    ][key]["memory"]
 
-            # calc peak memory
+            # Calc peak memory
             for process in memories:
-                if process not in max_memories:
-                    max_memories[process] = memories[process]
+                if process not in self.max_memories:
+                    self.max_memories[process] = memories[process]
                 else:
-                    if memories[process] > max_memories[process]:
-                        max_memories[process] = memories[process]
+                    if memories[process] > self.max_memories[process]:
+                        self.max_memories[process] = memories[process]
 
-            # free memory
+            # Free memory
             for process in can_free_memories:
                 if process in memories:
                     memories[process] -= can_free_memories[process]
 
         # Calculate the max memory in all ranks
-        max_memory = max(max_memories.values())
+        max_memory = max(self.max_memories.values())
+        self.max_memory = max_memory
 
         return max_memory
 
     def estimate(self, dist_context, resharder=None):
         self.prepare()
         from ..reshard import Resharder
-        resharder = Resharder(self.program, None, self.rank, dist_context,
-                              []) if resharder is None else resharder
+
+        resharder = (
+            Resharder(self.program, None, self.rank, dist_context, [])
+            if resharder is None
+            else resharder
+        )
 
         block = self.program.global_block()
         self._estimate_core(dist_context, resharder, block)
 
         return self.global_cost
+
+    def _print_tag(self, max_len, length):
+        tag = "+" + "-" * max_len
+        for i in range(length):
+            print(tag, end="")
+            if i == length - 1:
+                print("+")
+
+    def _print_vals(self, vals, max_len):
+        for idx, val in enumerate(vals):
+            s = "|" + str(val).center(max_len)
+            print(s, end="")
+            if idx == len(vals) - 1:
+                print("|")
+
+    def _pretty_print_memory_cost(self):
+        """Print memory of every rank prettily."""
+        if not self.max_memories or not self.max_memory:
+            raise ValueError("Please calculate memory cost before print.")
+
+        # Padding automatically
+        max_len = 0
+        header = ["Rank", "Memory(MiB)"]
+        memories = [
+            int(item // 1e6) for item in list(self.max_memories.values())
+        ]
+        for memory in memories + header:
+            if len(str(memory)) > max_len:
+                max_len = len(str(memory))
+        max_len += 4  # for pretty print of center
+
+        # Print tag
+        self._print_tag(max_len, len(header))
+
+        # Print header
+        self._print_vals(header, max_len)
+
+        # Print tag
+        self._print_tag(max_len, len(header))
+
+        # Print rank and its memory
+        for i in range(len(self.max_memories)):
+            memory = memories[i]
+            vals = [i, memory]
+            self._print_vals(vals, max_len)
+            self._print_tag(max_len, len(header))
+
+    def _pretty_print_global(self):
+        """Print global execution time and max memory prettily."""
+        if not self.max_memories or not self.max_memory:
+            raise ValueError("Please calculate cost before print.")
+
+        # Padding automatically
+        max_len = 0
+        header = ["Execution Time(ms)", "Max Memory(MiB)"]
+        vals = [round(self.global_cost.time, 3), int(self.max_memory // 1e6)]
+        for memory in vals + header:
+            if len(str(memory)) > max_len:
+                max_len = len(str(memory))
+        max_len += 4  # for pretty print of center
+
+        # Print tag
+        self._print_tag(max_len, len(header))
+
+        # Print header
+        self._print_vals(header, max_len)
+
+        # Print tag
+        self._print_tag(max_len, len(header))
+
+        # Print exec time and max memory
+        self._print_vals(vals, max_len)
+
+        # Print tag
+        self._print_tag(max_len, len(header))
+
+    def pretty_print_cost(self):
+        """Print cost prettily."""
+        print("The global execution time and max memory are as follows:")
+        self._pretty_print_global()
+        print("The memory of every rank is as follows:")
+        self._pretty_print_memory_cost()
+
+
+def get_cost_from_engine(engine, mode):
+    from ..utils import to_list
+    import copy
+
+    # Construct cost estimator by original main program
+    serial_main_prog = (
+        engine._fwd_main_progs[mode].clone()
+        if mode in engine._fwd_main_progs
+        else engine._orig_main_prog.clone()
+    )
+
+    serial_startup_prog = (
+        engine._serial_startup_progs[mode].clone()
+        if mode in engine._serial_startup_progs
+        else engine._orig_startup_prog.clone()
+    )
+    losses = (
+        to_list(engine._loss)
+        if (
+            not isinstance(engine._loss, paddle.nn.Layer)
+            and not callable(engine._loss)
+        )
+        else engine._losses
+    )
+    serial_optimizer = copy.deepcopy(engine._orig_optimizer)
+    if mode in engine._fwd_dist_contexts:
+        dist_context = copy.deepcopy(engine._fwd_dist_contexts[mode])
+    else:
+        from ..dist_context import DistributedContext
+
+        dist_context = DistributedContext(
+            serial_main_prog,
+            serial_startup_prog,
+            serial_optimizer,
+            losses,
+            {},
+            {"loss": losses},
+            engine._cluster,
+            engine._strategy,
+        )
+    from ..completion import Completer
+
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation()
+    dist_context.block_state.parse_forward_blocks(
+        dist_context.serial_main_program
+    )
+
+    if mode == "eval" or mode == "predict":
+        cost_estimator = CostEstimator(serial_main_prog, engine._cluster)
+    elif mode == "train":
+        from ..parallelizer_v2 import Parallelizer
+
+        # Get serial main program with backward
+        parallelizer = Parallelizer(mode, completer, dist_context)
+        # Generate backward
+        loss_name = dist_context.serial_loss.name
+        serial_loss = serial_main_prog.global_block()._var_recursive(loss_name)
+        params_grads = parallelizer._generate_backward(
+            serial_main_prog, serial_startup_prog, serial_loss
+        )
+
+        # Generate optimizer
+        optimizer_ops = parallelizer._generate_optimizer(
+            serial_main_prog,
+            serial_startup_prog,
+            serial_optimizer,
+            params_grads,
+        )
+        cost_estimator = CostEstimator(serial_main_prog, engine._cluster)
+
+    # Estimate global_cost and  max memory
+    global_cost = cost_estimator.estimate(dist_context)
+    max_memory = cost_estimator._estimate_max_memory_by_dist_op(dist_context)
+
+    # Print the cost
+    cost_estimator.pretty_print_cost()
+
+    return global_cost, max_memory
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index ff07deb42aad3f..92d0304eaf6138 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -173,6 +173,17 @@ def mark_annotated_as(self, dist_attr):
     def clear_annotated(self):
         self._is_annotated.clear()
 
+    def __eq__(self, other):
+        if not isinstance(other, TensorDistributedAttribute):
+            return False
+        if self.process_mesh != other.process_mesh:
+            return False
+        if self.dims_mapping != other.dims_mapping:
+            return False
+        if self._is_annotated != other._is_annotated:
+            return False
+        return True
+
     def __str__(self):
         str = "\n\ttensor_dist_attr = {"
         if self.is_annotated("process_mesh"):
@@ -486,6 +497,27 @@ def is_annotated_output_dims_mapping(self, name):
         else:
             return False
 
+    def __eq__(self, other):
+        if not isinstance(other, OperatorDistributedAttribute):
+            return False
+        if self.process_mesh != other.process_mesh:
+            return False
+        if self.op_type != other.op_type:
+            return False
+        if self.impl_type != other.impl_type:
+            return False
+        if self.impl_idx != other.impl_idx:
+            return False
+        if self._is_annotated != other._is_annotated:
+            return False
+        if self._is_recompute != other._is_recompute:
+            return False
+        if self.inputs_dist_attrs != other.inputs_dist_attrs:
+            return False
+        if self.outputs_dist_attrs != other.outputs_dist_attrs:
+            return False
+        return True
+
     def __str__(self):
         str = "\n\top_dist_attr = {"
         if self.is_annotated("process_mesh"):
diff --git a/python/paddle/distributed/auto_parallel/dist_context.py b/python/paddle/distributed/auto_parallel/dist_context.py
index 92a503659041eb..ae192605d52915 100644
--- a/python/paddle/distributed/auto_parallel/dist_context.py
+++ b/python/paddle/distributed/auto_parallel/dist_context.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+import time
 import copy
 from collections import defaultdict
 import paddle.fluid
@@ -24,7 +25,7 @@
 from .dist_tensor import DistributedTensor
 from .dist_op import DistributedOperator
 from .process_mesh import ProcessMesh
-from .utils import is_loss_grad_op, is_loss_op
+from .utils import is_loss_grad_op, is_loss_op, is_valid_list_index
 
 # There always exists a default context for user. And user can set it to another one.
 _g_default_distributed_context = None
@@ -53,15 +54,17 @@ class DistributedContext:
     One auto-parallel run should use its own DistributedContext to avoid interfering other run.
     """
 
-    def __init__(self,
-                 serial_main_prog=None,
-                 serial_startup_prog=None,
-                 serial_optimizer=None,
-                 serial_loss=None,
-                 feed_vars={},
-                 fetch_vars={},
-                 cluster=None,
-                 strategy=None):
+    def __init__(
+        self,
+        serial_main_prog=None,
+        serial_startup_prog=None,
+        serial_optimizer=None,
+        serial_loss=None,
+        feed_vars={},
+        fetch_vars={},
+        cluster=None,
+        strategy=None,
+    ):
         # Data members related to original programs (unchanged)
         self._original_serial_main_program = serial_main_prog
         self._original_serial_startup_program = serial_startup_prog
@@ -77,7 +80,6 @@ def __init__(self,
         self._serial_optimizer = None
         self._serial_feed_vars = {}
         self._serial_fetch_vars = {}
-        self._lr_optimizer = None  # record the optimzier holding lr_scheduler
 
         # Data members related to the program
         self._dist_tensors_for_program = {}
@@ -111,7 +113,7 @@ def __init__(self,
         # self._tensor_id_to_tensor_node_ids = {}
 
         self._is_initialized = False
-        #TODO: need a better way to remove the following flag
+        # TODO: need a better way to remove the following flag
         self._need_copy_dist_attr_to_graph = False
         self._backup_pass_context_stack = []
         self._backup_block_state_stack = []
@@ -126,8 +128,8 @@ def __init__(self,
         # A flag indicates whether the used parallelism is data parallel
         self._data_parallel = False
 
-        # flag whether using `to_static`
-        self._dygraph_mode = False
+        # record upstream and downstream of cur rank
+        self._up_down_streams = UpDownStream()
 
     @property
     def serial_main_program(self):
@@ -196,7 +198,8 @@ def block_state(self):
     @property
     def has_annotation(self):
         return len(self._dist_tensors_for_program) or len(
-            self._dist_ops_for_program)
+            self._dist_ops_for_program
+        )
 
     @property
     def gradient_scale(self):
@@ -210,24 +213,33 @@ def gradient_scale(self, gs):
     def data_parallel(self):
         return self._data_parallel
 
+    @property
+    def up_down_streams(self):
+        return self._up_down_streams
+
     @data_parallel.setter
     def data_parallel(self, dp):
         self._data_parallel = dp
 
     def _backup_serial_info(self, mode):
         self._backup_serial_main_program_stack.append(
-            self._serial_main_program.clone())
+            self._serial_main_program.clone()
+        )
         self._backup_serial_startup_program_stack.append(
-            self._serial_startup_program.clone())
-        self._backup_pass_context_stack.append(copy.deepcopy(
-            self._pass_context))
+            self._serial_startup_program.clone()
+        )
+        self._backup_pass_context_stack.append(
+            copy.deepcopy(self._pass_context)
+        )
         self._backup_block_state_stack.append(copy.deepcopy(self._block_state))
 
     def _backup_dist_info(self, mode):
         self._backup_dist_tensors_for_program_stack.append(
-            copy.deepcopy(self._dist_tensors_for_program))
+            copy.deepcopy(self._dist_tensors_for_program)
+        )
         self._backup_dist_ops_for_program_stack.append(
-            copy.deepcopy(self._dist_ops_for_program))
+            copy.deepcopy(self._dist_ops_for_program)
+        )
 
     def _backup(self, serial=True, serial_mode=None, dist=True, dist_mode=None):
         # Use this function carefully
@@ -244,7 +256,8 @@ def _restore_serial_loss(self):
                     block_idx = loss.block.idx
                     var_name = loss.name
                     var = self._serial_main_program.blocks[
-                        block_idx]._var_recursive(var_name)
+                        block_idx
+                    ]._var_recursive(var_name)
                     self._serial_loss = var
                 elif len(self._original_serial_loss) == 0:
                     self._serial_loss = []
@@ -254,7 +267,8 @@ def _restore_serial_loss(self):
                 block_idx = self._original_serial_loss.block.idx
                 var_name = self._original_serial_loss.name
                 var = self._serial_main_program.blocks[
-                    block_idx]._var_recursive(var_name)
+                    block_idx
+                ]._var_recursive(var_name)
                 self._serial_loss = var
 
     def _restore_serial_feed_vars(self):
@@ -264,33 +278,52 @@ def _restore_serial_feed_vars(self):
                 block_idx = var.block.idx
                 var_name = var.name
                 var = self._serial_main_program.blocks[
-                    block_idx]._var_recursive(var_name)
+                    block_idx
+                ]._var_recursive(var_name)
                 new_var_list.append(var)
             self._serial_feed_vars[key] = new_var_list
 
     def _restore_serial_fetch_vars(self):
         for key, var_list in self._original_serial_fetch_vars.items():
             new_var_list = []
-            for var in var_list:
-                block_idx = var.block.idx
-                var_name = var.name
-                var = self._serial_main_program.blocks[
-                    block_idx]._var_recursive(var_name)
-                new_var_list.append(var)
+            # metrics is a list of list
+            if key == "metrics":
+                for inner_var_list in var_list:
+                    new_inner_var_list = []
+                    for var in inner_var_list:
+                        block_idx = var.block.idx
+                        var_name = var.name
+                        var = self._serial_main_program.blocks[
+                            block_idx
+                        ]._var_recursive(var_name)
+                        new_inner_var_list.append(var)
+                    new_var_list.append(new_inner_var_list)
+            else:
+                for var in var_list:
+                    block_idx = var.block.idx
+                    var_name = var.name
+                    var = self._serial_main_program.blocks[
+                        block_idx
+                    ]._var_recursive(var_name)
+                    new_var_list.append(var)
             self._serial_fetch_vars[key] = new_var_list
 
     def _restore_serial_info(self, mode="to_backup"):
         if mode == "to_backup":
-            self._serial_main_program = self._backup_serial_main_program_stack.pop(
+            self._serial_main_program = (
+                self._backup_serial_main_program_stack.pop()
             )
-            self._serial_startup_program = self._backup_serial_startup_program_stack.pop(
+            self._serial_startup_program = (
+                self._backup_serial_startup_program_stack.pop()
             )
         elif mode == "to_original":
             assert self._original_serial_main_program is not None
             assert self._original_serial_startup_program is not None
-            self._serial_main_program = self._original_serial_main_program.clone(
+            self._serial_main_program = (
+                self._original_serial_main_program.clone()
             )
-            self._serial_startup_program = self._original_serial_startup_program.clone(
+            self._serial_startup_program = (
+                self._original_serial_startup_program.clone()
             )
 
         self._restore_serial_loss()
@@ -302,21 +335,27 @@ def _restore_serial_info(self, mode="to_backup"):
 
     def _restore_dist_info(self, mode="to_backup"):
         if mode == "to_backup":
-            self._dist_tensors_for_program = self._backup_dist_tensors_for_program_stack.pop(
+            self._dist_tensors_for_program = (
+                self._backup_dist_tensors_for_program_stack.pop()
             )
-            self._dist_ops_for_program = self._backup_dist_ops_for_program_stack.pop(
+            self._dist_ops_for_program = (
+                self._backup_dist_ops_for_program_stack.pop()
             )
         elif mode == "to_original":
             assert self._original_dist_tensors_for_program
             assert self._original_dist_ops_for_program
             self._dist_tensors_for_program = copy.deepcopy(
-                self._original_dist_tensors_for_program)
+                self._original_dist_tensors_for_program
+            )
             self._dist_ops_for_program = copy.deepcopy(
-                self._original_dist_ops_for_program)
+                self._original_dist_ops_for_program
+            )
         elif mode == "to_default":
             new_tensors_ids = []
-            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
-            ):
+            for (
+                tensor_id,
+                dist_tensor,
+            ) in self._dist_tensors_for_program.items():
                 if tensor_id in self._tensors_ids:
                     dist_tensor.dist_attr.reset()
                 else:
@@ -333,8 +372,10 @@ def _restore_dist_info(self, mode="to_backup"):
                 self._dist_ops_for_program.pop(op_id)
         else:
             new_tensors_ids = []
-            for tensor_id, dist_tensor in self._dist_tensors_for_program.items(
-            ):
+            for (
+                tensor_id,
+                dist_tensor,
+            ) in self._dist_tensors_for_program.items():
                 new_tensors_ids.append(tensor_id)
             for tensor_id in new_tensors_ids:
                 self._dist_tensors_for_program.pop(tensor_id)
@@ -349,11 +390,13 @@ def _restore_dist_info(self, mode="to_backup"):
         self._need_copy_dist_attr_to_graph = True
         self._process_meshes = []
 
-    def _restore(self,
-                 serial=True,
-                 serial_mode="to_backup",
-                 dist=True,
-                 dist_mode="to_backup"):
+    def _restore(
+        self,
+        serial=True,
+        serial_mode="to_backup",
+        dist=True,
+        dist_mode="to_backup",
+    ):
         # Use this function carefully
         if serial:
             self._restore_serial_info(serial_mode)
@@ -364,11 +407,13 @@ def initialize(self, with_graph=True):
         if not self._is_initialized:
             if not self._serial_main_program:
                 if self._original_serial_main_program:
-                    self._serial_main_program = self._original_serial_main_program.clone(
+                    self._serial_main_program = (
+                        self._original_serial_main_program.clone()
                     )
             if not self._serial_startup_program:
                 if self._original_serial_startup_program:
-                    self._serial_startup_program = self._original_serial_startup_program.clone(
+                    self._serial_startup_program = (
+                        self._original_serial_startup_program.clone()
                     )
             if not self._serial_loss:
                 self._restore_serial_loss()
@@ -382,26 +427,35 @@ def initialize(self, with_graph=True):
             self._init_dist_attr_for_program()
             # Backup the original distributed information for later restore
             self._original_dist_tensors_for_program = copy.deepcopy(
-                self._dist_tensors_for_program)
+                self._dist_tensors_for_program
+            )
             self._original_dist_ops_for_program = copy.deepcopy(
-                self._dist_ops_for_program)
+                self._dist_ops_for_program
+            )
             self._tensors_ids = list(self._dist_tensors_for_program.keys())
             self._ops_ids = list(self._dist_ops_for_program.keys())
             self._is_initialized = True
 
             if with_graph:
                 set_flags({"FLAGS_convert_all_blocks": True})
+                start_time = time.time()
                 self._serial_graph = framework.IrGraph(
-                    core.Graph(self._serial_main_program.desc))
+                    core.Graph(self._serial_main_program.desc)
+                )
+                print("bot-context-graph-build: ", time.time() - start_time, flush=True)
                 self._init_dist_attr_for_graph()
+                start_time = time.time()
                 self._need_copy_dist_attr_to_graph = False
+                print("bot-context-graph-dist: ", time.time() - start_time, flush=True)
 
         if self._need_copy_dist_attr_to_graph and with_graph:
+            # print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% here 1234", flush=True)
             self.copy_dist_attr_from_program_to_graph()
 
     def add_process_mesh(self, process_mesh):
-        assert isinstance(process_mesh, ProcessMesh), \
-            'The type of dim_mapping must be ProcessMesh.'
+        assert isinstance(
+            process_mesh, ProcessMesh
+        ), 'The type of dim_mapping must be ProcessMesh.'
         if process_mesh not in self.process_meshes:
             self._process_meshes.append(process_mesh)
 
@@ -423,7 +477,8 @@ def get_dist_tensor_for_program(self, serial_tensor):
         else:
             serial_tensor_id = serial_tensor.desc.original_id()
             dist_tensor = self._dist_tensors_for_program.get(
-                serial_tensor_id, None)
+                serial_tensor_id, None
+            )
             if dist_tensor:
                 return dist_tensor
             else:
@@ -463,7 +518,8 @@ def get_tensor_dist_attr_for_program(self, serial_tensor):
         else:
             serial_tensor_id = serial_tensor.desc.original_id()
             dist_tensor = self._dist_tensors_for_program.get(
-                serial_tensor_id, None)
+                serial_tensor_id, None
+            )
             if dist_tensor:
                 return dist_tensor.dist_attr
             else:
@@ -482,8 +538,9 @@ def set_tensor_dist_attr_for_program(self, serial_tensor, dist_attr):
 
     def get_tensor_dist_attr_for_graph(self, serial_tensor_node):
         serial_tensor_node_id = _node_id(serial_tensor_node)
-        dist_tensor = self._dist_tensors_for_graph.get(serial_tensor_node_id,
-                                                       None)
+        dist_tensor = self._dist_tensors_for_graph.get(
+            serial_tensor_node_id, None
+        )
         if dist_tensor:
             return dist_tensor.dist_attr
         else:
@@ -525,7 +582,8 @@ def get_dist_attr_for_graph(self, serial_node):
         if serial_node.is_var() and serial_node.var() is not None:
             serial_tensor_node_id = _node_id(serial_node)
             dist_tensor = self._dist_tensors_for_graph.get(
-                serial_tensor_node_id, None)
+                serial_tensor_node_id, None
+            )
             if dist_tensor:
                 return dist_tensor.dist_attr
             else:
@@ -552,7 +610,8 @@ def _init_dist_attr_for_program(self, no_default=False):
             for tensor in block.vars.values():
                 # Copy the distributed tensors in the default context
                 default_dist_tensor = default_ctx.get_dist_tensor_for_program(
-                    tensor)
+                    tensor
+                )
                 if default_dist_tensor and default_ctx is not self:
                     self.add_dist_tensor_for_program(default_dist_tensor)
                 current_dist_tensor = self.get_dist_tensor_for_program(tensor)
@@ -569,70 +628,102 @@ def _init_dist_attr_for_program(self, no_default=False):
                     dist_op = DistributedOperator(op)
                     self.add_dist_op_for_program(dist_op)
         self._original_dist_tensors_for_program = copy.deepcopy(
-            self._dist_tensors_for_program)
+            self._dist_tensors_for_program
+        )
         self._original_dist_ops_for_program = copy.deepcopy(
-            self._dist_ops_for_program)
+            self._dist_ops_for_program
+        )
 
     def _order_nodes_by_program_order(self):
-
         def _contains(nodes, target_node):
             for node in nodes:
                 if _node_id(node) == _node_id(target_node):
                     return True
             return False
 
+        start_time = time.time()
         serial_ordered_tensor_nodes = []
         serial_ordered_op_nodes = []
         all_nodes = []
         for idx, graph in enumerate(self._serial_graph.all_sub_graphs()):
             for node in graph.all_nodes():
                 all_nodes.append(node)
+        print("bot-context-graph-dist-ordering-0: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         for node in all_nodes:
             if node.is_var() and node.var() is not None:
                 serial_ordered_tensor_nodes.append(node)
             if node.is_op() and node.op() is not None:
                 serial_ordered_op_nodes.append(node)
+        print("bot-context-graph-dist-ordering-1: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         serial_ordered_tensor_nodes.sort(
-            key=lambda node: node.node.original_desc_id())
+            key=lambda node: node.node.original_desc_id()
+        )
+        print("bot-context-graph-dist-ordering-2: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         serial_ordered_op_nodes.sort(
-            key=lambda node: node.node.original_desc_id())
+            key=lambda node: node.node.original_desc_id()
+        )
+        print("bot-context-graph-dist-ordering-3: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         num_nodes_before = len(serial_ordered_tensor_nodes) + len(
-            serial_ordered_op_nodes)
+            serial_ordered_op_nodes
+        )
 
         new_serial_ordered_tensor_nodes = []
         new_serial_ordered_op_nodes = []
         new_serial_ordered_nodes = []
+        tmp_time = 0
+        # TODO: user a counter for the following sort
         for op_node in serial_ordered_op_nodes:
             tensor_nodes = []
             for tensor_node in op_node.inputs:
-                if tensor_node.is_var() \
-                    and tensor_node.var() is not None \
-                    and not _contains(new_serial_ordered_nodes, tensor_node):
+                if (
+                    tensor_node.is_var()
+                    and tensor_node.var() is not None
+                    and not _contains(new_serial_ordered_nodes, tensor_node)
+                ):
                     tensor_nodes.append(tensor_node)
                     new_serial_ordered_tensor_nodes.append(tensor_node)
+            inner_start_time = time.time()
             tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
+            tmp_time += time.time() - inner_start_time
             new_serial_ordered_nodes.extend(tensor_nodes)
             new_serial_ordered_nodes.append(op_node)
             new_serial_ordered_op_nodes.append(op_node)
             tensor_nodes = []
             for tensor_node in op_node.outputs:
-                if tensor_node.is_var() \
-                    and tensor_node.var() is not None \
-                    and not _contains(new_serial_ordered_nodes, tensor_node):
+                if (
+                    tensor_node.is_var()
+                    and tensor_node.var() is not None
+                    and not _contains(new_serial_ordered_nodes, tensor_node)
+                ):
                     tensor_nodes.append(tensor_node)
                     new_serial_ordered_tensor_nodes.append(tensor_node)
+            inner_start_time = time.time()
             tensor_nodes.sort(key=lambda node: node.node.original_desc_id())
+            tmp_time += time.time() - inner_start_time
             new_serial_ordered_nodes.extend(tensor_nodes)
+        print("bot-context-graph-dist-ordering-4: ", tmp_time, flush=True)
+        print("bot-context-graph-dist-ordering-5: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         new_serial_ordered_tensor_nodes.sort(
-            key=lambda node: node.node.original_desc_id())
+            key=lambda node: node.node.original_desc_id()
+        )
+        print("bot-context-graph-dist-ordering-6: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         new_serial_ordered_op_nodes.sort(
-            key=lambda node: node.node.original_desc_id())
+            key=lambda node: node.node.original_desc_id()
+        )
+        print("bot-context-graph-dist-ordering-7: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         self._serial_ordered_tensor_nodes = new_serial_ordered_tensor_nodes
         self._serial_ordered_op_nodes = new_serial_ordered_op_nodes
         self._serial_ordered_nodes = new_serial_ordered_nodes
         assert len(self._serial_ordered_nodes) == len(
-            self._serial_ordered_tensor_nodes) + len(
-                self._serial_ordered_op_nodes)
+            self._serial_ordered_tensor_nodes
+        ) + len(self._serial_ordered_op_nodes)
         self._serial_orphan_tensor_nodes = []
         for tensor_node in serial_ordered_tensor_nodes:
             if not _contains(self._serial_ordered_tensor_nodes, tensor_node):
@@ -641,43 +732,84 @@ def _contains(nodes, target_node):
             print(
                 "WARNING: there are some orphan tensors or ops which are not used in the execution."
             )
+        print("bot-context-graph-dist-ordering-8: ", time.time() - start_time, flush=True)
+        # for node in serial_ordered_tensor_nodes:
+        #     print("[before ordering] t: ", _node_id(node), node.var().name(),flush=True)
+        # for node in serial_ordered_op_nodes:
+        #     print("[before ordering] o: ", _node_id(node), node.op().type(), flush=True)
+        # for node in new_serial_ordered_tensor_nodes:
+        #     print("[after  ordering] t: ", _node_id(node), node.var().name(),flush=True)
+        # for node in new_serial_ordered_op_nodes:
+        #     print("[after  ordering] o: ", _node_id(node), node.op().type(), flush=True)
+        # for node in self._serial_orphan_tensor_nodes:
+        #     print("[after  ordering] a: ", _node_id(node), node.var().name(), flush=True)
+        # for node in new_serial_ordered_nodes:
+        #     print("[after  ordering] o: ", _node_id(node), flush=True)
 
     def _init_dist_attr_for_graph(self):
         # Convert program to graph and initialize the distributed attributes
+        start_time = time.time()
         self._order_nodes_by_program_order()
+        print("bot-context-graph-dist-ordering: ", time.time() - start_time, flush=True)
+        start_time = time.time()
         for node in self.serial_ordered_nodes:
             if node.is_var() and node.var() is not None:
                 dist_tensor = None
                 tensor_id = node.node.original_desc_id()
-                for cur_tensor_id, cur_dist_tensor in self._dist_tensors_for_program.items(
-                ):
-                    if tensor_id == cur_tensor_id \
-                        or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
+                # TODO: Use dict and (id, original_id) for keys to remove this for loop
+                for (
+                    cur_tensor_id,
+                    cur_dist_tensor,
+                ) in self._dist_tensors_for_program.items():
+                    if (
+                        tensor_id == cur_tensor_id
+                        or tensor_id
+                        == cur_dist_tensor.serial_tensor.desc.original_id()
+                    ):
                         dist_tensor = cur_dist_tensor
-                        self._node_id_to_tensor_id[_node_id(
-                            node)] = cur_tensor_id
-                assert dist_tensor is not None, \
-                    "Tensor must have a distributed tensor after the initialization for program."
+                        self._node_id_to_tensor_id[
+                            _node_id(node)
+                        ] = cur_tensor_id
+                assert (
+                    dist_tensor is not None
+                ), "Tensor must have a distributed tensor after the initialization for program."
                 serial_tensor_node_id = _node_id(node)
-                new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
-                                                    dist_tensor.dist_attr)
+                new_dist_tensor = DistributedTensor(
+                    dist_tensor.serial_tensor, dist_tensor.dist_attr
+                )
                 self._dist_tensors_for_graph[
-                    serial_tensor_node_id] = new_dist_tensor
+                    serial_tensor_node_id
+                ] = new_dist_tensor
             if node.is_op() and node.op() is not None:
                 dist_op = None
                 op_id = node.node.original_desc_id()
-                for cur_op_id, cur_dist_op in self._dist_ops_for_program.items(
-                ):
-                    if op_id == cur_op_id \
-                        or op_id == cur_dist_op.serial_op.desc.original_id():
+                for (
+                    cur_op_id,
+                    cur_dist_op,
+                ) in self._dist_ops_for_program.items():
+                    if (
+                        op_id == cur_op_id
+                        or op_id == cur_dist_op.serial_op.desc.original_id()
+                    ):
                         dist_op = cur_dist_op
                         self._node_id_to_op_id[_node_id(node)] = cur_op_id
-                assert dist_op is not None, \
-                    "Operator must have a distributed operator after the initialization for program."
+                assert (
+                    dist_op is not None
+                ), "Operator must have a distributed operator after the initialization for program."
                 serial_op_node_id = _node_id(node)
-                new_dist_op = DistributedOperator(dist_op.serial_op,
-                                                  dist_op.dist_attr)
+                new_dist_op = DistributedOperator(
+                    dist_op.serial_op, dist_op.dist_attr
+                )
                 self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
+        print("bot-context-graph-dist-init: ", time.time() - start_time, flush=True)
+        # for node_id, dist_tensor in self._dist_tensors_for_graph.items():
+        #     print("graph dist tensor: ", node_id, dist_tensor.serial_tensor.desc.id(), flush=True)
+        # for node_id, dist_op in self._dist_ops_for_graph.items():
+        #     print("graph dist     op: ", node_id, dist_op.serial_op.desc.id(), flush=True)
+        # for node_id, id in self._node_id_to_tensor_id.items():
+        #     print("graph dist tensor node_id: ", node_id, id, flush=True)
+        # for node_id, id in self._node_id_to_op_id.items():
+        #     print("graph dist     op node_id: ", node_id, id, flush=True)
 
     def clear_dist_info_for_program(self):
         self._dist_tensors_for_program.clear()
@@ -692,36 +824,111 @@ def copy_dist_attr_from_program_to_graph(self):
             if node.is_var() and node.var() is not None:
                 dist_tensor = None
                 tensor_id = node.node.original_desc_id()
-                for cur_tensor_id, cur_dist_tensor in self._dist_tensors_for_program.items(
-                ):
-                    if tensor_id == cur_tensor_id \
-                        or tensor_id == cur_dist_tensor.serial_tensor.desc.original_id():
+                for (
+                    cur_tensor_id,
+                    cur_dist_tensor,
+                ) in self._dist_tensors_for_program.items():
+                    if (
+                        tensor_id == cur_tensor_id
+                        or tensor_id
+                        == cur_dist_tensor.serial_tensor.desc.original_id()
+                    ):
                         dist_tensor = cur_dist_tensor
-                assert dist_tensor is not None, \
-                    "Tensor must have a distributed tensor after the initialization for program."
+                assert (
+                    dist_tensor is not None
+                ), "Tensor must have a distributed tensor after the initialization for program."
                 serial_tensor_node_id = _node_id(node)
-                new_dist_tensor = DistributedTensor(dist_tensor.serial_tensor,
-                                                    dist_tensor.dist_attr)
+                new_dist_tensor = DistributedTensor(
+                    dist_tensor.serial_tensor, dist_tensor.dist_attr
+                )
                 self._dist_tensors_for_graph[
-                    serial_tensor_node_id] = new_dist_tensor
+                    serial_tensor_node_id
+                ] = new_dist_tensor
             if node.is_op() and node.op() is not None:
                 dist_op = None
                 op_id = node.node.original_desc_id()
-                for cur_op_id, cur_dist_op in self._dist_ops_for_program.items(
-                ):
-                    if op_id == cur_op_id \
-                        or op_id == cur_dist_op.serial_op.desc.original_id():
+                for (
+                    cur_op_id,
+                    cur_dist_op,
+                ) in self._dist_ops_for_program.items():
+                    if (
+                        op_id == cur_op_id
+                        or op_id == cur_dist_op.serial_op.desc.original_id()
+                    ):
                         dist_op = cur_dist_op
-                assert dist_op is not None, \
-                    "Operator must have a distributed operator after the initialization for program."
+                assert (
+                    dist_op is not None
+                ), "Operator must have a distributed operator after the initialization for program."
                 serial_op_node_id = _node_id(node)
-                new_dist_op = DistributedOperator(dist_op.serial_op,
-                                                  dist_op.dist_attr)
+                new_dist_op = DistributedOperator(
+                    dist_op.serial_op, dist_op.dist_attr
+                )
+                self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
+
+    def copy_dist_attr_from_program_to_graph_bk(self):
+        for node in self.serial_ordered_nodes:
+            if node.is_var() and node.var() is not None:
+                dist_tensor = None
+                tensor_id = node.node.original_desc_id()
+                # for (
+                #     cur_tensor_id,
+                #     cur_dist_tensor,
+                # ) in self._dist_tensors_for_program.items():
+                #     if (
+                #         tensor_id == cur_tensor_id
+                #         or tensor_id
+                #         == cur_dist_tensor.serial_tensor.desc.original_id()
+                #     ):
+                #         dist_tensor = cur_dist_tensor
+                cur_dist_tensor = self._dist_tensors_for_program.get(tensor_id, None)
+                if cur_dist_tensor is not None:
+                    cur_tensor_id = tensor_id
+                else:
+                    cur_tensor_id = self._tensor_original_id_to_id[tensor_id]
+                    cur_dist_tensor  = self._dist_tensors_for_program.get(cur_tensor_id, None)
+                dist_tensor = cur_dist_tensor
+                assert (
+                    dist_tensor is not None
+                ), "Tensor must have a distributed tensor after the initialization for program."
+                serial_tensor_node_id = _node_id(node)
+                new_dist_tensor = DistributedTensor(
+                    dist_tensor.serial_tensor, dist_tensor.dist_attr
+                )
+                self._dist_tensors_for_graph[
+                    serial_tensor_node_id
+                ] = new_dist_tensor
+            if node.is_op() and node.op() is not None:
+                dist_op = None
+                op_id = node.node.original_desc_id()
+                # for (
+                #     cur_op_id,
+                #     cur_dist_op,
+                # ) in self._dist_ops_for_program.items():
+                #     if (
+                #         op_id == cur_op_id
+                #         or op_id == cur_dist_op.serial_op.desc.original_id()
+                #     ):
+                #         dist_op = cur_dist_op
+                cur_dist_op = self._dist_ops_for_program.get(op_id, None)
+                if cur_dist_op is not None:
+                    cur_op_id = op_id
+                else:
+                    cur_op_id = self._op_original_id_to_id[op_id]
+                    cur_dist_op  = self._dist_ops_for_program.get(cur_op_id, None)
+                dist_op = cur_dist_op
+                assert (
+                    dist_op is not None
+                ), "Operator must have a distributed operator after the initialization for program."
+                serial_op_node_id = _node_id(node)
+                new_dist_op = DistributedOperator(
+                    dist_op.serial_op, dist_op.dist_attr
+                )
                 self._dist_ops_for_graph[serial_op_node_id] = new_dist_op
 
     def copy_dist_attr_from_graph_to_program(self):
-        assert self._is_initialized, \
-            "Both program and graph must be initialized."
+        assert (
+            self._is_initialized
+        ), "Both program and graph must be initialized."
         updated_tensors = {}
         # all_nodes = self._serial_graph.all_nodes()
         all_nodes = self._serial_ordered_nodes
@@ -731,11 +938,15 @@ def copy_dist_attr_from_graph_to_program(self):
                 updated = updated_tensors.get(tensor_id, False)
                 # If a var has multiples var nodes in graph, only use the first one for now
                 if not updated:
-                    tensor_dist_attr_for_graph = self.get_tensor_dist_attr_for_graph(
-                        node)
+                    tensor_dist_attr_for_graph = (
+                        self.get_tensor_dist_attr_for_graph(node)
+                    )
                     dist_tensor_for_program = self._dist_tensors_for_program[
-                        tensor_id]
-                    dist_tensor_for_program.dist_attr = tensor_dist_attr_for_graph
+                        tensor_id
+                    ]
+                    dist_tensor_for_program.dist_attr = (
+                        tensor_dist_attr_for_graph
+                    )
                     updated_tensors[tensor_id] = True
             if node.is_op() and node.op() is not None:
                 op_id = self._node_id_to_op_id[_node_id(node)]
@@ -747,22 +958,26 @@ def copy_dist_attr_from_graph_to_program(self):
         for orphan_node in self._serial_orphan_tensor_nodes:
             serial_tensor_id = orphan_node.var().id()
             dist_tensor = self._dist_tensors_for_program.get(
-                serial_tensor_id, None)
+                serial_tensor_id, None
+            )
             if dist_tensor:
                 dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
             else:
                 serial_tensor_id = orphan_node.var().original_id()
                 dist_tensor = self._dist_tensors_for_program.get(
-                    serial_tensor_id, None)
+                    serial_tensor_id, None
+                )
                 dist_tensor.dist_attr.process_mesh = self._process_meshes[0]
 
     def amend_dist_attr_for_program(self):
         for dist_tensor in self._dist_tensors_for_program.values():
             serial_tensor = dist_tensor.serial_tensor
             dist_attr = dist_tensor.dist_attr
-            if serial_tensor.type == core.VarDesc.VarType.READER \
-                or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
-                or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+            if (
+                serial_tensor.type == core.VarDesc.VarType.READER
+                or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES
+            ):
                 tensor_shape = []
             else:
                 tensor_shape = serial_tensor.shape
@@ -772,8 +987,11 @@ def amend_dist_attr_for_program(self):
             # If the dimension of tensor is less than the sharding dimension of process mesh,
             # we just amend the dimension mapping to -1. (Is this really OK?)
             for i in range(len(tensor_shape)):
-                if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
-                    and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
+                if (
+                    dims_mapping[i] != -1
+                    and tensor_shape[i] > 0
+                    and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]
+                ):
                     dims_mapping[i] = -1
                 if dims_mapping[i] != -1 and len(process_mesh_processes) == 1:
                     dims_mapping[i] = -1
@@ -787,9 +1005,13 @@ def amend_dist_attr_for_program(self):
                 if dist_op.get_serial_input(arg_name) is None:
                     tensor_shape = []
                 else:
-                    if dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.READER \
-                        or dist_op.get_serial_input(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
-                        or dist_op.serial_op.type == "create_py_reader":
+                    if (
+                        dist_op.get_serial_input(arg_name).type
+                        == core.VarDesc.VarType.READER
+                        or dist_op.get_serial_input(arg_name).type
+                        == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                        or dist_op.serial_op.type == "create_py_reader"
+                    ):
                         tensor_shape = []
                     else:
                         tensor_shape = dist_op.get_serial_input(arg_name).shape
@@ -797,16 +1019,27 @@ def amend_dist_attr_for_program(self):
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
-                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
-                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
+                    if (
+                        dims_mapping[i] != -1
+                        and tensor_shape[i] > 0
+                        and process_mesh_shape[dims_mapping[i]]
+                        > tensor_shape[i]
+                    ):
                         dims_mapping[i] = -1
-                    if dims_mapping[i] != -1 and len(
-                            process_mesh_processes) == 1:
+                    if (
+                        dims_mapping[i] != -1
+                        and len(process_mesh_processes) == 1
+                    ):
                         dims_mapping[i] = -1
             for arg_name in serial_op.output_arg_names:
-                if dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.READER \
-                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
-                    or dist_op.get_serial_output(arg_name).type == core.VarDesc.VarType.STEP_SCOPES:
+                if (
+                    dist_op.get_serial_output(arg_name).type
+                    == core.VarDesc.VarType.READER
+                    or dist_op.get_serial_output(arg_name).type
+                    == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                    or dist_op.get_serial_output(arg_name).type
+                    == core.VarDesc.VarType.STEP_SCOPES
+                ):
                     tensor_shape = []
                 else:
                     tensor_shape = dist_op.get_serial_output(arg_name).shape
@@ -814,11 +1047,17 @@ def amend_dist_attr_for_program(self):
                 # If the dimension of tensor is less than the sharding dimension of process mesh,
                 # we just amend the dimension mapping to -1. (Is this really OK?)
                 for i in range(len(tensor_shape)):
-                    if dims_mapping[i] != -1 and tensor_shape[i] > 0 \
-                        and process_mesh_shape[dims_mapping[i]] > tensor_shape[i]:
+                    if (
+                        dims_mapping[i] != -1
+                        and tensor_shape[i] > 0
+                        and process_mesh_shape[dims_mapping[i]]
+                        > tensor_shape[i]
+                    ):
                         dims_mapping[i] = -1
-                    if dims_mapping[i] != -1 and len(
-                            process_mesh_processes) == 1:
+                    if (
+                        dims_mapping[i] != -1
+                        and len(process_mesh_processes) == 1
+                    ):
                         dims_mapping[i] = -1
             if len(process_mesh_processes) == 1:
                 dist_op.dist_attr.impl_type = "default"
@@ -826,30 +1065,44 @@ def amend_dist_attr_for_program(self):
 
     def validate_dist_attr_for_program(self):
         if not self._is_initialized:
-            assert False, \
-                "Program must be initialized before validating its distributed attributes"
+            assert (
+                False
+            ), "Program must be initialized before validating its distributed attributes"
         for block in self.serial_main_program.blocks:
             for tensor in block.vars.values():
                 dist_tensor = self.get_dist_tensor_for_program(tensor)
-                assert dist_tensor is not None, \
-                    "Tensor {} does not have a distributed attribute.".format(
-                        dist_tensor.serial_tensor.name)
-                if (dist_tensor
-                        is not None) and (not dist_tensor.validate_dist_attr()):
-                    assert False, "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
+                assert (
+                    dist_tensor is not None
+                ), "Tensor {} does not have a distributed attribute.".format(
+                    dist_tensor.serial_tensor.name
+                )
+                if (dist_tensor is not None) and (
+                    not dist_tensor.validate_dist_attr()
+                ):
+                    assert (
+                        False
+                    ), "Tensor {} (id: {}, original_id: {}) has a wrong distributed attributes {}.".format(
                         dist_tensor.serial_tensor.name,
                         dist_tensor.serial_tensor.desc.id(),
                         dist_tensor.serial_tensor.desc.original_id(),
-                        dist_tensor.dist_attr)
+                        dist_tensor.dist_attr,
+                    )
             for op in block.ops:
                 dist_op = self.get_dist_op_for_program(op)
-                assert dist_op is not None, \
-                    "Operator {} does not have a distributed attribute.".format(
-                        dist_op.serial_op.type)
+                assert (
+                    dist_op is not None
+                ), "Operator {} does not have a distributed attribute.".format(
+                    dist_op.serial_op.type
+                )
                 if (dist_op is not None) and (not dist_op.validate_dist_attr()):
-                    assert False, "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
-                        dist_op.serial_op.type, dist_op.serial_op.desc.id(),
-                        dist_op.serial_op.desc.original_id(), dist_op.dist_attr)
+                    assert (
+                        False
+                    ), "Operator {} (id: {}, original_id: {}) has a wrong distributed attributes {} .".format(
+                        dist_op.serial_op.type,
+                        dist_op.serial_op.desc.id(),
+                        dist_op.serial_op.desc.original_id(),
+                        dist_op.dist_attr,
+                    )
         return True
 
     def __deepcopy__(self, memo):
@@ -858,15 +1111,27 @@ def __deepcopy__(self, memo):
         memo[id(self)] = result
         for k, v in self.__dict__.items():
             if k in [
-                "_original_serial_main_program", "_original_serial_startup_program", \
-                "_serial_main_program", "_serial_startup_program", "_serial_graph", \
-                "_dist_main_programs", "_dist_startup_programs", \
-                "_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \
-                "_serial_ordered_op_nodes", "_original_serial_loss", \
-                "_original_serial_feed_vars", "_original_serial_fetch_vars", \
-                "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_lr_optimizer", \
-                "_backup_serial_main_program_stack", "_backup_serial_startup_program_stack", \
-                "_pass_context"]:
+                "_original_serial_main_program",
+                "_original_serial_startup_program",
+                "_serial_main_program",
+                "_serial_startup_program",
+                "_serial_graph",
+                "_dist_main_programs",
+                "_dist_startup_programs",
+                "_serial_ordered_nodes",
+                "_serial_ordered_tensor_nodes",
+                "_serial_ordered_op_nodes",
+                "_original_serial_loss",
+                "_original_serial_feed_vars",
+                "_original_serial_fetch_vars",
+                "_serial_loss",
+                "_serial_feed_vars",
+                "_serial_fetch_vars",
+                "_serial_optimizer",
+                "_backup_serial_main_program_stack",
+                "_backup_serial_startup_program_stack",
+                "_pass_context",
+            ]:
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
@@ -908,8 +1173,12 @@ def __deepcopy__(self, memo):
         memo[id(self)] = result
         for k, v in self.__dict__.items():
             if k in [
-                    "_dst_main_program", "_dst_startup_program", "_cur_src_op",
-                    "_work_block", "_main_block", "_startup_block"
+                "_dst_main_program",
+                "_dst_startup_program",
+                "_cur_src_op",
+                "_work_block",
+                "_main_block",
+                "_startup_block",
             ]:
                 setattr(result, k, v)
             else:
@@ -989,7 +1258,6 @@ def prepare_context(self, src_op):
 
 
 class BlockState(object):
-
     def __init__(self):
         self.nblock = 0
         self.forward_indices = []
@@ -1006,8 +1274,11 @@ def parse_forward_blocks(self, program):
         for idx, block in enumerate(program.blocks):
 
             assert idx == block.idx, "index doesn't match"
-            assert block.forward_block_idx == -1, "forward_block_idx of forward block [{}] is not [{}]".format(
-                idx, block.forward_block_idx)
+            assert (
+                block.forward_block_idx == -1
+            ), "forward_block_idx of forward block [{}] is not [{}]".format(
+                idx, block.forward_block_idx
+            )
             self.forward_indices.append(idx)
             self.nblock += 1
 
@@ -1016,7 +1287,8 @@ def parse_forward_blocks(self, program):
     def parse_backward_blocks(self, program):
 
         assert 0 in self.forward_indices, "forward block idx are{}".format(
-            self.forward_indices)
+            self.forward_indices
+        )
         self.backward_to_forward_index_map[0] = 0
 
         for idx, block in enumerate(program.blocks):
@@ -1031,3 +1303,49 @@ def parse_backward_blocks(self, program):
             self.nblock += 1
 
         assert self.nblock == len(program.blocks)
+
+
+class UpDownStream:
+    def __init__(self):
+        self._ups = dict()
+        self._downs = dict()
+
+    def add_up_stream(self, rank, up_stream):
+        ups = self._ups.get(rank, None)
+        if not ups:
+            self._ups[rank] = [up_stream]
+        elif up_stream != -1:
+            ups = list(filter(lambda a: a != -1, ups))
+            ups.append(up_stream)
+            self._ups[rank] = ups
+
+    def add_down_stream(self, rank, down_stream):
+        downs = self._downs.get(rank, None)
+        if not downs:
+            self._downs[rank] = [down_stream]
+        elif down_stream != -1:
+            downs = list(filter(lambda a: a != -1, downs))
+            downs.append(down_stream)
+            self._downs[rank] = downs
+
+    def add_pair_stream(self, up, down):
+        self.add_up_stream(up, -1)
+        self.add_up_stream(down, up)
+        self.add_down_stream(up, down)
+        self.add_down_stream(down, -1)
+        print(up, "'s upstream is ", self.ups(up))
+        print(down, "'s upstream is ", self.ups(down))
+        print(up, "'s downstream is ", self.downs(up))
+        print(down, "'s downstream is ", self.downs(down))
+
+    def ups(self, rank):
+        ups = self._ups.get(rank, None)
+        if not ups:
+            return None
+        return list(set(ups))
+
+    def downs(self, rank):
+        downs = self._downs.get(rank, None)
+        if not downs:
+            return None
+        return list(set(downs))
diff --git a/python/paddle/distributed/auto_parallel/dist_loader.py b/python/paddle/distributed/auto_parallel/dist_loader.py
index 5645235cb71f62..38b537799e546f 100644
--- a/python/paddle/distributed/auto_parallel/dist_loader.py
+++ b/python/paddle/distributed/auto_parallel/dist_loader.py
@@ -14,59 +14,14 @@
 
 import abc
 import numpy as np
-from functools import wraps
 
 import paddle
-from .utils import to_list
-from paddle.fluid.layers.utils import flatten
-from paddle.io import DataLoader, BatchSampler, IterableDataset
-from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler
+from paddle.io import BatchSampler, IterableDataset
+from paddle.fluid.dataloader.batch_sampler import _InfiniteIterableSampler, DistributedBatchSampler
 from paddle.fluid.dataloader.dataloader_iter import _DatasetKind, default_collate_fn, default_convert_fn
 
 
-class DistributedDataLoader(metaclass=abc.ABCMeta):
-
-    def __init__(self,
-                 dataset,
-                 batch_size=1,
-                 epochs=1,
-                 data_parallel_world_size=None,
-                 data_parallel_rank=None,
-                 drop_last=False,
-                 split_data=True):
-        if isinstance(dataset, IterableDataset):
-            self.dataset_kind = _DatasetKind.ITER
-        else:
-            self.dataset_kind = _DatasetKind.MAP
-
-        self.dataset = dataset
-        self.epochs = epochs
-        self.drop_lost = drop_last
-        self.data_parallel_world_size = data_parallel_world_size
-        self.data_parallel_rank = data_parallel_rank
-        self.split_data = split_data
-
-        if batch_size is None:
-            self.batch_size = None
-            self.batch_sampler = None
-        else:
-            if data_parallel_world_size is not None:
-                for dp_world_size in data_parallel_world_size:
-                    if dp_world_size is not None:
-                        assert batch_size % dp_world_size == 0, \
-                            "batch_size must be divisible by dp_world_size value {}".format(str(dp_world_size))
-            self.batch_size = batch_size
-            if isinstance(dataset, IterableDataset):
-                self.batch_sampler = _InfiniteIterableSampler(
-                    dataset, batch_size)
-            else:
-                self.batch_sampler = BatchSampler(dataset,
-                                                  batch_size=batch_size,
-                                                  shuffle=False,
-                                                  drop_last=drop_last)
-
-        self.auto_collate_batch = self.batch_sampler is not None
-        self.sampler_iter = iter(self.index_sampler)
+class DistributedDataLoaderBase(metaclass=abc.ABCMeta):
 
     @abc.abstractmethod
     def __iter__(self):
@@ -87,36 +42,72 @@ def index_sampler(self):
                 return _InfiniteIterableSampler(self.dataset, 1)
 
 
-class NonIterableGeneratorLoader(DistributedDataLoader):
+class DistributedDataLoaderFromGenerator(DistributedDataLoaderBase):
 
     def __init__(self,
                  dataset,
-                 feed_list,
-                 places,
+                 feed_list=None,
+                 capacity=None,
+                 use_double_buffer=True,
+                 iterable=True,
+                 return_list=False,
+                 use_multiprocess=False,
+                 drop_last=True,
+                 places=None,
                  batch_size=1,
                  epochs=1,
                  steps_per_epoch=None,
                  collate_fn=None,
-                 data_parallel_world_size=None,
-                 data_parallel_rank=None,
-                 drop_last=False,
-                 split_data=True):
+                 split_data=True,
+                 data_parallel_world_size=[],
+                 data_parallel_rank=[]):
+        self.dataset = dataset
         self.feed_list = feed_list
+        self.capacity = capacity
+        self.use_double_buffer = use_double_buffer
+        self.iterable = iterable
+        self.return_list = return_list
+        self.use_multiprocess = use_multiprocess
+        self.drop_last = drop_last
         self.places = places
+        self.batch_size = batch_size
+        self.epochs = epochs
         self.steps_per_epoch = steps_per_epoch
+        self.collate_fn = collate_fn
+        self.split_data = split_data
+        assert len(data_parallel_world_size) == len(feed_list)
+        assert len(data_parallel_rank) == len(feed_list)
+        self.dp_world_sizes = data_parallel_world_size
+        self.dp_ranks = data_parallel_rank
+
+        if isinstance(dataset, IterableDataset):
+            self.dataset_kind = _DatasetKind.ITER
+        else:
+            self.dataset_kind = _DatasetKind.MAP
+
+        if self.batch_size is None:
+            self.batch_sampler = None
+        else:
+            if isinstance(dataset, IterableDataset):
+                self.batch_sampler = _InfiniteIterableSampler(
+                    dataset, batch_size)
+            else:
+                self.batch_sampler = BatchSampler(dataset,
+                                                  batch_size=batch_size,
+                                                  shuffle=False,
+                                                  drop_last=drop_last)
 
-        super(NonIterableGeneratorLoader,
-              self).__init__(dataset, batch_size, epochs,
-                             data_parallel_world_size, data_parallel_rank,
-                             drop_last, split_data)
+        self.auto_collate_batch = self.batch_sampler is not None
+        self.sampler_iter = iter(self.index_sampler)
 
         if self.auto_collate_batch:
             self.collate_fn = collate_fn or default_collate_fn
         else:
             self.collate_fn = collate_fn or default_convert_fn
+
         self.dataset_fetcher = _DatasetKind.create_fetcher(
             self.dataset_kind, self.dataset, self.auto_collate_batch,
-            self.collate_fn, self.drop_lost)
+            self.collate_fn, self.drop_last)
 
         self._steps = self._infer_steps()
         self._inner_dataloader = self._create_inner_dataloader()
@@ -129,8 +120,10 @@ def __iter__(self):
     def __next__(self):
         if not self._steps:
             self._cur_step += 1
+            return None
         elif self._cur_step < self._steps:
             self._cur_step += 1
+            return None
         else:
             self._inner_dataloader.reset()
             self.sampler_iter = iter(self.index_sampler)
@@ -152,70 +145,125 @@ def _infer_steps(self):
             )
         return steps_per_epoch
 
+    @property
+    def index_sampler(self):
+        if self.auto_collate_batch:
+            return self.batch_sampler
+        else:
+            if self.dataset_kind == _DatasetKind.MAP:
+                return list(range(len(self.dataset)))
+            else:
+                return _InfiniteIterableSampler(self.dataset, 1)
+
     def _create_inner_dataloader(self):
 
-        def sample_data_generator():
+        def data_generator():
             while True:
                 try:
                     indices = next(self.sampler_iter)
                     batch = self.dataset_fetcher.fetch(indices)
                     if batch is None: break
-
                 except StopIteration:
                     self.dataset_fetcher = _DatasetKind.create_fetcher(
                         self.dataset_kind, self.dataset,
                         self.auto_collate_batch, self.collate_fn,
-                        self.drop_lost)
+                        self.drop_last)
                     break
 
                 partial_data = []
-                for i, d in enumerate(batch[:len(self.feed_list)]):
+                for i, d in enumerate(batch):
                     array = np.array(d)
                     if not self.split_data:
                         partial_data.append(array)
-                    elif self.dp_world_sizes[i] is not None:
-                        partial_data.append(
-                            np.split(array,
-                                     self.dp_world_sizes[i])[self.dp_ranks[i]])
-                    else:
-                        partial_data.append(array)
+                        continue
+
+                    batch_size = array.shape[0]
+                    assert batch_size % self.dp_world_sizes[i] == 0, \
+                        "batch_size [{}] is not divisible by dp_world_size [{}]".format(str(batch_size), str(self.dp_world_sizes[i]))
+                    partial_data.append(
+                        np.split(array,
+                                 self.dp_world_sizes[i])[self.dp_ranks[i]])
+
                 yield partial_data
 
-        def batch_data_generator():
-            while True:
-                try:
-                    indices = next(self.sampler_iter)
+        dataloader = paddle.fluid.io.DataLoader.from_generator(
+            feed_list=self.feed_list,
+            capacity=self.capacity,
+            use_double_buffer=self.use_double_buffer,
+            # iterable=self.iterable,
+            iterable=False,
+            return_list=self.return_list,
+            use_multiprocess=self.use_multiprocess,
+            drop_last=self.drop_last)
+        dataloader.set_batch_generator(data_generator, self.places)
 
-                    batch = self.dataset_fetcher.fetch(indices)
-                    if batch is None: break
-                except StopIteration:
-                    break
+        return dataloader
 
-                partial_data = []
-                for i, d in enumerate(batch[:len(self.feed_list)]):
-                    array = np.array(d)
-                    if not self.split_data:
-                        partial_data.append(array)
-                    elif self.dp_world_sizes[i] is not None:
-                        partial_data.append(
-                            np.split(array,
-                                     self.dp_world_sizes[i])[self.dp_ranks[i]])
-                    else:
-                        partial_data.append(array)
-                yield partial_data
 
-        self.dp_world_sizes = [
-            1 for _ in range(len(self.feed_list))
-        ] if self.data_parallel_world_size is None else self.data_parallel_world_size
-        self.dp_ranks = [
-            0 for _ in range(len(self.feed_list))
-        ] if self.data_parallel_rank is None else self.data_parallel_rank
+class DistributedDataLoader(DistributedDataLoaderBase):
 
-        dataloader = paddle.fluid.io.DataLoader.from_generator(
-            feed_list=self.feed_list, capacity=70, iterable=False)
-        if self.batch_size is not None:
-            dataloader.set_batch_generator(sample_data_generator, self.places)
-        else:
-            dataloader.set_batch_generator(batch_data_generator, self.places)
+    def __init__(self,
+                 dataset,
+                 feed_list=None,
+                 places=None,
+                 return_list=True,
+                 batch_size=1,
+                 shuffle=False,
+                 drop_last=False,
+                 collate_fn=None,
+                 num_workers=0,
+                 use_buffer_reader=True,
+                 use_shared_memory=True,
+                 timeout=0,
+                 worker_init_fn=None,
+                 epochs=1,
+                 steps_per_epoch=None,
+                 split_data=True,
+                 data_parallel_world_size=[],
+                 data_parallel_rank=[]):
+        self.dataset = dataset
+        self.feed_list = feed_list
+        self.return_list = return_list
+        self.places = places
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.collate_fn = collate_fn
+        self.num_workers = num_workers
+        self.use_buffer_reader = use_buffer_reader
+        self.use_shared_memory = use_shared_memory
+        self.timeout = timeout
+        self.worker_init_fn = worker_init_fn
+        self.epochs = epochs
+        self.steps_per_epoch = steps_per_epoch
+        self.dp_world_sizes = data_parallel_world_size
+        self.dp_ranks = data_parallel_rank
+        self.split_data = split_data
+        # TODO: rank info
+        self.batch_sampler = DistributedBatchSampler(
+            self.dataset, self.batch_size, self.dp_world_sizes[0],
+            self.dp_ranks[0], self.shuffle, self.drop_last)
+        self._inner_dataloader = self._create_inner_dataloader()
+
+    def __iter__(self):
+        return self
+
+    def __next__(self):
+        return next(self.data)
+
+    def _create_inner_dataloader(self):
+        dataloader = paddle.fluid.io.DataLoader(
+            self.dataset,
+            feed_list=self.feed_list,
+            places=self.places,
+            return_list=self.return_list,
+            batch_sampler=self.batch_sampler,
+            collate_fn=self.collate_fn,
+            num_workers=self.num_workers,
+            use_buffer_reader=self.use_buffer_reader,
+            use_shared_memory=self.use_shared_memory,
+            timeout=self.timeout,
+            worker_init_fn=self.worker_init_fn)
+        self.data = (x for x in dataloader)
 
         return dataloader
diff --git a/python/paddle/distributed/auto_parallel/dist_op.py b/python/paddle/distributed/auto_parallel/dist_op.py
index b6a77b778885f5..1ca49e364734c5 100644
--- a/python/paddle/distributed/auto_parallel/dist_op.py
+++ b/python/paddle/distributed/auto_parallel/dist_op.py
@@ -23,10 +23,10 @@
 from .dist_attribute import append_op_output_suffix
 from .dist_attribute import get_tensor_dist_attr_field_keys
 from .dist_attribute import get_op_dist_attr_field_keys
+from .utils import convert_to_shard_spec, verify_shard_spec
 
 
 class DistributedOperator:
-
     def __init__(self, serial_op, dist_attr=None):
         self._serial_op = serial_op
         self._serial_inputs = {}
@@ -77,28 +77,34 @@ def _init_default_dist_attr(self):
             if tensor is None:
                 tensor_shape = []
             else:
-                if tensor.type == core.VarDesc.VarType.READER \
-                    or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+                if (
+                    tensor.type == core.VarDesc.VarType.READER
+                    or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                ):
                     tensor_shape = []
                 else:
                     tensor_shape = tensor.shape
             if self._dist_attr.get_input_dims_mapping(tensor_name) is None:
                 tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
-                self._dist_attr.set_input_dims_mapping(tensor_name,
-                                                       tensor_dims_mapping)
+                self._dist_attr.set_input_dims_mapping(
+                    tensor_name, tensor_dims_mapping
+                )
         for tensor_name in self._serial_op.output_arg_names:
             tensor = self._serial_op.block._var_recursive(tensor_name)
-            if tensor.type == core.VarDesc.VarType.READER \
-                or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
-                or tensor.type == core.VarDesc.VarType.STEP_SCOPES:
+            if (
+                tensor.type == core.VarDesc.VarType.READER
+                or tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                or tensor.type == core.VarDesc.VarType.STEP_SCOPES
+            ):
                 tensor_shape = []
             else:
                 tensor_shape = tensor.shape
             self._serial_outputs[tensor_name] = tensor
             if self._dist_attr.get_output_dims_mapping(tensor_name) is None:
                 tensor_dims_mapping = [-1 for _ in range(len(tensor_shape))]
-                self._dist_attr.set_output_dims_mapping(tensor_name,
-                                                        tensor_dims_mapping)
+                self._dist_attr.set_output_dims_mapping(
+                    tensor_name, tensor_dims_mapping
+                )
         if self._dist_attr.op_type is None:
             self._dist_attr.op_type = self.serial_op.type
         if self._dist_attr.impl_type is None:
@@ -116,8 +122,10 @@ def _filter_dist_attr(self, dist_attr):
             new_dist_attr = {}
             for key, value in dist_attr.items():
                 if isinstance(key, Variable):
-                    if key.name in self._serial_op.input_arg_names \
-                        or key.name in self._serial_op.output_arg_names:
+                    if (
+                        key.name in self._serial_op.input_arg_names
+                        or key.name in self._serial_op.output_arg_names
+                    ):
                         new_dist_attr[key] = value
                 else:
                     new_dist_attr[key] = value
@@ -128,13 +136,15 @@ def _filter_dist_attr(self, dist_attr):
             for tensor_name in self._serial_op.input_arg_names:
                 tensor_dist_attr = dist_attr.get_input_dist_attr(tensor_name)
                 if tensor_dist_attr:
-                    new_dist_attr.set_input_dist_attr(tensor_name,
-                                                      tensor_dist_attr)
+                    new_dist_attr.set_input_dist_attr(
+                        tensor_name, tensor_dist_attr
+                    )
             for tensor_name in self._serial_op.output_arg_names:
                 tensor_dist_attr = dist_attr.get_output_dist_attr(tensor_name)
                 if tensor_dist_attr:
-                    new_dist_attr.set_output_dist_attr(tensor_name,
-                                                       tensor_dist_attr)
+                    new_dist_attr.set_output_dist_attr(
+                        tensor_name, tensor_dist_attr
+                    )
         else:
             assert False, "Cannot recognize the {} parameter.".format(dist_attr)
         return new_dist_attr
@@ -145,8 +155,10 @@ def validate_dist_attr(self):
         for name in self.serial_op.input_arg_names:
             input_dist_attr = self.dist_attr.get_input_dist_attr(name)
             dims_mapping = input_dist_attr.dims_mapping
-            if self.get_serial_input(
-                    name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+            if (
+                self.get_serial_input(name).type
+                == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+            ):
                 shape = []
             else:
                 shape = self.get_serial_input(name).shape
@@ -154,7 +166,8 @@ def validate_dist_attr(self):
                 return False
             for i in range(len(dims_mapping)):
                 if dims_mapping[i] < -1 or dims_mapping[i] >= len(
-                        self.dist_attr.process_mesh.topology):
+                    self.dist_attr.process_mesh.topology
+                ):
                     return False
             for i in range(len(self.dist_attr.process_mesh.topology)):
                 if dims_mapping.count(i) > 1:
@@ -165,8 +178,12 @@ def validate_dist_attr(self):
         for name in self.serial_op.output_arg_names:
             output_dist_attr = self.dist_attr.get_output_dist_attr(name)
             dims_mapping = output_dist_attr.dims_mapping
-            if self.get_serial_output(name).type == core.VarDesc.VarType.LOD_TENSOR_ARRAY\
-                or self.get_serial_output(name).type == core.VarDesc.VarType.STEP_SCOPES:
+            if (
+                self.get_serial_output(name).type
+                == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                or self.get_serial_output(name).type
+                == core.VarDesc.VarType.STEP_SCOPES
+            ):
                 shape = []
             else:
                 shape = self.get_serial_output(name).shape
@@ -174,7 +191,8 @@ def validate_dist_attr(self):
                 return False
             for i in range(len(dims_mapping)):
                 if dims_mapping[i] < -1 or dims_mapping[i] >= len(
-                        self.dist_attr.process_mesh.topology):
+                    self.dist_attr.process_mesh.topology
+                ):
                     return False
             for i in range(len(self.dist_attr.process_mesh.topology)):
                 if dims_mapping.count(i) > 1:
@@ -184,8 +202,9 @@ def validate_dist_attr(self):
         return True
 
     def __str__(self):
-        str = "{{op type: {}, op id: {}".format(self.serial_op.desc.type(),
-                                                self.serial_op.desc.id())
+        str = "{{op type: {}, op id: {}".format(
+            self.serial_op.desc.type(), self.serial_op.desc.id()
+        )
 
         # str += ", {}".format(self.dist_attr)
         # return str
@@ -194,8 +213,9 @@ def __str__(self):
             annotated_str = "annotated"
         else:
             annotated_str = "non-annotated"
-        str += ", process_mesh ({}): {}".format(annotated_str,
-                                                self.dist_attr.process_mesh)
+        str += ", process_mesh ({}): {}".format(
+            annotated_str, self.dist_attr.process_mesh
+        )
 
         for arg_name in self.serial_op.desc.input_arg_names():
             dims_mapping = self.dist_attr.get_input_dims_mapping(arg_name)
@@ -211,7 +231,8 @@ def __str__(self):
             else:
                 is_parameter_str = "non-parameter"
             str += ", {}'s dims_mapping (input, {}, {}): {}".format(
-                arg_name, annotated_str, is_parameter_str, dims_mapping)
+                arg_name, annotated_str, is_parameter_str, dims_mapping
+            )
 
         for arg_name in self.serial_op.desc.output_arg_names():
             dims_mapping = self.dist_attr.get_output_dims_mapping(arg_name)
@@ -227,12 +248,14 @@ def __str__(self):
             else:
                 is_parameter_str = "non-parameter"
             str += ", {}'s dims_mapping (output, {}, {}): {}".format(
-                arg_name, annotated_str, is_parameter_str, dims_mapping)
+                arg_name, annotated_str, is_parameter_str, dims_mapping
+            )
 
         str += ", pipeline stage: {}".format(None)
 
         str += ", dist_impl idx: {} , dist_impl type {} }}".format(
-            self.dist_attr._impl_idx, self.dist_attr._impl_type)
+            self.dist_attr._impl_idx, self.dist_attr._impl_type
+        )
 
         return str
 
@@ -241,30 +264,139 @@ def __deepcopy__(self, memo):
         result = cls.__new__(cls)
         memo[id(self)] = result
         for k, v in self.__dict__.items():
-            if k == "_serial_op" or k == "_serial_inputs" or k == "_serial_outputs":
+            if (
+                k == "_serial_op"
+                or k == "_serial_inputs"
+                or k == "_serial_outputs"
+            ):
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
         return result
 
 
-class DistributedModule:
-
-    def __init__(self, serial_module, dist_attr=None):
-        self._serial_module = serial_module
-        self._dist_attr = dist_attr
+class DistributedOperatorHelper:
+    def __init__(
+        self, serial_op, process_mesh, in_dims_mappings, out_dims_mappings
+    ):
+        self._serial_op = serial_op
+        self._process_mesh = process_mesh
+        self._in_dims_mappings = in_dims_mappings
+        self._out_dims_mappings = out_dims_mappings
 
     def __call__(self, *args, **kwargs):
-        from .dist_context import get_default_distributed_context
+        tensor_to_dims_mapping = {}
+        index = 0
+        if self._in_dims_mappings:
+            assert len(args) + len(kwargs) == len(
+                self._in_dims_mappings
+            ), "The length of dims_mapping {} does not matching the length output {}.".format(
+                len(self._in_dims_mappings), len(args) + len(kwargs)
+            )
+        for arg in args:
+            if isinstance(arg, Variable) and self._in_dims_mappings:
+                tensor_to_dims_mapping[arg.name] = self._in_dims_mappings[index]
+            index += 1
+        for arg in kwargs.values() and self._in_dims_mappings:
+            if isinstance(arg, Variable):
+                tensor_to_dims_mapping[arg.name] = self._in_dims_mappings[index]
+            index += 1
+
         default_prog = paddle.fluid.default_main_program()
         cur_block = default_prog.current_block()
         op_size = len(cur_block.ops)
-        output = self._serial_module(*args, **kwargs)
+        output = self._serial_op(*args, **kwargs)
         new_op_size = len(cur_block.ops)
+
+        if isinstance(output, tuple) or isinstance(output, list):
+            new_output = list(output)
+        elif isinstance(output, Variable):
+            new_output = [output]
+        else:
+            raise ValueError("Unrecognized outpout.")
+
+        if self._out_dims_mappings:
+            assert len(new_output) == len(
+                self._out_dims_mappings
+            ), "The length of dims_mapping {} does not matching the length output {}.".format(
+                len(self._out_dims_mappings), len(new_output)
+            )
+        for i, item in enumerate(new_output):
+            if isinstance(item, Variable) and self._out_dims_mappings:
+                tensor_to_dims_mapping[item.name] = self._out_dims_mappings[i]
+
+        from .dist_context import get_default_distributed_context
+
         default_dist_ctx = get_default_distributed_context()
         for idx in range(op_size, new_op_size):
             op = cur_block.ops[idx]
-            dist_op = DistributedOperator(op, self._dist_attr)
-            dist_op.dist_attr.mark_annotated_as(self._dist_attr)
+            dist_op = DistributedOperator(op)
+            for name in dist_op.serial_op.input_arg_names:
+                if name in tensor_to_dims_mapping.keys():
+                    tensor = dist_op.get_serial_input(name)
+                    tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(
+                        name
+                    )
+                    dims_mapping = tensor_to_dims_mapping[name]
+                    if tensor is None:
+                        tensor_shape = []
+                    else:
+                        if (
+                            tensor.type == core.VarDesc.VarType.READER
+                            or tensor.type
+                            == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                            or tensor.type == core.VarDesc.VarType.STEP_SCOPES
+                        ):
+                            tensor_shape = []
+                        else:
+                            tensor_shape = tensor.shape
+                    if dims_mapping is not None:
+                        dims_mapping = tensor_to_dims_mapping[name]
+                        shard_spec = convert_to_shard_spec(
+                            dims_mapping, self._process_mesh
+                        )
+                        assert verify_shard_spec(
+                            shard_spec, tensor_shape, self._process_mesh
+                        ), "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
+                            name, shard_spec, tensor_shape, self._process_mesh
+                        )
+                        tensor_dist_attr.dims_mapping = dims_mapping
+                        tensor_dist_attr.mark_annotated("dims_mapping")
+            for name in dist_op.serial_op.output_arg_names:
+                if name in tensor_to_dims_mapping.keys():
+                    tensor = dist_op.get_serial_output(name)
+                    tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(
+                        name
+                    )
+                    dims_mapping = tensor_to_dims_mapping[name]
+                    if tensor is None:
+                        tensor_shape = []
+                    else:
+                        if (
+                            tensor.type == core.VarDesc.VarType.READER
+                            or tensor.type
+                            == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                            or tensor.type == core.VarDesc.VarType.STEP_SCOPES
+                        ):
+                            tensor_shape = []
+                        else:
+                            tensor_shape = tensor.shape
+                    if dims_mapping is not None:
+                        dims_mapping = tensor_to_dims_mapping[name]
+                        shard_spec = convert_to_shard_spec(
+                            dims_mapping, self._process_mesh
+                        )
+                        assert verify_shard_spec(
+                            shard_spec, tensor_shape, self._process_mesh
+                        ), "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
+                            name, shard_spec, tensor_shape, self._process_mesh
+                        )
+                        tensor_dist_attr.dims_mapping = dims_mapping
+                        tensor_dist_attr.mark_annotated("dims_mapping")
+            dist_op.dist_attr.process_mesh = self._process_mesh
+            if self._process_mesh is not None:
+                dist_op.dist_attr.mark_annotated("process_mesh")
             default_dist_ctx.add_dist_op_for_program(dist_op)
+            default_dist_ctx.add_process_mesh(self._process_mesh)
+
         return output
diff --git a/python/paddle/distributed/auto_parallel/dist_saver.py b/python/paddle/distributed/auto_parallel/dist_saver.py
index c3dad9e2873866..350e5ac44e724d 100644
--- a/python/paddle/distributed/auto_parallel/dist_saver.py
+++ b/python/paddle/distributed/auto_parallel/dist_saver.py
@@ -27,7 +27,7 @@
 from .utils import get_dist_attr
 from .converter import Converter
 from .process_group import _g_process_group_map
-from ..utils import get_logger
+from ..utils.log_utils import get_logger
 
 
 def check_filename(re_exp, filename):
@@ -59,6 +59,14 @@ def __init__(self):
 
     def save(self, path, serial_program, dist_main_program, dist_context):
 
+        def _save_state(program, path, mode="param"):
+            state = {
+                k: np.array(v)
+                for k, v in program.state_dict(mode).items()
+            }
+            with open(path, "wb") as f:
+                pickle.dump(state, f)
+
         dirname, filename = _process_path(path)
 
         rank_id = paddle.distributed.get_rank()
@@ -76,16 +84,6 @@ def save(self, path, serial_program, dist_main_program, dist_context):
         with open(dist_model_path, "wb") as f:
             f.write(dist_main_program.desc.serialize_to_string())
 
-        # save distributed params
-        dist_param_filename = filename + "_dist" + str(rank_id) + ".pdparams"
-        dist_param_path = os.path.join(dirname, dist_param_filename)
-        dist_param = {
-            k: np.array(v)
-            for k, v in dist_main_program.state_dict().items()
-        }
-        with open(dist_param_path, "wb") as f:
-            pickle.dump(dist_param, f)
-
         # save distributed attribute
         dist_attr_filename = filename + "_dist" + str(rank_id) + ".pdattr"
         dist_attr_path = os.path.join(dirname, dist_attr_filename)
@@ -93,65 +91,69 @@ def save(self, path, serial_program, dist_main_program, dist_context):
         with open(dist_attr_path, "wb") as f:
             pickle.dump(dist_attrs, f)
 
+        # save distributed params
+        dist_param_filename = filename + "_dist" + str(rank_id) + ".pdparams"
+        dist_param_path = os.path.join(dirname, dist_param_filename)
+        _save_state(dist_main_program, dist_param_path)
+
+        # save distributed opt states
+        dist_opt_filename = filename + "_dist" + str(rank_id) + ".pdopt"
+        dist_opt_path = os.path.join(dirname, dist_opt_filename)
+        _save_state(dist_main_program, dist_opt_path, "opt")
+
         # TODO:save cluster.json
 
-    def load(self,
-             path,
-             program,
-             dist_context,
-             strict=True,
-             load_optimizer=True):
+    def load(self, path, load_optimizer=True):
         # TODO: if `program` is None, load `path.pdmodel`.
+        def _load_file(filename, dirname, suffix="pdparams"):
+            file_list = []
+            for file in os.listdir(dirname):
+                if check_filename('{}(.*)_dist(.*).{}'.format(filename, suffix),
+                                  file):
+                    file_list.append(os.path.join(dirname, file))
+            file_list.sort()
+            return file_list
+
+        def _load_state(filename, dirname, suffix="pdparams"):
+            file_list = _load_file(filename, dirname, suffix)
+            state_dict = {}
+            for file in file_list:
+                with open(file, 'rb') as f:
+                    state_dict_info = pickle.load(f, encoding='latin1')
+                for name, value in state_dict_info.items():
+                    if name in state_dict:
+                        state_dict[name].append(np.array(value))
+                    else:
+                        state_dict[name] = [np.array(value)]
+            self._logger.info("Load param file: {}".format(file_list))
+            return state_dict
+
         filename = os.path.basename(path)
         if filename == "":
             raise ValueError(
                 "path should be of 'dirname/filename' format, but received filename is empty string"
             )
         dirname = os.path.dirname(path)
-        # load path.pdparam
-        param_file_list = []
-        for param_file in os.listdir(dirname):
-            if check_filename('{}(.*)_dist(.*).pdparams'.format(filename),
-                              param_file):
-                param_file_list.append(os.path.join(dirname, param_file))
-        param_file_list.sort()
-        self._logger.info(
-            "Load distributed attribute file: {}".format(param_file_list))
-        param_dict = {}
-        for param_file in param_file_list:
-            with open(param_file, 'rb') as f:
-                state_dict_info = pickle.load(f, encoding='latin1')
-            for name, value in state_dict_info.items():
-                if name in param_dict:
-                    param_dict[name].append(np.array(value))
-                else:
-                    param_dict[name] = [np.array(value)]
+
+        # load path.pdparam and path.pdopt
+        param_state_dict = _load_state(filename, dirname)
+        opt_state_dict = _load_state(filename, dirname,
+                                     "pdopt") if load_optimizer else {}
+        state_dict = dict(param_state_dict, **opt_state_dict)
 
         # load path.pdattr
-        dist_attr_file_list = []
-        for dist_attr_file in os.listdir(dirname):
-            if check_filename('{}(.*)_dist(.*).pdattr'.format(filename),
-                              dist_attr_file):
-                dist_attr_file_list.append(os.path.join(dirname,
-                                                        dist_attr_file))
-        dist_attr_file_list.sort()
+        dist_attr_file_list = _load_file(filename, dirname, "pdattr")
         self._logger.info(
             "Load distributed attribute file: {}".format(dist_attr_file_list))
-        pre_dist_attr = {}
+        dist_attr = {}
         for dist_attr_file in dist_attr_file_list:
             with open(dist_attr_file, 'rb') as f:
-                dist_attr = pickle.load(f, encoding='latin1')
-            for name, attr in dist_attr.items():
-                if name not in pre_dist_attr:
-                    pre_dist_attr[name] = attr
-
-        # get current dist_attr
-        cur_dist_attr = get_dist_attr(program, dist_context)
-
-        # param convert
-        converter = Converter(param_dict, pre_dist_attr, cur_dist_attr)
-        param_dict = converter.convert(strict=strict)
-        program.set_state_dict(param_dict)
+                dist_attr_info = pickle.load(f, encoding='latin1')
+            for name, attr in dist_attr_info.items():
+                if name not in dist_attr:
+                    dist_attr[name] = attr
+
+        return state_dict, dist_attr
 
     def save_inference_model(self, path, feed_vars, fetch_vars, exe, **kwargs):
 
diff --git a/python/paddle/distributed/auto_parallel/engine.py b/python/paddle/distributed/auto_parallel/engine.py
index 8d1a1488ac790f..e6faf811fadfc9 100644
--- a/python/paddle/distributed/auto_parallel/engine.py
+++ b/python/paddle/distributed/auto_parallel/engine.py
@@ -12,85 +12,196 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import time
+import os
 import copy
 import logging
+import random
+import numbers
+import numpy as np
 from collections import defaultdict
 
 import paddle
 import paddle.utils as utils
 
 from paddle import fluid, static
-from paddle.io import Dataset
-from paddle.jit import to_static
 from paddle.metric import Metric
 from paddle.static import InputSpec
 from paddle.fluid import core
-from paddle.fluid import program_guard
+from paddle.fluid import Variable
 from paddle.fluid.layers.utils import flatten
 from paddle.fluid.executor import global_scope, _to_name_str
-from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import Operator, Parameter, _non_static_mode
+from paddle.fluid.framework import Operator, _non_static_mode
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.distributed import fleet
-from paddle.distributed.passes import new_pass, PassContext
 
-from .hepler import ProgramHelper
-from ..collective import _get_global_env
+from .callbacks import config_callbacks
+from .converter import Converter
+from .helper import ProgramHelper
 from .cluster import Cluster, get_default_cluster
 from .planner_v2 import Planner
 from .parallelizer_v2 import Parallelizer
 from .dist_op import DistributedOperator
 from .dist_saver import DistributedSaver
-from .dist_loader import NonIterableGeneratorLoader
-from .utils import make_data_unshard, set_grad_var_shape
-from .utils import print_program_with_dist_attr, to_list
-from .process_group import new_process_group, get_all_process_groups, get_world_process_group
+from .dist_loader import (
+    DistributedDataLoaderFromGenerator,
+    DistributedDataLoader,
+)
+from .process_group import new_process_group, get_all_process_groups
 from .dist_context import DistributedContext, get_default_distributed_context
+from .strategy import Strategy
+from .interface import CollectionNames, get_collection
+from .utils import to_list, get_dist_attr, get_lr, validate_opt
+from .utils import initialize_pg_in_full_mode, get_input_split_info
+from .cost.estimate_cost import get_cost_from_engine
+
+from ..utils.log_utils import get_logger
 
 
 class Engine:
+    """
+    An Engine object can provide the full power of auto parallel to users.
+    With the help of it, users can easily obtain the abilities of the
+    distributed training and inference. It also support the dynamic graph and
+    static graph at the same time.
+
+    Args:
+        model (paddle.nn.Layer, optional): The model is an instance of
+            paddle.nn.Layer.
+        loss (Loss|Callable|None, optional): The loss can be a `paddle.nn.Layer`
+            instance or any callable function taken the predicted values and
+            ground truth values as input. It can be None when there is no loss.
+            Default: None.
+        optimizer (Optimizer|None, optional): The optimizer need to be set in training
+            and should be None in eval and predict mode. Default: None.
+        metrics (Metric|list[Metric]|None, optional): If metrics is set, all
+            metrics will be calculated and output in train/eval mode. Default: None.
+        cluster (Cluster|None, optional): The cluster represents the topology information
+            about the used physical devices. Default: None. (Unused for now)
+        strategy (Strategy|None, optional): The strategy is used to configure the
+        parallelization and optimization behaviors. Default: None.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+            import paddle.vision.transforms as T
+            from paddle.distributed.fleet import auto
+            from paddle.vision.datasets import MNIST
+
+            transform = T.Compose([
+                T.Transpose(),
+                T.Normalize([127.5], [127.5])
+            ])
+            train_dataset = MNIST(mode='train', transform=transform)
+            valid_dataset = MNIST(mode='test', transform=transform)
+
+            model = paddle.vision.models.LeNet()
+            loss = paddle.nn.CrossEntropyLoss()
+            optimizer = paddle.optimizer.Adam(
+                learning_rate=0.001, parameters=model.parameters())
+            metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+            engine = auto.Engine(model, loss, optimizer, metrics)
+            # fit
+            engine.fit(train_dataset,
+                       epochs=2,
+                       batch_size=64)
+            # evaluate
+            engine.evaluate(valid_dataset,
+                            batch_size=64)
+            # predict
+            engine.predict(valid_dataset,
+                           batch_size=64)
+            # save
+            engine.save("./my_model")
+            # load
+            engine.load("./my_model")
+
+    """
+
+    def __init__(
+        self,
+        model=None,
+        loss=None,
+        optimizer=None,
+        metrics=None,
+        cluster=None,
+        strategy=None,
+    ):
+
+        if (
+            model
+            and not isinstance(model, paddle.nn.Layer)
+            and not callable(model)
+        ):
+            raise TypeError(
+                "'model must be sub classes of `paddle.nn.Layer` or any callable function."
+            )
+        self._model = model
+
+        if (
+            loss
+            and not isinstance(loss, (paddle.nn.Layer, Variable))
+            and not callable(loss)
+        ):
+            raise TypeError(
+                "'loss' must be sub classes of `paddle.nn.Layer` or any callable function or a Variable."
+            )
+        self._loss = loss
+
+        if optimizer and not isinstance(
+            optimizer,
+            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer),
+        ):
+            raise TypeError(
+                "'optimizer' must be object of class `paddle.optimizer.Optimizer`"
+                " or `paddle.fluid.optimizer.Optimizer`."
+            )
+        self._optimizer = validate_opt(optimizer)
+        self._orig_optimizer = copy.deepcopy(self._optimizer)
+
+        metrics = metrics or []
+        for metric in to_list(metrics):
+            if metric and not isinstance(metric, Metric):
+                raise TypeError(
+                    "{} is not sub class of Metric".format(
+                        metric.__class__.__name__
+                    )
+                )
+        self._metrics = to_list(metrics)
+
+        if cluster and not isinstance(cluster, Cluster):
+            raise TypeError(
+                "'cluster' must be the object or class `paddle.distributed.auto_parallel.Cluster`"
+            )
+        self._cluster = cluster or get_default_cluster()
+
+        if strategy and not isinstance(strategy, Strategy):
+            raise TypeError(
+                "'strategy' must be object of class `paddle.distributed.auto_parallel.Strategy`"
+            )
+        self._strategy = strategy or Strategy()
 
-    def __init__(self,
-                 model=None,
-                 inputs_spec=None,
-                 labels_spec=None,
-                 cluster=None,
-                 strategy=None,
-                 user_tuning_config=None):
-        self.model = model
-        self.inputs_spec = self._validate_spec(inputs_spec)
-        self.labels_spec = self._validate_spec(labels_spec)
-        self.cluster = cluster
-        if self.cluster is None:
-            self.cluster = get_default_cluster()
-        self.strategy = strategy
-        if self.strategy is None:
-            self.strategy = fleet.DistributedStrategy()
-        self._user_tuning_config = user_tuning_config
+        self._logger = get_logger(logging.INFO)
+        # if os.getenv("POD_NAME"):
+        #     self._logger.info(
+        #         "Distribute training by paddle.distributed.launch"
+        #     )
+        #     fleet.init(is_collective=True)
 
         self._executor = None
         self._cur_rank = paddle.distributed.get_rank()
         self._nranks = paddle.distributed.get_world_size()
         self._saver = DistributedSaver()
 
-        # TODO: add logger module
-        self._logger = logging.getLogger()
-        self._logger.propagate = False
-        if not self._logger.handlers:
-            self._logger.setLevel(logging.INFO)
-            log_handler = logging.StreamHandler()
-            log_format = logging.Formatter(
-                '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
-            )
-            log_handler.setFormatter(log_format)
-            self._logger.addHandler(log_handler)
-
         self._orig_main_prog = static.default_main_program()
         self._orig_startup_prog = static.default_startup_program()
         self._orig_dist_context = get_default_distributed_context()
         self._dist_contexts = {}
+        self._fwd_main_progs = {}
+        self._fwd_dist_contexts = {}
         self._serial_main_progs = {}
         self._serial_startup_progs = {}
         self._dist_main_progs = defaultdict(dict)  # dist main programs
@@ -98,62 +209,335 @@ def __init__(self,
         self._feed_vars = {}
         self._fetch_vars = {}
         self._planners = {}
-        self._mode_init_states = {
+        self._has_prepared = {"train": False, "eval": False, "predict": False}
+        self._has_prepared_reader = {
             "train": False,
             "eval": False,
-            "predict": False
+            "predict": False,
         }
+        self._inputs_spec = []
+        self._labels_spec = []
+        self._inputs = []
+        self._labels = []
+        self._losses = []
+
+        self._mode = None
+        self._skip_build = False
+        self._outside_dataloader = False
+        self._planned_mode = None
         self._dygraph_mode = False
+        self._tuning = self._strategy.tuning
 
-    def prepare(self,
-                optimizer=None,
-                loss=None,
-                gradient_scale=True,
-                metrics=None,
-                all_ranks=False):
-        if optimizer and not isinstance(
-                optimizer,
-            (paddle.optimizer.Optimizer, paddle.fluid.optimizer.Optimizer)):
-            raise TypeError(
-                    "'optimizer' must be object of class `paddle.optimizer.Optimizer`" \
-                        " or `paddle.fluid.optimizer.Optimizer`."
-                )
-        self._optimizer = optimizer
-        self._all_ranks = all_ranks
+        self.history = None
 
-        if loss and not isinstance(loss,
-                                   paddle.nn.Layer) and not callable(loss):
+    def _prepare_data_spec(self, data, split, batch_size):
+        inputs_spec = []
+        labels_spec = []
+        if isinstance(data, paddle.io.IterableDataset):
+            if split is None:
+                inputs, labels = next(iter(data))
+            else:
+                sample = next(iter(data))
+                inputs = sample[:split]
+                labels = sample[split:]
+        elif isinstance(data, paddle.io.Dataset):
+            if split is None:
+                inputs, labels = data[0]
+            else:
+                sample = data[0]
+                inputs = sample[:split]
+                labels = sample[split:]
+        else:
             raise TypeError(
-                "'loss' must be sub classes of `paddle.nn.Layer` or any callable function."
+                "Data should be a Dataset or IterableDatset, but received {}.".format(
+                    type(data).__name__
+                )
             )
-        self._loss = loss
+        inputs = to_list(inputs)
+        labels = to_list(labels)
+
+        num_shards = self._strategy.dataset.num_shards
+
+        def _adjust_item_spec(num_shards, spec):
+            if num_shards > 1 and len(spec.shape) > 1:
+                spec.shape[0] = spec.shape[0] * num_shards
+
+        def _infer_item_spec(item, name, batch_size, specs):
+            if isinstance(item, np.ndarray):
+                spec = InputSpec.from_numpy(item, name)
+                if batch_size is None:
+                    _adjust_item_spec(num_shards, spec)
+                    specs.append(spec)
+                else:
+                    specs.append(spec.batch(batch_size))
+            elif isinstance(item, (Variable, core.VarBase, core.eager.Tensor)):
+                spec = InputSpec.from_tensor(item, name)
+                _adjust_item_spec(num_shards, spec)
+                if batch_size is None:
+                    specs.append(spec)
+                else:
+                    specs.append(spec.batch(batch_size))
+            elif isinstance(item, numbers.Number):
+                specs.append(InputSpec([batch_size], type(item), name))
+            else:
+                raise TypeError(
+                    "The sample's dtype returned of dataset should be number, np.ndarray or Tensor, but got {}".format(
+                        type(item).__name__
+                    )
+                )
 
-        metrics = metrics or []
-        for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
-        self._metrics = to_list(metrics)
-        self._gradient_scale = gradient_scale
-        self._planned_mode = None
-        self._prepare_single_mode("train")
+        if inputs is not None:
+            for i, item in enumerate(inputs):
+                assert item is not None, "Receive None input."
+                name = "input" + str(i)
+                _infer_item_spec(item, name, batch_size, inputs_spec)
+        if labels is not None:
+            for i, item in enumerate(labels):
+                assert item is not None, "Receive None input."
+                name = "label" + str(i)
+                _infer_item_spec(item, name, batch_size, labels_spec)
+
+        inputs_spec = self._validate_spec(inputs_spec)
+        labels_spec = self._validate_spec(labels_spec)
+        return inputs_spec, labels_spec
+
+    def _prepare_data_tensor(self, inputs_spec, labels_spec, inputs, labels):
+        if _non_static_mode() or self._dygraph_mode:
+            raise ValueError("Only support static graph mode.")
 
-    def _prepare_single_mode(self, mode):
+        if inputs_spec:
+            assert isinstance(
+                inputs_spec, list
+            ), "inputs should be list, but received {}".format(
+                type(inputs_spec)
+            )
+            assert isinstance(
+                inputs, list
+            ), "inputs should be list, but received {}".format(type(inputs))
+            assert len(inputs_spec) == len(
+                inputs
+            ), "the number of `inputs_spec` should be equal to `inputs`'s."
+            for input_spec, input in zip(inputs_spec, inputs):
+                if input_spec.shape != input.shape:
+                    input.desc.set_shape(input_spec.shape)
+        if labels_spec:
+            assert isinstance(
+                labels_spec, list
+            ), "labels should be list, but received {}".format(
+                type(labels_spec)
+            )
+            assert isinstance(
+                labels, list
+            ), "labels should be list, but received {}".format(type(labels))
+            assert len(labels_spec) == len(
+                labels
+            ), "the number of `labels_spec` should be equal to `labels`'s."
+            for label_spec, label in zip(labels_spec, labels):
+                if label_spec.shape != label.shape:
+                    label.desc.set_shape(label_spec.shape)
+
+        return inputs, labels
+
+    def _prepare_reader(self, feed_list=[]):
+        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+        dist_context = self._dist_contexts[self._mode]
+        dist_main_block = dist_main_prog.global_block()
 
+        # NOTE: this list may be changed if Paddle changes the existing rules.
+        related_reader_ops = [
+            "create_py_reader",
+            "create_double_buffer_reader",
+            "read",
+        ]
+        # remove the first three ops if multiple run fit/evaluate/predict
+        if dist_main_block.ops[0].type == 'create_py_reader':
+            for i in range(len(related_reader_ops)):
+                if dist_main_block.ops[0].type in related_reader_ops:
+                    dist_main_block._remove_op(0, sync=False)
+        dist_main_block._sync_with_cpp()
+        # Step 1: find the reader ops
+        reader_op_indices = []
+        for idx, op in enumerate(dist_main_block.ops):
+            if op.type in related_reader_ops:
+                reader_op_indices.append(idx)
+        # Step 2: insert the new reader ops to cpp
+        # record the read ops' desc to insert to program of forward task_node
+        read_ops_desc = []
+        new_reader_ops = []
+        for idx in reversed(reader_op_indices):
+            new_op_desc = dist_main_block.desc._prepend_op()
+            new_op_desc.copy_from(dist_main_block.ops[idx].desc)
+            read_ops_desc.append(new_op_desc)
+            new_op = Operator(
+                dist_main_block, new_op_desc, type=new_op_desc.type()
+            )
+            new_reader_ops.append(new_op)
+            dist_op = DistributedOperator(new_op)
+            dist_context.add_dist_op_for_program(dist_op)
+        # Step 3: insert the new reader ops to python
+        for new_op in new_reader_ops:
+            dist_main_block.ops.insert(0, new_op)
+        for i in range(len(reader_op_indices)):
+            reader_op_indices[i] += len(reader_op_indices)
+        # Step 4: remove the old reader ops from python and cpp
+        for idx in reversed(reader_op_indices):
+            op = dist_main_block.ops.pop(idx)
+            dist_main_block.desc._remove_op(idx, idx + 1)
+        dist_main_block._sync_with_cpp()
+        self._has_prepared_reader[self._mode] = True
+
+        # Insert read op to forward TaskNode if 1F1B pass is setted
+        if self.main_program._pipeline_opt:
+            assert "tasks" in self.main_program._pipeline_opt["fleet_opt"]
+            fleet_opt = self.main_program._pipeline_opt["fleet_opt"]
+            fwd_task = fleet_opt["tasks"][1]
+            fwd_prog = fwd_task.get_program()
+            fwd_block = fwd_prog.global_block()
+
+            for var in feed_list:
+                if var.name not in fwd_block.vars:
+                    fwd_block._clone_variable(var)
+
+            for op_desc in read_ops_desc:
+                new_op_desc = fwd_block.desc._prepend_op()
+                new_op_desc.copy_from(op_desc)
+                new_op = Operator(
+                    fwd_block, new_op_desc, type=new_op_desc.type()
+                )
+                fwd_block.ops.insert(0, new_op)
+
+            fwd_block._sync_with_cpp()
+            fwd_task.set_program(fwd_prog)
+
+    def _prepare_feed(self, data, user_feeds, mode):
+        feeds = {}
+        if data is not None:
+            if isinstance(data, (list, tuple)):
+                if len(data) == 1 and isinstance(data[0], dict):
+                    for name, data in data[0].items():
+                        feeds[name] = data
+                else:
+                    raise ValueError("Unsupported data {}".format(data))
+            elif isinstance(data, dict):
+                for name, data in data.items():
+                    feeds[name] = data
+            else:
+                raise ValueError("Unsupported data {}".format(data))
+        if user_feeds is not None:
+            assert isinstance(
+                user_feeds, dict
+            ), "user_feeds must be a dict, but receive {}".format(
+                type(user_feeds).__name__
+            )
+            for name, data in user_feeds.items():
+                feeds[name] = data
+        return feeds
+
+    def _prepare_fetch(self, user_fetches, mode):
+        if user_fetches is not None:
+            assert isinstance(
+                user_fetches, list
+            ), "user_fetches must be a list, but receive {}".format(
+                type(user_fetches).__name__
+            )
+        fetch_names = []
+        fetch_indices = []
+
+        def _process_fetch_group(group_name, var_list):
+            group_indices = []
+            for var in var_list:
+                # Remove duplicate var_names
+                if self._is_local_var(var):
+                    var_name = _to_name_str(var)
+                    if var_name not in fetch_names:
+                        fetch_names.append(var_name)
+                    group_indices.append(fetch_names.index(var_name))
+            if not group_indices:
+                fetch_names.append([])
+            fetch_indices.append(group_indices)
+
+        if mode != "predict":
+            _process_fetch_group("loss", self._fetch_vars[mode]["loss"])
+        if mode != "predict":
+            metrics = self._fetch_vars[mode]["metrics"]
+            for i, var_list in enumerate(metrics):
+                _process_fetch_group("metrics_" + str(i), var_list)
+        if mode == "predict":
+            _process_fetch_group("outputs", self._fetch_vars[mode]["outputs"])
+        user_fetches_collection = [
+            item[1] for item in get_collection(CollectionNames.FETCHES)
+        ]
+        var_list = (user_fetches_collection or []) + (user_fetches or [])
+        _process_fetch_group("fetches", var_list)
+        return fetch_names, fetch_indices
+
+    def _prepare_logger(
+        self,
+        outs,
+        epoch=None,
+        step=None,
+        lr=None,
+        fetch_names=None,
+        fetch_indices=None,
+        mode=None,
+    ):
+        logs = {}
+        if epoch is not None:
+            logs["epoch"] = epoch
+        if step is not None:
+            logs["step"] = step + 1
+        if lr is not None:
+            logs["lr"] = lr
+        group_idx = 0
+        if mode != "predict":
+            # logging loss
+            loss_indices = fetch_indices[group_idx]
+            assert len(loss_indices) <= 1
+            for idx in loss_indices:
+                logs["loss"] = outs[idx][0]
+            group_idx += 1
+            # logging metrics
+            metric_vars = self._fetch_vars[mode]["metrics"]
+            if metric_vars:
+                for metric in self._metrics:
+                    metrics_indices = fetch_indices[group_idx]
+                    metric_out = []
+                    for idx in metrics_indices:
+                        metric_out.append(outs[idx])
+                    if metric_out:
+                        metric.update(*metric_out)
+                        results = metric.accumulate()
+                        for i, res in enumerate(to_list(results)):
+                            logs[metric.name()[i]] = res
+                    group_idx += 1
+        # logging outputs
+        elif mode == "predict":
+            outputs_indices = fetch_indices[group_idx]
+            logs_out = {}
+            for idx in outputs_indices:
+                logs_out["out%d" % (idx)] = outs[idx]
+            logs["outputs"] = logs_out
+            group_idx += 1
+        # logging user fetches
+        collect_fetches = get_collection(CollectionNames.FETCHES)
+        logs_fetch = {}
+        for name, var in collect_fetches:
+            if var.name in fetch_names:
+                idx = fetch_names.index(var.name)
+                logs_fetch[name or var.name] = outs[idx]
+        logs["fetches"] = logs_fetch
+        return logs
+
+    def _prepare_program(self, mode):
+        # Do the build process
         self._build(mode)
         # Do the planning process
         self._plan(mode)
-
-        # Do the Optimization tuning
-        if self._user_tuning_config and mode == "train":
-            self._optimization_tuning(mode)
-
         # Do the parallel process
-        self._parallel(mode, self._all_ranks)
-
+        self._parallel(mode)
         # Init comm and startup program
         self._initialize(mode)
-        self._mode_init_states[mode] = True
+        self._has_prepared[mode] = True
 
     def _build(self, mode):
         if _non_static_mode() or self._dygraph_mode:
@@ -161,21 +545,25 @@ def _build(self, mode):
             self._dygraph_mode = True
             self._logger.info("Building model with 'to_static' method.")
 
-            program_helper = ProgramHelper(self.model, self._loss,
-                                           self._metrics, self.inputs_spec,
-                                           self.labels_spec)
+            self.program_helper = ProgramHelper(
+                self._model,
+                self._loss,
+                self._metrics,
+                self._inputs_spec,
+                self._labels_spec,
+            )
             # build forward main program
-            program_helper.build_program(mode)
+            self.program_helper.build_program(mode)
 
-            self.concrete_program = program_helper.concrete_program
-            serial_main_prog = program_helper.main_program
-            serial_startup_prog = program_helper.startup_program
+            self.concrete_program = self.program_helper.concrete_program
+            serial_main_prog = self.program_helper.main_program
+            serial_startup_prog = self.program_helper.startup_program
 
-            inputs = program_helper.input_vars
-            outputs = program_helper.output_vars
-            labels = program_helper.label_vars
-            losses = program_helper.loss_vars
-            metrics = program_helper.metric_vars
+            self._inputs = self.program_helper.input_vars
+            self._labels = self.program_helper.label_vars
+            outputs = self.program_helper.output_vars
+            self._losses = self.program_helper.loss_vars
+            metrics = self.program_helper.metric_vars
 
             paddle.enable_static()
         else:
@@ -184,25 +572,46 @@ def _build(self, mode):
             if serial_main_prog is not None:
                 return
 
-            losses = []
+            outputs = []
             metrics = []
+            self._losses = []
             serial_main_prog = self._orig_main_prog.clone()
             serial_startup_prog = self._orig_startup_prog.clone()
-            # FIXME to support grad clip
-            with static.program_guard(serial_main_prog, serial_startup_prog), \
-                utils.unique_name.guard():
-                inputs_spec = self.inputs_spec
-                labels_spec = self.labels_spec if self.labels_spec else []
-                inputs = [s._create_feed_layer() for s in inputs_spec]
-                labels = [s._create_feed_layer() for s in labels_spec]
-                outputs = to_list(self.model(*inputs))
-                if mode != "predict" and self._loss:
-                    losses = to_list(self._loss(*(outputs + labels)))
-
-                if mode != "predict":
-                    for metric in self._metrics:
-                        metrics.extend(
-                            to_list(metric.compute(*(outputs + labels))))
+            if not self._skip_build:
+                with static.program_guard(
+                    serial_main_prog, serial_startup_prog
+                ), utils.unique_name.guard():
+                    self._inputs = [
+                        s._create_feed_layer() for s in self._inputs_spec
+                    ]
+                    self._labels = [
+                        s._create_feed_layer() for s in self._labels_spec
+                    ]
+
+                    outputs = to_list(self._model(*self._inputs))
+
+                    if mode != "predict" and self._loss:
+                        assert isinstance(
+                            self._loss, paddle.nn.Layer
+                        ) or callable(
+                            self._loss
+                        ), "the type of `loss` of the Engine arguments should be sub classes of `paddle.nn.Layer` or any callable function."
+                        self._losses = to_list(
+                            self._loss(*(outputs + self._labels))
+                        )
+
+                    if mode != "predict" and (outputs or self._labels):
+                        for metric in self._metrics:
+                            metrics.append(
+                                to_list(
+                                    metric.compute(*(outputs + self._labels))
+                                )
+                            )
+            elif mode == "train":
+                assert isinstance(
+                    self._loss, Variable
+                ), "the type of `loss` of the Engine arguments should be Variable."
+                self._losses = to_list(self._loss)
 
         default_ctx = get_default_distributed_context()
         if not default_ctx.has_annotation:
@@ -211,48 +620,73 @@ def _build(self, mode):
             new_process_group(list(range(self._nranks)))
             default_ctx.data_parallel = True
 
-        feed_vars = {"inputs": inputs, "labels": labels}
+        feed_vars = {"inputs": self._inputs, "labels": self._labels}
 
         fetch_vars = {
             "outputs": flatten(outputs),
-            "loss": losses,
-            "metrics": metrics
+            "loss": self._losses,
+            "metrics": metrics,
         }
 
+        if mode != "train":
+            serial_main_prog = serial_main_prog.clone(for_test=True)
+
         self._set_recompute_ckpts()
         self._dist_contexts[mode] = DistributedContext(
-            serial_main_prog, serial_startup_prog, self._optimizer, losses,
-            feed_vars, fetch_vars, self.cluster, self.strategy)
-        self._dist_contexts[mode].gradient_scale = self._gradient_scale
-        self._dist_contexts[mode]._dygraph_mode = self._dygraph_mode
-
-    def _optimization_tuning(self, mode):
+            serial_main_prog,
+            serial_startup_prog,
+            self._optimizer,
+            self._losses,
+            feed_vars,
+            fetch_vars,
+            self._cluster,
+            self._strategy,
+        )
+        self._fwd_dist_contexts[mode] = DistributedContext(
+            serial_main_prog,
+            serial_startup_prog,
+            self._optimizer,
+            self._losses,
+            feed_vars,
+            fetch_vars,
+            self._cluster,
+            self._strategy,
+        )
+        self._dist_contexts[mode].gradient_scale = self._strategy.gradient_scale
+        self._fwd_main_progs[mode] = serial_main_prog.clone()
+
+    def _optimization_tuning(self, mode, dataset, batch_size):
+        if not self._tuning.enable:
+            raise ValueError("Please set `tuning.enable=True`.")
+
+        assert mode == "train"
+        # Do the build process
+        self._build(mode)
+        # Do the planning process
+        self._plan(mode)
 
-        self.mode = mode
-        assert "batch_size" in self._user_tuning_config, "Optimization Tuning should provide with batch size."
-        assert "dataset" in self._user_tuning_config, "Optimization Tuning should provide with dataset."
-        batch_size = self._user_tuning_config["batch_size"]
-        dataset = self._user_tuning_config["dataset"]
-        dataset.dp_world_size = self.dp_world_sizes
-        dataset.dp_rank = self.dp_ranks
+        dataset.dp_world_size = self._dp_world_sizes
+        dataset.dp_rank = self._dp_ranks
 
         from .tuner.optimization_tuner import OptimizationTuner
-        self._optimization_tuner = OptimizationTuner(self._user_tuning_config,
-                                                     self._dist_contexts[mode],
-                                                     dataset,
-                                                     self.inputs_spec,
-                                                     self.labels_spec,
-                                                     batch_size=batch_size,
-                                                     rank=self._cur_rank)
+
+        self._optimization_tuner = OptimizationTuner(
+            self._tuning.to_dict(),
+            self._dist_contexts[mode],
+            dataset,
+            self._inputs_spec,
+            self._labels_spec,
+            batch_size=batch_size,
+            rank=self._cur_rank,
+        )
 
         self._optimization_tuner.tune()
 
-        if self._user_tuning_config["run_after_tuning"]:
+        if self._tuning.run_after_tuning:
             # update the strategy
             self._dist_contexts[
-                mode]._strategy = self._optimization_tuner.get_best_config()
-        else:
-            return
+                mode
+            ]._strategy = self._optimization_tuner.get_best_config()
 
     def _plan(self, mode):
         if self._planned_mode is None:
@@ -267,25 +701,28 @@ def _plan(self, mode):
         inputs_var = self._dist_contexts[mode].serial_feed_vars["inputs"]
         labels_var = self._dist_contexts[mode].serial_feed_vars["labels"]
         block = self._dist_contexts[mode].serial_main_program.global_block()
+        # TODO: check this feed_list
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in block.vars:
                 feed_list.append(block.vars[var.name])
 
-        self.dp_world_sizes = []
-        self.dp_ranks = []
+        self._dp_world_sizes = []
+        self._dp_ranks = []
         for feed_var in feed_list:
-            dp_world_size, dp_rank = self._get_input_split_info(
-                feed_var, self._dist_contexts[mode])
-            self.dp_world_sizes.append(dp_world_size)
-            self.dp_ranks.append(dp_rank)
+            dp_world_size, dp_rank = get_input_split_info(
+                self._cur_rank, feed_var, self._dist_contexts[mode]
+            )
+            self._dp_world_sizes.append(dp_world_size)
+            self._dp_ranks.append(dp_rank)
 
-    def _parallel(self, mode, all_ranks):
+    def _parallel(self, mode, all_ranks=False):
         # Parallelize program based on the planner's results
         # For now, the completer has to be passed to the planner,
         # because we may use it to complete the annotation of the backwarkward and update.
-        parallelizer = Parallelizer(mode, self._planners[mode].completer,
-                                    self._dist_contexts[mode])
+        parallelizer = Parallelizer(
+            mode, self._planners[mode].completer, self._dist_contexts[mode]
+        )
         if not all_ranks:
             parallelizer.parallel(self._cur_rank)
         else:
@@ -303,492 +740,1082 @@ def _init_dist_context(self, mode):
         for ib, block in enumerate(origin_main_prog.blocks):
             for iop, op in enumerate(block.ops):
                 ref_op = ref_blocks[ib].ops[iop]
-                assert op.type == ref_op.type, \
-                    "'{}' mode op '{}' is different with '{}' op '{}'. ".format(mode, op.type, ref_mode, ref_op.type)
-                ref_op_dist_attr = ref_dist_context.get_op_dist_attr_for_program(
-                    ref_op)
+                assert (
+                    op.type == ref_op.type
+                ), "'{}' mode op '{}' is different with '{}' op '{}'. ".format(
+                    mode, op.type, ref_mode, ref_op.type
+                )
+                ref_op_dist_attr = (
+                    ref_dist_context.get_op_dist_attr_for_program(ref_op)
+                )
                 dist_context.set_op_dist_attr_for_program(op, ref_op_dist_attr)
 
     def _initialize(self, mode):
         # Get the current content from the distributed context
         self._serial_main_progs[mode] = self._dist_contexts[
-            mode].serial_main_program
+            mode
+        ].serial_main_program
         self._serial_startup_progs[mode] = self._dist_contexts[
-            mode].serial_startup_program
+            mode
+        ].serial_startup_program
         self._dist_main_progs[mode] = self._dist_contexts[
-            mode].dist_main_programs
+            mode
+        ].dist_main_programs
         self._dist_startup_progs[mode] = self._dist_contexts[
-            mode].dist_startup_programs
+            mode
+        ].dist_startup_programs
         self._feed_vars[mode] = self._dist_contexts[mode].serial_feed_vars
         self._fetch_vars[mode] = self._dist_contexts[mode].serial_fetch_vars
-        self._lr_optimizer = self._dist_contexts[mode]._lr_optimizer
+        self._optimizer = self._dist_contexts[mode]._serial_optimizer
 
         if self._nranks > 1:
             # Traverse different rank programs and traverse each op of them,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
 
-            # NOTE: add the comm init control in the future for auto search
-            for process_group in all_process_groups:
-                if self._cur_rank not in process_group.ranks:
-                    continue
-                process_group.instantiate()
-
-        self._place = _get_device()
-        if isinstance(self._place, fluid.CUDAPlace):
-            self._place = fluid.CUDAPlace(ParallelEnv().dev_id)
-
-        if self._dygraph_mode:
-            paddle.disable_static()
-            main_program = self._dist_main_progs[mode][self._cur_rank]
-            for param in self.concrete_program.parameters:
-                # create var in scope and share parameters to scope
-                if param.name not in main_program.global_block().vars:
-                    continue
-                # get param_var's dist_attr
-                var = main_program.global_block().vars[param.name]
-                var_dist_attr = self._dist_contexts[
-                    mode].get_tensor_dist_attr_for_program(var)
-                dist_attr = {
-                    "dims_mapping": var_dist_attr.dims_mapping,
-                    "process_shape": var_dist_attr.process_mesh.topology,
-                    "process_group": var_dist_attr.process_mesh.processes
-                }
-                # slice param_value with dist_attr
-                # share sliced_param_value with param_tensor in global_scope
-                from .converter import Converter
-                param_tensor = global_scope().var(param.name).get_tensor()
-                sliced_param = Converter.slice_with_dist_attr(
-                    param.numpy(), dist_attr)
-                shared_tensor = paddle.to_tensor(sliced_param,
-                                                 place=self._place)
-                param_tensor._share_data_with(
-                    shared_tensor.value().get_tensor())
-            paddle.enable_static()
-
-        if self._executor is None:
-            self._executor = paddle.static.Executor(self._place)
-            uninitialized = []
-            dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
-            for var in dist_startup_prog.list_vars():
-                scope_var = global_scope().find_var(var.name)
-                if scope_var and scope_var.get_tensor()._is_initialized():
-                    continue
-                uninitialized.append(var)
-            if uninitialized:
-                prune_startup_prog = dist_startup_prog._prune(uninitialized)
-                self._executor.run(prune_startup_prog)
-
-            if self.strategy.amp and self.strategy.amp_configs['use_pure_fp16']:
-                # from paddle.fluid.contrib.mixed_precision.fp16_utils import cast_parameters_to_fp16
-                def cast_parameters_to_fp16(place,
-                                            program,
-                                            scope=None,
-                                            to_fp16_var_names=None):
-                    """
-                    Traverse all parameters in the whole model and set them to the FP16 data type.
-                    Whereas, this function will keep parameters of batchnorms in FP32.
-                    Args:
-                        place(fluid.CPUPlace|fluid.CUDAPlace): `place` is used to restore the FP16 weight tensors.
-                        program (Program): The used program.
-                        scope(fluid.Scope, optional): `scope` is used to get the FP32 weight tensor values.
-                                                    Default is None.
-                        to_fp16_var_names(set|list, optional): The data types of vars in `to_fp16_var_names`
-                                                            will be set to FP16. Usually, it is the returned
-                                                            value of `cast_model_to_fp16` API.
-                    """
-                    from paddle.framework import core
-                    import numpy as np
-                    all_parameters = []
-                    for block in program.blocks:
-                        all_parameters.extend(block.all_parameters())
-
-                    var_scope = scope if scope else paddle.static.global_scope()
-                    for param in all_parameters:
-                        if param.dtype == core.VarDesc.VarType.FP16:
-                            param_t = var_scope.find_var(
-                                param.name).get_tensor()
-                            data = np.array(param_t)
-                            param_t.set(np.float16(data), place)
-
-                cast_parameters_to_fp16(self._place, prune_startup_prog)
-
-    def fit(self,
-            train_data,
-            batch_size=1,
-            epochs=1,
-            fetches=None,
-            steps_per_epoch=None,
-            collate_fn=None,
-            use_cache=False,
-            return_numpy=True):
-        # TODO: callbacks
-        # TODO: evaluate after training
-
-        if not self._mode_init_states['train']:
-            raise Exception(
-                "train program is not initialized yet, please call engine.prepare() before calling fit() funtion."
-            )
-
-        self.mode = 'train'
-        assert self.mode in self._dist_main_progs, \
-            "train model is not ready, please call `engine.prepare()` first."
-        train_dataloader = self._create_dataloader(train_data, batch_size,
-                                                   epochs, steps_per_epoch,
-                                                   collate_fn)
-
-        usr_fetch = self._validate_fetches(fetches)
-        fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
-        fetch_list, fetch_map = self._fetch_map(fetch_loss, usr_fetch)
-        lr_scheduler = self.get_lr_scheduler(self.main_program)
-
+            if self._strategy.auto_mode == "full":
+                initialize_pg_in_full_mode(all_process_groups, self._cur_rank)
+            else:
+                for process_group in all_process_groups:
+                    if self._cur_rank not in process_group.ranks:
+                        continue
+                    print(
+                        "***process_group: id:",
+                        process_group.id,
+                        "rank:",
+                        process_group.ranks,
+                    )
+                    process_group.instantiate()
+
+        # place = _get_device()
+        # if isinstance(place, fluid.CUDAPlace):
+        #     place = fluid.CUDAPlace(ParallelEnv().dev_id)
+
+        # if self._strategy.seed:
+        #     paddle.seed(self._strategy.seed + self._dp_ranks[0])
+        #     np.random.seed(self._strategy.seed + self._dp_ranks[0])
+        #     random.seed(self._strategy.seed + self._dp_ranks[0])
+
+        # if self._dygraph_mode:
+        #     dist_context = self._dist_contexts[mode]
+        #     dist_main_program = self._dist_main_progs[mode][self._cur_rank]
+        #     self.program_helper.init(dist_main_program, place, dist_context)
+
+        # if self._executor is None:
+        #     self._executor = paddle.static.Executor(place)
+        #     uninitialized = []
+        #     dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+        #     for var in dist_startup_prog.list_vars():
+        #         scope_var = global_scope().find_var(var.name)
+        #         if scope_var and scope_var.get_tensor()._is_initialized():
+        #             continue
+        #         uninitialized.append(var)
+        #     if uninitialized:
+        #         prune_startup_prog = dist_startup_prog._prune(uninitialized)
+        #         self._executor.run(prune_startup_prog)
+
+        #     if hasattr(self, "_state_dict") and hasattr(self, "_dist_attr"):
+        #         self._set_state_dict(
+        #             mode, self._strict, self._state_dict, self._dist_attr
+        #         )
+
+        # if self._strategy.reinit:
+        #     self._logger.info("NOTE: parameters will be re-initialized.")
+        #     dist_startup_prog = self._dist_startup_progs[mode][self._cur_rank]
+        #     self._executor.run(dist_startup_prog)
+
+    def fit(
+        self,
+        train_data,
+        train_sample_split=None,
+        batch_size=1,
+        epochs=1,
+        steps_per_epoch=None,
+        log_freq=10,
+        save_dir=None,
+        save_freq=1,
+        valid_data=None,
+        valid_sample_split=None,
+        valid_freq=1,
+        valid_steps=None,
+        collate_fn=None,
+        callbacks=None,
+        verbose=2,
+    ):
+        """
+        Trains the model for a fixed number of epochs. If `valid_data` is set,
+        evaluation will be done at the end of each epoch.
+
+        Args:
+            train_data (Dataset): An instance of paddle paddle.io.Dataset. Default: None.
+            train_sample_split (int, optional): Each sample of the train dataset is assumed
+                to be a (input, label) pair by default and has two items. If each sample has
+                more than two items, train_sample_split specifies how to split these items into
+                input and label. The items before it are input and the left are label. Default: None.
+            batch_size (int, optional): The batch size of train_data and valid_data if provided.
+                The user's data will be used directly without batching if set to None. Default: 1.
+            epochs (int, optional): The number of epochs to train the model. Default: 1.
+            steps_per_epoch (int, optional): The total number of steps (batches of samples)
+                is executed in one epoch before stating the next one. If None, it is equal to
+                the number samples in your dataset divided by the batch size. Default: None.
+            valid_data (Dataset, optional): An instance of paddle paddle.io.Dataset used for
+                evaluation at the end of epoch. No evaluation will be done if set to None.
+                Default: None. (Unsupported for now)
+            valid_freq (int, optional): Only relevant if valid_data is provided. This specifies
+                how many training epochs before a new evaluation is performed. Default: 1.
+            valid_sample_split (int, optional): Only relevant if valid_data is provided.
+                Each sample of the valid dataset is assumed to be a (input, label) pair
+                by default and has two items. If each sample has more than two items,
+                valid_sample_split specifies how to split these items into input and label.
+                The items before it are input and the left are label. Default: None.
+            valid_steps (int, optional): Only relevant if valid_data is provided.
+                It is the total number of steps (batches of samples) to draw before
+                stopping validation at the end of every epoch. If None, validation will run until the
+                `valid_data` dataset is exhausted. The validation will start from the
+                beginning of the dataset at each epoch. Default: None.
+            collate_fn(callable, optional): function to generate mini-batch data by merging
+                the sample list, None for only stack each fields of sample in axis
+                0. Default None.
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during training. Default: None. (Unused for now)
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.distributed.fleet import auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss()
+                optimizer = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, optimizer, metrics)
+                engine.fit(train_dataset,
+                           epochs=2,
+                           batch_size=64)
+        """
+        self._mode = 'train'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            train_data, train_sample_split, batch_size
+        )
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+
+        train_dataloader = self._prepare_dataloader_from_generator(
+            dataset=train_data,
+            capacity=70,
+            iterable=False,
+            batch_size=batch_size,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
+            collate_fn=collate_fn,
+        )
+
+        fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
+
+        cbks = config_callbacks(
+            callbacks,
+            engine=self,
+            batch_size=batch_size,
+            epochs=epochs,
+            steps=train_dataloader._steps,
+            log_freq=log_freq,
+            save_freq=save_freq,
+            save_dir=save_dir,
+            verbose=verbose,
+            metrics=self._metrics_name(),
+            acc_step=self._k_steps,
+        )
+
+        cbks.on_begin('train')
         for epoch in range(epochs):
-            train_logs = {"epoch: {:d} ": epoch}
+            logs = {}
+            cbks.on_epoch_begin(epoch)
             for step, _ in enumerate(train_dataloader):
+                cbks.on_batch_begin('train', step, logs)
                 try:
-                    outs = self._executor.run(self.main_program,
-                                              fetch_list=fetch_list,
-                                              use_program_cache=use_cache,
-                                              return_numpy=return_numpy)
-                except fluid.core.EOFException:
+                    outs = self._executor.run(
+                        self.main_program,
+                        fetch_list=fetch_names,
+                        use_program_cache=self._strategy.use_cache,
+                        return_numpy=self._strategy.return_numpy,
+                    )
+                except core.EOFException:
                     break
-
-                train_logs["step: {:d} "] = step
-                if lr_scheduler is not None:
-                    lr_scheduler.step()
-                    try:
-                        train_logs["lr: {:5e} "] = self._lr_optimizer.get_lr()
-                    except:
-                        train_logs[
-                            "lr: {:5e} "] = self._lr_optimizer._learning_rate.get_lr(
-                            )
-                # inner fetches
-                if fetch_loss:
-                    train_logs["loss: {:9f} "] = outs[0][0]
-                # user fetches
-                user_outs = outs[len(fetch_loss):]
-                user_fetch_list = fetch_list[len(fetch_loss):]
-                for i, out in enumerate(user_outs):
-                    train_logs[fetch_map[user_fetch_list[i]] + ": {}"] = out
-                # logger
-                string = '[train] ' + ''.join(list(train_logs.keys()))
-                self._logger.info(string.format(*list(train_logs.values())))
-
-    def evaluate(self,
-                 eval_data,
-                 batch_size=1,
-                 fetches=None,
-                 collate_fn=None,
-                 use_cache=False,
-                 return_numpy=True):
-        self.mode = 'eval'
-        if not self._mode_init_states[self.mode]:
-            self._prepare_single_mode(self.mode)
-
-        assert self.mode in self._dist_main_progs, \
-            "eval model is not ready, please call `engine.prepare()` first."
-        eval_dataloader = self._create_dataloader(eval_data,
-                                                  batch_size,
-                                                  collate_fn=collate_fn)
-
-        usr_fetch = self._validate_fetches(fetches)
-        fetch_loss = self._validate_fetches(self.fetch_vars["loss"])
-        fetch_metrics = self._validate_fetches(self.fetch_vars["metrics"])
-        inner_fetch = dict(fetch_loss, **fetch_metrics)
-        fetch_list, fetch_map = self._fetch_map(inner_fetch, usr_fetch)
-
-        for step, _ in enumerate(eval_dataloader):
-            eval_logs = {"step: {:d} ": step}
+                lr = get_lr(self._optimizer)
+                logs = self._prepare_logger(
+                    outs,
+                    epoch,
+                    step,
+                    lr,
+                    fetch_names,
+                    fetch_indices,
+                    self._mode,
+                )
+                cbks.on_batch_end('train', step, logs)
+
+            if valid_data and (epoch + 1) % valid_freq == 0:
+                val_logs = self.evaluate(
+                    valid_data,
+                    valid_sample_split,
+                    batch_size,
+                    valid_steps,
+                    log_freq,
+                    collate_fn,
+                    callbacks,
+                    verbose,
+                )
+                val_logs = {
+                    "val_" + name: val for name, val in val_logs.items()
+                }
+                logs.update(val_logs)
+                self._switch_mode("train")
+            else:
+                self._reset_metrics()
+
+            cbks.on_epoch_end(epoch, logs)
+
+        cbks.on_end('train', logs)
+        return self.history
+
+    def evaluate(
+        self,
+        valid_data,
+        valid_sample_split=None,
+        batch_size=1,
+        steps=None,
+        log_freq=10,
+        collate_fn=None,
+        callbacks=None,
+        verbose=2,
+    ):
+        """
+        Evaluate the loss and metrics of the model on evaluation data.
+
+        Args:
+            valid_data (Dataset): An instance of paddle paddle.io.Dataset. Default: None.
+            valid_sample_split (int, optional): Each sample of the eval dataset is assumed
+                to be a (input, label) pair by default and has two items. If each sample has
+                more than two items, valid_sample_split specifies how to split these items into
+                input and label. The items before it are input and the left are label. Default: None.
+            batch_size (int, optional): The batch size of valid_data. The user's data will
+                be used directly without batching if set to None. Default: 1.
+            steps (int, optional): It is the total number of steps (batches of samples) to draw before
+                stopping evaluation. If None, evaluation will run until the `valid_data` dataset is exhausted.
+                The evaluation will start from the beginning of the dataset in each run. Default: None.
+            collate_fn(callable, optional): function to generate mini-batch data by merging
+                the sample list, None for only stack each fields of sample in axis
+                0. Default None.
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during evaluating. Default: None. (Unused for now)
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.distributed.fleet import auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                valid_dataset = MNIST(mode='test', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss()
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, metrics=metrics)
+                engine.evaluate(valid_dataset, batch_size=64)
+
+        """
+        self._mode = 'eval'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            valid_data, valid_sample_split, batch_size
+        )
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+
+        valid_dataloader = self._prepare_dataloader_from_generator(
+            dataset=valid_data,
+            capacity=70,
+            iterable=False,
+            batch_size=batch_size,
+            steps_per_epoch=steps,
+            collate_fn=collate_fn,
+        )
+
+        fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
+
+        cbks = config_callbacks(
+            callbacks,
+            engine=self,
+            batch_size=batch_size,
+            log_freq=log_freq,
+            verbose=verbose,
+            metrics=self._metrics_name(),
+        )
+
+        eval_steps = valid_dataloader._steps
+        cbks.on_begin(
+            'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
+        )
+        logs = {}
+        for step, _ in enumerate(valid_dataloader):
+            cbks.on_batch_begin('eval', step, logs)
             try:
-                outs = self._executor.run(self.main_program,
-                                          fetch_list=fetch_list,
-                                          use_program_cache=use_cache,
-                                          return_numpy=return_numpy)
-            except fluid.core.EOFException:
+                outs = self._executor.run(
+                    self.main_program,
+                    fetch_list=fetch_names,
+                    use_program_cache=self._strategy.use_cache,
+                    return_numpy=self._strategy.return_numpy,
+                )
+            except core.EOFException:
                 break
-            # inner fetches
-            if fetch_loss:
-                eval_logs["loss: {:9f} "] = outs[0][0]
-            # Metric
-            if fetch_metrics:
-                metric_out = outs[len(fetch_loss):len(inner_fetch)]
-                for metric in self._metrics:
-                    metric.update(*metric_out)
-                    results = metric.accumulate()
-                    for i, res in enumerate(to_list(results)):
-                        eval_logs[metric.name()[i] + ": {:9f} "] = res
-            # usr fetches
-            usr_outs = outs[len(inner_fetch):]
-            usr_fetch_list = fetch_list[len(inner_fetch):]
-            for i, out in enumerate(usr_outs):
-                eval_logs[fetch_map[usr_fetch_list[i]] + ": {}"] = out
-            # logger
-            string = '[eval] ' + ''.join(list(eval_logs.keys()))
-            self._logger.info(string.format(*list(eval_logs.values())))
-
-    def predict(self,
-                test_data,
-                batch_size=1,
-                fetches=None,
-                collate_fn=None,
-                use_cache=False,
-                return_numpy=True):
-        self.mode = 'predict'
-        if not self._mode_init_states[self.mode]:
-            self._prepare_single_mode(self.mode)
-
-        assert self.mode in self._dist_main_progs, \
-            "predict model is not ready, please call `engine.prepare()` first."
-        test_dataloader = self._create_dataloader(test_data,
-                                                  batch_size,
-                                                  collate_fn=collate_fn)
-
-        usr_fetch = self._validate_fetches(fetches)
-        fetch_outputs = self._validate_fetches(self.fetch_vars["outputs"])
-        fetch_list, fetch_map = self._fetch_map(fetch_outputs, usr_fetch)
+            logs = self._prepare_logger(
+                outs, None, step, None, fetch_names, fetch_indices, self._mode
+            )
+            cbks.on_batch_end('eval', step, logs)
+        cbks.on_end('eval', logs)
+        self._reset_metrics()
+        return logs
+
+    def predict(
+        self,
+        test_data,
+        test_sample_split=None,
+        batch_size=1,
+        steps=None,
+        collate_fn=None,
+        callbacks=None,
+        verbose=2,
+    ):
+        """
+        Compute the output predictions on testing data.
+
+        Args:
+            test_data (Dataset): An instance of paddle paddle.io.Dataset. Default: None.
+            test_sample_split (int, optional): Each sample of the test dataset is assumed
+                to be a (input, label) pair by default and has two items. If each sample has
+                more than two items, test_sample_split specifies how to split these items into
+                input and label. The items before it are input and the left are label. Default: None.
+            batch_size (int, optional): The batch size of test_data. The user's data will
+                be used directly without batching if set to None. Default: 1.
+            steps (int, optional): It is the total number of steps (batches of samples) to draw before
+                stopping predict. If None, predict will run until the `test_data` dataset is exhausted.
+                The predict will start from the beginning of the dataset in each run. Default: None.
+            collate_fn(callable, optional): function to generate mini-batch data by merging
+                the sample list, None for only stack each fields of sample in axis
+                0. Default None.
+            callbacks (Callback|None, optional): A list of `Callback` instances to apply
+                during testing. Default: None. (Unused for now)
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.distributed.fleet import auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                valid_dataset = MNIST(mode='test', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+
+                engine = auto.Engine(model)
+                engine.predict(valid_dataset, batch_size=64)
+        """
+        self._mode = 'predict'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            test_data, test_sample_split, batch_size
+        )
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+
+        test_dataloader = self._prepare_dataloader_from_generator(
+            dataset=test_data,
+            capacity=70,
+            iterable=False,
+            batch_size=batch_size,
+            steps_per_epoch=steps,
+            collate_fn=collate_fn,
+        )
+
+        fetch_names, fetch_indices = self._prepare_fetch(None, mode=self._mode)
 
         outputs = []
+        cbks = config_callbacks(callbacks, engine=self, verbose=verbose)
+        test_steps = test_dataloader._steps
+        cbks.on_begin('predict', {'steps': test_steps})
+        logs = {}
         for step, _ in enumerate(test_dataloader):
-            predict_logs = {"step: {:d} ": step}
+            cbks.on_batch_begin('predict', step, logs)
             try:
-                outs = self._executor.run(self.main_program,
-                                          fetch_list=fetch_list,
-                                          use_program_cache=use_cache,
-                                          return_numpy=return_numpy)
-            except fluid.core.EOFException:
+                outs = self._executor.run(
+                    self.main_program,
+                    fetch_list=fetch_names,
+                    use_program_cache=self._strategy.use_cache,
+                    return_numpy=self._strategy.return_numpy,
+                )
+            except core.EOFException:
                 break
-            outputs.append(outs[:len(fetch_outputs)])
-            for i, out in enumerate(outs):
-                predict_logs[fetch_map[fetch_list[i]] + ": {}"] = out
-            # logger
-            string = '[pred] ' + ''.join(list(predict_logs.keys()))
-            self._logger.info(string.format(*list(predict_logs.values())))
-
+            logs = self._prepare_logger(
+                outs, None, step, None, fetch_names, fetch_indices, self._mode
+            )
+            cbks.on_batch_end('predict', step, logs)
+            outputs.append(list(logs["outputs"].values()))
+        cbks.on_end('predict', logs)
         return outputs
 
-    def _create_dataloader(self,
-                           dataset,
-                           batch_size,
-                           epochs=1,
-                           steps_per_epoch=None,
-                           collate_fn=None):
-        dist_main_prog = self._dist_main_progs[self.mode][self._cur_rank]
-        dist_startup_prog = self._dist_startup_progs[self.mode][self._cur_rank]
-        dist_context = self._dist_contexts[self.mode]
+    def dataloader(
+        self,
+        dataset,
+        batch_size=1,
+        shuffle=False,
+        drop_last=False,
+        collate_fn=None,
+        num_workers=0,
+        use_buffer_reader=True,
+        use_shared_memory=True,
+        timeout=0,
+        worker_init_fn=None,
+        epochs=1,
+        steps_per_epoch=None,
+        sample_split=1,
+        mode=None,
+    ):
+        if mode is not None:
+            self.to_mode(mode)
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            dataset, sample_split, batch_size
+        )
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+
+        dataloader = self._prepare_dataloader(
+            dataset,
+            return_list=False,
+            batch_size=batch_size,
+            shuffle=shuffle,
+            drop_last=drop_last,
+            collate_fn=collate_fn,
+            num_workers=num_workers,
+            use_buffer_reader=use_buffer_reader,
+            use_shared_memory=use_shared_memory,
+            timeout=timeout,
+            worker_init_fn=worker_init_fn,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
+        )
+        return dataloader
+
+    def dataloader_from_generator(
+        self,
+        dataset,
+        capacity=70,
+        use_double_buffer=True,
+        iterable=True,
+        use_multiprocess=False,
+        drop_last=True,
+        batch_size=1,
+        epochs=1,
+        steps_per_epoch=None,
+        collate_fn=None,
+        sample_split=1,
+        mode=None,
+    ):
+        if mode is not None:
+            self.to_mode(mode)
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            dataset, sample_split, batch_size
+        )
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+
+        dataloader = self._prepare_dataloader_from_generator(
+            dataset=dataset,
+            capacity=capacity,
+            use_double_buffer=use_double_buffer,
+            iterable=iterable,
+            return_list=False,
+            use_multiprocess=use_multiprocess,
+            drop_last=drop_last,
+            batch_size=batch_size,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
+            collate_fn=collate_fn,
+        )
+        return dataloader
+
+    def prepare(
+        self,
+        inputs_spec=None,
+        labels_spec=None,
+        inputs=None,
+        labels=None,
+        main_program=None,
+        startup_program=None,
+        mode=None,
+    ):
+        if mode is not None:
+            self.to_mode(mode)
+
+        if not self._mode:
+            raise ValueError(
+                "Please set mode to be prepared with `prepare(mode=...)`"
+            )
+
+        if self._has_prepared[self._mode]:
+            return
+
+        inputs_spec = self._validate_spec(inputs_spec)
+        labels_spec = self._validate_spec(labels_spec)
+        inputs = self._validate_vars(inputs)
+        labels = self._validate_vars(labels)
+
+        self._orig_main_prog = main_program
+        self._orig_startup_prog = startup_program
+        if inputs or labels:
+            self._skip_build = True
+            inputs, labels = self._prepare_data_tensor(
+                inputs_spec, labels_spec, inputs, labels
+            )
+            if self._orig_main_prog is None:
+                self._orig_main_prog = static.default_main_program()
+            if self._orig_startup_prog is None:
+                self._orig_startup_prog = static.default_startup_program()
+        elif inputs_spec or labels_spec:
+            self._outside_dataloader = True
+            if self._orig_main_prog is None:
+                self._orig_main_prog = static.default_main_program()
+            if self._orig_startup_prog is None:
+                self._orig_startup_prog = static.default_startup_program()
+        else:
+            assert (
+                self._inputs_spec and self._labels_spec
+            ), "Please call the dataloader(...) before calling prepare(...)"
+
+        self._inputs_spec, self._labels_spec = inputs_spec, labels_spec
+        self._inputs, self._labels = inputs, labels
+        if not self._has_prepared[self._mode]:
+            self._prepare_program(self._mode)
+        else:
+            self._switch_mode(self._mode)
+
+    def run(self, data=None, feed=None, fetch_list=None, mode=None):
+        if mode is not None:
+            self.to_mode(mode)
+        feed_dict = self._prepare_feed(data, feed, self._mode)
+        fetch_names, fetch_indices = self._prepare_fetch(fetch_list, self._mode)
+        if (
+            self._outside_dataloader
+            and not self._has_prepared_reader[self._mode]
+        ):
+            self._prepare_reader()
+        outs = self._executor.run(
+            self.main_program,
+            feed=feed_dict,
+            fetch_list=fetch_names,
+            use_program_cache=self._strategy.use_cache,
+            return_numpy=self._strategy.return_numpy,
+        )
+        logs = self._prepare_logger(
+            outs, None, None, None, fetch_names, fetch_indices, self._mode
+        )
+        return logs
+
+    def _prepare_dataloader(
+        self,
+        dataset,
+        return_list=True,
+        batch_size=1,
+        shuffle=False,
+        drop_last=False,
+        collate_fn=None,
+        num_workers=0,
+        use_buffer_reader=True,
+        use_shared_memory=True,
+        timeout=0,
+        worker_init_fn=None,
+        epochs=1,
+        steps_per_epoch=None,
+    ):
+
+        if self._strategy.gradient_merge and batch_size is not None:
+            assert (
+                batch_size % self._k_steps == 0
+            ), "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(
+                batch_size, self._k_steps
+            )
+            batch_size //= self._k_steps
+
+        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
         dist_main_block = dist_main_prog.global_block()
 
-        # NOTE: Get feed_list from dist_program, then insert dataloader op
-        # with sharded var shape. Because predict_program does not contain
-        # labels var, so we will filter dataset's value with length of feed_list.
-        inputs_var = self._feed_vars[self.mode]["inputs"]
-        labels_var = self._feed_vars[self.mode]["labels"]
+        # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
+        # Cause predict_program does not contain labels var,
+        # then we will add labels var from serial_program to dist_program,
+        # that maintains the length of feed_list equal to the length of dataset's values.
+        inputs_var = self._feed_vars[self._mode]["inputs"]
+        labels_var = self._feed_vars[self._mode]["labels"]
         feed_list = []
         for var in inputs_var + labels_var:
             if var.name in dist_main_block.vars:
                 feed_list.append(dist_main_block.vars[var.name])
-
-        # remove the first three ops if multi run fit/evaluate/predict
-        op_size = len(dist_main_block.ops)
-        if dist_main_block.ops[0].type == 'create_py_reader':
-            op_size -= 3
-            for _ in range(3):
-                dist_main_block._remove_op(0, sync=False)
+            else:
+                copy_var = dist_main_block._clone_variable(var, var.persistable)
+                copy_var.desc.set_original_id(var.desc.original_id())
+                feed_list.append(copy_var)
 
         # insert read op at the end of program
         places = paddle.static.cuda_places()
         with static.program_guard(dist_main_prog, dist_startup_prog):
-            dataloader = NonIterableGeneratorLoader(
+            dataloader = DistributedDataLoader(
                 dataset,
-                feed_list,
-                places,
-                batch_size,
-                epochs,
-                steps_per_epoch,
-                collate_fn,
-                data_parallel_world_size=self.dp_world_sizes,
-                data_parallel_rank=self.dp_ranks,
-                split_data=self.strategy.split_data)
-
-        # move read op from the end of program to the start of program
-        new_op_size = len(dist_main_block.ops)
-        for _ in range(new_op_size - 1, op_size - 1, -1):
-            op = dist_main_block.ops[new_op_size - 1]
-            new_op_desc = dist_main_block.desc._prepend_op()
-            new_op_desc.copy_from(op.desc)
-            new_op = Operator(dist_main_block,
-                              new_op_desc,
-                              type=new_op_desc.type())
-            dist_main_block.ops.insert(0, new_op)
-            dist_op = DistributedOperator(new_op)
-            dist_context.add_dist_op_for_program(dist_op)
-        for _ in range(new_op_size - op_size):
-            dist_main_block._remove_op(new_op_size, sync=False)
-        dist_main_block._sync_with_cpp()
+                feed_list=feed_list,
+                places=places,
+                return_list=return_list,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last,
+                collate_fn=collate_fn,
+                num_workers=num_workers,
+                use_buffer_reader=use_buffer_reader,
+                use_shared_memory=use_shared_memory,
+                timeout=timeout,
+                worker_init_fn=worker_init_fn,
+                epochs=epochs,
+                steps_per_epoch=steps_per_epoch,
+                split_data=self._strategy.split_data,
+                data_parallel_world_size=self._dp_world_sizes,
+                data_parallel_rank=self._dp_ranks,
+            )
+
+        return dataloader
+
+    def _prepare_dataloader_from_generator(
+        self,
+        dataset,
+        capacity=None,
+        use_double_buffer=True,
+        iterable=True,
+        return_list=False,
+        use_multiprocess=False,
+        drop_last=True,
+        batch_size=1,
+        epochs=1,
+        steps_per_epoch=None,
+        collate_fn=None,
+    ):
+
+        if self._strategy.gradient_merge and batch_size is not None:
+            assert (
+                batch_size % self._k_steps == 0
+            ), "Requires batch_size:[{}] to be divisible by k_steps:[{}].".format(
+                batch_size, self._k_steps
+            )
+            batch_size //= self._k_steps
+
+        dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+        dist_startup_prog = self._dist_startup_progs[self._mode][self._cur_rank]
+        dist_main_block = dist_main_prog.global_block()
+
+        # NOTE: Get feed_list, then insert dataloader op with sharded var shape.
+        # Cause predict_program does not contain labels var,
+        # then we will add labels var from serial_program to dist_program,
+        # that maintains the length of feed_list equal to the length of dataset's values.
+        inputs_var = self._feed_vars[self._mode]["inputs"]
+        labels_var = self._feed_vars[self._mode]["labels"]
+        feed_list = []
+        for var in inputs_var + labels_var:
+            if var.name in dist_main_block.vars:
+                feed_list.append(dist_main_block.vars[var.name])
+            else:
+                copy_var = dist_main_block._clone_variable(var, var.persistable)
+                copy_var.desc.set_original_id(var.desc.original_id())
+                feed_list.append(copy_var)
+
+        places = paddle.static.cuda_places()
+        with static.program_guard(dist_main_prog, dist_startup_prog):
+            dataloader = DistributedDataLoaderFromGenerator(
+                dataset=dataset,
+                feed_list=feed_list,
+                capacity=capacity,
+                use_double_buffer=use_double_buffer,
+                iterable=iterable,
+                return_list=return_list,
+                use_multiprocess=use_multiprocess,
+                drop_last=drop_last,
+                places=places,
+                batch_size=batch_size,
+                epochs=epochs,
+                steps_per_epoch=steps_per_epoch,
+                collate_fn=collate_fn,
+                split_data=self._strategy.split_data,
+                data_parallel_world_size=self._dp_world_sizes,
+                data_parallel_rank=self._dp_ranks,
+            )
+        self._prepare_reader(feed_list)
         return dataloader
 
+    def _tune(self, tune_data, tune_sample_split=None, batch_size=1):
+        self._mode = 'train'
+        self._inputs_spec, self._labels_spec = self._prepare_data_spec(
+            tune_data, tune_sample_split, batch_size
+        )
+        self._optimization_tuning(self._mode, tune_data, batch_size)
+
     def _validate_spec(self, specs):
         specs = to_list(specs)
+        self._k_steps = self._strategy.gradient_merge.k_steps
         if specs is not None:
             for i, spec in enumerate(specs):
-                assert isinstance(spec, InputSpec)
+                if not isinstance(spec, InputSpec):
+                    raise TypeError(
+                        "'spec' must be object of class `paddle.static.InputSpec`."
+                    )
                 if spec.name is None:
                     raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}."
-                        .format(i, spec))
-        return specs
+                        "Requires Input[{}].name != None, but receive `None` with {}.".format(
+                            i, spec
+                        )
+                    )
+                if self._k_steps > 1:
+                    shape = list(spec.shape)
+                    assert (
+                        shape[0] % self._k_steps == 0
+                    ), "Requires batch_size[{}] to be divisible by k_steps[{}].".format(
+                        spec.shape[0], self._k_steps
+                    )
+                    shape[0] //= self._k_steps
+                    spec.shape = shape
+        return specs or []
+
+    def _validate_vars(self, vars):
+        vars = to_list(vars)
+        if vars is not None:
+            for i, var in enumerate(vars):
+                if not isinstance(var, Variable):
+                    raise TypeError("'var' must be a `Variable`.")
+        return vars or []
 
     def _is_local_var(self, var):
         var_name = _to_name_str(var)
         return var_name in self.main_program.global_block().vars
 
-    def _validate_fetches(self, fetches):
-        # 1. Check user-defined fetches type
-        # 2. Prepare fetches_dict like {user_defined_name: var_name}
-        if not fetches:
-            return {}
-        if isinstance(fetches, dict):
-            fetch_var_names = list(map(_to_name_str, fetches.values()))
-            fetches_dict = dict(zip(fetch_var_names, list(fetches.keys())))
-        elif isinstance(fetches, list):
-            fetch_var_names = list(map(_to_name_str, fetches))
-            fetches_dict = dict(zip(fetch_var_names, fetch_var_names))
-        else:
-            raise TypeError("'fetches' only support 'dict' and 'list', "
-                            "but got '{}'".format(str(type(fetches))))
-        return dict(
-            filter(lambda x: self._is_local_var(x[0]), fetches_dict.items()))
-
-    def _fetch_map(self, inner_fetch, usr_fetch):
-        # replace inner fetch name if usr set for it
-        for iname in inner_fetch:
-            if iname in usr_fetch:
-                inner_fetch[iname] = usr_fetch[iname]
-                usr_fetch.pop(iname)
-        fetches = dict(inner_fetch, **usr_fetch)
-        return list(fetches.keys()), fetches
-
-    def _get_input_split_info(self, var, dist_context):
-        # deduce how the input data is split among the cluster
-        from .utils import _get_comm_group, _get_corresponding_rank
-
-        tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
-        process_mesh = tensor_dist_attr.process_mesh
-        dims_mapping = tensor_dist_attr.dims_mapping
-
-        if self._cur_rank not in process_mesh.processes:
-            rank_id = _get_corresponding_rank(dist_context, process_mesh,
-                                              self._cur_rank)
-        else:
-            rank_id = self._cur_rank
-
-        batch_size_axis = dims_mapping[0]
-        if batch_size_axis > -1 and process_mesh.topology[batch_size_axis] > 1:
-            group_ranks = _get_comm_group(process_mesh.processes,
-                                          process_mesh.topology,
-                                          batch_size_axis, rank_id)
-            return len(group_ranks), group_ranks.index(rank_id)
-
-        return None, None
-
     def _set_recompute_ckpts(self):
         # NOTE hack to enable recompute in engine api for GPT-3
         # TODO support more PaddleNLP/CV models here
 
-        config = self.strategy.recompute_configs
+        recompute = self._strategy.recompute
 
         # extract ckpts by specific model
-        if isinstance(self.model, paddle.nn.Layer):
+        if isinstance(self._model, paddle.nn.Layer):
             if hasattr(
-                    self.model, "gpt"
-            ) and self.model.__class__.__name__ == 'GPTForPretraining':
-                exact_ckpts = self.model.gpt.checkpoints
+                self._model, "gpt"
+            ) and self._model.__class__.__name__ in [
+                'GPTForPretraining',
+                'GPTForPretrainingAuto',
+            ]:
+                exact_ckpts = self._model.gpt.checkpoints
             else:
-                exact_ckpts = config["checkpoints"]
+                exact_ckpts = recompute.checkpoints
         else:
-            exact_ckpts = config["checkpoints"]
+            exact_ckpts = recompute.checkpoints
 
         # modify strategy
-        if self.strategy.recompute:
-            config["checkpoints"] = exact_ckpts[:]
-            self.strategy.recompute_configs = config
+        if recompute.enable:
+            recompute.checkpoints = exact_ckpts[:]
             logs = {
-                'Model Class': self.model.__class__.__name__,
-                'Applied Recompute ckpts': exact_ckpts
+                'Model Class': self._model.__class__.__name__,
+                'Applied Recompute ckpts': exact_ckpts,
             }
             self._logger.info(logs)
 
-    def save(self, path, training=True, mode=None):
-        if not mode:
-            mode = self.mode
+    def _reset_metrics(self):
+        for metric in self._metrics:
+            metric.reset()
+
+    def _metrics_name(self):
+        metrics_name = ['loss'] if self._loss else []
+        for m in self._metrics:
+            metrics_name.extend(to_list(m.name()))
+        return metrics_name
+
+    def _switch_mode(self, mode):
+        assert (
+            mode in self._dist_main_progs
+        ), "{} model is not ready, please call `prepare()` first.".format(mode)
+        self.to_mode(mode)
+        self._optimizer = self._dist_contexts[mode]._serial_optimizer
+
+    def to_mode(self, mode):
+        assert mode in [
+            "train",
+            "eval",
+            "predict",
+        ], "mode {} should be one of ['train', 'eval', 'predict']".format(mode)
+        self._mode = mode
 
+    def _set_state_dict(self, mode, strict, state_dict, dist_attr):
+        program = self._dist_main_progs[mode][self._cur_rank]
+        dist_context = self._dist_contexts[mode]
+        cur_dist_attr = get_dist_attr(program, dist_context)
+        converter = Converter(state_dict, dist_attr, cur_dist_attr)
+        state_dict = converter.convert(strict=strict)
+        program.set_state_dict(state_dict)
+
+    def save(self, path, training=True):
+        """
+        Saves the model, parameters, optimizer state to path.
+        If `training` is set to False, only inference model will be saved.
+
+        Args:
+            path (str): The file prefix to save model. The format
+                is 'dirname/file_prefix' or 'file_prefix'. if empty str.
+                A exception will be raised.
+            training (bool, optional): Whether to save for training. If not, save
+                for inference only. If `training` is set to True, the optimizer state
+                will be saved. Otherwise, only the model and parameters are saved.
+                This function will silently overwrite existing file at the target
+                location. Default: True.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.distributed.fleet import auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss()
+                optimizer = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, optimizer, metrics)
+                engine.fit(train_dataset,
+                           epochs=1,
+                           batch_size=64)
+                engine.save("./my_model")
+
+        """
         if training:
-            assert 'train' in self._serial_main_progs, \
-                "training model is not ready, please call `engine.prepare()` first."
-            serial_program = self._serial_main_progs["train"]
-            dist_main_prog = self._dist_main_progs["train"][self._cur_rank]
-            dist_context = self._dist_contexts["train"]
-            self._saver.save(path,
-                             serial_program=serial_program,
-                             dist_main_program=dist_main_prog,
-                             dist_context=dist_context)
+            assert self._mode in self._serial_main_progs
+            serial_program = self._serial_main_progs[self._mode]
+            dist_main_prog = self._dist_main_progs[self._mode][self._cur_rank]
+            dist_context = self._dist_contexts[self._mode]
+            self._saver.save(
+                path,
+                serial_program=serial_program,
+                dist_main_program=dist_main_prog,
+                dist_context=dist_context,
+            )
         else:
-            assert mode, "Please set the 'mode' you want to save."
-            feed_vars = self._feed_vars[mode]['inputs']
-            fetch_vars = self._fetch_vars[mode]['outputs']
-            dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
-            self._saver.save_inference_model(path,
-                                             feed_vars,
-                                             fetch_vars,
-                                             self._executor,
-                                             program=dist_main_prog)
-
-    def load(self, path, strict=True, load_optimizer=True, mode=None):
-        if not mode:
-            mode = self.mode
-        assert mode, "Please set the 'mode' you want to load."
-
-        dist_main_prog = self._dist_main_progs[mode][self._cur_rank]
-        dist_context = self._dist_contexts[mode]
-        self._saver.load(path, dist_main_prog, dist_context, strict,
-                         load_optimizer)
-
-    @staticmethod
-    def get_lr_scheduler(program):
-        lr_sheduler = None
-        if hasattr(program, 'lr_sheduler'):
-            from paddle.optimizer.lr import LRScheduler
-            lr_sheduler = program.lr_sheduler
-            assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
-        return lr_sheduler
+            assert "predict" in self._dist_main_progs
+            feed_vars = self._feed_vars["predict"]['inputs']
+            fetch_vars = self._fetch_vars["predict"]['outputs']
+            dist_main_prog = self._dist_main_progs["predict"][self._cur_rank]
+            self._saver.save_inference_model(
+                path,
+                feed_vars,
+                fetch_vars,
+                self._executor,
+                program=dist_main_prog,
+            )
 
-    @property
-    def mode(self):
-        return self._mode
+    def load(self, path, strict=True, load_optimizer=True):
+        """
+        Load the stored model, parameters and optimizer states.
+
+        Args:
+            path (str): The prefix of files storing the model states and
+                optimizer states.
+            strict (bool, optional): Whether to skip the loading of mismatch
+                parameter or raise an error when mismatch happens (not found
+                the parameter in file storing model states of or receives a
+                mismatch shape). Default: True.
+            load_optimizer (bool, optional): If True, the stored optimizer
+                states is restored. Otherwise, the optimizer states is initialized
+                from scratch. Default: True.
+
+        Returns:
+            None
+
+        Examples:
+
+            .. code-block:: python
+                import paddle
+                import paddle.vision.transforms as T
+                from paddle.distributed.fleet import auto
+                from paddle.vision.datasets import MNIST
+
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                train_dataset = MNIST(mode='train', transform=transform)
+
+                model = paddle.vision.models.LeNet()
+                loss = paddle.nn.CrossEntropyLoss()
+                optimizer = paddle.optimizer.Adam(
+                    learning_rate=0.001, parameters=model.parameters())
+                metrics = paddle.metric.Accuracy(topk=(1, 2))
+
+                engine = auto.Engine(model, loss, optimizer, metrics)
+                engine.fit(train_dataset,
+                           epochs=1,
+                           batch_size=64)
+                engine.save("./my_model")
+                engine.load("./my_model")
+
+        """
+        self._strict = strict
+        self._state_dict, self._dist_attr = self._saver.load(
+            path, load_optimizer
+        )
+        return self._state_dict, self._dist_attr
+
+    def cost(self, inputs_spec=None, labels_spec=None, mode=None):
+        """
+        Get and Print cost, including memory of every rank,
+        max memory among all ranks, and the global cost of one step based on
+        communication cost(computation cost is 0 by default).
+        In the future, the flops information of every rank and global cost including
+        computation cost will be added.
+
+        Args:
+            inputs_spec(InputSpec): The specification of inputs. Default: None.
+            labels_spec(InputSpec): The specification of labels. Default: None.
+            mode (str): The engine mode must be in ["train", "predict", "eval"]. Default: None.
+
+        Returns:
+            Return the global execution time (ms) and max memory (B).
+
+        """
+        # Check parallel mode
+        if self._strategy.auto_mode == "full":
+            self._logger.info(
+                "The cost will be calcudated in the search process when the auto mode is full."
+            )
+            return
 
-    @mode.setter
-    def mode(self, mode):
-        self._mode = mode
+        # Check mode
+        mode = mode if mode is not None else self._mode
+        assert mode is not None, "Please set mode."
+        if mode not in self._has_prepared:
+            raise ValueError(
+                "The mode {} is not in accepted modes {}".format(
+                    mode, list(self._has_prepared.keys())
+                )
+            )
+        self.to_mode(mode)
+
+        if inputs_spec is not None and not self._has_prepared[mode]:
+            self._inputs_spec = self._validate_spec(inputs_spec)
+            self._labels_spec = self._validate_spec(labels_spec)
+            self._build(mode)
+            self._plan(mode)
+        else:
+            if _non_static_mode() or self._dygraph_mode:
+                raise ValueError(
+                    "Please call `prepare()` or `fit()` or  `evaluate()` or  `predict()` before calling `cost()`."
+                )
+            else:
+                self._logger.info(
+                    "The program whose cost to be estimated must be static default program. Otherwise, please call `prepare()`before calling `cost()`."
+                )
+                program = paddle.static.default_main_program()
+                if (
+                    not program.global_block().ops
+                    or not program.global_block().ops
+                ) and not self._has_prepared[mode]:
+                    raise ValueError(
+                        "Please call `prepare()` or `fit()` or  `evaluate()` or  `predict()` before calling `cost()`."
+                    )
+
+        # Estimate the exec cost and max memory
+        global_cost, max_memory = get_cost_from_engine(self, mode)
+
+        return global_cost.time, max_memory
 
     @property
     def main_program(self):
-        return self._dist_main_progs[self.mode][self._cur_rank]
+        return self._dist_main_progs[self._mode][self._cur_rank]
 
     @property
     def startup_program(self):
-        return self._dist_startup_progs[self.mode][self._cur_rank]
+        return self._dist_startup_progs[self._mode][self._cur_rank]
 
     @property
     def dist_context(self):
-        return self._dist_contexts[self.mode]
+        return self._dist_contexts[self._mode]
 
     @property
     def serial_main_program(self):
-        return self._serial_main_progs[self.mode]
+        return self._serial_main_progs[self._mode]
 
     @property
     def serial_startup_program(self):
-        return self._serial_startup_progs[self.mode]
+        return self._serial_startup_progs[self._mode]
 
     @property
     def fetch_vars(self):
-        return self._fetch_vars[self.mode]
+        return self._fetch_vars[self._mode]
+
+    @property
+    def inputs(self):
+        return self._inputs
+
+    @property
+    def labels(self):
+        return self._labels
diff --git a/python/paddle/distributed/auto_parallel/hepler.py b/python/paddle/distributed/auto_parallel/helper.py
similarity index 83%
rename from python/paddle/distributed/auto_parallel/hepler.py
rename to python/paddle/distributed/auto_parallel/helper.py
index 077b769116060c..7faa426ed3430c 100644
--- a/python/paddle/distributed/auto_parallel/hepler.py
+++ b/python/paddle/distributed/auto_parallel/helper.py
@@ -15,14 +15,18 @@
 import logging
 from collections import defaultdict
 
+import paddle
+
 from paddle.nn import Layer
 from paddle.jit import to_static, not_to_static
-from paddle.distributed.utils import get_logger
 from paddle.fluid.framework import Operator, Parameter, _non_static_mode
 from paddle.fluid.framework import program_guard
+from paddle.fluid.executor import global_scope
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 
 from .utils import to_list
+from .utils import get_logger
+from .converter import Converter
 
 
 class ProxyLayer(Layer):
@@ -89,13 +93,14 @@ def _eval(self, inputs, labels):
         # step 4. calculate metrics if needed
         self._metric_vars[mode] = self.call_metrics(new_inputs)
 
-    def _predict(self, inputs):
+    def _predict(self, inputs, labels):
         """
         Predict process of inner_layer with forward logic.
         """
         # step 1. save feed variables of Program
         mode = 'predict'
         self._input_vars[mode] = inputs
+        self._label_vars[mode] = labels
 
         # step 2. call inner_layer.forward
         self._output_vars[mode] = self.inner_layer(*inputs)
@@ -134,7 +139,7 @@ def call_metrics(self, inputs):
         """
         outs = []
         for metric in self.metrics:
-            outs.extend(metric.compute(*inputs))
+            outs.append(to_list(metric.compute(*inputs)))
 
         return outs
 
@@ -165,6 +170,10 @@ def loss_vars(self):
     def metric_vars(self):
         return self._metric_vars[self.mode]
 
+    @property
+    def startup_program(self):
+        return self.inner_layer._startup_program()
+
 
 class BuildInfo:
 
@@ -199,6 +208,7 @@ def __init__(self, layer, loss_func, metrics, inputs_spec, labels_spec):
 
         self.build_info = BuildInfo()
         self._logger = get_logger(logging.INFO)
+        self.lazy_init = False
 
     def reset(self):
         """
@@ -221,8 +231,7 @@ def build_program(self, mode):
             return
 
         self._logger.info("start to build program for mode = %s." % mode)
-        input_spec = [self.inputs_spec, self.labels_spec
-                      ] if mode != 'predict' else [self.inputs_spec]
+        input_spec = [self.inputs_spec, self.labels_spec]
         static_func = to_static(self.static_func(), input_spec=input_spec)
 
         func_name = '_' + mode
@@ -238,6 +247,9 @@ def _build_startup_program(self):
         """
         Create and Sync parameters into startup program.
         """
+        if len(self.startup_program.global_block().ops) > 1:
+            self.lazy_init = True
+            return
         for param in self.concrete_program.parameters:
             Parameter(name=param.name,
                       desc=param,
@@ -294,6 +306,28 @@ def static_func(self):
         func_name = '_' + self.proxy_layer.mode
         return getattr(self.proxy_layer, func_name)
 
+    def init(self, main_program, place, dist_context):
+        if self.lazy_init:
+            return
+        for param in self.concrete_program.parameters:
+            # create var in scope and share parameters to scope
+            if param.name not in main_program.global_block().vars:
+                continue
+            # get param_var's dist_attr
+            var = main_program.global_block().vars[param.name]
+            var_dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
+            dist_attr = {
+                "dims_mapping": var_dist_attr.dims_mapping,
+                "process_shape": var_dist_attr.process_mesh.topology,
+                "process_group": var_dist_attr.process_mesh.processes
+            }
+            # slice param_value with dist_attr
+            # share sliced_param_value with param_tensor in global_scope
+            param_tensor = global_scope().var(param.name).get_tensor()
+            sliced_param = Converter.slice_with_dist_attr(
+                param.numpy(), dist_attr)
+            param_tensor.set(sliced_param, place)
+
     @property
     def concrete_program(self):
         return self.static_func().concrete_program
@@ -304,7 +338,13 @@ def main_program(self):
 
     @property
     def startup_program(self):
-        return self.concrete_program.startup_program
+        try:
+            return self.proxy_layer.startup_program
+        except Exception as err:
+            self._logger.warning("`lazy init` failed.")
+            if isinstance(err, AssertionError):
+                return self.concrete_program.startup_program
+            raise err
 
     @property
     def input_vars(self):
diff --git a/python/paddle/distributed/auto_parallel/interface.py b/python/paddle/distributed/auto_parallel/interface.py
index 588d2b05b7934a..b154209700a313 100644
--- a/python/paddle/distributed/auto_parallel/interface.py
+++ b/python/paddle/distributed/auto_parallel/interface.py
@@ -12,101 +12,252 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy
-import copy
+from collections import defaultdict
+
 import paddle
-import paddle.fluid.core as core
-from paddle.fluid.framework import Variable
-from paddle.fluid.framework import _non_static_mode
+from paddle.fluid import core
+from .process_mesh import ProcessMesh
+from .process_mesh import get_current_process_mesh
+from .process_mesh import set_current_process_mesh
+from .process_mesh import reset_current_process_mesh
 from .dist_context import get_default_distributed_context
 from .dist_tensor import DistributedTensor
-from .dist_op import DistributedModule
-from .dist_attribute import TensorDistributedAttribute
-from .dist_attribute import OperatorDistributedAttribute
-
-
-def _static_mode_check():
-    if _non_static_mode():
-        raise RuntimeError("Auto-parallel only supports static mode for now, "
-                           "please use paddle.enable_static() first.")
+from .dist_op import DistributedOperatorHelper
+from .utils import verify_shard_spec, convert_to_dims_mapping
 
 
-def shard_tensor(x, dist_attr=None):
+def shard_tensor(x, process_mesh=None, shard_spec=None):
     """
-    Add distributed attributes for a tensors.
+    Shard a tensor on a process mesh according to the shard specification.
 
     Args:
         x (Tensor): the tensor to be sharded.
-        dist_attr (dict): the tensor distributed attributes. The accepted attributes are as follow:
-            "process_mesh": a nested list an to describe the mesh topology of logical processes.
-            "dims_mapping": a list to describe the mapping between `x` and `process_mesh`, the dimension 
-                `i` of `x` is split across the dimension `dims_mapping[i]` of `process_mesh`, 
-                where -1 means that tensor dimension is not split.
-            Both process_mesh and dims_mapping are optional and users can specify as need.
+        process_mesh (ProcessMesh, optional): An instance of ProcessMesh describes a mesh
+            topology of the used logical processes where the tensor is sharded. If it is None,
+            the found current process mesh will be used. And an error will be raised if the
+            current process mesh cannot be found. Default: None.
+        shard_spec (list, optional): a list to describe the sharding mapping between `x` and `process_mesh`,
+            which means the dimension `i` of `x` is split across the dimension `shard_spec[i]` of `process_mesh`,
+            where `None` means that tensor dimension is not split. For example, given a tensor wih
+            the shape [6, 12] and a process mesh with the shape [2, 3] and the dimension names ["x", "y"]:
+                If `shard_spec=["x", "y"]`, each shard of the tensor will have a shape [3, 4];
+                If `shard_spec=["y", "x"]`, each shard of the tensor will have a shape [2, 6];
+                If `shard_spec=["x", None]`, each shard of the tensor will have a shape [3, 12];
+                If `shard_spec=[None, "x"]`, each shard of the tensor will have a shape [6, 4];
+                If `shard_spec=["y", None]`, each shard of the tensor will have a shape [2, 12];
+                If `shard_spec=[None, "y"]`, each shard of the tensor will have a shape [6, 4];
+                If `shard_spec=[None, None]`, each shard of the tensor will have a shape [6, 12];
+        If the `shard_spec` is None, the tensor will be replicated across all the processes of `process_mesh`.
+        In the above example, the `shard_spec=None` is same as 'shard_spec=[None, None]'. Defaults: None.
 
     Returns:
-        Tensor: the tensor `x` annotated with distributed attributes.
+        Tensor: the tensor `x` annotated with sharding information.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.distributed as dist
-            
-            paddle.enable_static()
+            from paddle.distributed.fleet import auto
 
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
             x = paddle.ones([4, 6])
-            dist.shard_tensor(x, dist_attr={"process_mesh": [[0, 1], [2, 3]],
-                                            "dims_mapping": [0, -1]})
+            shard_spec = ["x", "y"]
+            auto.shard_tensor(x, mesh, shard_spec)
 
     """
-    _static_mode_check()
-    assert dist_attr is None or isinstance(dist_attr, (dict, TensorDistributedAttribute)), \
-        "The type of dist_attr must be None, dict or TensorDistributedAttribute."
-    dist_tensor = DistributedTensor(x, dist_attr)
-    dist_tensor.dist_attr.mark_annotated_as(dist_attr)
+
+    if process_mesh is not None:
+        assert isinstance(
+            process_mesh, ProcessMesh
+        ), "Argument process_mesh {} is not an instance of ProcessMesh".format(
+            process_mesh
+        )
+    else:
+        process_mesh = get_current_process_mesh()
+        assert (
+            process_mesh is not None
+        ), "Specify the process mesh argument or use ProcessMesh context manager first."
+    assert isinstance(
+        shard_spec, list
+    ), "Argument shard_spec {} is not an instance of list".format(shard_spec)
+    if isinstance(x, str):
+        x = paddle.fluid.default_main_program().global_block()._var_recursive(x)
+        dist_tensor = DistributedTensor(x)
+    else:
+        dist_tensor = DistributedTensor(x)
+    serial_tensor = dist_tensor.serial_tensor
+    dist_tensor.dist_attr.process_mesh = process_mesh
+    if (
+        serial_tensor.type == core.VarDesc.VarType.READER
+        or serial_tensor.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        or serial_tensor.type == core.VarDesc.VarType.STEP_SCOPES
+    ):
+        tensor_shape = []
+    else:
+        tensor_shape = serial_tensor.shape
+    if shard_spec is not None:
+        assert verify_shard_spec(
+            shard_spec, tensor_shape, process_mesh
+        ), "For tensor {}, shard_spec {} is invalid with tensor_shape {} and process_mesh {}.".format(
+            serial_tensor.name, shard_spec, tensor_shape, process_mesh
+        )
+        dist_tensor.dist_attr.dims_mapping = convert_to_dims_mapping(
+            shard_spec, process_mesh
+        )
+    if process_mesh is not None:
+        dist_tensor.dist_attr.mark_annotated("process_mesh")
+    if shard_spec is not None:
+        dist_tensor.dist_attr.mark_annotated("dims_mapping")
     default_dist_ctx = get_default_distributed_context()
     default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
+    dist_tensor = default_dist_ctx.get_dist_tensor_for_program(x)
+    default_dist_ctx.add_process_mesh(process_mesh)
     return x
 
 
-def shard_op(op_fn, dist_attr=None):
+def shard_op(op, process_mesh=None, in_shard_specs=None, out_shard_specs=None):
     """
-    Call a functioin and add distributed attributes for ops added by the function.
+    Shard an operation on a process mesh according to its input and output shard specification.
 
     Args:
-        op_fn (callable): a callable operator or module to be sharded.
-        dist_attr (dict): the operator distributed attributes. The accepted attributes are classified into 
-            two categories. The first category decsribes the distributed attributes shared by all inputs and 
-            outputs, and only `process_mesh` can be specified now. The second category describes distributed
-            attributes for inputs or outputs same as the `dist_attr` of `shard_tensor`. All of them are
-            optional and users can specify them as need. Note that `process_mesh` for operators must be the
-            same as these process_meshes for inputs and outputs. 
+        op (Callable): a callable operator or module to be sharded.
+        process_mesh (ProcessMesh, optional): An instance of ProcessMesh describes a mesh
+            topology of the used logical processes where the op is sharded. All of its inputs and
+            outputs are sharded by this process mesh. If it is None, the found current process mesh
+            will be used. And an error will be raised if the current process mesh cannot be found.
+            Default: None.
+        in_shard_specs (list of list, optional): a list of list to describe the sharding specifications
+            for the inputs. Each item of `in_shard_specs` is a `shard_spec` between the correspoinding input
+            and `process_mesh`. If one item is None, the cooresponding input is replicated across all processes
+            If it is None, all inputs are replicated accross all processes. Note that the lenght of the
+            `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
+            Default: None.
+        out_shard_specs (list of list, optional): a list of list to describe the sharding specifications
+            for the outputs. Each item of `out_shard_specs` is a `shard_spec` between the correspoinding output
+            and `process_mesh`. If one item is None, the cooresponding output is replicated across all processes
+            If it is None, all outputs are replicated accross all processes. Note that the lenght of the
+            `in_shard_specs` should be equal to the actual number of inputs when calling this operation.
+            Default: None. Default: None.
 
     Returns:
-        list: the outputs of the function `op_fn`, which are annotated with distributed attributes.
+        Outputs of `op`, each of which is annotated with sharding information.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.distributed as dist
+            from paddle.distributed.fleet import auto
 
-            paddle.enable_static()
-            
             x = paddle.ones([4, 6])
             y = paddle.zeros([4, 6])
-            dist_add = dist.shard_op(paddle.add,
-                                     dist_attr={
-                                         "process_mesh": [[2, 3, 1], [0, 4, 5]],
-                                         x: {"dims_mapping": [-1, 0]},
-                                         y: {"dims_mapping": [0, -1]}
-                                     })
+            mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+            dist_add = auto.shard_op(paddle.add,
+                                     in_shard_specs=[["x", "y"], ["y", None]],
+                                     out_shard_specs=[[None, "x"]])
             dist_add(x, y)
 
     """
-    _static_mode_check()
-    assert dist_attr is None or isinstance(dist_attr, (dict, OperatorDistributedAttribute)), \
-        "The type of dist_attr must be dict or OperatorDistributedAttribute."
-    dist_module = DistributedModule(op_fn, dist_attr)
-    return dist_module
+
+    if process_mesh is not None:
+        assert isinstance(
+            process_mesh, ProcessMesh
+        ), "Argument process_mesh {} is not an instance of ProcessMesh".format(
+            process_mesh
+        )
+    else:
+        process_mesh = get_current_process_mesh()
+        assert (
+            process_mesh is not None
+        ), "Specify the process mesh argument or use ProcessMesh context manager first."
+    in_dims_mappings = []
+    if in_shard_specs is not None:
+        assert all(
+            (isinstance(shard_spec, list) or shard_spec is None)
+            for shard_spec in in_shard_specs
+        ), "in_shard_spec {} is not a list of list or None".format(
+            in_shard_specs
+        )
+        for shard_spec in in_shard_specs:
+            if shard_spec is not None:
+                in_dims_mappings.append(
+                    convert_to_dims_mapping(shard_spec, process_mesh)
+                )
+            else:
+                in_dims_mappings.append(None)
+    out_dims_mappings = []
+    if out_shard_specs is not None:
+        assert all(
+            (isinstance(shard_spec, list) or shard_spec is None)
+            for shard_spec in out_shard_specs
+        ), "out_shard_spec {} is not a list of list or None".format(
+            out_shard_specs
+        )
+        for shard_spec in out_shard_specs:
+            if shard_spec is not None:
+                out_dims_mappings.append(
+                    convert_to_dims_mapping(shard_spec, process_mesh)
+                )
+            else:
+                out_dims_mappings.append(None)
+    op = DistributedOperatorHelper(
+        op, process_mesh, in_dims_mappings, out_dims_mappings
+    )
+    return op
+
+
+def recompute(op):
+    class RecomputeOperator:
+        def __init__(self, op):
+            self._op = op
+
+        def __call__(self, *args, **kwargs):
+            default_prog = paddle.fluid.default_main_program()
+            cur_block = default_prog.current_block()
+            op_size = len(cur_block.ops)
+            output = self._op(*args, **kwargs)
+            new_op_size = len(cur_block.ops)
+
+            for idx in range(op_size, new_op_size):
+                op = cur_block.ops[idx]
+                op._set_attr("is_recompute@auto_parallel", True)
+
+            return output
+
+    return RecomputeOperator(op)
+
+
+_g_collections = {}
+
+
+class CollectionNames(object):
+    FETCHES = "fetches"
+    LOGGING = "logging"
+
+
+def get_collection(name):
+    collection = _g_collections.get(name, None)
+    if collection is None:
+        collection = []
+        _g_collections[name] = collection
+    return _g_collections[name]
+
+
+def add_to_collection(collection_name, value, name=None):
+    if collection_name not in _g_collections:
+        _g_collections[collection_name] = []
+    if name is not None:
+        for _, v in _g_collections[collection_name]:
+            if v == value:
+                return
+        _g_collections[collection_name].append((name, value))
+    else:
+        for _, v in _g_collections[collection_name]:
+            if v == value:
+                return
+        _g_collections[collection_name].append((None, value))
+
+
+def fetch(tensor, name=None, logging=False):
+    add_to_collection(CollectionNames.FETCHES, tensor, name)
+    if logging:
+        add_to_collection(CollectionNames.LOGGING, tensor, name)
diff --git a/python/paddle/distributed/auto_parallel/operators/__init__.py b/python/paddle/distributed/auto_parallel/operators/__init__.py
index 295e3557df27d3..406ec4d8b36da0 100644
--- a/python/paddle/distributed/auto_parallel/operators/__init__.py
+++ b/python/paddle/distributed/auto_parallel/operators/__init__.py
@@ -32,4 +32,7 @@
 from . import dist_slice
 from . import dist_fused_feedforward
 from . import dist_fused_attention
-from . import dist_reduce_p
+from . import dist_reduce_sum_p
+from . import dist_shape
+from . import dist_assign
+from . import dist_scale
diff --git a/python/paddle/distributed/auto_parallel/operators/common.py b/python/paddle/distributed/auto_parallel/operators/common.py
index e7e7ad1e0ea268..9137322cc7171c 100644
--- a/python/paddle/distributed/auto_parallel/operators/common.py
+++ b/python/paddle/distributed/auto_parallel/operators/common.py
@@ -14,7 +14,11 @@
 
 import abc
 import paddle
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import (
+    OpRole,
+    OP_ROLE_KEY,
+    OP_ROLE_VAR_KEY,
+)
 from ..dist_attribute import OperatorDistributedAttribute
 from ..utils import _get_comm_group, _get_corresponding_rank, is_optimize_op
 from ..process_group import new_process_group
@@ -22,16 +26,22 @@
 _g_distributed_operator_impl_containers = {}
 
 _g_elementwise_ops = [
-    "elementwise", "gelu", "dropout", "cast", "gather", "concat",
-    "fused_softmax_mask_upper_triangle"
+    "elementwise",
+    "gelu",
+    "dropout",
+    "cast",
+    "gather",
+    "concat",
+    "fused_softmax_mask_upper_triangle",
 ]
 BACKWARD_ONLY_DIST_OPS = {'check_finite_and_unscale', 'update_loss_scaling'}
 
 
-class ParallelMode():
+class ParallelMode:
     """
     the parallel mode for communication or auxiliary operator
     """
+
     DataParallel = "auto_parallel/data_parallel"
     ModelParallel = "auto_parallel/model_parallel"
     PipelineParalel = "auto_parallel/pipeline_paralel"
@@ -47,7 +57,6 @@ def is_elementwise_op(op_type):
 
 
 class DistributedOperatorImplContainer:
-
     def __init__(self, op_type):
         self._type = op_type
         self._impls = []
@@ -65,8 +74,9 @@ def impls(self):
         return self._impls
 
     def register_impl(self, dist_impl):
-        assert self.type == dist_impl.type, \
-            "Op type of container must be same as that of the implementation."
+        assert (
+            self.type == dist_impl.type
+        ), "Op type of container must be same as that of the implementation."
         impl_idx = len(self.impls)
         dist_impl.idx = impl_idx
         self._impls.append(dist_impl)
@@ -97,7 +107,6 @@ def get_compatible_impls(self, dist_op):
 
 
 class DistributedOperatorImpl(abc.ABC):
-
     def __init__(self, name):
         self._name = name
         self._type = None
@@ -176,60 +185,75 @@ def register_distributed_operator_impl(op_type, dist_impl):
 
 def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
     """
-    Here just return the first compatible implemention. 
+    Here just return the first compatible implemention.
     This will be improved by cost model in the future.
     """
     op_type = dist_op.serial_op.type
     dist_op_impl_container = get_distributed_operator_impl_container(op_type)
     dist_op_eltwise_impl_container = get_distributed_operator_impl_container(
-        "elementwise")
+        "elementwise"
+    )
     dist_op_default_impl_container = get_distributed_operator_impl_container(
-        "default")
+        "default"
+    )
     compatible_impls = []
     if partial:
         if fwd:
             # First, find impls in the corresponding container
             if dist_op_impl_container:
                 compatible_impls.extend(
-                    dist_op_impl_container.get_input_compatible_impls(dist_op))
+                    dist_op_impl_container.get_input_compatible_impls(dist_op)
+                )
             # Second, find impls in the elementwise container
             if dist_op_eltwise_impl_container and is_elementwise_op(op_type):
                 compatible_impls.extend(
                     dist_op_eltwise_impl_container.get_input_compatible_impls(
-                        dist_op))
+                        dist_op
+                    )
+                )
             # Third, find impls in the default container
             if dist_op_default_impl_container:
                 compatible_impls.extend(
                     dist_op_default_impl_container.get_input_compatible_impls(
-                        dist_op))
+                        dist_op
+                    )
+                )
         else:
             # First, find impls in the corresponding container
             if dist_op_impl_container:
                 compatible_impls.extend(
-                    dist_op_impl_container.get_output_compatible_impls(dist_op))
+                    dist_op_impl_container.get_output_compatible_impls(dist_op)
+                )
             # Second, find impls in the elementwise container
             if dist_op_eltwise_impl_container and is_elementwise_op(op_type):
                 compatible_impls.extend(
                     dist_op_eltwise_impl_container.get_output_compatible_impls(
-                        dist_op))
+                        dist_op
+                    )
+                )
             # Third, find impls in the default container
             if dist_op_default_impl_container:
                 compatible_impls.extend(
                     dist_op_default_impl_container.get_output_compatible_impls(
-                        dist_op))
+                        dist_op
+                    )
+                )
     else:
         # First, find impls in the corresponding container
         if dist_op_impl_container:
             compatible_impls.extend(
-                dist_op_impl_container.get_compatible_impls(dist_op))
+                dist_op_impl_container.get_compatible_impls(dist_op)
+            )
         # Second, find impls in the elementwise container
         if dist_op_eltwise_impl_container and is_elementwise_op(op_type):
             compatible_impls.extend(
-                dist_op_eltwise_impl_container.get_compatible_impls(dist_op))
+                dist_op_eltwise_impl_container.get_compatible_impls(dist_op)
+            )
         # Third, find impls in the default container
         if dist_op_default_impl_container:
             compatible_impls.extend(
-                dist_op_default_impl_container.get_compatible_impls(dist_op))
+                dist_op_default_impl_container.get_compatible_impls(dist_op)
+            )
 
     if compatible_impls:
         # For now, just return the first compatible impl
@@ -242,18 +266,18 @@ def find_compatible_distributed_operator_impls(dist_op, fwd=True, partial=True):
 
 def is_parameter_related(varname, block):
     if ".subprog_" in varname:
-        varname = varname[:varname.index(".subprog_")]
+        varname = varname[: varname.index(".subprog_")]
     if ".cast_fp" in varname:
-        varname = varname[:varname.index(".cast_fp")]
+        varname = varname[: varname.index(".cast_fp")]
     if ".quantized" in varname:
-        varname = varname[:varname.index(".quantized")]
+        varname = varname[: varname.index(".quantized")]
     assert block.has_var(varname)
     var = block.var(varname)
     return var.is_parameter
 
 
 def infer_shape(block, src_var, src_var_dist_attr, op_input_dist_attr):
-    var_shape = block.var(src_var.name).shape
+    var_shape = block._var_recursive(src_var.name).shape
     var_topoloy = src_var_dist_attr.process_mesh.topology
     var_dims_mapping = src_var_dist_attr.dims_mapping
 
@@ -278,8 +302,9 @@ def infer_shape(block, src_var, src_var_dist_attr, op_input_dist_attr):
     return exact_shape
 
 
-def set_comm_op_dist_attr_for_program(new_op, process_mesh, tensor_dist_attr,
-                                      ctx):
+def set_comm_op_dist_attr_for_program(
+    new_op, process_mesh, tensor_dist_attr, ctx
+):
     assert process_mesh is not None
     assert tensor_dist_attr is not None
 
@@ -304,9 +329,11 @@ def naive_copy_op_dist_attr_for_program(new_op, ref_op, ctx):
         assert len(new_op.input(input_name)) == 1
 
         ref_tensor_dist_attr = ref_dist_attr.get_input_dist_attr(
-            ref_op.input(input_name)[0])
+            ref_op.input(input_name)[0]
+        )
         new_op_dist_attr.set_input_dist_attr(
-            new_op.input(input_name)[0], ref_tensor_dist_attr)
+            new_op.input(input_name)[0], ref_tensor_dist_attr
+        )
 
     for output_name in ref_op.output_names:
         assert output_name in new_op.output_names
@@ -314,9 +341,11 @@ def naive_copy_op_dist_attr_for_program(new_op, ref_op, ctx):
         assert len(new_op.output(output_name)) == 1
 
         ref_tensor_dist_attr = ref_dist_attr.get_output_dist_attr(
-            ref_op.output(output_name)[0])
+            ref_op.output(output_name)[0]
+        )
         new_op_dist_attr.set_output_dist_attr(
-            new_op.output(output_name)[0], ref_tensor_dist_attr)
+            new_op.output(output_name)[0], ref_tensor_dist_attr
+        )
 
     ctx.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
 
@@ -327,9 +356,9 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
 
     Args:
         dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        act_grad_names (list): list of input activation grads variable name to the current operator. 
-        out_grad_names (list): list of the output parameter's grads variable name of the current operator. 
+        op (Operator): the current (backward) operator which might need.
+        act_grad_names (list): list of input activation grads variable name to the current operator.
+        out_grad_names (list): list of the output parameter's grads variable name of the current operator.
         rank (int): global ranks index for current process.
     """
     dp_group = None
@@ -349,9 +378,12 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
         batch_size_axis = var_dim_mapping[0] if len(var_dim_mapping) > 0 else -1
 
         if batch_size_axis > -1 and mesh_shape[batch_size_axis] > 1:
-            group_ranks = _get_comm_group(process_mesh.processes,
-                                          process_mesh.topology,
-                                          batch_size_axis, rank)
+            group_ranks = _get_comm_group(
+                process_mesh.processes,
+                process_mesh.topology,
+                batch_size_axis,
+                rank,
+            )
             dp_group = new_process_group(group_ranks)
             break
 
@@ -360,13 +392,13 @@ def get_data_parallel_group(dist_ctx, op, act_grad_names, rank):
 
 def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
     """
-    insert the allreudce and scale ops for gradients of model 
+    insert the allreudce and scale ops for gradients of model
     parameters for operator in data parallelism.
 
     Args:
         dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        allreduce_var_names (list): list of the parameter's grads variable name in the current operator output. 
+        op (Operator): the current (backward) operator which might need.
+        allreduce_var_names (list): list of the parameter's grads variable name in the current operator output.
     """
 
     op_dist_attr = dist_ctx.get_op_dist_attr_for_program(op)
@@ -378,33 +410,39 @@ def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
     for var_name in allreduce_var_names:
         added_ops = []
         grad_var = main_block.var(var_name)
-        allreduce_op = main_block.append_op(type='c_allreduce_sum',
-                                            inputs={'X': [grad_var]},
-                                            outputs={'Out': [grad_var]},
-                                            attrs={
-                                                'ring_id': dp_group.id,
-                                                'use_calc_stream': True,
-                                                OP_ROLE_KEY: OpRole.Backward
-                                            })
-        allreduce_op._set_attr('op_namescope',
-                               str('/') + ParallelMode.DataParallel)
+        allreduce_op = main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [grad_var]},
+            outputs={'Out': [grad_var]},
+            attrs={
+                'ring_id': dp_group.id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Backward,
+            },
+        )
+        allreduce_op._set_attr(
+            'op_namescope', str('/') + ParallelMode.DataParallel
+        )
         added_ops.append(allreduce_op)
 
         if dist_ctx.gradient_scale:
-            scale_op = main_block.append_op(type='scale',
-                                            inputs={'X': grad_var},
-                                            outputs={'Out': grad_var},
-                                            attrs={
-                                                'scale': 1.0 / dp_degree,
-                                                OP_ROLE_KEY: OpRole.Backward
-                                            })
-            scale_op._set_attr('op_namescope',
-                               str('/') + ParallelMode.DataParallel)
+            scale_op = main_block.append_op(
+                type='scale',
+                inputs={'X': grad_var},
+                outputs={'Out': grad_var},
+                attrs={'scale': 1.0 / dp_degree, OP_ROLE_KEY: OpRole.Backward},
+            )
+            scale_op._set_attr(
+                'op_namescope', str('/') + ParallelMode.DataParallel
+            )
             added_ops.append(scale_op)
 
         dims_mapping = op_dist_attr.get_output_dims_mapping(grad_var.name)
-        assert dims_mapping is not None, "Unexception: dims_mapping of output [{}] of op [{}] is None".format(
-            grad_var.name, op_dist_attr.op_type)
+        assert (
+            dims_mapping is not None
+        ), "Unexception: dims_mapping of output [{}] of op [{}] is None".format(
+            grad_var.name, op_dist_attr.op_type
+        )
         # NOTE auxiliary op's dist attr should follow dist_op not dist_tensor
         for new_op in added_ops:
             new_op_attr = OperatorDistributedAttribute()
@@ -414,25 +452,29 @@ def sync_and_scale_gradients(dist_ctx, op, dp_group, allreduce_var_names):
             dist_ctx.set_op_dist_attr_for_program(new_op, new_op_attr)
 
 
-def gradient_synchronization(dist_ctx, op, act_grad_names, out_grad_names,
-                             rank):
+def gradient_synchronization(
+    dist_ctx, op, act_grad_names, out_grad_names, rank
+):
     """
-    conduct the allreudce and scaling（dp size）for gradients of model 
+    conduct the allreudce and scaling（dp size）for gradients of model
     parameters for operator in data parallelism.
 
     Args:
         dist_ctx (DistributedContext): dist context.
-        op (Operator): the current (backward) operator which might need. 
-        act_grad_names (list): list of input activation grads variable name to the current operator. 
-        out_grad_names (list): list of the output parameter's grads variable name of the current operator. 
+        op (Operator): the current (backward) operator which might need.
+        act_grad_names (list): list of input activation grads variable name to the current operator.
+        out_grad_names (list): list of the output parameter's grads variable name of the current operator.
         rank (int): global ranks index for current process.
     """
 
     if not is_in_backward_phase(dist_ctx):
         return
 
-    if is_optimize_op(op) or len(act_grad_names) == 0 or len(
-            out_grad_names) == 0:
+    if (
+        is_optimize_op(op)
+        or len(act_grad_names) == 0
+        or len(out_grad_names) == 0
+    ):
         return
 
     dp_group = get_data_parallel_group(dist_ctx, op, act_grad_names, rank)
@@ -444,13 +486,19 @@ def gradient_synchronization(dist_ctx, op, act_grad_names, out_grad_names,
 
 
 def is_data_parallel_scale_op(op):
-    return op.type == "scale" and op.desc.has_attr("op_namescope") \
-            and ParallelMode.DataParallel in op.desc.attr("op_namescope")
+    return (
+        op.type == "scale"
+        and op.desc.has_attr("op_namescope")
+        and ParallelMode.DataParallel in op.desc.attr("op_namescope")
+    )
 
 
 def is_data_parallel_reduce_op(op):
-    return op.type in ["c_reduce_sum", "c_allreduce_sum"] and op.desc.has_attr("op_namescope") \
-            and ParallelMode.DataParallel in op.desc.attr("op_namescope")
+    return (
+        op.type in ["c_reduce_sum", "c_allreduce_sum"]
+        and op.desc.has_attr("op_namescope")
+        and ParallelMode.DataParallel in op.desc.attr("op_namescope")
+    )
 
 
 def is_in_backward_phase(dist_ctx):
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_assign.py b/python/paddle/distributed/auto_parallel/operators/dist_assign.py
new file mode 100644
index 00000000000000..96923f461a73d3
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_assign.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from .dist_default import DistributedDefaultImpl0
+from ..utils import compute_compatible_and_update_dim_mapping
+
+
+class DistributedAssign(DistributedOperatorImplContainer):
+
+    def __init__(self, op_type):
+        super(DistributedAssign, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedAssign("assign"))
+
+
+class DistributedAssignImpl(DistributedOperatorImpl):
+
+    def __init__(self, name):
+        super(DistributedAssignImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        return True
+
+    def is_output_compatible(self, dist_op):
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        if x_dims_mapping != out_dims_mapping:
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        for i in range(len(x_dims_mapping)):
+            dim_changed = compute_compatible_and_update_dim_mapping(
+                [x_dims_mapping, out_dims_mapping], [i, i])
+            if dim_changed:
+                changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("assign", DistributedAssignImpl("assign"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_default.py b/python/paddle/distributed/auto_parallel/operators/dist_default.py
index 08c81c4a306200..69f0288bcf41cb 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_default.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_default.py
@@ -29,7 +29,11 @@
 from paddle.fluid.framework import _non_static_mode
 from paddle.fluid.framework import Program, Parameter, Variable, program_guard
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
-from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+from paddle.distributed.fleet.meta_optimizers.common import (
+    OpRole,
+    OP_ROLE_KEY,
+    OP_ROLE_VAR_KEY,
+)
 from ..process_group import new_process_group
 from ..utils import _get_comm_group, _get_corresponding_rank
 from ..cost import _g_op_cost_factory
@@ -37,6 +41,7 @@
 from ..cost import build_comp_costs_from_descs
 
 __op_not_need_param_init__ = ["while", "cond"]
+__op_has_shape_attr__ = ["fill_constant_batch_size_like", "fill_constant"]
 
 
 def prim_operator_data_parallel_functor(ctx, src_op):
@@ -46,35 +51,41 @@ def prim_operator_data_parallel_functor(ctx, src_op):
 
     var_name = src_op.output_arg_names[0]
     if var_name in ctx.grads_params:
-        assert var_name not in ctx.synced_gradient, "in primtive mode, grad is already {} synced".format(
-            var_name)
+        assert (
+            var_name not in ctx.synced_gradient
+        ), "in primtive mode, grad is already {} synced".format(var_name)
         ctx.synced_gradient.add(var_name)
         sync_group = new_process_group(ctx.data_parallel_group)
 
-        allreduce_op = main_block.append_op(type='c_allreduce_sum',
-                                            inputs={'X': [var_name]},
-                                            outputs={'Out': [var_name]},
-                                            attrs={
-                                                'ring_id': sync_group.id,
-                                                'use_calc_stream': True,
-                                                OP_ROLE_KEY: OpRole.Backward
-                                            })
+        allreduce_op = main_block.append_op(
+            type='c_allreduce_sum',
+            inputs={'X': [var_name]},
+            outputs={'Out': [var_name]},
+            attrs={
+                'ring_id': sync_group.id,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Backward,
+            },
+        )
 
         param = ctx.grads_params[var_name]
         startup_block = dist_op_context.startup_block
-        new_op = startup_block.append_op(type='c_broadcast',
-                                         inputs={'X': [param]},
-                                         outputs={'Out': [param]},
-                                         attrs={
-                                             'ring_id': sync_group.id,
-                                             'root': 0,
-                                             'use_calc_stream': True,
-                                             OP_ROLE_KEY: OpRole.Forward
-                                         })
+        new_op = startup_block.append_op(
+            type='c_broadcast',
+            inputs={'X': [param]},
+            outputs={'Out': [param]},
+            attrs={
+                'ring_id': sync_group.id,
+                'root': 0,
+                'use_calc_stream': True,
+                OP_ROLE_KEY: OpRole.Forward,
+            },
+        )
 
         grad_var = main_block.var(var_name)
         dims_mapping = ctx.get_tensor_dist_attr_for_program(
-            grad_var).dims_mapping
+            grad_var
+        ).dims_mapping
         dist_attr = ctx.get_op_dist_attr_for_program(src_op)
         process_mesh = dist_attr.process_mesh
         op_attr = OperatorDistributedAttribute()
@@ -87,7 +98,6 @@ def prim_operator_data_parallel_functor(ctx, src_op):
 
 
 class DistributedDefault(DistributedOperatorImplContainer):
-
     def __init__(self, op_type):
         super(DistributedDefault, self).__init__(op_type)
 
@@ -97,7 +107,6 @@ def __init__(self, op_type):
 
 # Replicated Default
 class DistributedDefaultImpl0(DistributedOperatorImpl):
-
     def __init__(self, name):
         super(DistributedDefaultImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -115,13 +124,14 @@ def calc_cost(self, op_role, dist_op, ctx, cluster):
 
     def calc_fwd_cost(self, dist_op, ctx, cluster):
         # calc comp op cost
-        desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op,
-                                                    dist_context=ctx)
+        desc_mapping = build_comp_desc_from_dist_op(
+            dist_op=dist_op, dist_context=ctx
+        )
         processes = dist_op.dist_attr.process_mesh.processes
         op_type = dist_op.serial_op.type
-        cost_mapping = build_comp_costs_from_descs(_g_op_cost_factory[op_type],
-                                                   ctx, processes, desc_mapping,
-                                                   cluster)
+        cost_mapping = build_comp_costs_from_descs(
+            _g_op_cost_factory[op_type], ctx, processes, desc_mapping, cluster
+        )
         res_cost = [cost_mapping]
 
         return res_cost
@@ -129,16 +139,17 @@ def calc_fwd_cost(self, dist_op, ctx, cluster):
     def calc_bwd_cost(self, dist_op, ctx, cluster):
         # calc comp op cost
         res = []
-        desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op,
-                                                    dist_context=ctx)
+        desc_mapping = build_comp_desc_from_dist_op(
+            dist_op=dist_op, dist_context=ctx
+        )
         dist_attr = dist_op.dist_attr
         process_mesh = dist_attr.process_mesh
         processes = process_mesh.processes
         backward_op = dist_op.serial_op
         op_type = backward_op.type
-        cost_mapping = build_comp_costs_from_descs(_g_op_cost_factory[op_type],
-                                                   ctx, processes, desc_mapping,
-                                                   cluster)
+        cost_mapping = build_comp_costs_from_descs(
+            _g_op_cost_factory[op_type], ctx, processes, desc_mapping, cluster
+        )
         res.append(cost_mapping)
 
         main_block = backward_op.block
@@ -147,7 +158,8 @@ def calc_bwd_cost(self, dist_op, ctx, cluster):
         for input_name in backward_op.desc.input_names():
             for varname in backward_op.desc.input(input_name):
                 if "@GRAD" not in varname and not is_parameter_related(
-                        varname, main_block):
+                    varname, main_block
+                ):
                     var_dim_mapping = dist_attr.get_input_dims_mapping(varname)
                     mesh_shape = process_mesh.topology
                     batch_size_axis = var_dim_mapping[0]
@@ -159,16 +171,25 @@ def calc_bwd_cost(self, dist_op, ctx, cluster):
             for input_name in backward_op.desc.input_names():
                 for varname in backward_op.desc.input(input_name):
                     if "@GRAD" not in varname and is_parameter_related(
-                            varname, main_block):
+                        varname, main_block
+                    ):
                         var_dim_mapping = dist_attr.get_input_dims_mapping(
-                            varname)
+                            varname
+                        )
                         mesh_shape = process_mesh.topology
                         batch_size_axis = var_dim_mapping[0]
                         parallel_axis = batch_size_axis
                         attrs = {"use_calc_stream": True}
                         var_names = [varname + "@GRAD"]
-                        build_dp_costs(res, dist_op, ctx, var_names, attrs,
-                                       parallel_axis, cluster)
+                        build_dp_costs(
+                            res,
+                            dist_op,
+                            ctx,
+                            var_names,
+                            attrs,
+                            parallel_axis,
+                            cluster,
+                        )
         return res
 
     def is_input_compatible(self, dist_op):
@@ -312,8 +333,10 @@ def is_auto_compatible(self, dist_op):
                     batch_dim_mappings.append(dims_mapping[1])
 
         # Check batch dim mapping compatibility
-        if not all(batch_dim_mappings[0] == dim_mapping
-                   for dim_mapping in batch_dim_mappings):
+        if not all(
+            batch_dim_mappings[0] == dim_mapping
+            for dim_mapping in batch_dim_mappings
+        ):
             return False
 
         return True
@@ -348,9 +371,10 @@ def update_dims_mapping(self, dist_op):
             else:
                 batch_dim_mappings.append(dims_mapping[1])
         for arg_name in op_desc.output_arg_names():
-            if op_desc.type() == "fill_zeros_like":
+            if op_desc.type() == 'fill_any_like':
                 input_tensor = dist_op.get_serial_input(
-                    op_desc.input_arg_names()[0])
+                    op_desc.input_arg_names()[0]
+                )
                 if input_tensor.is_parameter:
                     continue
             serial_tensor = dist_op.get_serial_output(arg_name)
@@ -367,7 +391,8 @@ def update_dims_mapping(self, dist_op):
             return changed
 
         compatible_dim_mapping = compute_compatible_dim_mapping(
-            batch_dim_mappings)
+            batch_dim_mappings
+        )
         if compatible_dim_mapping is None:
             return False
 
@@ -377,19 +402,24 @@ def update_dims_mapping(self, dist_op):
                 continue
             dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
             if arg_name not in input_xshape_arg_names:
-                if len(dims_mapping) >= 1 and \
-                    compatible_dim_mapping != dims_mapping[0]:
+                if (
+                    len(dims_mapping) >= 1
+                    and compatible_dim_mapping != dims_mapping[0]
+                ):
                     dims_mapping[0] = compatible_dim_mapping
                     changed = True
             else:
-                if len(dims_mapping) >= 2 and \
-                    compatible_dim_mapping != dims_mapping[1]:
+                if (
+                    len(dims_mapping) >= 2
+                    and compatible_dim_mapping != dims_mapping[1]
+                ):
                     dims_mapping[1] = compatible_dim_mapping
                     changed = True
         for arg_name in op_desc.output_arg_names():
-            if op_desc.type() == "fill_zeros_like":
+            if op_desc.type() == 'fill_any_like':
                 input_tensor = dist_op.get_serial_input(
-                    op_desc.input_arg_names()[0])
+                    op_desc.input_arg_names()[0]
+                )
                 if input_tensor.is_parameter:
                     continue
             if op_desc.type() in ["shape", "slice"]:
@@ -399,13 +429,17 @@ def update_dims_mapping(self, dist_op):
                 continue
             dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
             if arg_name not in output_xshape_arg_names:
-                if len(dims_mapping
-                       ) >= 1 and compatible_dim_mapping != dims_mapping[0]:
+                if (
+                    len(dims_mapping) >= 1
+                    and compatible_dim_mapping != dims_mapping[0]
+                ):
                     dims_mapping[0] = compatible_dim_mapping
                     changed = True
             else:
-                if len(dims_mapping
-                       ) >= 2 and compatible_dim_mapping != dims_mapping[1]:
+                if (
+                    len(dims_mapping) >= 2
+                    and compatible_dim_mapping != dims_mapping[1]
+                ):
                     dims_mapping[1] = compatible_dim_mapping
                     changed = True
 
@@ -422,17 +456,20 @@ def forward(ctx, *args, **kwargs):
         # check validation of inputs / outputs
         for input_name in src_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
-                input_name)
+                input_name
+            )
             assert len(kwargs[input_name]) == len(
                 src_op.desc.input(input_name)
             ), "number of tensor for input [{}] is not match".format(input_name)
         for output_name in src_op.desc.output_names():
             assert output_name in kwargs, "input [{}] is not given".format(
-                output_name)
+                output_name
+            )
             assert len(kwargs[output_name]) == len(
                 src_op.desc.output(output_name)
             ), "number of tensor for input [{}] is not match".format(
-                output_name)
+                output_name
+            )
 
         # replicate op in dist program
         dist_op_desc = main_block.append_op(type='nop').desc
@@ -443,8 +480,29 @@ def forward(ctx, *args, **kwargs):
         for output_name in src_op.desc.output_names():
             dist_op_desc.set_output(output_name, kwargs[output_name])
 
+        if (
+            src_op.has_attr('shape')
+            and src_op.attr('shape')
+            and src_op.type in __op_has_shape_attr__
+        ):
+            shape_list = src_op.attr('shape')
+            Out_var = main_block._var_recursive(kwargs['Out'][0])
+            op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
+            dim_mapping = op_dist_attr.get_output_dims_mapping(Out_var.name)
+            process_mesh_shape = op_dist_attr.process_mesh.shape
+            assert len(shape_list) == len(dim_mapping)
+            # modify target shape
+            for idx, axis in enumerate(dim_mapping):
+                if axis >= 0:
+                    if len(shape_list) > idx:
+                        shape_list[idx] = (
+                            shape_list[idx] // process_mesh_shape[axis]
+                        )
+            dist_op_desc._set_attr('shape', shape_list)
+
         # data parallel synchronization for primtive operators
         from paddle.incubate.autograd import prim_enabled
+
         if prim_enabled():
             assert is_prim_op(src_op)
             prim_operator_data_parallel_functor(ctx, src_op)
@@ -455,9 +513,11 @@ def forward(ctx, *args, **kwargs):
             return
 
         for varname in dist_op_desc.input_arg_names():
-            if startup_block.has_var(varname) and startup_block.var(
-                    varname
-            ).is_parameter and varname not in dist_op_context.already_init_sync_vars:
+            if (
+                startup_block.has_var(varname)
+                and startup_block.var(varname).is_parameter
+                and varname not in dist_op_context.already_init_sync_vars
+            ):
                 dist_op_context.already_init_sync_vars.add(varname)
                 param = startup_block.var(varname)
                 param_dist_attr = ctx.get_tensor_dist_attr_for_program(param)
@@ -466,38 +526,41 @@ def forward(ctx, *args, **kwargs):
 
                 # FIXME (JZ-LIANG) Remove this hack to support any op mesh group for Pipeline Parallelism
                 if rank_id not in process_mesh.processes:
-                    rank_id = _get_corresponding_rank(ctx, process_mesh,
-                                                      rank_id)
+                    rank_id = _get_corresponding_rank(
+                        ctx, process_mesh, rank_id
+                    )
 
                 # NOTE all not splited axis should be presented in mesh
                 for axis, size in enumerate(process_mesh.topology):
                     if size <= 1 or axis in dims_mapping:
                         pass
                     else:
-                        group_ranks = _get_comm_group(process_mesh.processes,
-                                                      process_mesh.topology,
-                                                      axis, rank_id)
+                        group_ranks = _get_comm_group(
+                            process_mesh.processes,
+                            process_mesh.topology,
+                            axis,
+                            rank_id,
+                        )
                         sync_group = new_process_group(group_ranks)
 
-                        new_op = startup_block.append_op(type='c_broadcast',
-                                                         inputs={'X': param},
-                                                         outputs={'Out': param},
-                                                         attrs={
-                                                             'ring_id':
-                                                             sync_group.id,
-                                                             'root':
-                                                             0,
-                                                             'use_calc_stream':
-                                                             True,
-                                                             OP_ROLE_KEY:
-                                                             OpRole.Forward
-                                                         })
+                        new_op = startup_block.append_op(
+                            type='c_broadcast',
+                            inputs={'X': param},
+                            outputs={'Out': param},
+                            attrs={
+                                'ring_id': sync_group.id,
+                                'root': 0,
+                                'use_calc_stream': True,
+                                OP_ROLE_KEY: OpRole.Forward,
+                            },
+                        )
 
                         # set distributed attribute
                         op_attr = OperatorDistributedAttribute()
                         op_attr.process_mesh = process_mesh
-                        op_attr.set_output_dims_mapping(param.name,
-                                                        dims_mapping)
+                        op_attr.set_output_dims_mapping(
+                            param.name, dims_mapping
+                        )
                         op_attr.set_input_dims_mapping(param.name, dims_mapping)
                         ctx.set_op_dist_attr_for_program(new_op, op_attr)
 
@@ -509,24 +572,30 @@ def backward(ctx, *args, **kwargs):
         main_block = dist_op_context.work_block
         backward_op = dist_op_context.cur_src_op
         dist_attr = ctx.get_op_dist_attr_for_program(backward_op)
-        assert dist_attr is not None, "backward op [{}] don't have dist attribute !".format(
-            str(backward_op))
+        assert (
+            dist_attr is not None
+        ), "backward op [{}] don't have dist attribute !".format(
+            str(backward_op)
+        )
         rank_id = dist_op_context.rank_id
 
         # check validation of inputs / outputs
         for input_name in backward_op.desc.input_names():
             assert input_name in kwargs, "input [{}] is not given".format(
-                input_name)
+                input_name
+            )
             assert len(kwargs[input_name]) == len(
                 backward_op.desc.input(input_name)
             ), "number of tensor for input [{}] is not match".format(input_name)
         for output_name in backward_op.desc.output_names():
             assert output_name in kwargs, "input [{}] is not given".format(
-                output_name)
+                output_name
+            )
             assert len(kwargs[output_name]) == len(
                 backward_op.desc.output(output_name)
             ), "number of tensor for input [{}] is not match".format(
-                output_name)
+                output_name
+            )
 
         # replicate op in dist program
         dist_op_desc = main_block.append_op(type='nop').desc
@@ -543,7 +612,8 @@ def backward(ctx, *args, **kwargs):
         for input_name in backward_op.desc.input_names():
             for varname in backward_op.desc.input(input_name):
                 if "@GRAD" not in varname and not is_parameter_related(
-                        varname, main_block):
+                    varname, main_block
+                ):
                     act_grad_names.append(varname)
 
         out_grad_names = []
@@ -556,9 +626,11 @@ def backward(ctx, *args, **kwargs):
                     if is_parameter_related(fwd_name, main_block):
                         out_grad_names.append(varname)
 
-        gradient_synchronization(ctx, backward_op, act_grad_names,
-                                 out_grad_names, rank_id)
+        gradient_synchronization(
+            ctx, backward_op, act_grad_names, out_grad_names, rank_id
+        )
 
 
 register_distributed_operator_impl(
-    "default", DistributedDefaultImpl0("replicate_parallel"))
+    "default", DistributedDefaultImpl0("replicate_parallel")
+)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
index 3b519c2cc5b16f..68f28a87630362 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_fill_constant_batch_size_like.py
@@ -32,21 +32,22 @@
 from ..cost import FillConstantBatchSizeLikeOpCost
 from ..cost import build_comp_desc_from_dist_op, build_dp_costs
 from ..cost import build_comp_costs_from_descs
-from paddle.distributed.auto_parallel.cost.comm_op_cost import AllreduceSumOpCost
+from paddle.distributed.auto_parallel.cost.comm_op_cost import (
+    AllreduceSumOpCost,
+)
 
 
 class DistributedFillConstantBatchSizeLike(DistributedOperatorImplContainer):
-
     def __init__(self, op_type):
         super(DistributedFillConstantBatchSizeLike, self).__init__(op_type)
 
 
 register_distributed_operator_impl_container(
-    DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like"))
+    DistributedFillConstantBatchSizeLike("fill_constant_batch_size_like")
+)
 
 
 class DistributedFillConstantBatchSizeLikeImpl0(DistributedOperatorImpl):
-
     def __init__(self, name):
         super(DistributedFillConstantBatchSizeLikeImpl0, self).__init__(name)
         self._forward_implemented = True
@@ -56,7 +57,8 @@ def calc_cost(self, op_role, dist_op, ctx, cluster):
         cost = None
         if int(op_role) == int(OpRole.Backward):
             raise ValueError(
-                "The fill_constant_batch_size_like has no grad op.")
+                "The fill_constant_batch_size_like has no grad op."
+            )
         else:
             cost = self.calc_fwd_cost(dist_op, ctx, cluster)
         assert cost is not None
@@ -64,13 +66,18 @@ def calc_cost(self, op_role, dist_op, ctx, cluster):
 
     def calc_fwd_cost(self, dist_op, ctx, cluster):
         # calc comp op cost
-        desc_mapping = build_comp_desc_from_dist_op(dist_op=dist_op,
-                                                    dist_context=ctx)
+        desc_mapping = build_comp_desc_from_dist_op(
+            dist_op=dist_op, dist_context=ctx
+        )
         processes = dist_op.dist_attr.process_mesh.processes
         op_type = dist_op.serial_op.type
         cost_mapping = build_comp_costs_from_descs(
-            FillConstantBatchSizeLikeOpCost, ctx, processes, desc_mapping,
-            cluster)
+            FillConstantBatchSizeLikeOpCost,
+            ctx,
+            processes,
+            desc_mapping,
+            cluster,
+        )
 
         res_cost = [cost_mapping]
         return res_cost
@@ -92,8 +99,9 @@ def is_output_compatible(self, dist_op):
         return True
 
     def is_auto_compatible(self, dist_op):
-        if (not self.is_input_compatible(dist_op)) or \
-            (not self.is_output_compatible(dist_op)):
+        if (not self.is_input_compatible(dist_op)) or (
+            not self.is_output_compatible(dist_op)
+        ):
             return False
         op_desc = dist_op.serial_op.desc
         op_dist_attr = dist_op.dist_attr
@@ -116,7 +124,8 @@ def update_dims_mapping(self, dist_op):
 
         # only the batch size dimemsion of input and output are relative.
         dim_changed = compute_compatible_and_update_dim_mapping(
-            [x_dims_mapping, out_dims_mapping], [0, 0])
+            [x_dims_mapping, out_dims_mapping], [0, 0]
+        )
         if dim_changed:
             changed = True
 
@@ -128,24 +137,6 @@ def forward(ctx, *args, **kwargs):
         kwargs: inputname_mapping & outputname_mapping
         """
         DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
-        dist_op_context = ctx.dist_op_context
-        src_op = dist_op_context.cur_src_op
-        op_dist_attr = ctx.get_op_dist_attr_for_program(src_op)
-        main_block = dist_op_context.work_block
-        op = main_block.ops[-1]
-        assert op.type == "fill_constant_batch_size_like"
-
-        # modify shape attr according to how output are partitioned
-        out_name = op.output('Out')[0]
-        dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
-        process_mesh_shape = op_dist_attr.process_mesh.topology
-        shape_list = op.attr("shape")
-        # modify target shape
-        for idx, axis in enumerate(dims_mapping):
-            if axis >= 0:
-                shape_list[idx] = shape_list[idx] // process_mesh_shape[axis]
-
-        op._set_attr("shape", shape_list)
 
     @staticmethod
     def backward(ctx, *args, **kwargs):
@@ -154,4 +145,5 @@ def backward(ctx, *args, **kwargs):
 
 register_distributed_operator_impl(
     "fill_constant_batch_size_like",
-    DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape"))
+    DistributedFillConstantBatchSizeLikeImpl0("fill_by_shape"),
+)
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 3be84c55126bff..8f2db1a3b2637e 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -1308,6 +1308,8 @@ def calc_bwd_cost(self, dist_op, ctx, cluster):
         process_mesh = dist_attr.process_mesh
         processes = process_mesh.processes
         # col parallel: matmul + allreduce
+        if backward_op.attr("trans_y"):
+            Y_var_dim_mapping.reverse()
         assert Y_var_dim_mapping[0] < 0
         parallel_axis = Y_var_dim_mapping[1]
 
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py
similarity index 92%
rename from python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
rename to python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py
index bdd105ef64c303..6b53b2eed7ad00 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_reduce_p.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_reduce_sum_p.py
@@ -33,21 +33,21 @@
 from ..utils import _get_comm_group, _get_corresponding_rank
 
 
-class DistributedReducePrimtive(DistributedOperatorImplContainer):
+class DistributedReduceSumPrimtive(DistributedOperatorImplContainer):
 
     def __init__(self, op_type):
-        super(DistributedReducePrimtive, self).__init__(op_type)
+        super(DistributedReduceSumPrimtive, self).__init__(op_type)
 
 
 register_distributed_operator_impl_container(
-    DistributedReducePrimtive("reduce_p"))
+    DistributedReduceSumPrimtive("reduce_sum_p"))
 
 
-# Batch Dimension Reduce Primitive
-class DistributedReducePrimtiveImpl0(DistributedOperatorImpl):
+# Batch Dimension ReduceSum Primitive
+class DistributedReduceSumPrimtiveImpl0(DistributedOperatorImpl):
 
     def __init__(self, name):
-        super(DistributedReducePrimtiveImpl0, self).__init__(name)
+        super(DistributedReduceSumPrimtiveImpl0, self).__init__(name)
         self._forward_implemented = True
         self._backward_implemented = True
 
@@ -149,4 +149,5 @@ def backward(ctx, *args, **kwargs):
 
 
 register_distributed_operator_impl(
-    "reduce_p", DistributedReducePrimtiveImpl0("batch_dimension_reduce_p"))
+    "reduce_sum_p",
+    DistributedReduceSumPrimtiveImpl0("batch_dimension_reduce_sum_p"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_scale.py b/python/paddle/distributed/auto_parallel/operators/dist_scale.py
new file mode 100644
index 00000000000000..9fc28d05a20775
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_scale.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..utils import compute_compatible_and_update_dim_mapping
+from .common import (
+    DistributedOperatorImpl,
+    DistributedOperatorImplContainer,
+    register_distributed_operator_impl,
+    register_distributed_operator_impl_container,
+)
+from .dist_default import DistributedDefaultImpl0
+
+
+class DistributedScale(DistributedOperatorImplContainer):
+    def __init__(self, op_type):
+        super().__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedScale("scale"))
+
+
+class DistributedScaleImpl(DistributedOperatorImpl):
+    def __init__(self, name):
+        super().__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        return True
+
+    def is_output_compatible(self, dist_op):
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or (
+            not self.is_output_compatible(dist_op)
+        ):
+            return False
+
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        if x_dims_mapping != out_dims_mapping:
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        changed = False
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        x_name = op_desc.input('X')[0]
+        out_name = op_desc.output('Out')[0]
+        x_dims_mapping = op_dist_attr.get_input_dims_mapping(x_name)
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        for i in range(len(x_dims_mapping)):
+            dim_changed = compute_compatible_and_update_dim_mapping(
+                [x_dims_mapping, out_dims_mapping], [i, i]
+            )
+            if dim_changed:
+                changed = True
+
+        return changed
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("scale", DistributedScaleImpl("scale"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_shape.py b/python/paddle/distributed/auto_parallel/operators/dist_shape.py
new file mode 100644
index 00000000000000..313f296ab96246
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/operators/dist_shape.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .common import DistributedOperatorImplContainer
+from .common import DistributedOperatorImpl
+from .common import register_distributed_operator_impl_container
+from .common import register_distributed_operator_impl
+from .dist_default import DistributedDefaultImpl0
+from ..utils import is_dim_shard
+
+
+class DistributedShape(DistributedOperatorImplContainer):
+
+    def __init__(self, op_type):
+        super(DistributedShape, self).__init__(op_type)
+
+
+register_distributed_operator_impl_container(DistributedShape("shape"))
+
+
+class DistributedShapeImpl(DistributedOperatorImpl):
+
+    def __init__(self, name):
+        super(DistributedShapeImpl, self).__init__(name)
+        self._forward_implemented = True
+        self._backward_implemented = True
+
+    def is_input_compatible(self, dist_op):
+        return True
+
+    def is_output_compatible(self, dist_op):
+        op_desc = dist_op.serial_op.desc
+        op_dist_attr = dist_op.dist_attr
+        out_name = op_desc.output('Out')[0]
+        out_dims_mapping = op_dist_attr.get_output_dims_mapping(out_name)
+
+        assert len(out_dims_mapping) == 1
+        if is_dim_shard(out_dims_mapping[0]):
+            return False
+
+        return True
+
+    def is_auto_compatible(self, dist_op):
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)):
+            return False
+
+        return True
+
+    def update_dims_mapping(self, dist_op):
+        return False
+
+    @staticmethod
+    def forward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.forward(ctx, *args, **kwargs)
+
+    @staticmethod
+    def backward(ctx, *args, **kwargs):
+        DistributedDefaultImpl0.backward(ctx, *args, **kwargs)
+
+
+register_distributed_operator_impl("shape", DistributedShapeImpl("shape"))
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_split.py b/python/paddle/distributed/auto_parallel/operators/dist_split.py
index 8f89020b53ca4a..9b7c680d7921d3 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_split.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_split.py
@@ -101,8 +101,12 @@ def update_dims_mapping(self, dist_op):
         return changed
 
     def is_auto_compatible(self, dist_op):
-        raise NotImplementedError(
-            "Auto Search is not supported by dist split yet.")
+        if (not self.is_input_compatible(dist_op)) or \
+            (not self.is_output_compatible(dist_op)) or \
+            (not self.is_compatible(dist_op)):
+            return False
+
+        return True
 
     @staticmethod
     def forward(ctx, *args, **kwargs):
diff --git a/python/paddle/distributed/auto_parallel/parallelizer.py b/python/paddle/distributed/auto_parallel/parallelizer.py
index 4b538431bb072b..620246fe522298 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer.py
@@ -23,14 +23,12 @@
 import pickle
 import time
 import paddle
-from paddle.fluid.backward import append_backward
-from paddle.distributed.utils import get_logger
-from paddle.distributed.fleet import cloud_utils
 import paddle.fluid.core as core
 from paddle.fluid import program_guard
+from paddle.fluid.backward import append_backward
+from paddle.distributed.utils.log_utils import get_logger
 from paddle.distributed.passes import new_pass, PassContext
 from .dist_context import DistributedContext
-from .dist_context import get_default_distributed_context
 from .dist_context import set_default_distributed_context
 from .completion import Completer
 from .partitioner import Partitioner
@@ -40,7 +38,6 @@
 from .process_group import _g_process_group_map, ProcessGroup
 from .utils import make_data_unshard
 from .utils import set_grad_var_shape
-from .utils import print_program_with_dist_attr
 from .utils import SerialProgramInfo
 from .reshard import Resharder
 from .cluster import Cluster
@@ -57,9 +54,9 @@ class AutoParallelizer:
     AutoParallelizer is the main controller class to do the auto parallel process.
     And the auto parallel process will be triggered in the wrapped parallelize function.
     To facilitate the auto parallelization, it will contain information about program, cluster and the
-    related context. In this basic version, the program information will be retrevied from 
+    related context. In this basic version, the program information will be retrevied from
     Fleet object, and the cluster information can be retrevied in the new created Cluster object,
-    and the context information can be retrevied in the new created DistributedContext. 
+    and the context information can be retrevied in the new created DistributedContext.
     """
 
     def __init__(self, fleet):
@@ -82,9 +79,13 @@ def __init__(self, fleet):
         self._pass_context = PassContext()
 
         self._need_rank_mapping = os.getenv("PADDLE_NEED_RANK_MAPPING")
-        self._need_rank_mapping = True if self._need_rank_mapping and \
-            self._need_rank_mapping.lower() == 'true' else False
-        self._pass_context = None
+        self._need_rank_mapping = (
+            True
+            if self._need_rank_mapping
+            and self._need_rank_mapping.lower() == 'true'
+            else False
+        )
+        # self._pass_context = None
 
     def _remove_distributed_attrs(self, main_program):
         suffix = core.kAutoParallelSuffix()
@@ -96,8 +97,9 @@ def _remove_distributed_attrs(self, main_program):
                     if suffix in attr_name:
                         op._remove_attr(attr_name)
 
-    def _apply_pre_optimization_passes(self, main_program, startup_program,
-                                       loss, params_grads, no_grad_set):
+    def _apply_pre_optimization_passes(
+        self, main_program, startup_program, loss, params_grads, no_grad_set
+    ):
         # apply amp pass
         if self._dist_strategy.amp:
             config = copy.deepcopy(self._dist_strategy.amp_configs)
@@ -107,12 +109,14 @@ def _apply_pre_optimization_passes(self, main_program, startup_program,
             if config["use_pure_fp16"]:
                 config["base_opt"] = self._optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply([main_program], [startup_program],
-                                              self._pass_context)
+                auto_parallel_fp16_pass.apply(
+                    [main_program], [startup_program], self._pass_context
+                )
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
-                auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                             self._pass_context)
+                auto_parallel_amp_pass.apply(
+                    [main_program], [startup_program], self._pass_context
+                )
 
         # apply recompute pass
         if self._dist_strategy.recompute:
@@ -120,14 +124,22 @@ def _apply_pre_optimization_passes(self, main_program, startup_program,
             config["dist_context"] = self._dist_context
             config["no_grad_set"] = copy.deepcopy(no_grad_set)
             config["loss"] = loss
-            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
-                                                    config)
-            auto_parallel_recompute_pass.apply([main_program],
-                                               [startup_program],
-                                               self._pass_context)
-
-    def _generate_backward(self, main_program, startup_program, loss,
-                           parameter_list, no_grad_set, callbacks):
+            auto_parallel_recompute_pass = new_pass(
+                "auto_parallel_recompute", config
+            )
+            auto_parallel_recompute_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+
+    def _generate_backward(
+        self,
+        main_program,
+        startup_program,
+        loss,
+        parameter_list,
+        no_grad_set,
+        callbacks,
+    ):
 
         with program_guard(main_program, startup_program):
             params_grads = append_backward(
@@ -135,7 +147,8 @@ def _generate_backward(self, main_program, startup_program, loss,
                 parameter_list,
                 no_grad_set,
                 callbacks,
-                distop_context=self._dist_context.dist_op_context)
+                distop_context=self._dist_context.dist_op_context,
+            )
         self._completer = Completer(self._dist_context)
         self._completer.complete_backward_annotation(main_program)
         self._dist_context.block_state.parse_backward_blocks(main_program)
@@ -143,38 +156,53 @@ def _generate_backward(self, main_program, startup_program, loss,
 
     def _apply_optimize(self, main_program, startup_program, params_grads):
 
+        optimizer = copy.deepcopy(self._optimizer)
         with program_guard(main_program, startup_program):
-            optimize_ops = copy.deepcopy(
-                self._optimizer).apply_gradients(params_grads)
+            optimize_ops = optimizer.apply_gradients(params_grads)
 
+        self._dist_context._serial_optimizer = optimizer
         # update completion
         self._completer = Completer(self._dist_context)
         self._completer.complete_update_annotation(main_program)
 
         return optimize_ops
 
-    def _apply_post_optimization_passes(self, main_program, startup_program,
-                                        rank, params_grads):
+    def _apply_post_optimization_passes(
+        self, main_program, startup_program, rank, params_grads
+    ):
 
         if self._dist_strategy.sharding:
             config = copy.deepcopy(self._dist_strategy.sharding_configs)
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["global_rank"] = rank
-            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
-                                                   config)
-            auto_parallel_sharding_pass.apply([main_program], [startup_program],
-                                              self._pass_context)
+            auto_parallel_sharding_pass = new_pass(
+                "auto_parallel_sharding", config
+            )
+            auto_parallel_sharding_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+            params_grads = self._pass_context.get_attr("params_grads")
+
+        config = copy.deepcopy(self._dist_strategy.sharding_configs)
+        config["dist_context"] = self._dist_context
+        config["params_grads"] = params_grads
+        config["rank_id"] = rank
+        auto_parallel_clip_pass = new_pass("auto_parallel_grad_clip", config)
+        auto_parallel_clip_pass.apply(
+            [main_program], [startup_program], self._pass_context
+        )
 
         if self._dist_strategy.gradient_merge:
             config = copy.deepcopy(self._dist_strategy.gradient_merge_configs)
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             auto_parallel_gradient_merge_pass = new_pass(
-                "auto_parallel_gradient_merge_pass", config)
-            auto_parallel_gradient_merge_pass.apply([main_program],
-                                                    [startup_program],
-                                                    self._pass_context)
+                "auto_parallel_gradient_merge_pass", config
+            )
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
 
     def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
         completed_main_program = None
@@ -188,8 +216,9 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
             self._dist_context = DistributedContext()
             _logger.info("Start annotation dist attr.")
             self._completer = Completer(self._dist_context)
-            completed_main_program = self._completer.complete_forward_annotation(
-                serial_main_program)
+            completed_main_program = (
+                self._completer.complete_forward_annotation(serial_main_program)
+            )
         else:
             completed_main_program = serial_main_program
             self._dist_context = copy.deepcopy(dist_context)
@@ -199,49 +228,77 @@ def _get_dist_program(self, rank, dist_context=None, relaunch_phase=False):
 
         # serial backward pass
         params_grads = self._generate_backward(
-            completed_main_program, serial_startup_program, serial_loss,
-            self._parameter_list, self._no_grad_set, self._callbacks)
+            completed_main_program,
+            serial_startup_program,
+            serial_loss,
+            self._parameter_list,
+            self._no_grad_set,
+            self._callbacks,
+        )
 
         # serial forward pass
-        self._apply_pre_optimization_passes(completed_main_program,
-                                            serial_startup_program, serial_loss,
-                                            params_grads, self._no_grad_set)
+        self._apply_pre_optimization_passes(
+            completed_main_program,
+            serial_startup_program,
+            serial_loss,
+            params_grads,
+            self._no_grad_set,
+        )
         # Logical partition
         partitioner = Partitioner(self._dist_context, rank)
-        dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-            completed_main_program, serial_startup_program, params_grads)
+        (
+            dist_main_prog,
+            dist_startup_prog,
+            dist_params_grads,
+        ) = partitioner.partition(
+            completed_main_program, serial_startup_program, params_grads
+        )
 
         # TODO refactor the placement of optimizer
         # generate optimize program
-        dist_optimize_ops = self._apply_optimize(dist_main_prog,
-                                                 dist_startup_prog,
-                                                 dist_params_grads)
+        dist_optimize_ops = self._apply_optimize(
+            dist_main_prog, dist_startup_prog, dist_params_grads
+        )
 
         set_grad_var_shape(dist_main_prog, self._dist_context)
 
         make_data_unshard(dist_main_prog, dist_startup_prog, self._dist_context)
 
-        resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                              self._dist_context, dist_params_grads)
+        resharder = Resharder(
+            dist_main_prog,
+            dist_startup_prog,
+            rank,
+            self._dist_context,
+            dist_params_grads,
+        )
         resharder.reshard()
 
-        self._apply_post_optimization_passes(dist_main_prog, dist_startup_prog,
-                                             rank, dist_params_grads)
+        self._apply_post_optimization_passes(
+            dist_main_prog, dist_startup_prog, rank, dist_params_grads
+        )
         g_process_group_map = None
         if not relaunch_phase:
             g_process_group_map = copy.deepcopy(_g_process_group_map)
             _g_process_group_map.clear()
-            _g_process_group_map[0] = ProcessGroup(0, [])
+            _g_process_group_map[0] = ProcessGroup(1000, [])
             for process_mesh in self._dist_context._process_meshes:
                 _g_process_group_map[0].add_ranks(process_mesh.processes)
-        return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog, g_process_group_map
-
-    def parallelize(self,
-                    loss,
-                    startup_program,
-                    parameter_list=None,
-                    no_grad_set=None,
-                    callbacks=None):
+        return (
+            dist_optimize_ops,
+            dist_params_grads,
+            dist_startup_prog,
+            dist_main_prog,
+            g_process_group_map,
+        )
+
+    def parallelize(
+        self,
+        loss,
+        startup_program,
+        parameter_list=None,
+        no_grad_set=None,
+        callbacks=None,
+    ):
         assert startup_program is not None
         self._loss = loss
         self._startup_program = startup_program
@@ -252,25 +309,27 @@ def parallelize(self,
 
         if self._enable_auto_mapping and self._need_rank_mapping:
             # Do the mapping pass before parallelization
-            assert self._cluster is not None, \
-                "The cluster must not be none when using auto mapping."
+            assert (
+                self._cluster is not None
+            ), "The cluster must not be none when using auto mapping."
             dist_programs = {}
             world_process_group = get_world_process_group()
             dist_context = None
             # auto search
             if self._dist_strategy.auto_search:
                 logging.info("Start searching dist attr.")
-                serial_program_info = SerialProgramInfo(self._main_program,
-                                                        self._startup_program,
-                                                        self._loss,
-                                                        self._optimizer,
-                                                        self._cluster)
-                planner = Planner(serial_program_info,
-                                  self,
-                                  algorithm_config={
-                                      "name": "mcmc",
-                                      "max_search_times": 5
-                                  })
+                serial_program_info = SerialProgramInfo(
+                    self._main_program,
+                    self._startup_program,
+                    self._loss,
+                    self._optimizer,
+                    self._cluster,
+                )
+                planner = Planner(
+                    serial_program_info,
+                    self,
+                    algorithm_config={"name": "mcmc", "max_search_times": 5},
+                )
                 dist_context, _ = planner.search()
                 logging.info("End searching dist attr.")
 
@@ -279,31 +338,42 @@ def parallelize(self,
                 logging.info("Start serialize searched dist attr")
                 cwd = pathlib.Path().resolve()
                 searched_dist_context_path = os.path.join(
-                    cwd, f"searched_dist_context_{time.time()}.pkl")
+                    cwd, f"searched_dist_context_{time.time()}.pkl"
+                )
                 saved_dist_context = {}
                 ops_dist_attr = {}
                 tensors_dist_attr = {}
                 for key, dist_op in dist_context._dist_ops_for_program.items():
                     ops_dist_attr[key] = dist_op.dist_attr
-                for key, dist_tensor in dist_context._dist_tensors_for_program.items(
-                ):
+                for (
+                    key,
+                    dist_tensor,
+                ) in dist_context._dist_tensors_for_program.items():
                     tensors_dist_attr[key] = dist_tensor.dist_attr
                 saved_dist_context["ops_dist_attr"] = ops_dist_attr
                 saved_dist_context["tensors_dist_attr"] = tensors_dist_attr
                 saved_dist_context[
-                    "process_meshes"] = dist_context._process_meshes
-                with open(searched_dist_context_path,
-                          "wb") as dist_context_file:
+                    "process_meshes"
+                ] = dist_context._process_meshes
+                with open(
+                    searched_dist_context_path, "wb"
+                ) as dist_context_file:
                     pickle.dump(saved_dist_context, dist_context_file)
                     os.environ[
-                        'PADDLE_SEARCHED_DIST_CONTEXT_PATH'] = searched_dist_context_path
+                        'PADDLE_SEARCHED_DIST_CONTEXT_PATH'
+                    ] = searched_dist_context_path
                     logging.info(
                         f"End serialize searched dist attr to {searched_dist_context_path}"
                     )
 
             for rank in world_process_group.ranks:
-                dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog, g_process_group_map = self._get_dist_program(
-                    rank, dist_context)
+                (
+                    dist_optimize_ops,
+                    dist_params_grads,
+                    dist_startup_prog,
+                    dist_main_prog,
+                    g_process_group_map,
+                ) = self._get_dist_program(rank, dist_context)
                 dist_programs[rank] = [dist_main_prog, g_process_group_map]
 
             # Do the mapping between the distributed program graph and the cluster graph
@@ -315,27 +385,42 @@ def parallelize(self,
                 json.dump(rank_mapping, rank_mapping_file)
 
             enable_elastic = os.getenv("PADDLE_ENABLE_ELASTIC")
-            enable_elastic = True if enable_elastic and enable_elastic.lower(
-            ) == 'true' else False
+            enable_elastic = (
+                True
+                if enable_elastic and enable_elastic.lower() == 'true'
+                else False
+            )
             if enable_elastic:
                 print("Auto mapping finished, now do elastic re-launch")
-                sys.exit(paddle.distributed.fleet.elastic.manager.
-                         ELASTIC_AUTO_PARALLEL_EXIT_CODE)
+                sys.exit(
+                    paddle.distributed.fleet.elastic.manager.ELASTIC_AUTO_PARALLEL_EXIT_CODE
+                )
 
             original_cmd_args = os.getenv("PADDLE_ORIGINAL_CMD_ARGS")
             rank_mapping_args = " ".join(
-                ["--rank_mapping_path", self._rank_mapping_path])
+                ["--rank_mapping_path", self._rank_mapping_path]
+            )
             if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
                 coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
             else:
                 coverage_args = []
-            new_cmd_args = "-m paddle.distributed.fleet.launch" + " " + rank_mapping_args + " " + original_cmd_args
-            new_cmd = [sys.executable, "-u"
-                       ] + coverage_args + shlex.split(new_cmd_args)
+            new_cmd_args = (
+                "-m paddle.distributed.fleet.launch"
+                + " "
+                + rank_mapping_args
+                + " "
+                + original_cmd_args
+            )
+            new_cmd = (
+                [sys.executable, "-u"]
+                + coverage_args
+                + shlex.split(new_cmd_args)
+            )
             new_process = subprocess.Popen(new_cmd)
             new_process.wait()
-            assert new_process.returncode == 0, \
-                "Launch failed with rank mapping"
+            assert (
+                new_process.returncode == 0
+            ), "Launch failed with rank mapping"
             print("Successfully do the second launch for auto mapping!")
             sys.exit(0)
         else:
@@ -343,27 +428,32 @@ def parallelize(self,
             rank = paddle.distributed.get_rank()
             dist_context = None
             searched_dist_context_path = os.getenv(
-                "PADDLE_SEARCHED_DIST_CONTEXT_PATH", None)
+                "PADDLE_SEARCHED_DIST_CONTEXT_PATH", None
+            )
             if searched_dist_context_path is not None:
-                with open(searched_dist_context_path,
-                          "rb") as dist_context_file:
+                with open(
+                    searched_dist_context_path, "rb"
+                ) as dist_context_file:
                     saved_dist_context = pickle.load(dist_context_file)
                     dist_context = DistributedContext()
                     for op in self._main_program.global_block().ops:
                         dist_attr = saved_dist_context["ops_dist_attr"][
-                            op.desc.id()]
+                            op.desc.id()
+                        ]
                         dist_op = DistributedOperator(op, dist_attr)
                         dist_context.add_dist_op_for_program(dist_op)
 
                     vars = self._main_program.global_block().vars
                     for var in vars.values():
                         dist_attr = saved_dist_context["tensors_dist_attr"][
-                            var.desc.id()]
+                            var.desc.id()
+                        ]
                         dist_tensor = DistributedTensor(var, dist_attr)
                         dist_context.add_dist_tensor_for_program(dist_tensor)
 
                     dist_context._process_meshes = saved_dist_context[
-                        "process_meshes"]
+                        "process_meshes"
+                    ]
 
             else:
                 if self._dist_strategy.auto_search:
@@ -372,13 +462,16 @@ def parallelize(self,
                         self._startup_program,
                         self._loss,
                         self._optimizer,
-                        cluster=self._cluster)
-                    planner = Planner(serial_program_info,
-                                      self,
-                                      algorithm_config={
-                                          "name": "mcmc",
-                                          "max_search_times": 5
-                                      })
+                        cluster=self._cluster,
+                    )
+                    planner = Planner(
+                        serial_program_info,
+                        self,
+                        algorithm_config={
+                            "name": "mcmc",
+                            "max_search_times": 5,
+                        },
+                    )
                     dist_context, _ = planner.search()
 
             # rebuild g_process_group
@@ -386,8 +479,13 @@ def parallelize(self,
                 pg0 = get_process_group(0)
                 for process_mesh in dist_context._process_meshes:
                     pg0.add_ranks(process_mesh.processes)
-            dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog, _ = self._get_dist_program(
-                rank, dist_context, relaunch_phase=True)
+            (
+                dist_optimize_ops,
+                dist_params_grads,
+                dist_startup_prog,
+                dist_main_prog,
+                _,
+            ) = self._get_dist_program(rank, dist_context, relaunch_phase=True)
 
             # NOTE: This is a trick to fix hang in pipeline mode when dist context is searched by planner
             if self._dist_strategy.auto_search:
@@ -404,8 +502,13 @@ def parallelize(self,
             # instantiate communication by process_mapping.
             all_process_groups = get_all_process_groups()
             for process_group in all_process_groups:
+                if len(_g_process_group_map) > 0:
+                    tmp = paddle.to_tensor([1], dtype="int32")
+                    paddle.distributed.all_reduce(tmp, sync_op=True, group=_g_process_group_map[0])
+                    paddle.device.cuda.synchronize()
+                
                 if rank not in process_group.ranks:
-                    continue
+                    continue              
                 process_group.instantiate()
 
             # Copy distributed info to the default context
@@ -415,14 +518,25 @@ def parallelize(self,
             # with inference.
             self._remove_distributed_attrs(dist_main_prog)
 
-            return dist_optimize_ops, dist_params_grads, dist_startup_prog, dist_main_prog
+            return (
+                dist_optimize_ops,
+                dist_params_grads,
+                dist_startup_prog,
+                dist_main_prog,
+            )
 
     def __deepcopy__(self, memo):
         cls = self.__class__
         result = cls.__new__(cls)
         memo[id(self)] = result
         for k, v in self.__dict__.items():
-            if k == "_main_program" or k == "_startup_program" or k == "_dist_context" or k == "_fleet" or k == "_loss":
+            if (
+                k == "_main_program"
+                or k == "_startup_program"
+                or k == "_dist_context"
+                or k == "_fleet"
+                or k == "_loss"
+            ):
                 setattr(result, k, v)
             else:
                 setattr(result, k, copy.deepcopy(v, memo))
diff --git a/python/paddle/distributed/auto_parallel/parallelizer_v2.py b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
index 51eede57638ff8..6f77dbd4e07c87 100644
--- a/python/paddle/distributed/auto_parallel/parallelizer_v2.py
+++ b/python/paddle/distributed/auto_parallel/parallelizer_v2.py
@@ -15,28 +15,20 @@
 import copy
 import time
 import logging
-from collections import defaultdict
 
-import paddle
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
-from paddle.fluid.framework import _non_static_mode, unique_name
+from paddle.fluid.framework import unique_name
 from paddle.distributed.passes import new_pass
-from paddle.distributed.utils import get_logger
 
 from .reshard import Resharder
 from .partitioner import Partitioner
-from .dist_op import DistributedOperator
-from .dist_saver import DistributedSaver
-from .dist_loader import NonIterableGeneratorLoader
-from .utils import make_data_unshard, set_grad_var_shape
-from .utils import print_program_with_dist_attr, to_list
-from .process_group import get_all_process_groups, get_world_process_group
-from .dist_context import DistributedContext, get_default_distributed_context
+from .utils import set_grad_var_shape
+from .process_group import get_world_process_group
+from ..utils.log_utils import get_logger
 
 
 class Parallelizer:
-
     def __init__(self, mode, completer, dist_context):
         self._mode = mode
         self._completer = completer
@@ -61,73 +53,139 @@ def parallel(self, rank):
         if self._mode == "train" and serial_optimizer:
             # Generate backward
             serial_loss = self._dist_context.serial_loss
-            params_grads = self._generate_backward(serial_main_program,
-                                                   serial_startup_program,
-                                                   serial_loss)
+            params_grads = self._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss
+            )
             # Apply pre optimization passes
             time0 = time.time()
-            serial_main_program, serial_startup_program, params_grads = self._apply_pre_optimization(
-                serial_main_program, serial_startup_program, serial_loss,
-                serial_optimizer, params_grads)
-            self._logger.info(
-                "within parallel apply_pre_optimization time: {}, mode {}".
-                format(time.time() - time0, self._mode))
+            (
+                serial_main_program,
+                serial_startup_program,
+                params_grads,
+            ) = self._apply_pre_optimization(
+                serial_main_program,
+                serial_startup_program,
+                serial_loss,
+                serial_optimizer,
+                params_grads,
+            )
+            self._logger.debug(
+                "within parallel apply_pre_optimization time: {}, mode {}".format(
+                    time.time() - time0, self._mode
+                )
+            )
             # Do logical partition
             time0 = time.time()
             partitioner = Partitioner(self._dist_context, rank)
-            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-                serial_main_program, serial_startup_program, params_grads)
-            self._logger.info(
+            (
+                dist_main_prog,
+                dist_startup_prog,
+                dist_params_grads,
+            ) = partitioner.partition(
+                serial_main_program, serial_startup_program, params_grads
+            )
+            self._logger.debug(
                 "within parallel partitioner time: {}, mode {}".format(
-                    time.time() - time0, self._mode))
+                    time.time() - time0, self._mode
+                )
+            )
             # Generate optimizer
             time0 = time.time()
-            self._generate_optimizer(dist_main_prog, dist_startup_prog,
-                                     serial_optimizer, dist_params_grads)
-            self._logger.info(
+            self._generate_optimizer(
+                dist_main_prog,
+                dist_startup_prog,
+                serial_optimizer,
+                dist_params_grads,
+            )
+            self._logger.debug(
                 "within parallel optimizer time: {}, mode {}".format(
-                    time.time() - time0, self._mode))
+                    time.time() - time0, self._mode
+                )
+            )
             # Do reshard process
             time0 = time.time()
             set_grad_var_shape(dist_main_prog, self._dist_context)
-            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                                  self._dist_context, dist_params_grads)
+            resharder = Resharder(
+                dist_main_prog,
+                dist_startup_prog,
+                rank,
+                self._dist_context,
+                dist_params_grads,
+            )
             resharder.reshard()
-            self._logger.info(
+            self._logger.debug(
                 "within parallel reshard time: {}, mode {}".format(
-                    time.time() - time0, self._mode))
+                    time.time() - time0, self._mode
+                )
+            )
             # Apply post optimization passes
             time0 = time.time()
-            self._apply_post_optimization(dist_main_prog, dist_startup_prog,
-                                          rank, dist_params_grads)
-            self._logger.info(
-                "within parallel apply_post_optimization time: {}, mode {}".
-                format(time.time() - time0, self._mode))
+            self._apply_post_optimization(
+                dist_main_prog, dist_startup_prog, rank, dist_params_grads
+            )
+            self._logger.debug(
+                "within parallel apply_post_optimization time: {}, mode {}".format(
+                    time.time() - time0, self._mode
+                )
+            )
         else:
             # Apply pre optimization passes
-            # self._apply_pre_optimization(serial_main_program,
-            #                              serial_startup_program, None, None,
-            #                              None)
+            time0 = time.time()
+            self._apply_pre_optimization(
+                serial_main_program, serial_startup_program, None, None, None
+            )
+            self._logger.debug(
+                "within parallel apply_pre_optimization time: {}, mode {}".format(
+                    time.time() - time0, self._mode
+                )
+            )
             # Do logical partition
             time0 = time.time()
             partitioner = Partitioner(self._dist_context, rank)
-            dist_main_prog, dist_startup_prog, dist_params_grads = partitioner.partition(
-                serial_main_program, serial_startup_program, [])
+            (
+                dist_main_prog,
+                dist_startup_prog,
+                dist_params_grads,
+            ) = partitioner.partition(
+                serial_main_program, serial_startup_program, []
+            )
             # Do reshard process
-            self._logger.info(
+            self._logger.debug(
                 "within parallel partitioner time: {}, mode {}".format(
-                    time.time() - time0, self._mode))
+                    time.time() - time0, self._mode
+                )
+            )
             time0 = time.time()
-            resharder = Resharder(dist_main_prog, dist_startup_prog, rank,
-                                  self._dist_context, [], 1)
+            resharder = Resharder(
+                dist_main_prog,
+                dist_startup_prog,
+                rank,
+                self._dist_context,
+                [],
+                1,
+            )
             resharder.reshard()
-            self._logger.info(
+            self._logger.debug(
                 "within parallel reshard time: {}, mode {}".format(
-                    time.time() - time0, self._mode))
+                    time.time() - time0, self._mode
+                )
+            )
+            # Apply post optimization passes
+            time0 = time.time()
+            self._apply_post_optimization(
+                dist_main_prog, dist_startup_prog, rank, dist_params_grads
+            )
+            self._logger.debug(
+                "within parallel apply_post_optimization time: {}, mode {}".format(
+                    time.time() - time0, self._mode
+                )
+            )
         # Clone program for test
         if self._mode != 'train':
+            pipeline_opt = dist_main_prog._pipeline_opt
             dist_main_prog = dist_main_prog.clone(for_test=True)
             dist_startup_prog = dist_startup_prog.clone(for_test=True)
+            dist_main_prog._pipeline_opt = pipeline_opt
 
         # Store the distributed programs for further usages
         self._dist_context.dist_main_programs[rank] = dist_main_prog
@@ -136,85 +194,88 @@ def parallel(self, rank):
     def _generate_backward(self, main_program, startup_program, loss):
         with program_guard(main_program, startup_program):
             params_grads = append_backward(
-                loss, distop_context=self._dist_context.dist_op_context)
+                loss, distop_context=self._dist_context.dist_op_context
+            )
         self._completer.complete_backward_annotation(main_program)
         self._dist_context.block_state.parse_backward_blocks(main_program)
         return params_grads
 
-    def _generate_optimizer(self, main_program, startup_program, optimizer,
-                            params_grads):
+    def _generate_optimizer(
+        self, main_program, startup_program, optimizer, params_grads
+    ):
         # NOTE: `apply_gradients` will add an Accumulator for a parameter only once,
         # but optimizer will be called repeatedly in re-launch, so optimizer need to be copied.
-        if self._dist_context._dygraph_mode:
-            paddle.disable_static()
-            optimizer = copy.deepcopy(optimizer)
-            paddle.enable_static()
-        else:
-            optimizer = copy.deepcopy(optimizer)
-        self._dist_context._lr_optimizer = optimizer
+        optimizer = copy.deepcopy(optimizer)
+        self._dist_context._serial_optimizer = optimizer
         with program_guard(main_program, startup_program):
             with unique_name.guard("opt_"):
                 optimizer_ops = optimizer.apply_gradients(params_grads)
         self._completer.complete_update_annotation(main_program)
         return optimizer_ops
 
-    def _apply_pre_optimization(self, main_program, startup_program, loss,
-                                optimizer, params_grads):
+    def _apply_pre_optimization(
+        self, main_program, startup_program, loss, optimizer, params_grads
+    ):
         if self._strategy is None:
             return
 
         # apply quantization pass
         # The pass can be applied when mode must be 'train'
-        if self._mode == 'train' and self._strategy.qat:
-            config = copy.deepcopy(self._strategy.qat_configs)
+        if self._mode == 'train' and self._strategy.qat.enable:
+            config = copy.deepcopy(self._strategy.qat.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             auto_parallel_quantization_pass = new_pass(
-                "auto_parallel_quantization", config)
-            auto_parallel_quantization_pass.apply([main_program],
-                                                  [startup_program],
-                                                  self._pass_context)
+                "auto_parallel_quantization", config
+            )
+            auto_parallel_quantization_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
             main_program = self._pass_context.get_attr("main_program")
             startup_program = self._pass_context.get_attr("startup_program")
             params_grads = self._pass_context.get_attr("params_grads")
 
-        # apply amp pass
-        # FIXME we disenable amp for eval since it has a little bug with
-        # eval program and which will be fixed in future
-        if self._mode == 'train' and self._strategy.amp:
-            config = copy.deepcopy(self._strategy.amp_configs)
+        # apply amp pass on train/eval/predict
+        if self._strategy.amp.enable:
+            config = copy.deepcopy(self._strategy.amp.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["loss"] = loss
-            config["input_data"] = self._dist_context.serial_feed_vars["inputs"] \
+            config["input_data"] = (
+                self._dist_context.serial_feed_vars["inputs"]
                 + self._dist_context.serial_feed_vars["labels"]
+            )
             if config["use_pure_fp16"]:
                 config["base_opt"] = optimizer
                 auto_parallel_fp16_pass = new_pass("auto_parallel_fp16", config)
-                auto_parallel_fp16_pass.apply([main_program], [startup_program],
-                                              self._pass_context)
+                auto_parallel_fp16_pass.apply(
+                    [main_program], [startup_program], self._pass_context
+                )
             else:
                 auto_parallel_amp_pass = new_pass("auto_parallel_amp", config)
-                auto_parallel_amp_pass.apply([main_program], [startup_program],
-                                             self._pass_context)
+                auto_parallel_amp_pass.apply(
+                    [main_program], [startup_program], self._pass_context
+                )
 
         # apply recompute pass
         # recompute is then train-only optimization
-        if self._mode == "train" and self._strategy.recompute:
-            config = copy.deepcopy(self._strategy.recompute_configs)
+        if self._mode == "train" and self._strategy.recompute.enable:
+            config = copy.deepcopy(self._strategy.recompute.to_dict())
             config["dist_context"] = self._dist_context
             config["no_grad_set"] = None
             config["loss"] = loss
-            auto_parallel_recompute_pass = new_pass("auto_parallel_recompute",
-                                                    config)
-            auto_parallel_recompute_pass.apply([main_program],
-                                               [startup_program],
-                                               self._pass_context)
+            auto_parallel_recompute_pass = new_pass(
+                "auto_parallel_recompute", config
+            )
+            auto_parallel_recompute_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
 
         return main_program, startup_program, params_grads
 
-    def _apply_post_optimization(self, main_program, startup_program, rank,
-                                 params_grads):
+    def _apply_post_optimization(
+        self, main_program, startup_program, rank, params_grads
+    ):
         if self._strategy is None:
             return
 
@@ -222,38 +283,61 @@ def _apply_post_optimization(self, main_program, startup_program, rank,
         config = {}
         config["dist_context"] = self._dist_context
         config["global_rank"] = rank
+        config["use_sharding"] = self._strategy.sharding.enable
         dp_pass = new_pass("auto_parallel_data_parallel_optimization", config)
         dp_pass.apply([main_program], [startup_program], self._pass_context)
 
-        if self._strategy.sharding:
-            config = copy.deepcopy(self._strategy.sharding_configs)
+        if self._strategy.sharding.enable:
+            config = copy.deepcopy(self._strategy.sharding.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["global_rank"] = rank
-            auto_parallel_sharding_pass = new_pass("auto_parallel_sharding",
-                                                   config)
-            auto_parallel_sharding_pass.apply([main_program], [startup_program],
-                                              self._pass_context)
+            auto_parallel_sharding_pass = new_pass(
+                "auto_parallel_sharding", config
+            )
+            auto_parallel_sharding_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+            params_grads = self._pass_context.get_attr("params_grads")
 
         # GradClip is train-only optimization
-
         if self._mode == "train":
-            config = copy.deepcopy(self._strategy.sharding_configs)
+            config = copy.deepcopy(self._strategy.sharding.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             config["rank_id"] = rank
-            auto_parallel_clip_pass = new_pass("auto_parallel_grad_clip",
-                                               config)
-            auto_parallel_clip_pass.apply([main_program], [startup_program],
-                                          self._pass_context)
+            auto_parallel_clip_pass = new_pass(
+                "auto_parallel_grad_clip", config
+            )
+            auto_parallel_clip_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+
+        if self._strategy.pipeline.enable:
+            self._strategy.gradient_merge.enable = True
+            self._strategy.gradient_merge.k_steps = (
+                self._strategy.pipeline.accumulate_steps
+            )
+            self._strategy.gradient_merge.avg = True
 
         # gradient_merge is then train-only optimization
-        if self._mode == "train" and self._strategy.gradient_merge:
-            config = copy.deepcopy(self._strategy.gradient_merge_configs)
+        if self._mode == "train" and self._strategy.gradient_merge.enable:
+            config = copy.deepcopy(self._strategy.gradient_merge.to_dict())
             config["dist_context"] = self._dist_context
             config["params_grads"] = params_grads
             auto_parallel_gradient_merge_pass = new_pass(
-                "auto_parallel_gradient_merge_pass", config)
-            auto_parallel_gradient_merge_pass.apply([main_program],
-                                                    [startup_program],
-                                                    self._pass_context)
+                "auto_parallel_gradient_merge_pass", config
+            )
+            auto_parallel_gradient_merge_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
+
+        if self._strategy.pipeline.enable:
+            config = copy.deepcopy(self._strategy.pipeline.to_dict())
+            config["dist_context"] = self._dist_context
+            auto_parallel_pipeline_pass = new_pass(
+                "auto_parallel_pipeline", config
+            )
+            auto_parallel_pipeline_pass.apply(
+                [main_program], [startup_program], self._pass_context
+            )
diff --git a/python/paddle/distributed/auto_parallel/partitioner.py b/python/paddle/distributed/auto_parallel/partitioner.py
index 3262505416b1d0..e12a111dd2a61e 100644
--- a/python/paddle/distributed/auto_parallel/partitioner.py
+++ b/python/paddle/distributed/auto_parallel/partitioner.py
@@ -28,7 +28,7 @@
 from .utils import print_program_with_dist_attr, is_forward_op, is_backward_op, is_loss_op, is_optimize_op
 from .operators.common import BACKWARD_ONLY_DIST_OPS
 
-__varname_not_in_block__ = ["lod_tensor_blocking_queue_0"]
+__varname_not_in_block__ = ["lod_tensor_blocking_queue"]
 __not_shape_var_type__ = [
     core.VarDesc.VarType.READER, core.VarDesc.VarType.STEP_SCOPES
 ]
@@ -243,7 +243,9 @@ def partition_block(self, ref_block, target_block):
                                        target_block, serial_input_varname,
                                        new_varname)
                     else:
-                        assert serial_input_varname in __varname_not_in_block__
+                        for varname_not_in_block in __varname_not_in_block__:
+                            assert varname_not_in_block in serial_input_varname, \
+                                "{} is not found".format(serial_input_varname)
 
                     self._serial2dist_varname_mapping[
                         serial_input_varname] = new_varname
diff --git a/python/paddle/distributed/auto_parallel/planner.py b/python/paddle/distributed/auto_parallel/planner.py
index 701fd78a7e8b9b..0425424b0d7ae3 100755
--- a/python/paddle/distributed/auto_parallel/planner.py
+++ b/python/paddle/distributed/auto_parallel/planner.py
@@ -22,7 +22,7 @@
 import numpy as np
 
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from .cost_model import estimate_cost
 from .dist_op import DistributedOperator
 from .process_group import _g_process_group_map
diff --git a/python/paddle/distributed/auto_parallel/planner_v2.py b/python/paddle/distributed/auto_parallel/planner_v2.py
index 90b840c5943bce..8e2c0c4617b0f8 100755
--- a/python/paddle/distributed/auto_parallel/planner_v2.py
+++ b/python/paddle/distributed/auto_parallel/planner_v2.py
@@ -14,9 +14,7 @@
 
 from .completion import Completer
 from .dist_context import get_default_distributed_context
-from .utils import print_program_with_dist_attr
-
-# from .tuner.parallel_tuner import ParallelTuner
+from .tuner.parallel_tuner import ParallelTuner
 
 
 class Planner:
@@ -39,20 +37,20 @@ def __init__(self, mode, dist_context):
         self._completer = Completer(self._dist_context)
 
         self._strategy = dist_context.strategy
-        # if self._strategy.auto_search:
-        #     self._parallel_tuner = ParallelTuner(
-        #         self._dist_context, mode=self._mode)
+        # set parallel tuner for auto search
+        if self._strategy.auto_mode == "full":
+            self._parallel_tuner = ParallelTuner(self._dist_context,
+                                                 mode=self._mode)
 
     @property
     def completer(self):
         return self._completer
 
     def plan(self):
-        self._completer.complete_forward_annotation()
-        # if self._strategy.auto_search:
-        #     self._parallel_tuner.tune()
-        # else:
-        #     self._completer.complete_forward_annotation()
+        if self._strategy.auto_mode == "full":
+            self._parallel_tuner.tune()
+        else:
+            self._completer.complete_forward_annotation()
         # parse forward sub block
         self._dist_context.block_state.parse_forward_blocks(
             self._dist_context.serial_main_program)
diff --git a/python/paddle/distributed/auto_parallel/process_group.py b/python/paddle/distributed/auto_parallel/process_group.py
index 5b0d5e286ff779..3cb0f5c181d070 100644
--- a/python/paddle/distributed/auto_parallel/process_group.py
+++ b/python/paddle/distributed/auto_parallel/process_group.py
@@ -31,10 +31,11 @@ def get_all_process_groups():
 
 def get_process_group(group_id, g_process_group_map=None):
     global _g_process_group_map
-    return _g_process_group_map.get(
-        group_id,
-        None) if g_process_group_map is None else g_process_group_map.get(
-            group_id, None)
+    return (
+        _g_process_group_map.get(group_id, None)
+        if g_process_group_map is None
+        else g_process_group_map.get(group_id, None)
+    )
 
 
 def get_world_process_group():
@@ -45,23 +46,23 @@ def get_world_process_group():
 def clear_all_process_groups():
     global _g_process_group_map
     _g_process_group_map = {}
-    _g_process_group_map[0] = ProcessGroup(0, [])
+    _g_process_group_map[0] = ProcessGroup(1000, [])
 
 
 def new_process_group(ranks, group_id=None):
     global _g_process_group_map
     # A key constructed from ranks is used for avoiding duplication
-    new_key = ''.join(map(str, sorted(ranks)))
+    new_key = ''.join(map(str, ranks))
     for pg_id, pg in _g_process_group_map.items():
-        cur_key = ''.join(map(str, sorted(pg.ranks)))
+        cur_key = ''.join(map(str, pg.ranks))
         if pg_id != 0 and new_key == cur_key:
             return pg
     # If not matching the existing one, construt a new process group
     num_groups = len(_g_process_group_map)
     # Note: our process group may interfere with the original implementation
     # so the created group id should start from the original _new_ring_id()
-    if group_id == None:
-        group_id = _new_ring_id() + num_groups + 1
+    if group_id is None:
+        group_id = _new_ring_id() + num_groups + 1000
 
     new_pg = ProcessGroup(group_id, ranks)
     _g_process_group_map[group_id] = new_pg
@@ -75,14 +76,15 @@ def new_process_group(ranks, group_id=None):
 # the instantiation process in a more general way. In the future, the process group may
 # handle the communication implementation choice.
 class ProcessGroup:
-
     def __init__(self, group_id, ranks):
-        if group_id == 0 and get_process_group(0) is not None:
-            assert group_id != 0, "Process group id 0 is reserved for all ranks."
+        if group_id == 1000 and get_process_group(0) is not None:
+            assert (
+                group_id != 1000
+            ), "Process group id 1000 is reserved for all ranks."
         self._group_id = group_id
-        self._ranks = sorted(ranks)
+        self._ranks = ranks
         # Add the current ranks into group 0
-        if group_id != 0:
+        if group_id != 1000:
             global _g_process_group_map
             _g_process_group_map[0].add_ranks(ranks)
         self._is_instantiate = False
@@ -103,17 +105,19 @@ def add_ranks(self, new_ranks):
         if set(new_ranks) <= set(self.ranks):
             return
         else:
-            assert self.is_instantiate() == False, \
-                "Cannot add new ranks after instantiating the process group"
+            assert (
+                self.is_instantiate() == False
+            ), "Cannot add new ranks after instantiating the process group"
         self._ranks.extend(new_ranks)
-        self._ranks = sorted(list(set(self.ranks)))
+        self._ranks = list(set(self.ranks))
 
     def local_rank(self, global_rank):
         if global_rank in self.ranks:
             return self.ranks.index(global_rank)
         else:
-            assert False, \
-                "Rank {} doesn't belong to this group".format(global_rank)
+            assert False, "Rank {} doesn't belong to this group".format(
+                global_rank
+            )
 
     def is_instantiate(self):
         return self._is_instantiate
@@ -137,24 +141,34 @@ def instantiate(self):
 
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(genv.device_id)
-                core.NCCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.NCCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             else:
-                assert False, ("No CUDA device found")
+                assert False, "No CUDA device found"
 
             # TODO(shenliang03): This is a temporary solution to solve the problem of
             # hang caused by cross-creation of new_group
             paddle.disable_static()
             _enable_legacy_dygraph()
-            paddle.set_device('gpu:%d' %
-                              paddle.distributed.ParallelEnv().dev_id)
-            tmp = paddle.to_tensor(
-                [1], dtype="int32") if _non_static_mode() else fill_constant(
-                    [0], dtype="int32", value="1")
-            paddle.distributed.all_reduce(tmp, use_calc_stream=True, group=self)
+            paddle.set_device(
+                'gpu:%d' % paddle.distributed.ParallelEnv().dev_id
+            )
+            tmp = (
+                paddle.to_tensor([1], dtype="int32")
+                if _non_static_mode()
+                else fill_constant([0], dtype="int32", value="1")
+            )
+            paddle.distributed.all_reduce(tmp, sync_op=True, group=self)
             paddle.distributed.wait(tmp, group=self)
+            
+            # TODO(shenliang03) AlltoAll create communicator
+            alltoall_tmp = paddle.empty(shape=[self.nranks, self.nranks], dtype="int32")
+            out = paddle._legacy_C_ops.alltoall(alltoall_tmp, 
+                                         'use_calc_stream', True, 
+                                         'ring_id', ring_id)
+            paddle.device.cuda.synchronize()
             paddle.enable_static()
-
         self._is_instantiate = True
 
     def is_member(self):
@@ -172,7 +186,8 @@ def __ne__(self, other):
 
     def __str__(self):
         string = "id: {}, nranks: {}, ranks: {}.".format(
-            self.id, self.nranks, ", ".join(map(str, self.ranks)))
+            self.id, self.nranks, ", ".join(map(str, self.ranks))
+        )
         return string
 
     def __hash__(self):
@@ -182,4 +197,4 @@ def __hash__(self):
 # Note that Process group 0 is reserved for representing all ranks.
 # At the beginning, group 0 is empty and new ranks will be added automatically.
 _g_process_group_map = OrderedDict()
-_g_process_group_map[0] = ProcessGroup(0, [])
+_g_process_group_map[0] = ProcessGroup(1000, [])
diff --git a/python/paddle/distributed/auto_parallel/process_mesh.py b/python/paddle/distributed/auto_parallel/process_mesh.py
index f751087e29eb0d..72dc9043cabd6a 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh.py
@@ -12,122 +12,226 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy
+import numpy as np
 import copy
+import paddle
 
+# Use to store the previous and current process mesh
+_g_previous_process_mesh = None
+_g_current_process_mesh = None
 
-def _get_nested_list_shape(nested_list):
-    """
-    Get the shape of a nested_list.
-    """
-    result = []
-    while isinstance(nested_list, list):
-        result.append(len(nested_list))
-        nested_list = nested_list[0]
-    return result
 
+def get_current_process_mesh():
+    global _g_current_process_mesh
+    return _g_current_process_mesh
 
-def _flatten_nested_list(nested_list):
-    """
-    Get a list of all items in a nested_list.
-    Ref: https://stackoverflow.com/questions/952914/how-to-make-a-flat-list-out-of-a-list-of-lists
-    """
-    result = numpy.array(nested_list).flatten().tolist()
-    return result
+
+def set_current_process_mesh(process_mesh):
+    global _g_previous_process_mesh
+    global _g_current_process_mesh
+    _g_previous_process_mesh = _g_current_process_mesh
+    _g_current_process_mesh = process_mesh
+
+
+def reset_current_process_mesh():
+    global _g_previous_process_mesh
+    global _g_current_process_mesh
+    _g_current_process_mesh = _g_previous_process_mesh
 
 
 class ProcessMesh(object):
-    r"""
-    The class `Processmesh` describes the topology of logical processes. 
-    A mesh is an N-dimensional array. The shape of the N-dimensional
-    array represents the topology of logical processes and every
-    element of the N-dimensional array represent a logical process. For
-    example, the 2-dimensional array [[2, 4, 5], [0, 1, 3]]
-    illustrates six logical processes organized as the topology [2, 3],
-    i.e., the shape of the 2-dimensional array. With the above topology,
-    there are two parallel groups, where the first parallel group has a
-    parallel degree of 2 and the second one has a parallel degree of 3.
-    And the first logical process is the one with id=2.
+    """
+    The `Processmesh` object describes the topology of the used processes.
 
     Args:
-        mesh (list): an N-dimensional array (nested list) describes the toplogy
-            of logical processes. The shape of the N-dimensional array
-            represents the topology of logical processes and every 
-            element of the N-dimensional array represents a logical process.
-    
-    Returns:
-        None
-
-    Raises:
-        ValueError: If `mesh` is not an instance of list.
+        mesh (list|numpy.array): an n-dimensional array describes the toplogy
+            of the processes.
+        dim_names (list, optional): the i-th element of this list gives the name of the
+            i-th dimension of the mesh.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import paddle.distributed as dist
-            
-            paddle.enable_static()
-            
-            mesh = dist.ProcessMesh([[2, 4, 5], [0, 1, 3]])
-            assert mesh.topology == [2, 3]
-            assert mesh.processes == [2, 4, 5, 0, 1, 3]
 
-    """
+            mesh = auto.ProcessMesh([[2, 4, 5], [0, 1, 3]], dim_names=["x", "y"])
+            assert mesh.shape == [2, 3]
+            assert mesh.processe_ids == [2, 4, 5, 0, 1, 3]
 
-    def __init__(self, mesh):
-        if mesh is None or not isinstance(mesh, list):
-            raise ValueError('mesh must be an instance of list.')
-
-        processes = _flatten_nested_list(mesh)
-
-        assert all(isinstance(p, int) for p in processes), \
-            ("All elements of mesh must be integer")
-
-        assert min(processes) >= 0, ('All elements of mesh must be >= 0.')
-
-        unique_processes = set(processes)
-        assert len(unique_processes) == len(processes), (
-            'All elements of mesh must be unique.')
+    """
 
-        self._topology = _get_nested_list_shape(mesh)
-        self._processes = processes
+    def __init__(self, mesh=None, dim_names=None, shape=None, process_ids=None):
+        # Use shape and process_ids just for compatibility
+        # Users should not use these directly
+        if mesh is None:
+            assert shape is not None
+            assert process_ids is not None
+            mesh = np.array(process_ids).reshape(shape)
+
+        if not isinstance(mesh, list) and not isinstance(mesh, np.ndarray):
+            raise ValueError(
+                'The mesh must be an instance of list or np.ndarray.'
+            )
+        if isinstance(mesh, list):
+            mesh = np.array(mesh)
+
+        self._mesh = mesh
+        self._shape = list(self._mesh.shape)
+        self._process_ids = self._mesh.flatten().tolist()
+
+        assert all(
+            isinstance(p, int) for p in self._process_ids
+        ), "All elements of the mesh must be integer"
+        assert (
+            min(self._process_ids) >= 0
+        ), 'All elements of the mesh must be >= 0.'
+        unique_process_ids = set(self._process_ids)
+        assert len(unique_process_ids) == len(
+            self._process_ids
+        ), 'All elements of the mesh must be unique.'
+
+        if dim_names is not None:
+            assert len(dim_names) == len(
+                self._shape
+            ), "The length of dims_names must be same as the shape of the mesh."
+            self._dim_names = copy.deepcopy(dim_names)
+        else:
+            self._dim_names = ["d" + str(i) for i in range(len(self._shape))]
+        unique_dim_names = set(self._dim_names)
+        assert len(unique_dim_names) == len(
+            self._dim_names
+        ), 'All dim_names {} must be unique.'.format(dim_names)
+
+        # # Store all process meshes
+        # from .dist_context import get_default_distributed_context
+        # default_dist_cxt = get_default_distributed_context()
+        # default_dist_cxt.add_process_mesh(self)
 
-        # Store all process meshes
-        from .dist_context import get_default_distributed_context
-        default_dist_cxt = get_default_distributed_context()
-        default_dist_cxt.add_process_mesh(self)
         # Add new processes to process group 0
         from .process_group import get_process_group
+
         pg0 = get_process_group(0)
         pg0.add_ranks(self.processes)
 
     @property
-    def topology(self):
-        r"""
-        Get the topology of logical processes belonging to this ProcessMesh.
-        This is the shape of `mesh` used to initialized this ProcessMesh.
+    def shape(self):
+        """
+        Get the shape of this ProcessMesh.
         """
-        return self._topology
+        return self._shape
 
     @property
-    def processes(self):
-        r"""
-        Get a list of all processes belonging to this ProcessMesh.
+    def process_ids(self):
+        """
+        Get the process ids belonging to this ProcessMesh.
+        """
+        return self._process_ids
+
+    @property
+    def dim_names(self):
         """
-        return self._processes
+        Get the dimension names of this ProcessMesh.
+        """
+        return self._dim_names
 
     @property
     def ndim(self):
-        r"""
-        Get the number of dimension of ProcessMesh.
         """
-        return len(self._topology)
+        Get the number of dimension of this ProcessMesh.
+        """
+        return len(self._shape)
+
+    @property
+    def mesh(self):
+        """
+        Get the underlying mesh of ProcessMesh.
+        """
+        return self._mesh
+
+    @property
+    def topology(self):
+        return self._shape
+
+    @property
+    def processes(self):
+        return self._process_ids
+
+    def __getitem__(self, index):
+        if isinstance(index, tuple):
+            new_dim_names = []
+            for i, item in enumerate(index):
+                if isinstance(item, slice):
+                    new_dim_names.append(self._dim_names[i])
+            new_mesh = self._mesh[index]
+            if new_mesh.shape:
+                return ProcessMesh(new_mesh, new_dim_names)
+            else:
+                # Wrap a scalar into a list but without dim_names
+                return ProcessMesh([new_mesh])
+        elif isinstance(index, slice):
+            new_mesh = self._mesh[index]
+            new_dim_names = self._dim_names
+            return ProcessMesh(new_mesh, new_dim_names)
+        else:
+            new_mesh = self._mesh[index]
+            new_dim_names = self._dim_names[1:]
+            if new_mesh.shape:
+                return ProcessMesh(new_mesh, new_dim_names)
+            else:
+                return ProcessMesh([new_mesh])
+
+    def __enter__(self):
+        set_current_process_mesh(self)
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        self._old_var_names = list(cur_block.vars.keys())
+        self._old_op_size = len(cur_block.ops)
+
+    def __exit__(self, exc_type, exc_value, exc_traceback):
+        from .dist_tensor import DistributedTensor
+        from .dist_op import DistributedOperator
+
+        default_prog = paddle.fluid.default_main_program()
+        cur_block = default_prog.current_block()
+        new_var_names = list(cur_block.vars.keys())
+        new_op_size = len(cur_block.ops)
+        from .dist_context import get_default_distributed_context
+
+        default_dist_ctx = get_default_distributed_context()
+        for name in new_var_names:
+            if name not in self._old_var_names:
+                tensor = cur_block.vars[name]
+                dist_tensor = default_dist_ctx.get_dist_tensor_for_program(
+                    tensor
+                )
+                if dist_tensor is None:
+                    dist_tensor = DistributedTensor(
+                        cur_block.vars[name], {"process_mesh": self}
+                    )
+                    dist_tensor.dist_attr.mark_annotated("process_mesh")
+                    default_dist_ctx.add_dist_tensor_for_program(dist_tensor)
+                else:
+                    if dist_tensor.dist_attr.process_mesh is None:
+                        dist_tensor.dist_attr.process_mesh = self
+                        dist_tensor.dist_attr.mark_annotated("process_mesh")
+
+        for idx in range(self._old_op_size, new_op_size):
+            op = cur_block.ops[idx]
+            dist_op = default_dist_ctx.get_dist_op_for_program(op)
+            if dist_op is None:
+                dist_op = DistributedOperator(op, {"process_mesh": self})
+                dist_op.dist_attr.mark_annotated("process_mesh")
+                default_dist_ctx.add_dist_op_for_program(dist_op)
+            else:
+                if dist_op.dist_attr.process_mesh is None:
+                    dist_op.dist_attr.process_mesh = self
+                    dist_op.dist_attr.mark_annotated("process_mesh")
+        reset_current_process_mesh()
 
     def __eq__(self, other):
         if not isinstance(other, ProcessMesh):
             return False
-        if self.topology != other.topology or self.processes != other.processes:
+        if self.shape != other.shape or self.process_ids != other.process_ids:
             return False
         return True
 
@@ -135,6 +239,7 @@ def __ne__(self, other):
         return not self.__eq__(other)
 
     def __str__(self):
-        str = "shape {} and process group {}".format(self.topology,
-                                                     self.processes)
+        str = "shape {}, process_ids {}, dim_nams {}".format(
+            self.shape, self.process_ids, self.dim_names
+        )
         return str
diff --git a/python/paddle/distributed/auto_parallel/process_mesh_v2.py b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
index 08a391e51eb9ea..f3ce83e8bc457d 100644
--- a/python/paddle/distributed/auto_parallel/process_mesh_v2.py
+++ b/python/paddle/distributed/auto_parallel/process_mesh_v2.py
@@ -81,54 +81,57 @@ def mesh(self):
         return self._mesh
 
 
-# def compute_compatible_process_meshes(process_meshes):
-#     """Compute the compatible process mesh given a list of process meshes."""
-#     if not process_meshes:
-#         return None
-
-#     def _compute_compatible_two_process_meshes(pm1, pm2):
-#         if pm1 is None:
-#             return True, pm2
-#         if pm2 is None:
-#             return True, pm1
-#         if pm1 == pm2:
-#             return True, pm1
-#         if pm1.device_mesh != pm2.device_mesh:
-#             return False, None
-#         if pm1.process_ids == pm2.process_ids:
-#             if len(pm1.shape) >= len(pm2.shape):
-#                 return True, pm1
-#             else:
-#                 return True, pm2
-#         process_set1 = set(pm1.process_ids)
-#         process_set2 = set(pm2.process_ids)
-#         if process_set1.issubset(process_set2):
-#             return True, pm2
-#         if process_set2.issubset(process_set1):
-#             return True, pm1
-#         return False, None
-
-#     compatible_result = None
-#     for process_mesh in process_meshes:
-#         compatible, compatible_result = _compute_compatible_two_process_meshes(
-#             compatible_result, process_mesh)
-#         if not compatible:
-#             return None
-#     return ProcessMesh(compatible_result.mesh, compatible_result.dim_names)
-
-# def merge_process_meshes(process_meshes):
-#     """Merge a list of process meshes."""
-#     merged_process_mesh = None
-#     merged_process_ids = set()
-#     device_type = ""
-#     for process_mesh in process_meshes:
-#         if process_mesh is not None:
-#             process_ids = set(process_mesh.process_ids)
-#             if not device_type:
-#                 device_type = process_mesh.device_type
-#             assert device_type != process_mesh.device_type, \
-#                 "All process meshes must have the same device_type."
-#             merged_process_ids.union(process_ids)
-#     if len(merged_process_ids) != 0:
-#         merged_process_mesh = ProcessMesh(list(merged_process_ids))
-#     return merged_process_mesh
+def compute_compatible_process_mesh(process_meshes):
+    """Compute the compatible process mesh given a list of process meshes."""
+    if not process_meshes:
+        return None
+
+    def _compute_compatible_of_two_process_meshes(pm1, pm2):
+        if pm1 is None:
+            return True, pm2
+        if pm2 is None:
+            return True, pm1
+        if pm1 == pm2:
+            return True, pm1
+        if pm1.process_ids == pm2.process_ids:
+            if len(pm1.shape) >= len(pm2.shape):
+                return True, pm1
+            else:
+                return True, pm2
+        process_set1 = set(pm1.process_ids)
+        process_set2 = set(pm2.process_ids)
+        if process_set1.issubset(process_set2):
+            return True, pm2
+        if process_set2.issubset(process_set1):
+            return True, pm1
+        return False, None
+
+    compatible_result = None
+    for process_mesh in process_meshes:
+        compatible, compatible_result = _compute_compatible_of_two_process_meshes(
+            compatible_result, process_mesh)
+        if not compatible:
+            return None
+    if compatible_result.empty():
+        return None
+    if isinstance(compatible_result, core.ProcessMesh):
+        mesh = np.array(compatible_result.process_ids).reshape(
+            compatible_result.shape)
+        return ProcessMesh(mesh, compatible_result.dim_names)
+    elif isinstance(compatible_result, ProcessMesh):
+        return ProcessMesh(compatible_result.mesh, compatible_result.dim_names)
+    else:
+        raise ValueError("Unrecognized ProcessMesh.")
+
+
+def merge_process_mesh(process_meshes):
+    """Merge a list of process meshes."""
+    merged_process_mesh = None
+    merged_process_ids = set()
+    for process_mesh in process_meshes:
+        if process_mesh is not None:
+            process_ids = set(process_mesh.process_ids)
+            merged_process_ids = merged_process_ids.union(process_ids)
+    if len(merged_process_ids) != 0:
+        merged_process_mesh = ProcessMesh(list(merged_process_ids))
+    return merged_process_mesh
diff --git a/python/paddle/distributed/auto_parallel/reshard.py b/python/paddle/distributed/auto_parallel/reshard.py
index 6da39b063efa77..5cd7018205774a 100644
--- a/python/paddle/distributed/auto_parallel/reshard.py
+++ b/python/paddle/distributed/auto_parallel/reshard.py
@@ -24,7 +24,10 @@
 import paddle.fluid.layers.utils as utils
 from ..collective import _get_global_env
 from .dist_context import DistributedContext
-from .dist_attribute import OperatorDistributedAttribute, TensorDistributedAttribute
+from .dist_attribute import (
+    OperatorDistributedAttribute,
+    TensorDistributedAttribute,
+)
 from .process_group import new_process_group, ProcessGroup, _g_process_group_map
 from .cost import build_comm_desc, CommContext
 from .cost import AllgatherOpCost, SendOpCost
@@ -35,8 +38,13 @@
 # NOTE: If op in _g_special_ops or _g_gradient_clip_ops, it will not be resharded.
 _g_special_ops = ['check_finite_and_unscale', 'update_loss_scaling']
 _g_gradient_clip_ops = [
-    "sum", "sqrt", "fill_constant", "elementwise_max", "elementwise_div"
+    "sum",
+    "sqrt",
+    "fill_constant",
+    "elementwise_max",
+    "elementwise_div",
 ]
+_g_subblock_ops = ["while", "conditional_block"]
 
 
 def get_var_with_recursion(var_name, block, program):
@@ -45,10 +53,11 @@ def get_var_with_recursion(var_name, block, program):
     if var_name in block.vars:
         var = block.vars[var_name]
     else:
-        parent_block = program.blocks[block.parent_idx]
-        if var_name in parent_block.vars:
-            var = parent_block.vars[var_name]
-    assert var is not None
+        var = block._var_recursive(var_name)
+        # parent_block = program.blocks[block.parent_idx]
+        # if var_name in parent_block.vars:
+        #     var = parent_block.vars[var_name]
+    assert var is not None, "{} is not found".format(var.name)
 
     return var
 
@@ -265,20 +274,26 @@ class Inserter:
     def insert_cast_op(block, idx, tensor, op_role, tensor_type):
         # to avoid name conflict with framework
         new_var_name = paddle.fluid.unique_name.generate_with_ignorable_key(
-            ".".join(["cast@RESHARD", 'tmp']))
-        out = block.create_var(name=new_var_name,
-                               dtype=tensor_type,
-                               type=tensor.type,
-                               lod_level=tensor.lod_level)
-        block._insert_op(idx,
-                         type='cast',
-                         inputs={'X': [tensor]},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'in_dtype': tensor.dtype,
-                             'out_dtype': out.dtype,
-                             'op_role': op_role
-                         })
+            ".".join(["cast@RESHARD", 'tmp'])
+        )
+        out = block.create_var(
+            name=new_var_name,
+            dtype=tensor_type,
+            type=tensor.type,
+            lod_level=tensor.lod_level,
+        )
+        cast_op = block._insert_op(
+            idx,
+            type='cast',
+            inputs={'X': [tensor]},
+            outputs={'Out': [out]},
+            attrs={
+                'in_dtype': tensor.dtype,
+                'out_dtype': out.dtype,
+                'op_role': op_role,
+            },
+        )
+        cast_op._set_attr('op_namescope', "/auto_parallel/reshard")
         return out
 
     @staticmethod
@@ -287,16 +302,19 @@ def insert_send_op(block, idx, tensor, src, dst, op_role):
         op_type = 'send_v2'
         # use pair comm group
         process_group = new_process_group([src, dst])
-        block._insert_op(idx,
-                         type=op_type,
-                         inputs={'X': [tensor]},
-                         attrs={
-                             'ring_id': process_group.id,
-                             'peer': process_group.ranks.index(dst),
-                             'use_calc_stream': True,
-                             'op_role': op_role,
-                             'dynamic_shape': True
-                         })
+        send_op = block._insert_op(
+            idx,
+            type=op_type,
+            inputs={'X': [tensor]},
+            attrs={
+                'ring_id': process_group.id,
+                'peer': process_group.ranks.index(dst),
+                'use_calc_stream': True,
+                'op_role': op_role,
+                'dynamic_shape': False,
+            },
+        )
+        send_op._set_attr('op_namescope', "/auto_parallel/reshard")
 
     @staticmethod
     def insert_recv_op(block, idx, tensor, src, dst, op_role):
@@ -304,40 +322,46 @@ def insert_recv_op(block, idx, tensor, src, dst, op_role):
         op_type = 'recv_v2'
         # use pair group
         process_group = new_process_group([src, dst])
-        block._insert_op(idx,
-                         type=op_type,
-                         inputs={'X': [tensor]},
-                         outputs={'Out': [tensor]},
-                         attrs={
-                             'ring_id': process_group.id,
-                             'peer': process_group.ranks.index(src),
-                             'out_shape': tensor.shape,
-                             'dtype': tensor.dtype,
-                             'use_calc_stream': True,
-                             'op_role': op_role,
-                             'dynamic_shape': True
-                         })
+        recv_op = block._insert_op(
+            idx,
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [tensor]},
+            attrs={
+                'ring_id': process_group.id,
+                'peer': process_group.ranks.index(src),
+                'out_shape': tensor.shape,
+                'dtype': tensor.dtype,
+                'use_calc_stream': True,
+                'op_role': op_role,
+                'dynamic_shape': False,
+            },
+        )
+        recv_op._set_attr('op_namescope', "/auto_parallel/reshard")
 
     @staticmethod
     def insert_reset_lod_op(block, idx, X, Y, op_role):
         """Insert reset_lod op into block at the given index."""
 
         new_var_name = paddle.fluid.unique_name.generate_with_ignorable_key(
-            ".".join(["reset_lod@RESHARD", 'tmp']))
-        reset_lod_out = block.create_var(name=new_var_name,
-                                         shape=X.shape,
-                                         type=X.type,
-                                         dtype=X.dtype,
-                                         lod_level=X.lod_level)
-
-        block._insert_op(idx,
-                         type="lod_reset",
-                         inputs={
-                             'X': X,
-                             'Y': Y
-                         },
-                         outputs={'Out': reset_lod_out},
-                         attrs={'op_role': op_role})
+            ".".join(["reset_lod@RESHARD", 'tmp'])
+        )
+        reset_lod_out = block.create_var(
+            name=new_var_name,
+            shape=X.shape,
+            type=X.type,
+            dtype=X.dtype,
+            lod_level=X.lod_level,
+        )
+
+        reset_op = block._insert_op(
+            idx,
+            type="lod_reset",
+            inputs={'X': X, 'Y': Y},
+            outputs={'Out': reset_lod_out},
+            attrs={'op_role': op_role},
+        )
+        reset_op._set_attr('op_namescope', "/auto_parallel/reshard")
         return reset_lod_out
 
     @staticmethod
@@ -352,23 +376,29 @@ def insert_concat_op(block, idx, tensors, axis, op_role):
         with paddle.static.program_guard(block.program):
             out = block.create_var(
                 name=paddle.fluid.unique_name.generate_with_ignorable_key(
-                    ".".join([helper.name, 'tmp'])),
+                    ".".join([helper.name, 'tmp'])
+                ),
                 dtype=tensors[0].dtype,
                 shape=None,
                 lod_level=tensors[0].lod_level,
                 type=tensors[0].type,
                 persistable=False,
-                stop_gradient=False)
-        block._insert_op(idx,
-                         type='concat',
-                         inputs=inputs,
-                         outputs={'Out': [out]},
-                         attrs=attrs)
+                stop_gradient=False,
+            )
+        concat_op = block._insert_op(
+            idx,
+            type='concat',
+            inputs=inputs,
+            outputs={'Out': [out]},
+            attrs=attrs,
+        )
+        concat_op._set_attr('op_namescope', "/auto_parallel/reshard")
         return out
 
     @staticmethod
-    def insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
-                        op_role):
+    def insert_slice_op(
+        block, idx, tensor, starts, ends, axes, new_var_name, op_role
+    ):
         """Insert slice op into block at the given block."""
         # This is a hack to insert split op to get slice tensor
         # 1. [128, 128] => [64, 128]: split
@@ -383,19 +413,20 @@ def insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
 
         # use assign
         if len(diff_dims) == 0:
-            out = block.create_var(name=new_var_name,
-                                   dtype=tensor.dtype,
-                                   type=tensor.type,
-                                   shape=slice_shape,
-                                   lod_level=tensor.lod_level)
+            out = block.create_var(
+                name=new_var_name,
+                dtype=tensor.dtype,
+                type=tensor.type,
+                shape=slice_shape,
+                lod_level=tensor.lod_level,
+            )
             inputs = {'X': [tensor]}
             outputs = {"Out": [out]}
             attrs = {"in_place": False}
-            block._insert_op(idx,
-                             type="assign",
-                             inputs=inputs,
-                             outputs=outputs,
-                             attrs=attrs)
+            slice_op = block._insert_op(
+                idx, type="assign", inputs=inputs, outputs=outputs, attrs=attrs
+            )
+            slice_op._set_attr('op_namescope', "/auto_parallel/reshard")
             return out
 
         # use split once
@@ -415,23 +446,28 @@ def insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
                     new_shape.append(item // num_or_sections)
             with paddle.static.program_guard(block.program):
                 outs = [
-                    block.create_var(name=paddle.fluid.unique_name.
-                                     generate_with_ignorable_key(".".join(
-                                         ['split@RESHARD', 'tmp'])),
-                                     dtype=tensor.dtype,
-                                     shape=None,
-                                     type=tensor.type,
-                                     persistable=False,
-                                     lod_level=tensor.lod_level,
-                                     stop_gradient=False)
+                    block.create_var(
+                        name=paddle.fluid.unique_name.generate_with_ignorable_key(
+                            ".".join(['split@RESHARD', 'tmp'])
+                        ),
+                        dtype=tensor.dtype,
+                        shape=None,
+                        type=tensor.type,
+                        persistable=False,
+                        lod_level=tensor.lod_level,
+                        stop_gradient=False,
+                    )
                     for i in range(num_or_sections)
                 ]
                 out = outs[cur_idx]
-            op = block._insert_op(idx,
-                                  type="split",
-                                  inputs=inputs,
-                                  outputs={'Out': outs},
-                                  attrs=attrs)
+            split_op = block._insert_op(
+                idx,
+                type="split",
+                inputs=inputs,
+                outputs={'Out': outs},
+                attrs=attrs,
+            )
+            split_op._set_attr('op_namescope', "/auto_parallel/reshard")
             return out
 
         # use slice
@@ -443,18 +479,22 @@ def insert_slice_op(block, idx, tensor, starts, ends, axes, new_var_name,
                 "starts": starts,
                 "ends": ends,
                 "infer_flags": infer_flags,
-                'op_role': op_role
+                'op_role': op_role,
             }
-            out = block.create_var(name=new_var_name,
-                                   dtype=tensor.dtype,
-                                   type=tensor.type,
-                                   lod_level=tensor.lod_level)
-            block._insert_op(idx,
-                             type="slice",
-                             inputs=inputs,
-                             outputs={'Out': [out]},
-                             attrs=attrs)
-
+            out = block.create_var(
+                name=new_var_name,
+                dtype=tensor.dtype,
+                type=tensor.type,
+                lod_level=tensor.lod_level,
+            )
+            slice_op = block._insert_op(
+                idx,
+                type="slice",
+                inputs=inputs,
+                outputs={'Out': [out]},
+                attrs=attrs,
+            )
+            slice_op._set_attr('op_namescope', "/auto_parallel/reshard")
             return out
 
     @staticmethod
@@ -474,19 +514,21 @@ def insert_split_op(block, idx, tensor, num_or_sections, op_role, axis=0):
             outs = [
                 block.create_var(
                     name=paddle.fluid.unique_name.generate_with_ignorable_key(
-                        ".".join([helper.name, 'tmp'])),
+                        ".".join([helper.name, 'tmp'])
+                    ),
                     dtype=tensor.dtype,
                     shape=None,
                     lod_level=tensor.lod_level,
                     type=tensor.type,
                     persistable=False,
-                    stop_gradient=False) for i in range(num_or_sections)
+                    stop_gradient=False,
+                )
+                for i in range(num_or_sections)
             ]
-        block._insert_op(idx,
-                         type="split",
-                         inputs=inputs,
-                         outputs={'Out': outs},
-                         attrs=attrs)
+        split_op = block._insert_op(
+            idx, type="split", inputs=inputs, outputs={'Out': outs}, attrs=attrs
+        )
+        split_op._set_attr('op_namescope', "/auto_parallel/reshard")
         return outs
 
     @staticmethod
@@ -498,28 +540,32 @@ def insert_fill_constant_op(block, idx, op_role):
         with paddle.static.program_guard(block.program):
             out = block.create_var(
                 name=paddle.fluid.unique_name.generate_with_ignorable_key(
-                    ".".join([helper.name, 'tmp'])),
+                    ".".join([helper.name, 'tmp'])
+                ),
                 dtype=paddle.int64,
                 shape=None,
                 type=core.VarDesc.VarType.LOD_TENSOR,
                 persistable=False,
-                stop_gradient=False)
+                stop_gradient=False,
+            )
         inputs = {}
         attrs = {'force_cpu': False}
         attrs['str_value'] = str(int("1"))
         attrs['value'] = int("1")
         attrs['dtype'] = out.dtype
         attrs['op_role'] = op_role
-        utils.get_shape_tensor_inputs(inputs=inputs,
-                                      attrs=attrs,
-                                      shape=[0],
-                                      op_type='fill_constant')
-        block._insert_op(idx,
-                         type='fill_constant',
-                         inputs=inputs,
-                         outputs={'Out': [out]},
-                         attrs=attrs)
+        utils.get_shape_tensor_inputs(
+            inputs=inputs, attrs=attrs, shape=[0], op_type='fill_constant'
+        )
+        fillconstant_op = block._insert_op(
+            idx,
+            type='fill_constant',
+            inputs=inputs,
+            outputs={'Out': [out]},
+            attrs=attrs,
+        )
         out.stop_gradient = True
+        fillconstant_op._set_attr('op_namescope', "/auto_parallel/reshard")
         return out
 
     @staticmethod
@@ -533,26 +579,32 @@ def insert_allgather_op(block, idx, tensor, ranks, op_role):
         if not group.is_instantiate():
             # insert fill_constant op
             fill_constant_out = Inserter.insert_fill_constant_op(
-                block, idx, op_role)
+                block, idx, op_role
+            )
             fill_constant_out.stop_gradient = True
 
             # insert c_allreduce_sum op
-            block._insert_op(idx + 1,
-                             type="c_allreduce_sum",
-                             inputs={'X': [fill_constant_out]},
-                             outputs={'Out': [fill_constant_out]},
-                             attrs={
-                                 'ring_id': 0,
-                                 'use_calc_stream': True,
-                                 'op_role': op_role
-                             })
-
+            allreduce_op = block._insert_op(
+                idx + 1,
+                type="c_allreduce_sum",
+                inputs={'X': [fill_constant_out]},
+                outputs={'Out': [fill_constant_out]},
+                attrs={
+                    'ring_id': 0,
+                    'use_calc_stream': True,
+                    'op_role': op_role,
+                },
+            )
+            allreduce_op._set_attr('op_namescope', "/auto_parallel/reshard")
             # insert c_sync_calc_stream op
-            block._insert_op(idx + 2,
-                             type="c_sync_calc_stream",
-                             inputs={'X': [fill_constant_out]},
-                             outputs={'Out': [fill_constant_out]},
-                             attrs={'op_role': op_role})
+            sync_calc_op = block._insert_op(
+                idx + 2,
+                type="c_sync_calc_stream",
+                inputs={'X': [fill_constant_out]},
+                outputs={'Out': [fill_constant_out]},
+                attrs={'op_role': op_role},
+            )
+            sync_calc_op._set_attr('op_namescope', "/auto_parallel/reshard")
             idx_offset = 3
 
         # insert c_allgather op
@@ -562,36 +614,42 @@ def insert_allgather_op(block, idx, tensor, ranks, op_role):
         with paddle.static.program_guard(block.program):
             allgather_out = block.create_var(
                 name=paddle.fluid.unique_name.generate_with_ignorable_key(
-                    ".".join([helper.name, 'tmp'])),
+                    ".".join([helper.name, 'tmp'])
+                ),
                 dtype=tensor.dtype,
                 shape=None,
                 lod_level=tensor.lod_level,
                 type=tensor.type,
                 persistable=False,
-                stop_gradient=False)
-        block._insert_op(idx + idx_offset,
-                         type=op_type,
-                         inputs={'X': [tensor]},
-                         outputs={'Out': [allgather_out]},
-                         attrs={
-                             'ring_id': group.id,
-                             'use_calc_stream': True,
-                             'nranks': group.nranks,
-                             'op_role': op_role
-                         })
+                stop_gradient=False,
+            )
+        allgather_op = block._insert_op(
+            idx + idx_offset,
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [allgather_out]},
+            attrs={
+                'ring_id': group.id,
+                'use_calc_stream': True,
+                'nranks': group.nranks,
+                'op_role': op_role,
+            },
+        )
+        allgather_op._set_attr('op_namescope', "/auto_parallel/reshard")
         idx_offset += 1
 
         # insert split op
-        split_out = Inserter.insert_split_op(block, idx + idx_offset,
-                                             allgather_out, group.nranks,
-                                             op_role)
+        split_out = Inserter.insert_split_op(
+            block, idx + idx_offset, allgather_out, group.nranks, op_role
+        )
         idx_offset += 1
         tensor_list.extend(split_out)
         return tensor_list, idx_offset
 
     @staticmethod
-    def concat_partitions_with_op(partition_tensor_list, tensor,
-                                  partition_index, block, idx, op_role):
+    def concat_partitions_with_op(
+        partition_tensor_list, tensor, partition_index, block, idx, op_role
+    ):
         """Concat the tensors and insert concat op."""
         if not partition_tensor_list:
             partition_tensor_list.append((tensor, partition_index))
@@ -599,18 +657,42 @@ def concat_partitions_with_op(partition_tensor_list, tensor,
             i = 0
             has_concat = False
             while i < len(partition_tensor_list):
-                concat_axis, first_order, new_partition = Resharder.compute_concat_info(
-                    partition_tensor_list[i][1], partition_index)
+                (
+                    concat_axis,
+                    first_order,
+                    new_partition,
+                ) = Resharder.compute_concat_info(
+                    partition_tensor_list[i][1], partition_index
+                )
                 if concat_axis != -1:
                     has_concat = True
-                    _ = Inserter.insert_concat_op(block, idx[0], [partition_tensor_list[i][0], tensor], concat_axis, op_role) \
-                        if first_order == 0 else \
-                        Inserter.insert_concat_op(block, idx[0], [tensor, partition_tensor_list[i][0]], concat_axis, op_role)
+                    _ = (
+                        Inserter.insert_concat_op(
+                            block,
+                            idx[0],
+                            [partition_tensor_list[i][0], tensor],
+                            concat_axis,
+                            op_role,
+                        )
+                        if first_order == 0
+                        else Inserter.insert_concat_op(
+                            block,
+                            idx[0],
+                            [tensor, partition_tensor_list[i][0]],
+                            concat_axis,
+                            op_role,
+                        )
+                    )
                     partition_tensor_list.pop(i)
                     idx[0] += 1
-                    Inserter.concat_partitions_with_op(partition_tensor_list, _,
-                                                       new_partition, block,
-                                                       idx, op_role)
+                    Inserter.concat_partitions_with_op(
+                        partition_tensor_list,
+                        _,
+                        new_partition,
+                        block,
+                        idx,
+                        op_role,
+                    )
                     break
                 i += 1
             if not has_concat:
@@ -624,7 +706,9 @@ class Remover:
     def remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
         """Remove no need ops in the main program"""
         not_remove_op_ref = [
-            "create_py_reader", "create_double_buffer_reader", "read"
+            "create_py_reader",
+            "create_double_buffer_reader",
+            "read",
         ]
 
         # NOTE: The nested sub block is not be supported now.
@@ -648,7 +732,9 @@ def remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
                     for var_name in op.output_arg_names:
                         dim_list.extend(
                             get_var_with_recursion(
-                                var_name, block, auto_parallel_main_prog).shape)
+                                var_name, block, auto_parallel_main_prog
+                            ).shape
+                        )
                     for i in range(idx, -1, -1):
                         if ops[i].type == "create_py_reader":
                             ops[i]._set_attr("shape_concat", dim_list)
@@ -659,10 +745,13 @@ def remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
                 if op.type == "c_sync_comm_stream":
                     need_save = []
                     for var_name in op.input_arg_names:
-                        process_mesh = dist_context.get_tensor_dist_attr_for_program(
-                            get_var_with_recursion(
-                                var_name, block,
-                                auto_parallel_main_prog)).process_mesh
+                        process_mesh = (
+                            dist_context.get_tensor_dist_attr_for_program(
+                                get_var_with_recursion(
+                                    var_name, block, auto_parallel_main_prog
+                                )
+                            ).process_mesh
+                        )
                         if rank_id in process_mesh.processes:
                             need_save.append(var_name)
                     if not need_save:
@@ -678,15 +767,20 @@ def remove_no_need_ops(auto_parallel_main_prog, dist_context, rank_id):
                 op_dist_attr = dist_context.get_op_dist_attr_for_program(op)
                 if op_dist_attr is not None:
                     op_process_mesh = op_dist_attr.process_mesh
-                    if rank_id not in op_process_mesh.processes and op.type not in not_remove_op_ref:
+                    if (
+                        rank_id not in op_process_mesh.processes
+                        and op.type not in not_remove_op_ref
+                    ):
                         remove_op_idx.append(idx)
 
             for idx in remove_op_idx[::-1]:
-                block._remove_op(idx)
+                block._remove_op(idx, sync=False)
+            block._sync_with_cpp()
 
     @staticmethod
-    def remove_no_need_vars(auto_parallel_main_prog, dist_params_grads,
-                            feed_var_names):
+    def remove_no_need_vars(
+        auto_parallel_main_prog, dist_params_grads, feed_var_names
+    ):
         """Remove no need vars in the main program"""
         for block_idx, block in enumerate(auto_parallel_main_prog.blocks):
             remove_vars = set()
@@ -709,7 +803,10 @@ def remove_no_need_vars(auto_parallel_main_prog, dist_params_grads,
                 param_grad_map = {}
                 for op in ops:
                     if int(op.attr('op_role')) == int(OpRole.Optimize):
-                        if "Param" in op.input_names and "Grad" in op.input_names:
+                        if (
+                            "Param" in op.input_names
+                            and "Grad" in op.input_names
+                        ):
                             param_name = op.input("Param")[0]
                             grad_name = op.input("Grad")[0]
                             param_grad_map[param_name] = grad_name
@@ -728,7 +825,9 @@ def remove_no_need_vars(auto_parallel_main_prog, dist_params_grads,
                     grad_name = dist_params_grads[idx][1].name
                     if grad_name != param_grad_map[param_name]:
                         dist_params_grads[idx] = (
-                            vars[param_name], vars[param_grad_map[param_name]])
+                            vars[param_name],
+                            vars[param_grad_map[param_name]],
+                        )
                     idx += 1
 
             for var in remove_vars:
@@ -737,23 +836,28 @@ def remove_no_need_vars(auto_parallel_main_prog, dist_params_grads,
                 block._remove_var(var)
 
     @staticmethod
-    def remove_no_need_in_main(auto_parallel_main_prog, dist_context, rank_id,
-                               dist_params_grads):
+    def remove_no_need_in_main(
+        auto_parallel_main_prog, dist_context, rank_id, dist_params_grads
+    ):
         """Remove no need vars and ops in the main program."""
-        Remover.remove_no_need_ops(auto_parallel_main_prog, dist_context,
-                                   rank_id)
-        Resharder.change_while_op_input_and_output(auto_parallel_main_prog,
-                                                   dist_context)
+        Remover.remove_no_need_ops(
+            auto_parallel_main_prog, dist_context, rank_id
+        )
+        Resharder.change_while_op_input_and_output(
+            auto_parallel_main_prog, dist_context
+        )
         # 'feed_var_names' cannot be removed from auto_parallel_main_prog
         feed_var_names = []
         for var in sum(list(dist_context.serial_feed_vars.values()), []):
             feed_var_names.append(var.name)
-        Remover.remove_no_need_vars(auto_parallel_main_prog, dist_params_grads,
-                                    feed_var_names)
+        Remover.remove_no_need_vars(
+            auto_parallel_main_prog, dist_params_grads, feed_var_names
+        )
 
     @staticmethod
-    def remove_no_need_in_startup(auto_parallel_main_prog,
-                                  auto_parallel_startup_prog):
+    def remove_no_need_in_startup(
+        auto_parallel_main_prog, auto_parallel_startup_prog
+    ):
         """Remove no need vars and ops in the startup program."""
         main_input_vars = set()
         main_ops = auto_parallel_main_prog.global_block().ops
@@ -823,7 +927,8 @@ def remove_no_need_in_startup(auto_parallel_main_prog,
             if is_no_need_op:
                 remove_op_idx.append(idx)
         for idx in remove_op_idx[::-1]:
-            startup_block._remove_op(idx)
+            startup_block._remove_op(idx, sync=False)
+        startup_block._sync_with_cpp()
 
 
 class Resharder:
@@ -838,28 +943,43 @@ class Resharder:
         dist_params_grads (list): The list contains the tuple of param and grad.
         batch_size (int): The batch size. Default: None.
     """
+
     while_block_info = {}
 
-    def __init__(self,
-                 auto_parallel_main_prog,
-                 auto_parallel_startup_prog,
-                 rank_id,
-                 dist_context,
-                 dist_params_grads,
-                 batch_size=None):
-        assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_main_prog should be Program, " \
-                                            "but got {}.".format(type(auto_parallel_main_prog))
+    def __init__(
+        self,
+        auto_parallel_main_prog,
+        auto_parallel_startup_prog,
+        rank_id,
+        dist_context,
+        dist_params_grads,
+        batch_size=None,
+    ):
+        assert isinstance(auto_parallel_main_prog, Program), (
+            "The type of auto_parallel_main_prog should be Program, "
+            "but got {}.".format(type(auto_parallel_main_prog))
+        )
         if auto_parallel_startup_prog is not None:
-            assert isinstance(auto_parallel_main_prog, Program), "The type of auto_parallel_startup_prog should be Program or None, " \
-                                                "but got {}.".format(type(auto_parallel_startup_prog))
-        assert isinstance(rank_id, int), "The type of rank_id should be int, " \
-                                            "but got {}.".format(type(rank_id))
-        assert isinstance(dist_context, DistributedContext), "The type of dist_context should be DistributedContext, " \
-                                            "but got {}.".format(type(dist_context))
+            assert isinstance(auto_parallel_main_prog, Program), (
+                "The type of auto_parallel_startup_prog should be Program or None, "
+                "but got {}.".format(type(auto_parallel_startup_prog))
+            )
+        assert isinstance(
+            rank_id, int
+        ), "The type of rank_id should be int, " "but got {}.".format(
+            type(rank_id)
+        )
+        assert isinstance(dist_context, DistributedContext), (
+            "The type of dist_context should be DistributedContext, "
+            "but got {}.".format(type(dist_context))
+        )
 
         if batch_size is not None:
-            assert isinstance(batch_size, int), "The type of batch_size should be int, " \
-                                                "but got {}.".format(type(batch_size))
+            assert isinstance(
+                batch_size, int
+            ), "The type of batch_size should be int, " "but got {}.".format(
+                type(batch_size)
+            )
 
         self._auto_parallel_main_prog = auto_parallel_main_prog
         self._auto_parallel_startup_prog = auto_parallel_startup_prog
@@ -931,29 +1051,37 @@ def compute_process_index(process, process_group, process_shape):
         for i in range(len(process_shape)):
             idx = relative_process // (product // process_shape[i])
             product = product // process_shape[i]
-            relative_process = relative_process - relative_process // product * product
+            relative_process = (
+                relative_process - relative_process // product * product
+            )
             process_index.append(idx)
 
         return process_index
 
     @staticmethod
-    def compute_partition_index(process, complete_shape, dims_mapping,
-                                process_shape, process_group):
+    def compute_partition_index(
+        process, complete_shape, dims_mapping, process_shape, process_group
+    ):
         """Compute the partition index in complete tensor."""
         partition_shape = Resharder.compute_partition_shape(
-            complete_shape, dims_mapping, process_shape)
-        process_index = Resharder.compute_process_index(process, process_group,
-                                                        process_shape)
+            complete_shape, dims_mapping, process_shape
+        )
+        process_index = Resharder.compute_process_index(
+            process, process_group, process_shape
+        )
         partition_index = []
 
         for i in range(len(complete_shape)):
             if dims_mapping[i] == -1:
                 partition_index.append([0, partition_shape[i]])
             else:
-                partition_index.append([
-                    process_index[dims_mapping[i]] * partition_shape[i],
-                    (process_index[dims_mapping[i]] + 1) * partition_shape[i]
-                ])
+                partition_index.append(
+                    [
+                        process_index[dims_mapping[i]] * partition_shape[i],
+                        (process_index[dims_mapping[i]] + 1)
+                        * partition_shape[i],
+                    ]
+                )
 
         return partition_index
 
@@ -968,12 +1096,16 @@ def compute_concat_info(partition_index_x, partition_index_y):
         for idx, item in enumerate(partition_index_x):
             if item != partition_index_y[idx]:
                 differ_count += 1
-                if item[1] == partition_index_y[idx][
-                        0] and item[0] < partition_index_y[idx][1]:
+                if (
+                    item[1] == partition_index_y[idx][0]
+                    and item[0] < partition_index_y[idx][1]
+                ):
                     concat_axis = idx
                     new_partition.append([item[0], partition_index_y[idx][1]])
-                elif item[0] == partition_index_y[idx][
-                        1] and item[1] > partition_index_y[idx][0]:
+                elif (
+                    item[0] == partition_index_y[idx][1]
+                    and item[1] > partition_index_y[idx][0]
+                ):
                     first_order = 1
                     concat_axis = idx
                     new_partition.append([partition_index_y[idx][0], item[1]])
@@ -1006,12 +1138,14 @@ def concat_partitions(partition_index_list, partition_index):
             has_concat = False
             while i < len(partition_index_list):
                 concat_axis, _, new_partition = Resharder.compute_concat_info(
-                    partition_index_list[i], partition_index)
+                    partition_index_list[i], partition_index
+                )
                 if concat_axis != -1:
                     has_concat = True
                     partition_index_list.pop(i)
-                    Resharder.concat_partitions(partition_index_list,
-                                                new_partition)
+                    Resharder.concat_partitions(
+                        partition_index_list, new_partition
+                    )
                     break
                 i += 1
             if not has_concat:
@@ -1023,7 +1157,8 @@ def change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
         for sub_block_idx in Resharder.while_block_info:
             sub_block = auto_parallel_main_prog.blocks[sub_block_idx]
             parent_while_op_id = Resharder.while_block_info[sub_block_idx][
-                "op_id"]
+                "op_id"
+            ]
             parent_block = auto_parallel_main_prog.blocks[sub_block.parent_idx]
 
             sub_block_op_inputs = set()
@@ -1031,10 +1166,12 @@ def change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
             for op in sub_block.ops:
                 # skip the input and output of operators inserted in the reshard phase
                 dist_op = dist_context.get_dist_op_for_program(op)
-                if dist_op or (op.type == "slice" and not dist_op) or (
-                        op.type == "split"
-                        and not dist_op) or (op.type == "assign"
-                                             and not dist_op):
+                if (
+                    dist_op
+                    or (op.type == "slice" and not dist_op)
+                    or (op.type == "split" and not dist_op)
+                    or (op.type == "assign" and not dist_op)
+                ):
                     for var_name in op.output_arg_names:
                         if var_name not in sub_block_op_outputs:
                             sub_block_op_outputs.append(var_name)
@@ -1064,7 +1201,10 @@ def change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
             new_Out = []
             for var_name in while_op.output("Out"):
                 for output_name in sub_block_op_outputs[::-1]:
-                    if output_name.find(var_name) != -1:
+                    if output_name.find(var_name) != -1 and (
+                        len(var_name) == len(output_name)
+                        or "@RESHARD" in output_name
+                    ):
                         if output_name not in new_Out:
                             new_Out.append(output_name)
             assert new_Out
@@ -1073,8 +1213,9 @@ def change_while_op_input_and_output(auto_parallel_main_prog, dist_context):
     def is_overlapped(self, shape_x, shape_y):
         """Judge whether two partitions intersect on the specified dimension."""
         overlapped = False
-        if (shape_y[0] <= shape_x[0] < shape_y[1]) or (shape_x[0] <= shape_y[0]
-                                                       < shape_x[1]):
+        if (shape_y[0] <= shape_x[0] < shape_y[1]) or (
+            shape_x[0] <= shape_y[0] < shape_x[1]
+        ):
             overlapped = True
         return overlapped
 
@@ -1093,15 +1234,18 @@ def is_special_op(self, op):
         return False
 
     def is_condition_replicative(self, op):
-        assert op.type == "while"
         sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id]
-        dist_op = self.dist_context.get_dist_op_for_program(op)
-        op_dist_attr = dist_op.dist_attr
+
+        if op.type == "while":
+            input_cond = op.input("Condition")
+        elif op.type == "conditional_block":
+            input_cond = op.input("Cond")
 
         # the dims mapping of condition tensor should be replicative
-        for var_name in op.input("Condition"):
-            var = get_var_with_recursion(var_name, sub_block,
-                                         self.auto_parallel_main_prog)
+        for var_name in input_cond:
+            var = get_var_with_recursion(
+                var_name, sub_block, self.auto_parallel_main_prog
+            )
             dist_tensor = self.dist_context.get_dist_tensor_for_program(var)
             tensor_dist_attr = dist_tensor.dist_attr
             var_dims_mapping = tensor_dist_attr.dims_mapping
@@ -1124,13 +1268,22 @@ def need_reshard(self, dist_tensor, dist_attr, op_input=True, dist_op=None):
         if op_input:
             op_input_dims_mapping = dist_attr[1]
             if all(
-                    map(lambda x: x, [
-                        tensor_dims_mapping, tensor_process_mesh,
-                        op_input_dims_mapping, op_process_mesh
-                    ])):
+                map(
+                    lambda x: x,
+                    [
+                        tensor_dims_mapping,
+                        tensor_process_mesh,
+                        op_input_dims_mapping,
+                        op_process_mesh,
+                    ],
+                )
+            ):
                 # judge whether need reshard by dims_mapping
                 if tensor_dims_mapping != op_input_dims_mapping:
-                    if tensor_process_mesh not in self.dist_context.process_meshes:
+                    if (
+                        tensor_process_mesh
+                        not in self.dist_context.process_meshes
+                    ):
                         # assert whether -1 when union.
                         for item in tensor_dims_mapping:
                             if item != -1:
@@ -1154,10 +1307,16 @@ def need_reshard(self, dist_tensor, dist_attr, op_input=True, dist_op=None):
         else:
             op_output_dims_mapping = dist_attr[1]
             if all(
-                    map(lambda x: x, [
-                        tensor_dims_mapping, tensor_process_mesh,
-                        op_output_dims_mapping, op_process_mesh
-                    ])):
+                map(
+                    lambda x: x,
+                    [
+                        tensor_dims_mapping,
+                        tensor_process_mesh,
+                        op_output_dims_mapping,
+                        op_process_mesh,
+                    ],
+                )
+            ):
                 if tensor_dims_mapping != op_output_dims_mapping:
                     raise ValueError(
                         "It is not supported that tensor dims mapping is different from op output dims mapping."
@@ -1174,10 +1333,9 @@ def get_op_process_meshes(self, op):
         op_process_mesh = dist_op.dist_attr.process_mesh
 
         for process_mesh in self.dist_context.process_meshes:
-            if set(process_mesh.processes) & (set(
-                    op_process_mesh.processes)) and len(
-                        process_mesh.processes) < len(
-                            op_process_mesh.processes):
+            if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+            ) and len(process_mesh.processes) < len(op_process_mesh.processes):
                 process_meshes.append(process_mesh)
 
         # it means the process mesh is not a union when process meshes is null
@@ -1213,40 +1371,55 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
         target_process_group = target_process_mesh.processes
         target_process_shape = target_process_mesh.topology
 
+        op_role = dist_attr[2]
+
         if source_tensor.shape[0] < 0:
             assert source_tensor.shape[0] == -1
             new_shape = list(source_tensor.shape)
             new_shape[0] = self.batch_size
             source_tensor.desc.set_shape(new_shape)
 
-        complete_shape = Resharder.compute_complete_shape(
-            source_tensor.shape, source_process_shape,
-            source_dims_mapping) if not serial else source_tensor.shape
+        complete_shape = (
+            Resharder.compute_complete_shape(
+                source_tensor.shape, source_process_shape, source_dims_mapping
+            )
+            if not serial
+            else source_tensor.shape
+        )
         op_desc_seq = {}
 
         # TODO: if the target process group has the same process with source process group
-        if set(target_process_group).intersection(set(
-                source_process_group)) and set(target_process_group).difference(
-                    set(source_process_group)):
+        if set(target_process_group).intersection(
+            set(source_process_group)
+        ) and set(target_process_group).difference(set(source_process_group)):
             pass
 
         elif target_process_group != source_process_group:
             partition_process_mapping_list = []
             for source_process in source_process_group:
                 # get partition index of source process
-                source_partition_index = Resharder.compute_partition_index(source_process, complete_shape, source_dims_mapping, \
-                                                                source_process_shape, source_process_group)
+                source_partition_index = Resharder.compute_partition_index(
+                    source_process,
+                    complete_shape,
+                    source_dims_mapping,
+                    source_process_shape,
+                    source_process_group,
+                )
                 if not partition_process_mapping_list:
                     # the item in partition_process_mapping_list is source_partition_index, which processes and whether has been used
                     partition_process_mapping_list.append(
-                        [source_partition_index, [source_process], [False]])
+                        [source_partition_index, [source_process], [False]]
+                    )
                 else:
                     partition_list = list(
-                        [item[0] for item in partition_process_mapping_list])
+                        [item[0] for item in partition_process_mapping_list]
+                    )
                     process_list = list(
-                        [item[1] for item in partition_process_mapping_list])
+                        [item[1] for item in partition_process_mapping_list]
+                    )
                     has_used = list(
-                        [item[2] for item in partition_process_mapping_list])
+                        [item[2] for item in partition_process_mapping_list]
+                    )
 
                     if partition_list.count(source_partition_index) == 1:
                         index = partition_list.index(source_partition_index)
@@ -1254,32 +1427,52 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                         has_used[index].append(False)
                     else:
                         partition_process_mapping_list.append(
-                            [source_partition_index, [source_process], [False]])
+                            [source_partition_index, [source_process], [False]]
+                        )
 
             for target_process in target_process_group:
                 # has_sent means the source_partition_index has been sent to target_process
                 has_sent = []
                 target_partition_index = Resharder.compute_partition_index(
-                    target_process, complete_shape, target_dims_mapping,
-                    target_process_shape, target_process_group)
+                    target_process,
+                    complete_shape,
+                    target_dims_mapping,
+                    target_process_shape,
+                    target_process_group,
+                )
                 partition_index_list = []
                 all_partition_index_list = []
                 for source_process in source_process_group:
                     source_partition_index = Resharder.compute_partition_index(
-                        source_process, complete_shape, source_dims_mapping,
-                        source_process_shape, source_process_group)
+                        source_process,
+                        complete_shape,
+                        source_dims_mapping,
+                        source_process_shape,
+                        source_process_group,
+                    )
                     to_send_process = None
-                    if all(_ for _ in list(map(self.is_overlapped, source_partition_index, target_partition_index))) \
-                            and source_partition_index not in has_sent:
-                        idx = list([
-                            item[0] for item in partition_process_mapping_list
-                        ]).index(source_partition_index)
-                        has_used = list([
-                            item[2] for item in partition_process_mapping_list
-                        ])[idx]
-                        process_list = list([
-                            item[1] for item in partition_process_mapping_list
-                        ])[idx]
+                    if (
+                        all(
+                            _
+                            for _ in list(
+                                map(
+                                    self.is_overlapped,
+                                    source_partition_index,
+                                    target_partition_index,
+                                )
+                            )
+                        )
+                        and source_partition_index not in has_sent
+                    ):
+                        idx = list(
+                            [item[0] for item in partition_process_mapping_list]
+                        ).index(source_partition_index)
+                        has_used = list(
+                            [item[2] for item in partition_process_mapping_list]
+                        )[idx]
+                        process_list = list(
+                            [item[1] for item in partition_process_mapping_list]
+                        )[idx]
                         i = 0
                         while i < len(has_used):
                             if not has_used[i]:
@@ -1292,7 +1485,9 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                             has_used = list(map(lambda x: False, has_used))
                             to_send_process = process_list[0]
                             has_used[0] = True
-                        assert to_send_process is not None, "Failed to find the send process."
+                        assert (
+                            to_send_process is not None
+                        ), "Failed to find the send process."
 
                         if to_send_process not in op_desc_seq.keys():
                             op_desc_seq[to_send_process] = []
@@ -1301,25 +1496,34 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                         all_partition_index_list.append(source_partition_index)
 
                         # append send and recv op desc
-                        is_bool = (
-                            dist_tensor.serial_tensor.dtype == paddle.bool)
-                        send_op_desc = SendOpDesc(source_partition_index,
-                                                  to_send_process,
-                                                  target_process,
-                                                  is_bool=is_bool)
-                        recv_op_desc = RecvOpDesc(source_partition_index,
-                                                  to_send_process,
-                                                  target_process,
-                                                  is_bool=is_bool)
+                        is_bool = dist_tensor.serial_tensor.dtype == paddle.bool
+                        send_op_desc = SendOpDesc(
+                            source_partition_index,
+                            to_send_process,
+                            target_process,
+                            is_bool=is_bool,
+                        )
+                        recv_op_desc = RecvOpDesc(
+                            source_partition_index,
+                            to_send_process,
+                            target_process,
+                            is_bool=is_bool,
+                        )
                         op_desc_seq[to_send_process].append(send_op_desc)
                         op_desc_seq[target_process].append(recv_op_desc)
                         has_sent.append(source_partition_index)
-                        Resharder.concat_partitions(partition_index_list,
-                                                    source_partition_index)
+                        Resharder.concat_partitions(
+                            partition_index_list, source_partition_index
+                        )
+                        if int(op_role) == int(OpRole.Forward):
+                            self.dist_context.up_down_streams.add_pair_stream(
+                                to_send_process, target_process
+                            )
 
                 # append concat op desc
                 op_desc_seq[target_process].append(
-                    ConcatOpDesc(all_partition_index_list))
+                    ConcatOpDesc(all_partition_index_list)
+                )
 
                 # append slice op desc
                 slice_starts = []
@@ -1329,17 +1533,21 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                 to_slice_tensor_shape = []
 
                 for idx, item in enumerate(concatenated_partition_index):
-                    slice_starts.append(target_partition_index[idx][0] -
-                                        item[0])
+                    slice_starts.append(
+                        target_partition_index[idx][0] - item[0]
+                    )
                     slice_ends.append(target_partition_index[idx][1] - item[0])
                     slices_axes.append(idx)
                     to_slice_tensor_shape.append(item[1] - item[0])
 
                 op_desc_seq[target_process].append(
-                    SliceOpDesc(slice_starts,
-                                slice_ends,
-                                slices_axes,
-                                shape=to_slice_tensor_shape))
+                    SliceOpDesc(
+                        slice_starts,
+                        slice_ends,
+                        slices_axes,
+                        shape=to_slice_tensor_shape,
+                    )
+                )
 
         # in the same process group, it will use allgahther and slice op.
         else:
@@ -1349,16 +1557,26 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
             process_index = []
             for source_process in source_process_group:
                 source_partition_index = Resharder.compute_partition_index(
-                    source_process, complete_shape, source_dims_mapping,
-                    source_process_shape, source_process_group)
+                    source_process,
+                    complete_shape,
+                    source_dims_mapping,
+                    source_process_shape,
+                    source_process_group,
+                )
                 if source_partition_index not in partition_index_list:
                     partition_index_list.append(source_partition_index)
-                    process_index.append([[
-                        source_process,
-                    ], source_partition_index])
+                    process_index.append(
+                        [
+                            [
+                                source_process,
+                            ],
+                            source_partition_index,
+                        ]
+                    )
                 else:
-                    process_index[partition_index_list.index(
-                        source_partition_index)][0].append(source_process)
+                    process_index[
+                        partition_index_list.index(source_partition_index)
+                    ][0].append(source_process)
 
             for i in range(len(process_index[0][0])):
                 group = []
@@ -1372,28 +1590,50 @@ def find_op_desc_seq(self, dist_tensor, dist_attr, serial=False):
                     slice_ends = []
                     slices_axes = []
                     target_partition_index = Resharder.compute_partition_index(
-                        process, complete_shape, target_dims_mapping,
-                        target_process_shape, target_process_group)
+                        process,
+                        complete_shape,
+                        target_dims_mapping,
+                        target_process_shape,
+                        target_process_group,
+                    )
                     for idx, item in enumerate(target_partition_index):
                         slice_starts.append(item[0])
                         slice_ends.append(item[1])
                         slices_axes.append(idx)
 
                     to_slice_tensor_shape = dist_tensor.global_sizes()
-                    slice_op_desc = SliceOpDesc(starts=slice_starts,
-                                                ends=slice_ends,
-                                                axes=slices_axes,
-                                                shape=to_slice_tensor_shape)
-                    allgather_shape = None if not serial else dist_tensor.local_sizes(
-                        rank=process)
-                    op_desc_seq[process] = [AllGatherOpDesc(group=group, shape=allgather_shape, is_bool=(source_tensor.dtype == paddle.bool)),
-                                            ConcatOpDesc(partition_index_list=all_partition_index_list), slice_op_desc] \
-                        if len(group) > 1 else [slice_op_desc]
+                    slice_op_desc = SliceOpDesc(
+                        starts=slice_starts,
+                        ends=slice_ends,
+                        axes=slices_axes,
+                        shape=to_slice_tensor_shape,
+                    )
+                    allgather_shape = (
+                        None
+                        if not serial
+                        else dist_tensor.local_sizes(rank=process)
+                    )
+                    op_desc_seq[process] = (
+                        [
+                            AllGatherOpDesc(
+                                group=group,
+                                shape=allgather_shape,
+                                is_bool=(source_tensor.dtype == paddle.bool),
+                            ),
+                            ConcatOpDesc(
+                                partition_index_list=all_partition_index_list
+                            ),
+                            slice_op_desc,
+                        ]
+                        if len(group) > 1
+                        else [slice_op_desc]
+                    )
 
         return op_desc_seq
 
-    def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
-                      dist_attr):
+    def parse_op_desc(
+        self, block, op_desc_seq, var_name, reshard_op, dist_attr
+    ):
         """Parse op desc sequence and insert op in the block"""
         tensor_list = []
         partition_tensor_list = []
@@ -1406,55 +1646,84 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
             if op.desc.id == reshard_op.desc.id:
                 idx = index
                 break
-        assert idx is not None, "The op for reshard cannot be found in the rank {} program.".format(
-            self.rank_id)
+        assert (
+            idx is not None
+        ), "The op for reshard cannot be found in the rank {} program.".format(
+            self.rank_id
+        )
 
         matched_op = block.ops[idx]
-        source_tensor = get_var_with_recursion(var_name, block,
-                                               self.auto_parallel_main_prog)
+        source_tensor = get_var_with_recursion(
+            var_name, block, self.auto_parallel_main_prog
+        )
         for op_desc in op_desc_list:
             if isinstance(op_desc, AllGatherOpDesc):  # noqa: F401
                 if var_name not in self.has_allgather.keys():
                     self.has_allgather[var_name] = []
-                if not self.has_allgather[var_name] or op_desc.group not in list(
-                        map(lambda x: x[0], self.has_allgather[var_name])):
+                if not self.has_allgather[
+                    var_name
+                ] or op_desc.group not in list(
+                    map(lambda x: x[0], self.has_allgather[var_name])
+                ):
                     if op_desc.is_bool:
                         # for bool data allgather, cast to int64 -> allgather -> cast bool
                         out_cast = Inserter.insert_cast_op(
-                            block, idx, source_tensor,
-                            reshard_op.attr('op_role'), paddle.int64)
+                            block,
+                            idx,
+                            source_tensor,
+                            reshard_op.attr('op_role'),
+                            paddle.int64,
+                        )
                         tensor_list, idx_offset = Inserter.insert_allgather_op(
-                            block, idx + 1, out_cast, op_desc.group,
-                            reshard_op.attr('op_role'))
+                            block,
+                            idx + 1,
+                            out_cast,
+                            op_desc.group,
+                            reshard_op.attr('op_role'),
+                        )
                         idx += idx_offset
                         tensor_name_list = []
                         for var in tensor_list:
                             out_cast = Inserter.insert_cast_op(
-                                block, idx, var, reshard_op.attr('op_role'),
-                                paddle.bool)
+                                block,
+                                idx,
+                                var,
+                                reshard_op.attr('op_role'),
+                                paddle.bool,
+                            )
                             tensor_name_list.append(out_cast.name)
                             idx += 1
                         self.has_allgather[var_name].append(
-                            [op_desc.group, tensor_name_list])
+                            [op_desc.group, tensor_name_list]
+                        )
                     else:
                         tensor_list, idx_offset = Inserter.insert_allgather_op(
-                            block, idx, source_tensor, op_desc.group,
-                            reshard_op.attr('op_role'))
+                            block,
+                            idx,
+                            source_tensor,
+                            op_desc.group,
+                            reshard_op.attr('op_role'),
+                        )
                         idx += idx_offset
                         tensor_name_list = [var.name for var in tensor_list]
                         self.has_allgather[var_name].append(
-                            [op_desc.group, tensor_name_list])
+                            [op_desc.group, tensor_name_list]
+                        )
                 else:
                     for item in self.has_allgather[var_name]:
                         if op_desc.group == item[0]:
                             tensor_list = [
                                 get_var_with_recursion(
-                                    var_name, block,
-                                    self.auto_parallel_main_prog)
+                                    var_name,
+                                    block,
+                                    self.auto_parallel_main_prog,
+                                )
                                 for var_name in item[1]
                             ]
                             break
-                assert tensor_list, "The result of parsing allgather op should not be None."
+                assert (
+                    tensor_list
+                ), "The result of parsing allgather op should not be None."
 
             elif isinstance(op_desc, SendOpDesc):
                 if var_name not in self.has_sent.keys():
@@ -1462,16 +1731,30 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                 if op_desc.dst not in self.has_sent[var_name]:
                     if op_desc.is_bool:
                         out_cast = Inserter.insert_cast_op(
-                            block, idx, source_tensor,
-                            reshard_op.attr('op_role'), paddle.int64)
-                        Inserter.insert_send_op(block, idx + 1, out_cast,
-                                                op_desc.src, op_desc.dst,
-                                                reshard_op.attr('op_role'))
+                            block,
+                            idx,
+                            source_tensor,
+                            reshard_op.attr('op_role'),
+                            paddle.int64,
+                        )
+                        Inserter.insert_send_op(
+                            block,
+                            idx + 1,
+                            out_cast,
+                            op_desc.src,
+                            op_desc.dst,
+                            reshard_op.attr('op_role'),
+                        )
                         idx += 2
                     else:
-                        Inserter.insert_send_op(block, idx, source_tensor,
-                                                op_desc.src, op_desc.dst,
-                                                reshard_op.attr('op_role'))
+                        Inserter.insert_send_op(
+                            block,
+                            idx,
+                            source_tensor,
+                            op_desc.src,
+                            op_desc.dst,
+                            reshard_op.attr('op_role'),
+                        )
                         idx += 1
                     self.has_sent[var_name].append(op_desc.dst)
 
@@ -1490,13 +1773,23 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                             shape=shape,
                             lod_level=source_tensor.lod_level,
                             dtype=paddle.int64,
-                            type=source_tensor.type)
-                        Inserter.insert_recv_op(block, idx, recv_tensor,
-                                                op_desc.src, op_desc.dst,
-                                                reshard_op.attr('op_role'))
+                            type=source_tensor.type,
+                        )
+                        Inserter.insert_recv_op(
+                            block,
+                            idx,
+                            recv_tensor,
+                            op_desc.src,
+                            op_desc.dst,
+                            reshard_op.attr('op_role'),
+                        )
                         out_cast = Inserter.insert_cast_op(
-                            block, idx + 1, recv_tensor,
-                            reshard_op.attr('op_role'), paddle.bool)
+                            block,
+                            idx + 1,
+                            recv_tensor,
+                            reshard_op.attr('op_role'),
+                            paddle.bool,
+                        )
                         tensor_list.append(out_cast)
                         idx += 2
                         self.has_recv[var_name][op_desc.src] = out_cast
@@ -1506,26 +1799,45 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                             shape=shape,
                             lod_level=source_tensor.lod_level,
                             dtype=source_tensor.dtype,
-                            type=source_tensor.type)
-                        Inserter.insert_recv_op(block, idx, recv_tensor,
-                                                op_desc.src, op_desc.dst,
-                                                reshard_op.attr('op_role'))
+                            type=source_tensor.type,
+                        )
+                        Inserter.insert_recv_op(
+                            block,
+                            idx,
+                            recv_tensor,
+                            op_desc.src,
+                            op_desc.dst,
+                            reshard_op.attr('op_role'),
+                        )
 
                         # for lod tensor, need reset lod after received
                         if recv_tensor.lod_level != 0:
                             set_lod = False
                             # use data lod to reset tensor lod
-                            for tmp_block in self.auto_parallel_main_prog.blocks:
+                            for (
+                                tmp_block
+                            ) in self.auto_parallel_main_prog.blocks:
                                 for tmp_var_name in tmp_block.vars:
                                     tmp_var = tmp_block.vars[tmp_var_name]
-                                    if tmp_var.is_data and tmp_var.lod_level == recv_tensor.lod_level:
-                                        reset_lod_out = Inserter.insert_reset_lod_op(
-                                            block, idx + 1, recv_tensor,
-                                            tmp_var, reshard_op.attr('op_role'))
+                                    if (
+                                        tmp_var.is_data
+                                        and tmp_var.lod_level
+                                        == recv_tensor.lod_level
+                                    ):
+                                        reset_lod_out = (
+                                            Inserter.insert_reset_lod_op(
+                                                block,
+                                                idx + 1,
+                                                recv_tensor,
+                                                tmp_var,
+                                                reshard_op.attr('op_role'),
+                                            )
+                                        )
                                         tensor_list.append(reset_lod_out)
                                         idx += 2
                                         self.has_recv[var_name][
-                                            op_desc.src] = reset_lod_out
+                                            op_desc.src
+                                        ] = reset_lod_out
                                         set_lod = True
                                         break
                                 if set_lod:
@@ -1543,16 +1855,24 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                 idx_list = [idx]
                 for index, tensor in enumerate(tensor_list):
                     Inserter.concat_partitions_with_op(
-                        partition_tensor_list, tensor,
-                        partition_index_list[index], block, idx_list,
-                        reshard_op.attr('op_role'))
+                        partition_tensor_list,
+                        tensor,
+                        partition_index_list[index],
+                        block,
+                        idx_list,
+                        reshard_op.attr('op_role'),
+                    )
                 idx = idx_list[0]
 
             elif isinstance(op_desc, SliceOpDesc):
-                assert len(
-                    partition_tensor_list) == 1 or not partition_tensor_list
-                to_slice_tensor = partition_tensor_list[0][0] if len(
-                    partition_tensor_list) == 1 else source_tensor
+                assert (
+                    len(partition_tensor_list) == 1 or not partition_tensor_list
+                )
+                to_slice_tensor = (
+                    partition_tensor_list[0][0]
+                    if len(partition_tensor_list) == 1
+                    else source_tensor
+                )
                 new_name = unique_name.generate(var_name + "@RESHARD")
                 target_tensor = Inserter.insert_slice_op(
                     block,
@@ -1562,7 +1882,8 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                     ends=op_desc.ends,
                     axes=op_desc.axes,
                     new_var_name=new_name,
-                    op_role=reshard_op.attr('op_role'))
+                    op_role=reshard_op.attr('op_role'),
+                )
 
                 process_mesh = dist_attr[0]
                 dims_mapping = dist_attr[1]
@@ -1571,87 +1892,123 @@ def parse_op_desc(self, block, op_desc_seq, var_name, reshard_op,
                 tensor_attr.dims_mapping = dims_mapping
                 tensor_attr.process_mesh = process_mesh
                 self.dist_context.set_tensor_dist_attr_for_program(
-                    target_tensor, tensor_attr)
+                    target_tensor, tensor_attr
+                )
 
                 if matched_op.type == "while":
                     # var_reshard_mapping means the while op input need be changed to
-                    if "var_reshard_mapping" not in Resharder.while_block_info[
-                            op.attr("sub_block").id].keys():
-                        Resharder.while_block_info[op.attr(
-                            "sub_block").id]["var_reshard_mapping"] = {}
-                    if var_name not in Resharder.while_block_info[op.attr(
-                            "sub_block").id]["var_reshard_mapping"].keys():
+                    if (
+                        "var_reshard_mapping"
+                        not in Resharder.while_block_info[
+                            op.attr("sub_block").id
+                        ].keys()
+                    ):
                         Resharder.while_block_info[op.attr("sub_block").id][
-                            "var_reshard_mapping"][var_name] = []
+                            "var_reshard_mapping"
+                        ] = {}
+                    if (
+                        var_name
+                        not in Resharder.while_block_info[
+                            op.attr("sub_block").id
+                        ]["var_reshard_mapping"].keys()
+                    ):
+                        Resharder.while_block_info[op.attr("sub_block").id][
+                            "var_reshard_mapping"
+                        ][var_name] = []
                     Resharder.while_block_info[op.attr("sub_block").id][
-                        "var_reshard_mapping"][var_name].append(
-                            [dist_attr, target_tensor.name])
+                        "var_reshard_mapping"
+                    ][var_name].append([dist_attr, target_tensor.name])
 
                 # rename op input name according to new name
                 for op in block.ops:
                     # just for while op
                     while_op_X_append = []
                     for name in op.input_arg_names:
-                        op_dist_attr = self.dist_context.get_op_dist_attr_for_program(
-                            op)
+                        op_dist_attr = (
+                            self.dist_context.get_op_dist_attr_for_program(op)
+                        )
                         if name == var_name and op_dist_attr is not None:
                             if op.desc.id() == matched_op.desc.id():
                                 if matched_op.type == "while":
                                     old_name = name
                                     new_name = target_tensor.name
                                     assert old_name != new_name
-                                    op_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                                        old_name)
+                                    op_input_dist_attr = (
+                                        op_dist_attr.get_input_dist_attr(
+                                            old_name
+                                        )
+                                    )
                                     op_dist_attr.set_input_dist_attr(
-                                        new_name, op_input_dist_attr)
+                                        new_name, op_input_dist_attr
+                                    )
                                     op_dist_attr.set_input_dims_mapping(
-                                        new_name, dims_mapping)
-                                    if old_name in op_dist_attr._inputs_dist_attrs:
+                                        new_name, dims_mapping
+                                    )
+                                    if (
+                                        old_name
+                                        in op_dist_attr._inputs_dist_attrs
+                                    ):
                                         op_dist_attr.del_input_dist_attr(
-                                            old_name)
+                                            old_name
+                                        )
                                     while_op_X_append.append(new_name)
                                     continue
                                 else:
                                     op.desc._rename_input(
-                                        name, target_tensor.name)
+                                        name, target_tensor.name
+                                    )
                                     old_name = name
                                     new_name = target_tensor.name
                                     assert old_name != new_name
-                                    op_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                                        old_name)
+                                    op_input_dist_attr = (
+                                        op_dist_attr.get_input_dist_attr(
+                                            old_name
+                                        )
+                                    )
                                     op_dist_attr.set_input_dist_attr(
-                                        new_name, op_input_dist_attr)
+                                        new_name, op_input_dist_attr
+                                    )
                                     op_dist_attr.set_input_dims_mapping(
-                                        new_name, dims_mapping)
+                                        new_name, dims_mapping
+                                    )
                                     op_dist_attr.del_input_dist_attr(old_name)
                                     continue
 
                             op_process_mesh = op_dist_attr.process_mesh
-                            op_input_dims_mapping = op_dist_attr.get_input_dims_mapping(
-                                var_name)
+                            op_input_dims_mapping = (
+                                op_dist_attr.get_input_dims_mapping(var_name)
+                            )
                             # NOTE: For op whose process mesh is a union, its input will not be renamed by other op reshard result now which means that it will have more reshard operation.
-                            if op_process_mesh == process_mesh and op_input_dims_mapping == dims_mapping:
+                            if (
+                                op_process_mesh == process_mesh
+                                and op_input_dims_mapping == dims_mapping
+                            ):
                                 op.desc._rename_input(name, target_tensor.name)
                                 old_name = name
                                 new_name = target_tensor.name
                                 assert old_name != new_name
-                                op_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                                    old_name)
+                                op_input_dist_attr = (
+                                    op_dist_attr.get_input_dist_attr(old_name)
+                                )
                                 op_dist_attr.set_input_dist_attr(
-                                    new_name, op_input_dist_attr)
+                                    new_name, op_input_dist_attr
+                                )
                                 op_dist_attr.set_input_dims_mapping(
-                                    new_name, dims_mapping)
+                                    new_name, dims_mapping
+                                )
                                 op_dist_attr.del_input_dist_attr(old_name)
 
                     # for while op, the input X should reset
                     if while_op_X_append:
                         proto = OpProtoHolder.instance().get_op_proto(op.type)
-                        op.desc.set_input(proto.inputs[0].name,
-                                          op.input("X") + while_op_X_append)
+                        op.desc.set_input(
+                            proto.inputs[0].name,
+                            op.input("X") + while_op_X_append,
+                        )
 
-    def _get_while_op_input_attrs(self, op, var_name):
+    def _get_subblock_input_attrs(self, op, var_name):
         # NOTE: Multi while loop is not supported
-        assert op.type == "while"
+        assert op.type in _g_subblock_ops
         sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id]
         ops = sub_block.ops
         input_attrs = []
@@ -1665,27 +2022,70 @@ def _get_while_op_input_attrs(self, op, var_name):
                 if name == var_name:
                     process_mesh = dist_attr.process_mesh
                     input_dims_mapping = dist_attr.get_input_dims_mapping(
-                        var_name)
+                        var_name
+                    )
                     has_exist = False
                     for input_attr in input_attrs:
-                        if process_mesh == input_attr[
-                                0] and input_dims_mapping == input_attr[1]:
+                        if (
+                            process_mesh == input_attr[0]
+                            and input_dims_mapping == input_attr[1]
+                        ):
                             has_exist = True
                             break
                     if not has_exist:
-                        input_attrs.append([process_mesh, input_dims_mapping])
+                        input_attrs.append(
+                            [
+                                process_mesh,
+                                input_dims_mapping,
+                                op.attr('op_role'),
+                            ]
+                        )
         return input_attrs
 
+    def _get_subblock_output_attrs(self, op, var_name):
+        assert op.type in _g_subblock_ops
+        sub_block = self.auto_parallel_main_prog.blocks[op.attr("sub_block").id]
+        ops = sub_block.ops
+        output_attrs = []
+
+        for op in ops:
+            dist_op = self.dist_context.get_dist_op_for_program(op)
+            if not dist_op:
+                continue
+            dist_attr = dist_op.dist_attr
+            for name in op.output_arg_names:
+                if name == var_name:
+                    process_mesh = dist_attr.process_mesh
+                    output_dims_mapping = dist_attr.get_output_dims_mapping(
+                        var_name
+                    )
+                    has_exist = False
+                    for output_attr in output_attrs:
+                        if (
+                            process_mesh == output_attr[0]
+                            and output_dims_mapping == output_attr[1]
+                        ):
+                            has_exist = True
+                            break
+                    if not has_exist:
+                        output_attrs.append(
+                            [
+                                process_mesh,
+                                output_dims_mapping,
+                                op.attr('op_role'),
+                            ]
+                        )
+        return output_attrs
+
     def _get_common_op_input_attrs(self, op, var_name):
         process_meshes = []
         dist_op = self.dist_context.get_dist_op_for_program(op)
         dist_attr = dist_op.dist_attr
         op_process_mesh = dist_attr.process_mesh
         for process_mesh in self.dist_context.process_meshes:
-            if set(process_mesh.processes) & (set(
-                    op_process_mesh.processes)) and len(
-                        process_mesh.processes) < len(
-                            op_process_mesh.processes):
+            if set(process_mesh.processes) & (
+                set(op_process_mesh.processes)
+            ) and len(process_mesh.processes) < len(op_process_mesh.processes):
                 process_meshes.append(process_mesh)
 
         # it means that the process mesh is not a union when process meshes is none
@@ -1695,15 +2095,19 @@ def _get_common_op_input_attrs(self, op, var_name):
         input_dims_mapping = dist_attr.get_input_dims_mapping(var_name)
         input_attrs = []
         for process_mesh in process_meshes:
-            input_attrs.append([process_mesh, input_dims_mapping])
+            input_attrs.append(
+                [process_mesh, input_dims_mapping, op.attr('op_role')]
+            )
 
         return input_attrs
 
     def get_op_input_attrs(self, op, var_name):
         op_input_attrs = []
 
-        if op.type == "while":
-            op_input_attrs = self._get_while_op_input_attrs(op, var_name)
+        if op.type in _g_subblock_ops:
+            op_input_attrs = self._get_subblock_input_attrs(op, var_name)
+            if not op_input_attrs:
+                op_input_attrs = self._get_subblock_output_attrs(op, var_name)
         else:
             op_input_attrs = self._get_common_op_input_attrs(op, var_name)
 
@@ -1716,22 +2120,28 @@ def _remove_global_process_mesh(self):
         processes = set()
         process_mesh_count = len(self.dist_context.process_meshes)
         if process_mesh_count > 1:
-            global_process_mesh_idx = None
+            global_process_mesh_idx = []
+            has_sub_process_mesh = False
             for process_mesh in self.dist_context.process_meshes:
                 for process in process_mesh.processes:
                     processes.add(process)
             for idx, process_mesh in enumerate(
-                    self.dist_context.process_meshes):
+                self.dist_context.process_meshes
+            ):
                 if len(set(process_mesh.processes)) == len(processes):
-                    global_process_mesh_idx = idx
-                    break
-            if global_process_mesh_idx is not None:
-                self.dist_context.process_meshes.pop(idx)
+                    global_process_mesh_idx.append(idx)
+                elif set(process_mesh.processes) < processes:
+                    has_sub_process_mesh = True
+
+            if has_sub_process_mesh:
+                for idx in reversed(global_process_mesh_idx):
+                    self.dist_context.process_meshes.pop(idx)
 
     def _change_subblock_op_input_and_output(self, block_idx, block):
         if "var_reshard_mapping" in Resharder.while_block_info[block_idx]:
             var_reshard_mapping = Resharder.while_block_info[block_idx][
-                "var_reshard_mapping"]
+                "var_reshard_mapping"
+            ]
             for op in block.ops:
                 for var_name in op.input_arg_names:
                     if var_name in var_reshard_mapping:
@@ -1740,9 +2150,11 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                         dist_attr = dist_op.dist_attr
                         target_name = None
                         for item in var_reshard_mapping[var_name]:
-                            if dist_attr.process_mesh == item[0][
-                                    0] and dist_attr.get_input_dims_mapping(
-                                        var_name) == item[0][1]:
+                            if (
+                                dist_attr.process_mesh == item[0][0]
+                                and dist_attr.get_input_dims_mapping(var_name)
+                                == item[0][1]
+                            ):
                                 target_name = item[1]
                                 break
                         if target_name is None:
@@ -1750,15 +2162,18 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                         else:
                             op.desc._rename_input(var_name, target_name)
                             dist_op = self.dist_context.get_dist_op_for_program(
-                                op)
+                                op
+                            )
                             op_dist_attr = dist_op.dist_attr
                             old_name = var_name
                             new_name = target_name
                             assert old_name != new_name
-                            op_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                                old_name)
+                            op_input_dist_attr = (
+                                op_dist_attr.get_input_dist_attr(old_name)
+                            )
                             op_dist_attr.set_input_dist_attr(
-                                new_name, op_input_dist_attr)
+                                new_name, op_input_dist_attr
+                            )
                             op_dist_attr.del_input_dist_attr(old_name)
 
                 # the outputs also need to be renamed when the output name is the same with input name in inplace op
@@ -1778,9 +2193,11 @@ def _change_subblock_op_input_and_output(self, block_idx, block):
                         new_name = target_name
                         assert old_name != new_name
                         op_output_dist_attr = op_dist_attr.get_output_dist_attr(
-                            old_name)
+                            old_name
+                        )
                         op_dist_attr.set_output_dist_attr(
-                            new_name, op_output_dist_attr)
+                            new_name, op_output_dist_attr
+                        )
                         op_dist_attr.del_output_dist_attr(old_name)
 
     def _reshard_input(self, block):
@@ -1795,22 +2212,28 @@ def _reshard_input(self, block):
 
             dist_op = self.dist_context.get_dist_op_for_program(op)
             if dist_op is not None:
-                op_input_dist_attrs = [
-                ]  # [(op_process_mesh, op_input_dims_mapping), (op_process_mesh, op_input_dims_mapping)]
-                if op.type == "while":
+                op_input_dist_attrs = (
+                    []
+                )  # [(op_process_mesh, op_input_dims_mapping), (op_process_mesh, op_input_dims_mapping)]
+                if op.type in _g_subblock_ops:
                     if not self.is_condition_replicative(op):
                         raise ValueError(
                             "Please check the condition due to the dims mapping is not replicative."
                         )
-                    if op.attr(
-                            "sub_block").id not in Resharder.while_block_info:
+                    if (
+                        op.attr("sub_block").id
+                        not in Resharder.while_block_info
+                    ):
                         Resharder.while_block_info[op.attr("sub_block").id] = {}
-                    Resharder.while_block_info[op.attr(
-                        "sub_block").id]["op_id"] = op.desc.id()
+                    Resharder.while_block_info[op.attr("sub_block").id][
+                        "op_id"
+                    ] = op.desc.id()
 
                 if op.type == "while":
                     # condition var process mesh is the same with op and dims_mapping is replicative, so it do not need reshard
                     input_var_names = op.input("X")
+                elif op.type == "conditional_block":
+                    input_var_names = op.input("Input")
                 else:
                     input_var_names = op.input_arg_names
                 # to avoid while op X order different
@@ -1818,20 +2241,27 @@ def _reshard_input(self, block):
 
                 idx_offset = 0
                 for var_name in input_var_names:
-                    # skip lod_tensor_blocking_queue_0
-                    if var_name == "lod_tensor_blocking_queue_0":
+                    # skip lod_tensor_blocking_queue_? name
+                    if "lod_tensor_blocking_queue" in var_name:
                         continue
-                    var = get_var_with_recursion(var_name, block,
-                                                 self.auto_parallel_main_prog)
+                    var = get_var_with_recursion(
+                        var_name, block, self.auto_parallel_main_prog
+                    )
                     dist_tensor = self.dist_context.get_dist_tensor_for_program(
-                        var)
+                        var
+                    )
 
                     # judge whether union tensor dims_mapping all -1
                     is_union_process_mesh_tensor = False
-                    if dist_tensor.dist_attr.process_mesh not in self.dist_context.process_meshes and self.dist_context.process_meshes:
+                    if (
+                        dist_tensor.dist_attr.process_mesh
+                        not in self.dist_context.process_meshes
+                        and self.dist_context.process_meshes
+                    ):
                         is_union_process_mesh_tensor = True
                         assert dist_tensor.dist_attr.dims_mapping.count(
-                            -1) == len(dist_tensor.dist_attr.dims_mapping)
+                            -1
+                        ) == len(dist_tensor.dist_attr.dims_mapping)
 
                     op_input_attrs = self.get_op_input_attrs(op, var_name)
                     for input_attr in op_input_attrs:
@@ -1841,18 +2271,23 @@ def _reshard_input(self, block):
                         if is_union_process_mesh_tensor:
                             # if op process mesh is subset of union tensor process mesh, need no reshard
                             if set(input_attr[0].processes) <= set(
-                                    dist_tensor.dist_attr.process_mesh.processes
+                                dist_tensor.dist_attr.process_mesh.processes
                             ):
                                 continue
 
                         if dist_tensor is not None and self.need_reshard(
-                                dist_tensor, input_attr):
+                            dist_tensor, input_attr
+                        ):
                             reshard_op_desc = self.find_op_desc_seq(
-                                dist_tensor, input_attr)
-                            self.parse_op_desc(block, reshard_op_desc, var_name,
-                                               op, input_attr)
+                                dist_tensor, input_attr
+                            )
+                            self.parse_op_desc(
+                                block, reshard_op_desc, var_name, op, input_attr
+                            )
                             cur_op_count = len(block.ops)
-                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            idx_offset = (
+                                idx_offset + cur_op_count - pre_op_count
+                            )
                             pre_op_count = cur_op_count
                 idx = idx + idx_offset + 1
             else:
@@ -1867,34 +2302,43 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                     shape=var.shape,
                     lod_level=var.lod_level,
                     dtype=paddle.int64,
-                    type=var.type)
-                Inserter.insert_recv_op(block, idx + 1,
-                                        recv_cast_out, send_rank, recv_rank,
-                                        op.attr('op_role'))
+                    type=var.type,
+                )
+                Inserter.insert_recv_op(
+                    block,
+                    idx + 1,
+                    recv_cast_out,
+                    send_rank,
+                    recv_rank,
+                    op.attr('op_role'),
+                )
                 reset_lod_out = None
                 if var.lod_level != 0:
                     set_lod = False
                     for tmp_block in self.auto_parallel_main_prog.blocks:
                         for tmp_var_name in tmp_block.vars:
                             tmp_var = tmp_block.vars[tmp_var_name]
-                            if tmp_var.is_data and tmp_var.lod_level == var.lod_level:
+                            if (
+                                tmp_var.is_data
+                                and tmp_var.lod_level == var.lod_level
+                            ):
                                 reset_lod_out = block.create_var(
-                                    name=unique_name.generate(var.name +
-                                                              "@RESETLOD"),
+                                    name=unique_name.generate(
+                                        var.name + "@RESETLOD"
+                                    ),
                                     shape=recv_cast_out.shape,
                                     type=recv_cast_out.type,
                                     dtype=recv_cast_out.dtype,
-                                    lod_level=recv_cast_out.lod_level)
+                                    lod_level=recv_cast_out.lod_level,
+                                )
                                 idx += 1
                                 block._insert_op(
                                     idx,
                                     type="lod_reset",
-                                    inputs={
-                                        'X': recv_cast_out,
-                                        'Y': tmp_var
-                                    },
+                                    inputs={'X': recv_cast_out, 'Y': tmp_var},
                                     outputs={'Out': reset_lod_out},
-                                    attrs={'op_role': op.attr("op_role")})
+                                    attrs={'op_role': op.attr("op_role")},
+                                )
                                 set_lod = True
                                 break
                         if set_lod:
@@ -1902,18 +2346,22 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                     assert set_lod is True
 
                 # cast int64 to bool
-                block._insert_op(idx + 2,
-                                 type='cast',
-                                 inputs={
-                                     'X': [recv_cast_out] if
-                                     reset_lod_out is None else [reset_lod_out]
-                                 },
-                                 outputs={'Out': [var]},
-                                 attrs={
-                                     'in_dtype': recv_cast_out.dtype,
-                                     'out_dtype': var.dtype,
-                                     'op_role': op.attr('op_role')
-                                 })
+                cast_op = block._insert_op(
+                    idx + 2,
+                    type='cast',
+                    inputs={
+                        'X': [recv_cast_out]
+                        if reset_lod_out is None
+                        else [reset_lod_out]
+                    },
+                    outputs={'Out': [var]},
+                    attrs={
+                        'in_dtype': recv_cast_out.dtype,
+                        'out_dtype': var.dtype,
+                        'op_role': op.attr('op_role'),
+                    },
+                )
+                cast_op._set_attr('op_namescope', "/auto_parallel/reshard")
             else:
                 if var.lod_level != 0:
                     recv_out = block.create_var(
@@ -1921,53 +2369,79 @@ def _hadnle_recv(self, block, idx, var, op, send_rank, recv_rank):
                         shape=var.shape,
                         lod_level=var.lod_level,
                         dtype=var.int64,
-                        type=var.type)
-                    Inserter.insert_recv_op(block, idx + 1, recv_out, send_rank,
-                                            recv_rank, op.attr('op_role'))
+                        type=var.type,
+                    )
+                    Inserter.insert_recv_op(
+                        block,
+                        idx + 1,
+                        recv_out,
+                        send_rank,
+                        recv_rank,
+                        op.attr('op_role'),
+                    )
                     set_lod = False
                     for tmp_block in self.auto_parallel_main_prog.blocks:
                         for tmp_var_name in tmp_block.vars:
                             tmp_var = tmp_block.vars[tmp_var_name]
-                            if tmp_var.is_data and tmp_var.lod_level == var.lod_level:
+                            if (
+                                tmp_var.is_data
+                                and tmp_var.lod_level == var.lod_level
+                            ):
                                 idx += 1
                                 block._insert_op(
                                     idx,
                                     type="lod_reset",
-                                    inputs={
-                                        'X': recv_out,
-                                        'Y': tmp_var
-                                    },
+                                    inputs={'X': recv_out, 'Y': tmp_var},
                                     outputs={'Out': var},
-                                    attrs={'op_role': op.attr("op_role")})
+                                    attrs={'op_role': op.attr("op_role")},
+                                )
                                 set_lod = True
                                 break
                         if set_lod:
                             break
                     assert set_lod is True
                 else:
-                    Inserter.insert_recv_op(block, idx + 1, var, send_rank,
-                                            recv_rank, op.attr('op_role'))
+                    Inserter.insert_recv_op(
+                        block,
+                        idx + 1,
+                        var,
+                        send_rank,
+                        recv_rank,
+                        op.attr('op_role'),
+                    )
 
     def _handle_send(self, block, idx, var, op, send_rank, recv_rank):
         if var.dtype == paddle.bool:
-            cast_out = Inserter.insert_cast_op(block, idx + 1, var,
-                                               op.attr('op_role'), paddle.int64)
-            Inserter.insert_send_op(block, idx + 2, cast_out, send_rank,
-                                    recv_rank, op.attr('op_role'))
+            cast_out = Inserter.insert_cast_op(
+                block, idx + 1, var, op.attr('op_role'), paddle.int64
+            )
+            Inserter.insert_send_op(
+                block,
+                idx + 2,
+                cast_out,
+                send_rank,
+                recv_rank,
+                op.attr('op_role'),
+            )
         else:
-            Inserter.insert_send_op(block, idx + 1, var, send_rank, recv_rank,
-                                    op.attr('op_role'))
+            Inserter.insert_send_op(
+                block, idx + 1, var, send_rank, recv_rank, op.attr('op_role')
+            )
 
     def _reshard_output(self, block):
         # insert send and recv op if output process mesh is different from tensor process mesh
         idx = 0
         # skip reader and ops whose process mesh is union
         skip_ops = [
-            "create_py_reader", "create_double_buffer_reader", "read", "while",
-            "write_to_array", "read_from_array"
+            "create_py_reader",
+            "create_double_buffer_reader",
+            "read",
+            "write_to_array",
+            "read_from_array",
         ]
         global _g_special_ops
         skip_ops += _g_special_ops
+        skip_ops += _g_subblock_ops
         while idx < len(block.ops):
             pre_op_count = len(block.ops)
             op = block.ops[idx]
@@ -1975,76 +2449,113 @@ def _reshard_output(self, block):
             if dist_op is not None and op.type not in skip_ops:
                 idx_offset = 0
                 for var_name in op.output_arg_names:
-                    var = get_var_with_recursion(var_name, block,
-                                                 self.auto_parallel_main_prog)
+                    var = get_var_with_recursion(
+                        var_name, block, self.auto_parallel_main_prog
+                    )
                     dist_tensor = self.dist_context.get_dist_tensor_for_program(
-                        var)
+                        var
+                    )
                     tensor_process_mesh = dist_tensor.dist_attr.process_mesh
                     output_attr = [
                         dist_op.dist_attr.process_mesh,
-                        dist_op.dist_attr.get_output_dims_mapping(var_name)
+                        dist_op.dist_attr.get_output_dims_mapping(var_name),
                     ]
                     if dist_tensor is not None and self.need_reshard(
-                            dist_tensor, output_attr, False):
+                        dist_tensor, output_attr, False
+                    ):
                         tensor_processes = set(
-                            tensor_process_mesh.processes) - (
-                                set(tensor_process_mesh.processes)
-                                & set(output_attr[0].processes))
+                            tensor_process_mesh.processes
+                        ) - (
+                            set(tensor_process_mesh.processes)
+                            & set(output_attr[0].processes)
+                        )
                         if tensor_processes:
                             if len(tensor_processes) != len(
-                                    output_attr[0].processes):
+                                output_attr[0].processes
+                            ):
                                 if dist_tensor.dist_attr.dims_mapping.count(
-                                        -1) != len(
-                                            dist_tensor.dist_attr.dims_mapping
-                                        ) or output_attr[1].count(-1) != len(
-                                            output_attr[1]):
+                                    -1
+                                ) != len(
+                                    dist_tensor.dist_attr.dims_mapping
+                                ) or output_attr[
+                                    1
+                                ].count(
+                                    -1
+                                ) != len(
+                                    output_attr[1]
+                                ):
                                     raise ValueError(
-                                        "The dims_mapping must be -1")
+                                        "The dims_mapping must be -1"
+                                    )
                                 else:
                                     for index, tensor_process in enumerate(
-                                            tensor_processes):
+                                        tensor_processes
+                                    ):
                                         recv_rank = tensor_process
                                         actual_index = index
                                         if index >= len(
-                                                output_attr[0].processes):
+                                            output_attr[0].processes
+                                        ):
                                             actual_index = (
-                                                index -
-                                                len(output_attr[0].processes)
+                                                index
+                                                - len(output_attr[0].processes)
                                             ) % len(output_attr[0].processes)
                                         item = output_attr[0].processes[
-                                            actual_index]
+                                            actual_index
+                                        ]
                                         if recv_rank == item:
                                             continue
+                                        if var.shape[0] == -1:
+                                            new_shape = list(var.shape)
+                                            new_shape[0] = self.batch_size
+                                            var.desc.set_shape(new_shape)
                                         if self.rank_id == item:
                                             # if send bool data, cast then send
                                             self._handle_send(
-                                                block, idx, var, op, item,
-                                                recv_rank)
+                                                block,
+                                                idx,
+                                                var,
+                                                op,
+                                                item,
+                                                recv_rank,
+                                            )
                                         if self.rank_id == recv_rank:
                                             # if recv bool data, recv then cast
                                             self._hadnle_recv(
-                                                block, idx, var, op, item,
-                                                recv_rank)
+                                                block,
+                                                idx,
+                                                var,
+                                                op,
+                                                item,
+                                                recv_rank,
+                                            )
                             else:
                                 for index, tensor_process in enumerate(
-                                        tensor_processes):
+                                    tensor_processes
+                                ):
                                     recv_rank = tensor_process
                                     item = output_attr[0].processes[index]
                                     if recv_rank == item:
                                         continue
+                                    if var.shape[0] == -1:
+                                        new_shape = list(var.shape)
+                                        new_shape[0] = self.batch_size
+                                        var.desc.set_shape(new_shape)
                                     if self.rank_id == item:
                                         # if send bool data, cast then send
                                         self._handle_send(
-                                            block, idx, var, op, item,
-                                            recv_rank)
+                                            block, idx, var, op, item, recv_rank
+                                        )
                                     if self.rank_id == recv_rank:
                                         # if recv bool data, recv then cast
                                         self._hadnle_recv(
-                                            block, idx, var, op, item,
-                                            recv_rank)
+                                            block, idx, var, op, item, recv_rank
+                                        )
 
                             cur_op_count = len(block.ops)
-                            idx_offset = idx_offset + cur_op_count - pre_op_count
+                            idx_offset = (
+                                idx_offset + cur_op_count - pre_op_count
+                            )
                             pre_op_count = cur_op_count
 
                 idx = idx + idx_offset + 1
@@ -2066,13 +2577,17 @@ def reshard(self):
             self._reshard_output(block)
 
         # remove no need vars and ops in the main program
-        Remover.remove_no_need_in_main(self.auto_parallel_main_prog,
-                                       self.dist_context, self.rank_id,
-                                       self.dist_params_grads)
+        Remover.remove_no_need_in_main(
+            self.auto_parallel_main_prog,
+            self.dist_context,
+            self.rank_id,
+            self.dist_params_grads,
+        )
 
         # remove no need vars and ops in the startip program
-        Remover.remove_no_need_in_startup(self.auto_parallel_main_prog,
-                                          self.auto_parallel_startup_prog)
+        Remover.remove_no_need_in_startup(
+            self.auto_parallel_main_prog, self.auto_parallel_startup_prog
+        )
 
         # reset some variable when remove operation ended
         Resharder.while_block_info = {}
@@ -2090,47 +2605,68 @@ def get_cost(self, op, tensor, cluster):
                 return reshard_op_cost
             else:
                 dist_tensor = self.dist_context.get_dist_tensor_for_program(
-                    tensor)
+                    tensor
+                )
                 # simplified processing: ignore union process mesh and output reshard
                 dist_op = self.dist_context.get_dist_op_for_program(op)
                 dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
-                    tensor.name)
+                    tensor.name
+                )
                 process_mesh = dist_op.dist_attr.process_mesh
                 dist_attr = [process_mesh, dims_mapping]
                 if dist_tensor is not None and self.need_reshard(
-                        dist_tensor, dist_attr):
+                    dist_tensor, dist_attr
+                ):
                     if tensor_name not in self._has_resharded:
                         self._has_resharded[tensor_name] = [dist_op]
                     else:
                         for item in self._has_resharded[tensor_name]:
                             item_dist_attr = item.dist_attr
-                            item_dims_mapping = item_dist_attr.get_input_dims_mapping(
-                                tensor_name)
+                            item_dims_mapping = (
+                                item_dist_attr.get_input_dims_mapping(
+                                    tensor_name
+                                )
+                            )
                             item_process_mesh = item_dist_attr.process_mesh
-                            if dims_mapping == item_dims_mapping and item_process_mesh == process_mesh:
+                            if (
+                                dims_mapping == item_dims_mapping
+                                and item_process_mesh == process_mesh
+                            ):
                                 return reshard_op_cost
                         self._has_resharded[tensor_name].append(dist_op)
 
-                    reshard_op_desc = self.find_op_desc_seq(dist_tensor,
-                                                            dist_attr,
-                                                            serial=True)
+                    reshard_op_desc = self.find_op_desc_seq(
+                        dist_tensor, dist_attr, serial=True
+                    )
                     dtype = dist_tensor.serial_tensor.dtype
                     reshard_op_cost = self.parse_op_desc_for_cost(
-                        reshard_op_desc, dtype, cluster)
+                        reshard_op_desc, dtype, cluster
+                    )
 
         return reshard_op_cost
 
-    def _concat_partitions_for_cost(self, partition_tensor_list,
-                                    partition_index, dtype, rank_id,
-                                    local_rank_comp_cost, cluster):
+    def _concat_partitions_for_cost(
+        self,
+        partition_tensor_list,
+        partition_index,
+        dtype,
+        rank_id,
+        local_rank_comp_cost,
+        cluster,
+    ):
         if not partition_tensor_list:
             partition_tensor_list.append(partition_index)
         else:
             i = 0
             has_concat = False
             while i < len(partition_tensor_list):
-                concat_axis, first_order, new_partition = Resharder.compute_concat_info(
-                    partition_tensor_list[i], partition_index)
+                (
+                    concat_axis,
+                    first_order,
+                    new_partition,
+                ) = Resharder.compute_concat_info(
+                    partition_tensor_list[i], partition_index
+                )
                 if concat_axis != -1:
                     has_concat = True
                     concat_desc = {}
@@ -2138,31 +2674,38 @@ def _concat_partitions_for_cost(self, partition_tensor_list,
                     concat_desc["attrs"] = {"axis": concat_axis}
                     if first_order == 0:
                         concat_desc["inputs"] = {
-                            "X": [(dtype, partition_tensor_list[i]),
-                                  (dtype, partition_index)]
+                            "X": [
+                                (dtype, partition_tensor_list[i]),
+                                (dtype, partition_index),
+                            ]
                         }
                     else:
                         concat_desc["inputs"] = {
-                            "X": [(dtype, partition_index),
-                                  (dtype, partition_tensor_list[i])]
+                            "X": [
+                                (dtype, partition_index),
+                                (dtype, partition_tensor_list[i]),
+                            ]
                         }
                     partition_tensor_list.pop(i)
                     if rank_id not in local_rank_comp_cost:
                         local_rank_comp_cost[rank_id] = []
                     local_rank_comp_cost[rank_id].append(
-                        ConcatOpCost(op_desc=concat_desc, cluster=cluster))
-                    self._concat_partitions_for_cost(partition_tensor_list,
-                                                     new_partition, dtype,
-                                                     rank_id,
-                                                     local_rank_comp_cost,
-                                                     cluster)
+                        ConcatOpCost(op_desc=concat_desc, cluster=cluster)
+                    )
+                    self._concat_partitions_for_cost(
+                        partition_tensor_list,
+                        new_partition,
+                        dtype,
+                        rank_id,
+                        local_rank_comp_cost,
+                        cluster,
+                    )
                     break
                 i += 1
             if not has_concat:
                 partition_tensor_list.append(partition_index)
 
     def parse_op_desc_for_cost(self, reshard_op_desc, dtype, cluster):
-
         def _get_idx(comm_ranks, group_ranks):
             res, is_the_same = None, False
             idx = 0
@@ -2193,28 +2736,41 @@ def _get_idx(comm_ranks, group_ranks):
                 if isinstance(op_desc, SendOpDesc):
                     group_ranks = [key, op_desc.dst]
                     shape = op_desc.shape
-                    send_desc = build_comm_desc("send_v2", group_ranks, dtype,
-                                                shape)
+                    send_desc = build_comm_desc(
+                        "send_v2", group_ranks, dtype, shape
+                    )
                     idx, is_the_same = _get_idx(comm_ranks, group_ranks)
                     if idx is None:
-                        comm_costs.append([
-                            (group_ranks,
-                             SendOpCost(op_desc=send_desc,
-                                        comm_context=comm_context))
-                        ])
+                        comm_costs.append(
+                            [
+                                (
+                                    group_ranks,
+                                    SendOpCost(
+                                        op_desc=send_desc,
+                                        comm_context=comm_context,
+                                    ),
+                                )
+                            ]
+                        )
                         comm_ranks.append(set(group_ranks))
                     else:
                         if not is_the_same:
                             comm_costs[idx].append(
-                                (group_ranks,
-                                 SendOpCost(op_desc=send_desc,
-                                            comm_context=comm_context)))
+                                (
+                                    group_ranks,
+                                    SendOpCost(
+                                        op_desc=send_desc,
+                                        comm_context=comm_context,
+                                    ),
+                                )
+                            )
                 elif isinstance(op_desc, AllGatherOpDesc):
                     # NOTE: fill_const and other unnecessary op is not calculated because those cost is very small
                     group_ranks = op_desc.group
                     shape = op_desc.shape
-                    allgather_desc = build_comm_desc("c_allgather", group_ranks,
-                                                     dtype, shape)
+                    allgather_desc = build_comm_desc(
+                        "c_allgather", group_ranks, dtype, shape
+                    )
                     split_inputs_shape = []
                     for idx, dim in enumerate(shape):
                         if idx == 0:
@@ -2223,18 +2779,29 @@ def _get_idx(comm_ranks, group_ranks):
                             split_inputs_shape.append(dim)
                     idx, is_the_same = _get_idx(comm_ranks, group_ranks)
                     if idx is None:
-                        comm_costs.append([
-                            (group_ranks,
-                             AllgatherOpCost(op_desc=allgather_desc,
-                                             comm_context=comm_context))
-                        ])
+                        comm_costs.append(
+                            [
+                                (
+                                    group_ranks,
+                                    AllgatherOpCost(
+                                        op_desc=allgather_desc,
+                                        comm_context=comm_context,
+                                    ),
+                                )
+                            ]
+                        )
                         comm_ranks.append(set(group_ranks))
                     else:
                         if not is_the_same:
                             comm_costs[idx].append(
-                                (group_ranks,
-                                 AllgatherOpCost(op_desc=allgather_desc,
-                                                 comm_context=comm_context)))
+                                (
+                                    group_ranks,
+                                    AllgatherOpCost(
+                                        op_desc=allgather_desc,
+                                        comm_context=comm_context,
+                                    ),
+                                )
+                            )
                     # calc the split op cost
                     if key not in local_rank_comp_cost:
                         local_rank_comp_cost[key] = []
@@ -2245,19 +2812,27 @@ def _get_idx(comm_ranks, group_ranks):
                     }
                     split_desc["attrs"] = {"num": len(group_ranks), "axis": 0}
                     local_rank_comp_cost[key].append(
-                        SplitOpCost(op_desc=split_desc, cluster=cluster))
+                        SplitOpCost(op_desc=split_desc, cluster=cluster)
+                    )
                 elif isinstance(op_desc, ConcatOpDesc):
                     partition_index_list = op_desc._partition_index_list
                     for idx, partion_idex in enumerate(partition_index_list):
                         self._concat_partitions_for_cost(
-                            partition_tensor_list, partion_idex, dtype, key,
-                            local_rank_comp_cost, cluster)
+                            partition_tensor_list,
+                            partion_idex,
+                            dtype,
+                            key,
+                            local_rank_comp_cost,
+                            cluster,
+                        )
 
                 elif isinstance(op_desc, SliceOpDesc):
                     if key not in local_rank_comp_cost:
                         local_rank_comp_cost[key] = []
-                    assert len(
-                        partition_tensor_list) == 1 or not partition_tensor_list
+                    assert (
+                        len(partition_tensor_list) == 1
+                        or not partition_tensor_list
+                    )
                     to_slice_tensor_shape = []
                     if len(partition_tensor_list) == 1:
                         for item in partition_tensor_list[0]:
@@ -2271,13 +2846,14 @@ def _get_idx(comm_ranks, group_ranks):
                         "axes": op_desc.axes,
                         "starts": op_desc.starts,
                         "ends": op_desc.ends,
-                        "infer_flags": infer_flags
+                        "infer_flags": infer_flags,
                     }
                     slice_desc["inputs"] = {
                         "Input": [(dtype, to_slice_tensor_shape)]
                     }
                     local_rank_comp_cost[key].append(
-                        SliceOpCost(op_desc=slice_desc, cluster=cluster))
+                        SliceOpCost(op_desc=slice_desc, cluster=cluster)
+                    )
 
         res = (comm_costs, local_rank_comp_cost)
 
diff --git a/python/paddle/distributed/auto_parallel/strategy.py b/python/paddle/distributed/auto_parallel/strategy.py
new file mode 100644
index 00000000000000..f7dd7e6697b889
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/strategy.py
@@ -0,0 +1,196 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import os
+import copy
+import argparse
+from . import constants
+
+
+class BaseConfig(object):
+    def __init__(self, category, config_dict=None):
+        self._category = category
+        self._config_dict = None
+        if config_dict is not None:
+            if isinstance(config_dict, dict):
+                self._config_dict = config_dict
+            else:
+                raise ValueError(
+                    "Expected a dictionary. But received: {}".format(
+                        config_dict
+                    )
+                )
+        # Initialize attributes by the default config
+        config = constants.get_category_default_config(self._category)
+        for field, default_value in config.items():
+            setattr(self, field, default_value)
+
+        # Overide attributes by the config_dict
+        if self._config_dict:
+            self.from_dict(self._config_dict)
+
+    def from_dict(self, config_dict):
+        config = constants.get_category_default_config(self._category)
+        for field in config.keys():
+            value = config_dict.get(field, constants.NOT_FOUND)
+            # Use the default value if we cannot found the value
+            if value != constants.NOT_FOUND:
+                setattr(self, field, value)
+
+    def to_dict(self):
+        result_dict = {}
+        config = constants.get_category_default_config(self._category)
+        for field in config.keys():
+            value = getattr(self, field)
+            result_dict[field] = value
+        for field, value in self.__dict__.items():
+            if isinstance(value, BaseConfig):
+                result_dict[field] = value.to_dict()
+        return result_dict
+
+    def __repr__(self):
+        result_dict = self.to_dict()
+        string = "{"
+        for k, v in result_dict.items():
+            string += "\"%s\":\"%s\"," % (k, v)
+        return string + "}"
+
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            setattr(result, k, copy.deepcopy(v, memo))
+        return result
+
+
+class RecomputeConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.RECOMPUTE
+        super(RecomputeConfig, self).__init__(category, config_dict)
+
+
+class AMPConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.AMP
+        super(AMPConfig, self).__init__(category, config_dict)
+
+
+class ShardingConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.SHARDING
+        super(ShardingConfig, self).__init__(category, config_dict)
+
+
+class GradientMergeConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.GRADIENT_MERGE
+        super(GradientMergeConfig, self).__init__(category, config_dict)
+
+
+class PipelineConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.PIPELINE
+        super(PipelineConfig, self).__init__(category, config_dict)
+
+
+class QATConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.QAT
+        super(QATConfig, self).__init__(category, config_dict)
+
+
+class TuningConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.TUNING
+        super(TuningConfig, self).__init__(category, config_dict)
+
+
+class DatasetConfig(BaseConfig):
+    def __init__(self, config_dict=None):
+        category = constants.DATASET
+        super(DatasetConfig, self).__init__(category, config_dict)
+
+
+class Strategy(BaseConfig):
+    """
+    The `Strategy` object is used to configure the paralleization and optimization beheviors.
+
+    Args:
+        config (dict|string, optional): If this is None, the default configurations will used.
+        If this is a dictionary, the recognized key-value of it will be used to override the default
+        configurations while other default configurations are left unchanged. If this is a string,
+        it is interpreted as the path to a YAML configuration and will be loaded to override the
+        corresponding default configurations.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.distributed.fleet import auto
+
+            strategy = auto.Strategy()
+            sharding = strategy.sharding
+            self.assertEqual(sharding.enabled, False)
+            self.assertEqual(sharding.stage, 1)
+            self.assertEqual(sharding.degree, 8)
+            sharding.enabled = True
+            sharding.stage = 2
+            sharding.degree = 2
+            self.assertEqual(sharding.enabled, True)
+            self.assertEqual(sharding.stage, 2)
+            self.assertEqual(sharding.degree, 2)
+
+    """
+
+    def __init__(self, config=None):
+        if config is not None:
+            if isinstance(config, dict):
+                self._config_dict = copy.deepcopy(config)
+            # elif os.path.exists(config):
+            #     with open(config, "rb") as yaml_file:
+            #         self._config_dict = yaml.load(yaml_file, Loader=yaml.Loader)
+            else:
+                raise ValueError(
+                    "Expected a dictionary. But received: {}".format(config)
+                )
+        else:
+            self._config_dict = {}
+
+        category = constants.BASE
+        super(Strategy, self).__init__(category, self._config_dict)
+
+        config_dict = self._config_dict.get(constants.RECOMPUTE, None)
+        self.recompute = RecomputeConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.AMP, None)
+        self.amp = AMPConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.SHARDING, None)
+        self.sharding = ShardingConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.GRADIENT_MERGE, None)
+        self.gradient_merge = GradientMergeConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.PIPELINE, None)
+        self.pipeline = PipelineConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.QAT, None)
+        self.qat = QATConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.TUNING, None)
+        self.tuning = TuningConfig(config_dict)
+
+        config_dict = self._config_dict.get(constants.DATASET, None)
+        self.dataset = DatasetConfig(config_dict)
diff --git a/python/paddle/distributed/auto_parallel/tuner/algorithms.py b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
index 8440ab91a811a0..16b0cea342dfb6 100644
--- a/python/paddle/distributed/auto_parallel/tuner/algorithms.py
+++ b/python/paddle/distributed/auto_parallel/tuner/algorithms.py
@@ -16,7 +16,7 @@
 from abc import ABC, abstractmethod
 import logging
 
-from paddle.distributed.utils import get_logger
+from ..utils import get_logger
 from .trial import TrialStatus
 from .trial import OptimizationTunerTrial as Trial
 
@@ -110,13 +110,13 @@ class ShardingStageAlgorithm(AlgorithmBase):
     # TODO import trial class & copy strategy
     def __init__(self, config):
         super().__init__(config)
-        self._changed_configs = ["sharding_configs"]
+        self._changed_configs = ["sharding"]
 
     def _init_spaces(self):
         self._max_stage = 3
         self._trial_idx = 0
 
-        stage_range = self._config.sharding_configs.get("stage_range", None)
+        stage_range = self._config.sharding.to_dict().get("tuning_range", None)
         if stage_range:
             assert set(stage_range).issubset(
                 set([0, 1, 2, 3])
@@ -136,9 +136,8 @@ def next_trial(self):
             stage = self._stage_range[self._trial_idx]
 
             new_strategy = copy.deepcopy(self._config.dist_strategy)
-            config_dict = new_strategy.sharding_configs
-            config_dict["stage"] = stage
-            new_strategy.sharding_configs = config_dict
+            sharding = new_strategy.sharding
+            sharding.stage = stage
 
             name = "trial-sharding-stage{}".format(stage)
             trial = Trial(new_strategy, name, self.changed_configs)
diff --git a/python/paddle/distributed/auto_parallel/tuner/config.py b/python/paddle/distributed/auto_parallel/tuner/config.py
index 19818a3a655703..b1eedbe04f0eb4 100644
--- a/python/paddle/distributed/auto_parallel/tuner/config.py
+++ b/python/paddle/distributed/auto_parallel/tuner/config.py
@@ -17,15 +17,13 @@
 import pathlib
 
 import paddle
-from paddle.distributed import fleet
+from ..strategy import Strategy
 
 _tuning_supported_passes = ["sharding", "recompute"]
-_strategy_config_suffiex = "_configs"
 
 
 def _get_pass_config(strategy, pass_name):
-    config_name = pass_name + _strategy_config_suffiex
-    config = getattr(strategy, config_name)
+    config = getattr(strategy, pass_name)
     return config
 
 
@@ -38,10 +36,8 @@ class TuningConfig(object):
 
     def __init__(self, user_config, strategy):
 
-        if not isinstance(strategy, fleet.DistributedStrategy):
-            raise TypeError(
-                "'strategy' must be object of class `fleet.DistributedStrategy`."
-            )
+        if not isinstance(strategy, Strategy):
+            raise TypeError("'strategy' must be object of class `Strategy`.")
 
         if not user_config:
             user_config = {}
@@ -116,11 +112,11 @@ def _initialize(self, user_config):
 
         for p in _tuning_supported_passes:
             if getattr(self._dist_strategy, p) and _get_pass_config(
-                    self._dist_strategy, p)["enable_tuning"]:
+                    self._dist_strategy, p).enable_tuning:
                 # TODO distinguish different args of each passes
                 self._tuning_passes_name.add(p)
 
-                config_name = p + _strategy_config_suffiex
+                config_name = p
                 p_dict = getattr(self._dist_strategy, config_name)
                 self.__dict__[config_name] = p_dict
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
index ec50371c7ec0e6..4b3c53ef30b43e 100644
--- a/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
+++ b/python/paddle/distributed/auto_parallel/tuner/optimization_tuner.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# import yaml
 import os
 import sys
 import copy
@@ -29,7 +30,6 @@
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
 from paddle.distributed.passes import new_pass, PassContext
-from paddle.distributed.utils import get_logger
 
 from paddle.distributed.auto_parallel.dist_context import DistributedContext, get_default_distributed_context
 from paddle.distributed.auto_parallel.completion import Completer
@@ -39,6 +39,7 @@
 from paddle.distributed.auto_parallel.utils import debug_program
 from paddle.distributed.auto_parallel.utils import make_data_unshard, set_grad_var_shape
 
+from ..utils import get_logger
 from .config import TuningConfig
 from .algorithms import new_algorithm
 from .trial import TrialStatus
@@ -135,12 +136,24 @@ def _copy_context(ref_dist_context):
 
     for key, var_list in ref_dist_context._serial_fetch_vars.items():
         new_var_list = []
-        for var in var_list:
-            block_idx = var.block.idx
-            var_name = var.name
-            var = new_dist_context._serial_main_program.blocks[
-                block_idx]._var_recursive(var_name)
-            new_var_list.append(var)
+        # metrics is a list of list
+        if key == "metrics":
+            for inner_var_list in var_list:
+                new_inner_var_list = []
+                for var in inner_var_list:
+                    block_idx = var.block.idx
+                    var_name = var.name
+                    var = new_dist_context._serial_main_program.blocks[
+                        block_idx]._var_recursive(var_name)
+                    new_inner_var_list.append(var)
+                new_var_list.append(new_inner_var_list)
+        else:
+            for var in var_list:
+                block_idx = var.block.idx
+                var_name = var.name
+                var = new_dist_context._serial_main_program.blocks[
+                    block_idx]._var_recursive(var_name)
+                new_var_list.append(var)
         new_dist_context._serial_fetch_vars[key] = new_var_list
 
     # copy information in forward and backward
@@ -256,8 +269,8 @@ def _apply_optimization(self, trial):
         startup_program = dist_context.serial_startup_program
 
         # applying optimization pass
-        if new_strategy.amp:
-            config = copy.deepcopy(new_strategy.amp_configs)
+        if new_strategy.amp.enable:
+            config = copy.deepcopy(new_strategy.amp.to_dict())
             config["dist_context"] = dist_context
             config["params_grads"] = dist_context._params_grads
 
@@ -275,8 +288,8 @@ def _apply_optimization(self, trial):
                 auto_parallel_amp_pass.apply([main_program], [startup_program],
                                              pass_context)
 
-        if new_strategy.recompute:
-            config = copy.deepcopy(new_strategy.recompute_configs)
+        if new_strategy.recompute.enable:
+            config = copy.deepcopy(new_strategy.recompute.to_dict())
             config["dist_context"] = dist_context
             config["no_grad_set"] = None
             config["loss"] = dist_context.serial_loss
@@ -303,8 +316,8 @@ def _apply_optimization(self, trial):
                               dist_context, dist_params_grads)
         resharder.reshard()
 
-        if new_strategy.sharding:
-            config = copy.deepcopy(new_strategy.sharding_configs)
+        if new_strategy.sharding.enable:
+            config = copy.deepcopy(new_strategy.sharding.to_dict())
             config["dist_context"] = dist_context
             config["params_grads"] = dist_params_grads
             config["global_rank"] = self.rank
@@ -313,8 +326,8 @@ def _apply_optimization(self, trial):
             auto_parallel_sharding_pass.apply([dist_main_prog],
                                               [dist_startup_prog], pass_context)
 
-        if new_strategy.gradient_merge:
-            config = copy.deepcopy(new_strategy.gradient_merge_configs)
+        if new_strategy.gradient_merge.enable:
+            config = copy.deepcopy(new_strategy.gradient_merge.to_dict())
             config["dist_context"] = dist_context
             config["params_grads"] = dist_params_grads
             auto_parallel_gradient_merge_pass = new_pass(
@@ -492,9 +505,10 @@ def summary(self):
             for line in summary_.split("\n"):
                 fw.write(line + "\n")
 
-        full_strategy = self.get_best_config()
-        full_strategy.save_to_prototxt(
-            os.path.join(self.project_dir, "tuned_dist_strategy.prototxt"))
+        # full_strategy = self.get_best_config()
+        # path = os.path.join(self.project_dir, "tuned_dist_strategy.yaml")
+        # with open(path, 'w') as outfile:
+        #     yaml.dump(full_strategy, outfile, default_flow_style=False)
 
     def clear(self):
         """
diff --git a/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
new file mode 100644
index 00000000000000..24ee382f7f75aa
--- /dev/null
+++ b/python/paddle/distributed/auto_parallel/tuner/parallel_tuner.py
@@ -0,0 +1,968 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+import math
+import copy
+import hashlib
+import itertools
+from collections import defaultdict
+import numpy as np
+from ..process_mesh import ProcessMesh
+from ..completion import Completer
+from ..parallelizer_v2 import Parallelizer
+from ..dist_context import _node_id
+from ..dist_op import DistributedOperator
+from ..operators.common import find_compatible_distributed_operator_impls
+from .trial import Trial, TrialStatus
+from .tunable_space import TunableSpace
+from .tunable_variable import Boolean, IntRange
+from ..cost import CostEstimator
+from .tunable_variable import Boolean, IntRange
+
+
+class ParallelTuner:
+
+    def __init__(self,
+                 dist_context,
+                 mode="train",
+                 max_trials=25,
+                 tuner_id=None,
+                 seed=None,
+                 logger=None,
+                 loop_count=10):
+        self._loop_count = loop_count
+        self._estimator = None
+        self._dist_context = dist_context
+        assert self._dist_context._is_initialized
+        self._mode = mode
+        self._cluster = self._dist_context.cluster
+        self._num_machines = self._cluster.get_num_machines()
+        self._num_devices_per_machine = self._cluster.get_num_devices_per_machine(
+        )
+        self._space = TunableSpace()
+        self._objective = "time"
+        self._direction = "min"
+        self._max_trials = max_trials
+        self._tuner_id = tuner_id
+        self._seed = seed if seed is not None else 9999
+
+        print("seed",
+              self._seed,
+              "mode",
+              self._mode,
+              "num_machies",
+              self._num_machines,
+              "num_devices_per_machine",
+              self._num_devices_per_machine,
+              flush=True)
+        self._seed_state = self._seed
+        self._logger = logger
+        self._max_collisions = 3
+        self._tried_values = set()
+        self._num_trials = 0
+        self._rng = np.random.default_rng(self._seed)
+
+        # Search the op types in the include_op_types,
+        # and will search all op types if it is empty.
+        # Exclude the op types in the exclude_op_types
+        # from the search list.
+        self._exclude_op_types = []
+        self._include_op_types = []
+        # The final dist ops will be searched after considering
+        # the include_op_types and exclude_op_types.
+        self._concerned_dist_ops = {}
+
+        self._op_id_to_dist_attr_candidates = defaultdict(list)
+        self._cached_dims_mapping_candidates = {}
+        self._cached_candidates_info = defaultdict(list)
+
+        self._special_ops = [
+            "create_py_reader", "create_double_buffer_reader", "read", "while",
+            "read_from_array", "write_to_array"
+        ]
+
+        # Each parallel strategy has two elements. The First one is for distributed tensors,
+        # the second element is for distributed tensors, the third element is for process meshes.
+        self._init_parallel_strategy = [None, None, None]
+        self._best_parallel_strategy = [None, None, None]
+
+        self._completer = Completer(self._dist_context)
+
+        self._parallelizer = Parallelizer(self._mode, self._completer,
+                                          self._dist_context)
+
+    def _generate_combination(self,
+                              elements,
+                              target,
+                              idx,
+                              partial_candidate,
+                              candidates,
+                              num_candidates=None):
+        if target == 0:
+            candidates.append(copy.deepcopy(partial_candidate))
+            return
+
+        if target < 0 or idx == len(elements) \
+            or len(candidates) > num_candidates:
+            return
+
+        # Use
+        partial_candidate.append(elements[idx])
+        self._generate_combination(elements, target - elements[idx], idx,
+                                   partial_candidate, candidates,
+                                   num_candidates)
+        # Not use
+        partial_candidate.pop()
+        self._generate_combination(elements, target, idx + 1, partial_candidate,
+                                   candidates, num_candidates)
+
+    def _permute_combination(self,
+                             combination,
+                             target,
+                             check,
+                             partial_candidate,
+                             candidates,
+                             num_candidates=None,
+                             skip_prob=None):
+        if num_candidates is not None \
+            and len(candidates) == num_candidates:
+            return
+
+        if len(partial_candidate) == len(combination):
+            candidates.append(partial_candidate)
+            return
+
+        for i in range(len(combination)):
+            if check[i] == 1:
+                continue
+            if self._rng.choice([True, False], p=[skip_prob, 1 - skip_prob]):
+                continue
+            if i > 0 and combination[i] == combination[i - 1] \
+                and check[i -1] == 0:
+                continue
+            check[i] = 1
+            self._permute_combination(combination, target, check,
+                                      partial_candidate + [combination[i]],
+                                      candidates, num_candidates, skip_prob)
+            check[i] = 0
+
+    def _partition_number(self, target):
+        log2_target = int(math.log2(target))
+        elements = [pow(2, i) for i in range(log2_target)]
+        if pow(2, log2_target) == target:
+            elements.append(target)
+        seed_candidates = []
+        num_seed_candidates = 1000
+        partial_results = []
+        self._generate_combination(elements, target, 0, partial_results,
+                                   seed_candidates, num_seed_candidates)
+
+        candidates = []
+        for seed_candidate in seed_candidates:
+            cur_candidates = []
+            num_cur_candidates = 16
+            seed_candidate.sort()
+            check = [0 for i in range(len(seed_candidate))]
+            if target <= 8:
+                skip_prob = 0.0
+            else:
+                skip_prob = (len(seed_candidate) / target)
+            self._permute_combination(seed_candidate, target, check, [],
+                                      cur_candidates, num_cur_candidates,
+                                      skip_prob)
+            candidates.extend(cur_candidates)
+        return candidates
+
+    def _partition_devices(self, num_machines, num_devices_per_machine):
+        inter_node_partitions = self._partition_number(num_machines)
+        intra_node_partitions = self._partition_number(num_devices_per_machine)
+        return inter_node_partitions, intra_node_partitions
+
+    def _generate_process_mesh_list(self, inter_node_partition,
+                                    intra_node_partition):
+        process_mesh_list = []
+        start_row = 0
+        start_col = 0
+        for m in inter_node_partition:
+            start_col = 0
+            for n in intra_node_partition:
+                process_mesh = []
+                for p in range(m):
+                    start = (start_row +
+                             p) * self._num_devices_per_machine + start_col
+                    tmp = []
+                    for q in range(n):
+                        tmp.append(start + q)
+                    process_mesh.append(tmp)
+                process_mesh_list.append(copy.deepcopy(process_mesh))
+                start_col += n
+            start_row += m
+        return process_mesh_list
+
+    def _generate_dims_mapping_candidates_helper(self, dims_mapping, dims_list,
+                                                 start, visited, candidates):
+        if start == len(dims_mapping) or all(visited):
+            candidates.append(copy.deepcopy(dims_mapping))
+            return
+
+        for idx, dim in enumerate(dims_list):
+            if visited[idx] == False:
+                dims_mapping[start] = dim
+                visited[idx] = True
+                self._generate_dims_mapping_candidates_helper(
+                    dims_mapping, dims_list, start + 1, visited, candidates)
+                visited[idx] = False
+        dims_mapping[start] = -1
+        self._generate_dims_mapping_candidates_helper(dims_mapping, dims_list,
+                                                      start + 1, visited,
+                                                      candidates)
+
+    def _generate_dims_mapping_candidates(self, dims_mapping_len,
+                                          process_mesh_len):
+        assert dims_mapping_len >= 1 and process_mesh_len >= 1
+        key = (dims_mapping_len, process_mesh_len)
+        if key in self._cached_dims_mapping_candidates:
+            return self._cached_dims_mapping_candidates[key]
+        candidates = []
+        dims_mapping = [-1 for i in range(dims_mapping_len)]
+        dims_list = [i for i in range(process_mesh_len)]
+        visited = [False for i in range(process_mesh_len)]
+        self._generate_dims_mapping_candidates_helper(dims_mapping, dims_list,
+                                                      0, visited, candidates)
+        self._cached_dims_mapping_candidates[key] = candidates
+        return candidates
+
+    def _generate_dist_attr_candidates(self, op_id, dist_op):
+        # For now, only allow the process meshes have two dimensions
+        process_mesh_len = 2
+        serial_op = dist_op.serial_op
+        op_dist_attr = dist_op.dist_attr
+        if serial_op.type in self._special_ops:
+            return [copy.deepcopy(op_dist_attr)]
+        key = []
+        key.append(serial_op.type)
+        for input_name in serial_op.input_names:
+            key.append(input_name)
+            for input_arg_name in serial_op.input(input_name):
+                key.append(
+                    len(op_dist_attr.get_input_dims_mapping(input_arg_name)))
+        for output_name in serial_op.output_names:
+            key.append(output_name)
+            for output_arg_name in serial_op.output(output_name):
+                key.append(
+                    len(op_dist_attr.get_output_dims_mapping(output_arg_name)))
+        key = tuple(key)
+
+        if key in self._cached_candidates_info:
+            cached_dist_attr_candidates = []
+            cached_input_arg_names = self._cached_candidates_info[key][0]
+            cached_output_arg_names = self._cached_candidates_info[key][1]
+            for cached_dist_attr in self._cached_candidates_info[key][2]:
+                new_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
+                i = 0
+                for input_name in serial_op.input_names:
+                    for input_arg_name in serial_op.input(input_name):
+                        cached_dims_mapping = cached_dist_attr.get_input_dims_mapping(
+                            cached_input_arg_names[i])
+                        new_op_dist_attr.set_input_dims_mapping(
+                            input_arg_name, cached_dims_mapping)
+                        i += 1
+                i = 0
+                for output_name in serial_op.output_names:
+                    for output_arg_name in serial_op.output(output_name):
+                        cached_dims_mapping = cached_dist_attr.get_output_dims_mapping(
+                            cached_output_arg_names[i])
+                        new_op_dist_attr.set_output_dims_mapping(
+                            output_arg_name, cached_dims_mapping)
+                        i += 1
+                cached_dist_attr_candidates.append(new_op_dist_attr)
+            return cached_dist_attr_candidates
+
+        # cached_candidates_info = []
+        input_arg_names = []
+        for input_name in serial_op.input_names:
+            for input_arg_name in serial_op.input(input_name):
+                input_arg_names.append(input_arg_name)
+        self._cached_candidates_info[key].append(input_arg_names)
+        # cached_candidates_info.append(input_arg_names)
+        output_arg_names = []
+        for output_name in serial_op.output_names:
+            for output_arg_name in serial_op.output(output_name):
+                output_arg_names.append(output_arg_name)
+        self._cached_candidates_info[key].append(output_arg_names)
+        # cached_candidates_info.append(output_arg_names)
+
+        new_op_dist_attr = copy.deepcopy(dist_op.dist_attr)
+        # Find valid dims_mapping candidates for inputs
+        input_names = []
+        dims_mapping_generated = []
+        inputs_dist_attrs = op_dist_attr.inputs_dist_attrs
+        for tensor_name, tensor_dist_attr in inputs_dist_attrs.items():
+            original_dims_mapping = tensor_dist_attr.dims_mapping
+            dims_mapping_len = len(original_dims_mapping)
+            input_names.append(tensor_name)
+            if dims_mapping_len < 1:
+                dims_mapping_generated.append(
+                    [copy.deepcopy(original_dims_mapping)])
+            else:
+                dims_mapping_generated.append(
+                    self._generate_dims_mapping_candidates(
+                        dims_mapping_len, process_mesh_len))
+        input_dims_mapping_candidates = []
+        for dims_mapping_list in itertools.product(*dims_mapping_generated):
+            dims_mapping_list = list(dims_mapping_list)
+            assert len(dims_mapping_list) == len(input_names)
+            for i, dims_mapping in enumerate(dims_mapping_list):
+                new_op_dist_attr.set_input_dims_mapping(input_names[i],
+                                                        dims_mapping)
+            new_dist_op = DistributedOperator(dist_op.serial_op,
+                                              new_op_dist_attr)
+            dist_op_impls = find_compatible_distributed_operator_impls(
+                new_dist_op, fwd=True)
+            if dist_op_impls is not None:
+                input_dims_mapping_candidates.append(dims_mapping_list)
+
+        # Find valid dims_mapping candidates for outputs
+        output_names = []
+        dims_mapping_generated = []
+        outputs_dist_attrs = op_dist_attr.outputs_dist_attrs
+        for tensor_name, tensor_dist_attr in outputs_dist_attrs.items():
+            original_dims_mapping = tensor_dist_attr.dims_mapping
+            dims_mapping_len = len(original_dims_mapping)
+            output_names.append(tensor_name)
+            if dims_mapping_len < 1:
+                dims_mapping_generated.append(
+                    [copy.deepcopy(original_dims_mapping)])
+            else:
+                dims_mapping_generated.append(
+                    self._generate_dims_mapping_candidates(
+                        dims_mapping_len, process_mesh_len))
+        output_dims_mapping_candidates = []
+        for dims_mapping_list in itertools.product(*dims_mapping_generated):
+            dims_mapping_list = list(dims_mapping_list)
+            assert len(dims_mapping_list) == len(output_names)
+            for i, dims_mapping in enumerate(dims_mapping_list):
+                new_op_dist_attr.set_output_dims_mapping(
+                    output_names[i], dims_mapping)
+            new_dist_op = DistributedOperator(dist_op.serial_op,
+                                              new_op_dist_attr)
+            dist_op_impls = find_compatible_distributed_operator_impls(
+                new_dist_op, fwd=False)
+            if dist_op_impls is not None:
+                output_dims_mapping_candidates.append(dims_mapping_list)
+
+        if not input_dims_mapping_candidates and output_dims_mapping_candidates:
+            inout_dims_mapping_generated = [[[[-2]]],
+                                            output_dims_mapping_candidates]
+        elif input_dims_mapping_candidates and not output_dims_mapping_candidates:
+            inout_dims_mapping_generated = [
+                input_dims_mapping_candidates, [[[-2]]]
+            ]
+        elif not input_dims_mapping_candidates and not output_dims_mapping_candidates:
+            inout_dims_mapping_generated = [[[[-2]]], [[[-2]]]]
+        else:
+            inout_dims_mapping_generated = [
+                input_dims_mapping_candidates, output_dims_mapping_candidates
+            ]
+        # Find valid dims_mapping generated for both inputs and outputs
+        cached_dist_attr_candidates = []
+        for inout_dims_mapping_list in itertools.product(
+                *inout_dims_mapping_generated):
+            assert len(inout_dims_mapping_list) == 2
+            if input_dims_mapping_candidates:
+                assert len(inout_dims_mapping_list[0]) == len(input_names)
+            if output_dims_mapping_candidates:
+                assert len(inout_dims_mapping_list[1]) == len(output_names)
+            # set the dims_mappings for inputs
+            for i, dims_mapping in enumerate(inout_dims_mapping_list[0]):
+                if dims_mapping != [-2]:
+                    new_op_dist_attr.set_input_dims_mapping(
+                        input_names[i], dims_mapping)
+            # set the dims_mappings for outputs
+            for i, dims_mapping in enumerate(inout_dims_mapping_list[1]):
+                if dims_mapping != [-2]:
+                    new_op_dist_attr.set_output_dims_mapping(
+                        output_names[i], dims_mapping)
+            new_dist_op = DistributedOperator(dist_op.serial_op,
+                                              new_op_dist_attr)
+            dist_op_impls = find_compatible_distributed_operator_impls(
+                new_dist_op, partial=False)
+            if dist_op_impls is None:
+                continue
+            for dist_op_impl in dist_op_impls:
+                new_op_dist_attr.impl_type = dist_op_impl.type
+                new_op_dist_attr.impl_idx = dist_op_impl.idx
+                cached_dist_attr_candidates.append(
+                    copy.deepcopy(new_op_dist_attr))
+        self._cached_candidates_info[key].append(cached_dist_attr_candidates)
+        return self._cached_candidates_info[key][2]
+
+    def construct_space(self):
+        inter_node_partitions, intra_node_partitions = self._partition_devices(
+            self._num_machines, self._num_devices_per_machine)
+        self._space.choice("inter_node_partitions",
+                           inter_node_partitions,
+                           default=inter_node_partitions[0])
+        self._space.choice("intra_node_partitions",
+                           intra_node_partitions,
+                           default=intra_node_partitions[0])
+
+        dist_ops = self._dist_context._dist_ops_for_program
+        for op_id, dist_op in dist_ops.items():
+            op_type = dist_op.serial_op.type
+            if self._include_op_types:
+                if op_type in self._include_op_types:
+                    self._concerned_dist_ops[op_id] = dist_op
+            else:
+                self._concerned_dist_ops[op_id] = dist_op
+
+        for op_id, dist_op in self._concerned_dist_ops.items():
+            op_type = dist_op.serial_op.type
+            if op_type in self._exclude_op_types:
+                del self._concerned_dist_ops[op_id]
+
+        print("Number of the concered dist ops",
+              len(self._concerned_dist_ops),
+              flush=True)
+        search_space = 1
+        for op_id, dist_op in self._concerned_dist_ops.items():
+            op_dist_attr_candidates = self._generate_dist_attr_candidates(
+                op_id, dist_op)
+            search_space *= len(op_dist_attr_candidates)
+            self._space.choice(str(op_id),
+                               op_dist_attr_candidates,
+                               default=op_dist_attr_candidates[0])
+
+    def _compute_values_hash(self, values):
+        keys = sorted(values.keys())
+        s = "".join(str(k) + "=" + str(values[k]) for k in keys)
+        return hashlib.sha256(s.encode("utf-8")).hexdigest()[:32]
+
+    def _random_values(self):
+        space = TunableSpace()
+        collisions = 0
+        while True:
+            for v in self._space.variables.values():
+                space._register(v)
+                space.values[v.name] = v.random(self._seed_state)
+                self._seed_state += 1
+            values = space.values
+            values_hash = self._compute_values_hash(values)
+            if values_hash in self._tried_values:
+                collisions += 1
+                if collisions > self._max_collisions:
+                    return None
+                continue
+            self._tried_values.add(values_hash)
+            break
+        return values
+
+    def _populate_space(self):
+        values = self._random_values()
+        if values is None:
+            return {"status": TrialStatus.STOPPED, "values": None}
+        return {"status": TrialStatus.RUNNING, "values": values}
+
+    def _create_trial(self):
+        trial_id = "{{:0{}d}}".format(len(str(self._max_trials)))
+        trial_id = trial_id.format(self._num_trials)
+
+        if self._max_trials and self._num_trials >= self._max_trials:
+            status = TrialStatus.STOPPED
+            values = None
+        else:
+            results = self._populate_space()
+            status = results["status"]
+            values = results["values"]
+
+        space = TunableSpace()
+        space.variables = self._space.variables
+        space.values = values
+        trial = Trial(tunable_space=space, trial_id=trial_id, status=status)
+        self._num_trials += 1
+        return trial
+
+    def _generate_pipeline_starts(self, process_mesh_list):
+        total_ops = len(self._dist_context._dist_ops_for_program)
+        total_stages = len(process_mesh_list)
+        ops_per_stage = total_ops // total_stages
+        if ops_per_stage == 0:
+            return None
+        # Compute the initial pipeline starts
+        pipeline_starts = []
+        start = 0
+        pipeline_starts.append(0)
+        # The pipeline_starts have total_stages+1 items, and
+        # at least have 2 items.
+        for _ in process_mesh_list:
+            start += ops_per_stage
+            pipeline_starts.append(start)
+        pipeline_starts[-1] = total_ops
+        # Adjust the pipeline starts by random selection
+        directions = []
+        sizes = []
+        half_ops_per_stage = ops_per_stage // 2
+        if half_ops_per_stage > 0 and total_stages > 1:
+            new_pipeline_starts = []
+            # Don't change the first start
+            new_pipeline_starts.append(0)
+            # Consider the starts except the first and the last one
+            for _ in pipeline_starts[1:-1]:
+                directions.append(Boolean("direction"))
+                sizes.append(
+                    IntRange("size",
+                             start=0,
+                             stop=half_ops_per_stage,
+                             endpoint=True))
+            for i, start in enumerate(pipeline_starts[1:-1]):
+                direction = directions[i].random(self._seed)
+                size = sizes[i].random(self._seed)
+                if direction:
+                    # Substract 1 from size to avoid the overlapping of new starts
+                    new_start = start - (size - 1)
+                else:
+                    new_start = start + size
+                new_pipeline_starts.append(new_start)
+            # Don't change the last start
+            new_pipeline_starts.append(pipeline_starts[-1])
+            # Validate the new starts
+            print("Adjusted pipeline starts",
+                  new_pipeline_starts,
+                  half_ops_per_stage,
+                  pipeline_starts,
+                  flush=True)
+            for i, new_start in enumerate(new_pipeline_starts[1:]):
+                assert new_start > new_pipeline_starts[i]
+            return new_pipeline_starts
+        else:
+            print("Non-adjusted pipeline starts",
+                  pipeline_starts,
+                  half_ops_per_stage,
+                  flush=True)
+            return pipeline_starts
+
+    def _apply_pipeline_partition(self, process_mesh_list):
+        op_id_to_process_mesh = {}
+        total_ops = len(self._dist_context._dist_ops_for_program)
+        total_stages = len(process_mesh_list)
+        ops_per_stage = total_ops // total_stages
+        if ops_per_stage == 0:
+            return None
+        pipeline_starts = self._generate_pipeline_starts(process_mesh_list)
+        start_idx = 1
+        sorted_op_ids = sorted(self._dist_context._dist_ops_for_program.keys())
+        for idx, op_id in enumerate(sorted_op_ids):
+            if idx < pipeline_starts[start_idx]:
+                op_id_to_process_mesh[op_id] = process_mesh_list[start_idx - 1]
+            else:
+                start_idx += 1
+                op_id_to_process_mesh[op_id] = process_mesh_list[start_idx - 1]
+        return op_id_to_process_mesh
+
+    def _amend_dist_attr(self):
+        # 1) Reshape the process mesh of [1, x] to [x] or [x, 1] to [x],
+        # and amend the corresponding dims_mapping.
+        # 2) Set the dim_mapping to -1 when the shape cannot be divided
+        # by the corresponding processes.
+        for dist_op in self._dist_context._dist_ops_for_program.values():
+            dist_attr = dist_op.dist_attr
+            process_mesh = dist_attr.process_mesh
+            if process_mesh is None:
+                continue
+            assert process_mesh.ndim == 2
+            dim_of_one = None
+            dim_of_other = None
+            if process_mesh.topology[0] == 1:
+                dim_of_one = 0
+                dim_of_other = 1
+            elif process_mesh.topology[1] == 1:
+                dim_of_one = 1
+                dim_of_other = 0
+
+            if dim_of_one is not None:
+                dist_attr.process_mesh = ProcessMesh(process_mesh.processes)
+                self._dist_context.add_process_mesh(dist_attr.process_mesh)
+
+            for arg_name in dist_attr.inputs_dist_attrs.keys():
+                new_dims_mapping = []
+                dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
+                for dim_mapping in dims_mapping:
+                    if dim_mapping == dim_of_one:
+                        new_dims_mapping.append(-1)
+                    elif dim_mapping == dim_of_other:
+                        new_dims_mapping.append(0)
+                    else:
+                        new_dims_mapping.append(dim_mapping)
+                dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping)
+
+                dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
+                # dynamic_dims = dist_attr.get_input_dynamic_dims(arg_name)
+                process_mesh = dist_attr.process_mesh
+                process_shape = process_mesh.topology
+                tensor = dist_op.get_serial_input(arg_name)
+                if dims_mapping:
+                    tensor_shape = tensor.shape
+                else:
+                    continue
+                for i, dim_mapping in enumerate(dims_mapping):
+                    # if dim_mapping != -1 \
+                    #     and (tensor_shape[i] % process_shape[dim_mapping] != 0 \
+                    #     or dynamic_dims[i] == 1):
+                    if dim_mapping != -1 \
+                        and (tensor_shape[i] % process_shape[dim_mapping] != 0):
+                        dims_mapping[i] = -1
+                    # it is a fix-bug
+                    if dim_mapping != -1 \
+                        and process_shape[dim_mapping] == 1:
+                        dims_mapping[i] = -1
+
+            for arg_name in dist_attr.outputs_dist_attrs.keys():
+                new_dims_mapping = []
+                dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
+                for dim_mapping in dims_mapping:
+                    if dim_mapping == dim_of_one:
+                        new_dims_mapping.append(-1)
+                    elif dim_mapping == dim_of_other:
+                        new_dims_mapping.append(0)
+                    else:
+                        new_dims_mapping.append(dim_mapping)
+                dist_attr.set_output_dims_mapping(arg_name, new_dims_mapping)
+
+                dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
+                # dynamic_dims = dist_attr.get_output_dynamic_dims(arg_name)
+                process_mesh = dist_attr.process_mesh
+                process_shape = process_mesh.topology
+
+                tensor = dist_op.get_serial_output(arg_name)
+                if dims_mapping:
+                    tensor_shape = tensor.shape
+                else:
+                    continue
+                for i, dim_mapping in enumerate(dims_mapping):
+                    if dim_mapping != -1 \
+                        and (tensor_shape[i] % process_shape[dim_mapping] != 0):
+                        dims_mapping[i] = -1
+                    # it is a fix-bug
+                    if dim_mapping != -1 \
+                        and process_shape[dim_mapping] == 1:
+                        dims_mapping[i] = -1
+            dist_op_impls = find_compatible_distributed_operator_impls(
+                dist_op, partial=False)
+            serial_op_type = dist_op.serial_op.type
+
+            if dist_op_impls is not None and (
+                    serial_op_type != "fused_softmax_mask_upper_triangle"
+                    or self._check_fused_softmax_mask_upper_triangle(dist_op)):
+                dist_op.dist_attr.impl_type = dist_op_impls[0].type
+                dist_op.dist_attr.impl_idx = dist_op_impls[0].idx
+            else:
+                # Use the default dist op impl
+                for arg_name in dist_attr.inputs_dist_attrs.keys():
+                    dims_mapping = dist_attr.get_input_dims_mapping(arg_name)
+                    for i, _ in enumerate(dims_mapping):
+                        dims_mapping[i] = -1
+                for arg_name in dist_attr.outputs_dist_attrs.keys():
+                    dims_mapping = dist_attr.get_output_dims_mapping(arg_name)
+                    for i, _ in enumerate(dims_mapping):
+                        dims_mapping[i] = -1
+                dist_op.dist_attr.impl_type = "default"
+                dist_op.dist_attr.impl_idx = 0
+
+    def _check_fused_softmax_mask_upper_triangle(self, dist_op):
+        """The last_but_one dim shoule be equal to last dim."""
+        input_name = dist_op.serial_op.input_arg_names[0]
+        input_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
+            input_name)
+        topology = dist_op.dist_attr.process_mesh.topology
+        input_tensor = dist_op.get_serial_input(input_name)
+        last_but_one_dim = input_tensor.shape[-2] // topology[
+            input_dims_mapping[-2]] if input_dims_mapping[
+                -2] != -1 else input_tensor.shape[-2]
+        last_dim = input_tensor.shape[-1] // topology[input_dims_mapping[
+            -1]] if input_dims_mapping[-1] != -1 else input_tensor.shape[-1]
+        if last_but_one_dim == last_dim:
+            return True
+        return False
+
+    def _eval_trial(self, trial):
+        if self._num_trials == 0:
+            num_prev_trials = 0
+        else:
+            num_prev_trials = self._num_trials - 1
+
+        results = None
+
+        start_time = time.time()
+
+        inter_node_partition = trial.space.values["inter_node_partitions"]
+        intra_node_partition = trial.space.values["intra_node_partitions"]
+        process_mesh_list = self._generate_process_mesh_list(
+            inter_node_partition, intra_node_partition)
+        print("\tprocess_mesh list", process_mesh_list, flush=True)
+        op_id_to_process_mesh = self._apply_pipeline_partition(
+            process_mesh_list)
+        if op_id_to_process_mesh is None:
+            print("Operators are less than pipeline stages", flush=True)
+            return results
+
+        op_id_to_dist_attr = {}
+        for name, value in trial.space.values.items():
+            if name != "inter_node_partitions" \
+                and name !="intra_node_partitions":
+                op_id_to_dist_attr[int(name)] = value
+
+        end_time = time.time()
+        cur_sample_time = end_time - start_time
+        self._sample_time = (num_prev_trials * self._sample_time +
+                             cur_sample_time) / self._num_trials
+        print("\tsample_time",
+              num_prev_trials,
+              self._num_trials,
+              self._sample_time,
+              cur_sample_time,
+              flush=True)
+
+        assert len(op_id_to_process_mesh) == len(op_id_to_dist_attr)
+
+        start_time = time.time()
+        for op_id, process_mesh in op_id_to_process_mesh.items():
+            dist_op = self._dist_context._dist_ops_for_program[op_id]
+            dist_op.dist_attr = copy.deepcopy(op_id_to_dist_attr[op_id])
+            assert dist_op.dist_attr.impl_type == op_id_to_dist_attr[
+                op_id].impl_type
+            assert dist_op.dist_attr.impl_idx == op_id_to_dist_attr[
+                op_id].impl_idx
+            dist_op.dist_attr.process_mesh = process_mesh
+        self._amend_dist_attr()
+
+        self._completer._complete_tensor_dist_attr_by_op()
+
+        self._dist_context.block_state.parse_forward_blocks(
+            self._dist_context.serial_main_program)
+
+        end_time = time.time()
+        cur_complete_time = end_time - start_time
+        self._complete_time = (num_prev_trials * self._complete_time +
+                               cur_complete_time) / self._num_trials
+        print("\tcomplete_time",
+              num_prev_trials,
+              self._num_trials,
+              self._complete_time,
+              cur_complete_time,
+              flush=True)
+
+        start_time = time.time()
+        estimate_time = self._estimate_trial()
+        end_time = time.time()
+        cur_estimate_time = end_time - start_time
+        self._estimate_time = (num_prev_trials * self._estimate_time +
+                               cur_estimate_time) / self._num_trials
+        print("\testimate_time",
+              num_prev_trials,
+              self._num_trials,
+              self._estimate_time,
+              cur_estimate_time,
+              estimate_time,
+              flush=True)
+
+        results = {"estimate_time": estimate_time}
+        return results
+
+    def _update_trail(self, trial, metrics, step=0):
+        for metric_name, metric_value in metrics.items():
+            trial.recorder.update(metric_name, metric_value, step=step)
+        return trial.status
+
+    def _estimate_trial(self):
+        assert self._cluster is not None
+        if self._mode == "eval":
+            self._estimator = CostEstimator(
+                self._dist_context.serial_main_program,
+                self._cluster,
+                loop_count=self._loop_count)
+        elif self._mode == "predict":
+            self._estimator = CostEstimator(
+                self._dist_context.serial_main_program,
+                self._cluster,
+                loop_count=self._loop_count)
+        elif self._mode == "train":
+            # get serial main program with backward
+            serial_main_program = self._dist_context.serial_main_program
+            serial_startup_program = self._dist_context.serial_startup_program
+            serial_optimizer = self._dist_context.serial_optimizer
+
+            # Generate backward
+            serial_loss = self._dist_context.serial_fetch_vars["loss"][0]
+            params_grads = self._parallelizer._generate_backward(
+                serial_main_program, serial_startup_program, serial_loss)
+
+            # Generate optimizer
+            optimizer_ops = self._parallelizer._generate_optimizer(
+                serial_main_program, serial_startup_program, serial_optimizer,
+                params_grads)
+            self._estimator = CostEstimator(serial_main_program,
+                                            self._cluster,
+                                            loop_count=self._loop_count)
+
+        max_memory = self._estimator._estimate_max_memory_by_dist_op(
+            self._dist_context)
+        print("\tmax_memory", "{:,}".format(max_memory), flush=True)
+        # The max memory must be less than 80% 32GB (hard code)
+        if max_memory > 32 * 0.8 * 1024 * 1024 * 1024:
+            return math.inf
+        else:
+            global_cost = self._estimator.estimate(self._dist_context)
+            return global_cost.time
+
+    def _store_init_parallel_strategy(self):
+        # If there is no annotation information, use the dp as the initial parallel strategy.
+        # TODO: we should need a better way to set up the initial parallel strategy.
+        if not self._dist_context.has_annotation \
+            or not self._dist_context.process_meshes:
+            ranks = self._num_machines * self._num_devices_per_machine
+            tensor_node = self._dist_context._serial_ordered_tensor_nodes[0]
+            tensor_node_id = _node_id(tensor_node)
+            tensor = self._dist_context._dist_tensors_for_graph[
+                tensor_node_id].serial_tensor
+            tensor_dist_attr = self._dist_context._dist_tensors_for_graph[
+                tensor_node_id].dist_attr
+            tensor_dist_attr.process_mesh = ProcessMesh(list(range(ranks)))
+            self._dist_context._process_meshes.append(
+                tensor_dist_attr.process_mesh)
+            tensor_dist_attr.dims_mapping = [0] + [
+                -1 for _ in range(len(tensor.shape) - 1)
+            ]
+            tensor_dist_attr.mark_annotated("process_mesh")
+            tensor_dist_attr.mark_annotated("dims_mapping")
+            print("Use dp as the init parallel strategy!", flush=True)
+
+        # Do the sharding propagation
+        self._completer.complete_forward_annotation()
+        self._dist_context.block_state.parse_forward_blocks(
+            self._dist_context.serial_main_program)
+
+        # Backup the intital parallel strategy
+        self._init_parallel_strategy[0] = copy.deepcopy(
+            self._dist_context._dist_tensors_for_program)
+        self._init_parallel_strategy[1] = copy.deepcopy(
+            self._dist_context._dist_ops_for_program)
+        self._init_parallel_strategy[2] = copy.deepcopy(
+            self._dist_context.process_meshes)
+
+        # Initialize the best parallel strategy to the initial one
+        self._best_parallel_strategy[0] = copy.deepcopy(
+            self._dist_context._dist_tensors_for_program)
+        self._best_parallel_strategy[1] = copy.deepcopy(
+            self._dist_context._dist_ops_for_program)
+        self._best_parallel_strategy[2] = copy.deepcopy(
+            self._dist_context._process_meshes)
+
+    def _store_best_parallel_strategy(self):
+        # Swap the best and the current parallel strategy
+        tmp = [None, None, None]
+        tmp[0] = self._best_parallel_strategy[0]
+        tmp[1] = self._best_parallel_strategy[1]
+        tmp[2] = self._best_parallel_strategy[2]
+        self._best_parallel_strategy[
+            0] = self._dist_context._dist_tensors_for_program
+        self._best_parallel_strategy[
+            1] = self._dist_context._dist_ops_for_program
+        self._best_parallel_strategy[2] = self._dist_context._process_meshes
+        self._dist_context._dist_tensors_for_program = tmp[0]
+        self._dist_context._dist_ops_for_program = tmp[1]
+        self._dist_context._process_meshes = tmp[2]
+
+    def tune(self):
+        global_start_time = time.time()
+        self._dist_context._backup(serial=True, dist=True)
+        # This store statement must follow the above backup statement
+        self._store_init_parallel_strategy()
+        init_time = self._estimate_trial()  # estimate_trial when init
+        # print_program_with_dist_attr(self._dist_context.serial_main_program, self._dist_context)
+        # We have to restore the distributed context, because the estimation of one trail need to
+        # generate the backward and update parts. Since we will do the tuning process,
+        # here we only need to reset all distributed information to the default one.
+        self._dist_context._restore(serial=True,
+                                    serial_mode="to_backup",
+                                    dist=True,
+                                    dist_mode="to_default")
+
+        best_time = init_time
+        start_time = time.time()
+        self.construct_space()
+        end_time = time.time()
+        print("construct_space time",
+              self._num_trials,
+              end_time - start_time,
+              flush=True)
+        create_trial_time = 0.0
+        eval_trial_time = 0.0
+        self._sample_time = 0.0
+        self._complete_time = 0.0
+        self._estimate_time = 0.0
+        while True:
+            start_time = time.time()
+            trial = self._create_trial()
+            if self._num_trials == 0:
+                num_prev_trials = 0
+            else:
+                num_prev_trials = self._num_trials - 1
+            end_time = time.time()
+            cur_create_trial_time = end_time - start_time
+            create_trial_time = (num_prev_trials * create_trial_time +
+                                 cur_create_trial_time) / self._num_trials
+            print("create_trial time",
+                  num_prev_trials,
+                  self._num_trials,
+                  create_trial_time,
+                  cur_create_trial_time,
+                  flush=True)
+            if trial.status == TrialStatus.STOPPED:
+                break
+            # We need to backup the distributed context, because the evaluation of one trail will
+            # generate the backward and update parts which may change the context.
+            # However, the distributed information of the context aren't backup since a new one is used.
+            self._dist_context._backup(serial=True, dist=False)
+
+            start_time = time.time()
+            results = self._eval_trial(trial)
+            end_time = time.time()
+            cur_eval_trial_time = end_time - start_time
+            eval_trial_time = (num_prev_trials * eval_trial_time +
+                               cur_eval_trial_time) / self._num_trials
+            print("eval_trial time",
+                  num_prev_trials,
+                  self._num_trials,
+                  eval_trial_time,
+                  cur_eval_trial_time,
+                  "\n",
+                  flush=True)
+
+            cur_time = results["estimate_time"]
+            if cur_time < best_time:
+                self._update_trail(trial, results)
+                self._store_best_parallel_strategy()
+                best_time = cur_time
+            # We need to restore the distributed context and reset the distributed information to the default.
+            self._dist_context._restore(serial=True,
+                                        serial_mode="to_backup",
+                                        dist=True,
+                                        dist_mode="to_default")
+        # Select the best parallel strategy
+        self._dist_context._dist_tensors_for_program = self._best_parallel_strategy[
+            0]
+        self._dist_context._dist_ops_for_program = self._best_parallel_strategy[
+            1]
+        self._dist_context._process_meshes = self._best_parallel_strategy[2]
diff --git a/python/paddle/distributed/auto_parallel/tuner/profiler.py b/python/paddle/distributed/auto_parallel/tuner/profiler.py
index a894554c2facd5..4b2655028bf7f0 100644
--- a/python/paddle/distributed/auto_parallel/tuner/profiler.py
+++ b/python/paddle/distributed/auto_parallel/tuner/profiler.py
@@ -13,20 +13,17 @@
 # limitations under the License.
 
 import os
-import sys
 import argparse
 import traceback
 import pickle
 import json
 import time
-import numpy as np
-from functools import partial
 
 import paddle
 from paddle.fluid.framework import Program, _current_expected_place
-from paddle.fluid.framework import Operator, Parameter
-from paddle.distributed.auto_parallel.process_group import clear_all_process_groups, get_all_process_groups, new_process_group
-from paddle.distributed.auto_parallel.dist_loader import NonIterableGeneratorLoader
+from paddle.fluid.framework import Operator
+from paddle.distributed.auto_parallel.process_group import get_all_process_groups, new_process_group
+from paddle.distributed.auto_parallel.dist_loader import DistributedDataLoaderFromGenerator
 from paddle.distributed.collective import _get_global_env
 
 paddle.enable_static()
@@ -135,13 +132,14 @@ def create_dataloader(main_program,
     # insert read op at the end of program
     places = paddle.static.cuda_places()
     with paddle.static.program_guard(main_program, startup_program):
-        dataloader = NonIterableGeneratorLoader(
-            dataset,
-            feed_list,
-            places,
-            dataset.batch_size,
-            epochs,
-            steps_per_epoch,
+        dataloader = DistributedDataLoaderFromGenerator(
+            dataset=dataset,
+            feed_list=feed_list,
+            capacity=70,
+            places=places,
+            batch_size=dataset.batch_size,
+            epochs=epochs,
+            steps_per_epoch=steps_per_epoch,
             data_parallel_world_size=dataset.dp_world_size,
             data_parallel_rank=dataset.dp_rank)
 
diff --git a/python/paddle/distributed/auto_parallel/tuner/trial.py b/python/paddle/distributed/auto_parallel/tuner/trial.py
index 3937ca9865181f..edc588b4c70fec 100644
--- a/python/paddle/distributed/auto_parallel/tuner/trial.py
+++ b/python/paddle/distributed/auto_parallel/tuner/trial.py
@@ -156,9 +156,10 @@ def summary(self):
             draws += h1_format.format("{} auto=True <-> {}".format(name, name))
             draws += line + "\n"
             my_configs = getattr(self.space, name)
-            keys = my_configs.keys()
+            keys = my_configs.to_dict().keys()
             for key in keys:
-                draws += h2_format.format(key, str(my_configs.get(key, None)))
+                draws += h2_format.format(
+                    key, str(my_configs.to_dict().get(key, None)))
 
         result_res = draws + border
         return result_res
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
index 93ae25c9c4dd1b..38dc142468e8aa 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_space.py
@@ -44,10 +44,18 @@ def __init__(self):
     def variables(self):
         return self._variables
 
+    @variables.setter
+    def variables(self, variables):
+        self._variables = variables
+
     @property
     def values(self):
         return self._values
 
+    @values.setter
+    def values(self, values):
+        self._values = values
+
     def get_value(self, name):
         if name in self.values:
             return self.values[name]
diff --git a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
index 424b6b74bb154b..31dd07aad374c3 100644
--- a/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
+++ b/python/paddle/distributed/auto_parallel/tuner/tunable_variable.py
@@ -90,6 +90,7 @@ def __init__(self, name, values, default=None):
             raise TypeError(
                 "Choice can contain only one type of value, but found values: {} with types: {}."
                 .format(str(values), str(types)))
+        self._is_unknown_type = False
 
         if isinstance(values[0], str):
             values = [str(v) for v in values]
@@ -108,9 +109,8 @@ def __init__(self, name, values, default=None):
             if default is not None:
                 default = bool(default)
         else:
-            raise TypeError(
-                "Choice can only contain str, int, float, or boll, but found: {} "
-                .format(str(values)))
+            self._is_unknown_type = True
+            self._indices = [i for i in range(len(values))]
         self.values = values
 
         if default is not None and default not in values:
@@ -129,7 +129,11 @@ def default(self):
 
     def random(self, seed=None):
         rng = np.random.default_rng(seed)
-        return rng.choice(self.values)
+        if self._is_unknown_type:
+            indice = rng.choice(self._indices)
+            return self.values[indice]
+        else:
+            return rng.choice(self.values)
 
     def get_state(self):
         state = super(Choice, self).get_state()
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index d276df6ddbd062..a08a17288a456c 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -23,9 +23,33 @@
 
 import paddle.fluid.core as core
 from paddle.distributed.fleet.meta_optimizers.common import OpRole
-from paddle.distributed.auto_parallel.process_group import get_all_process_groups
+from paddle.distributed.auto_parallel.process_group import (
+    get_all_process_groups,
+)
 from paddle.fluid.io import is_parameter, is_belong_to_optimizer
-from paddle.distributed.auto_parallel.dist_attribute import TensorDistributedAttribute, OperatorDistributedAttribute
+from paddle.distributed.auto_parallel.dist_attribute import (
+    TensorDistributedAttribute,
+    OperatorDistributedAttribute,
+)
+
+__not_shape_var_type__ = [
+    core.VarDesc.VarType.READER,
+    core.VarDesc.VarType.STEP_SCOPES,
+]
+
+
+def get_logger(log_level, name="auto_parallel"):
+    logger = logging.getLogger(name)
+    logger.propagate = False
+    if not logger.handlers:
+        logger.setLevel(log_level)
+        log_handler = logging.StreamHandler()
+        log_format = logging.Formatter(
+            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
+        )
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
+    return logger
 
 
 def is_valid_list_index(list, index):
@@ -49,6 +73,63 @@ def is_dim_replicate(mapping):
         return False
 
 
+def verify_dims_mapping(dims_mapping, process_mesh):
+    if dims_mapping is None:
+        return False
+    if not all(isinstance(d, int) for d in dims_mapping):
+        return False
+    for i in range(len(dims_mapping)):
+        if dims_mapping[i] < -1 or dims_mapping[i] >= len(process_mesh.shape):
+            return False
+    for i in range(len(process_mesh.shape)):
+        if dims_mapping.count(i) > 1:
+            return False
+    return True
+
+
+def convert_to_dims_mapping(shard_spec, process_mesh):
+    dims_mapping = []
+    for shard in shard_spec:
+        if shard is None:
+            dims_mapping.append(-1)
+        elif process_mesh.topology[process_mesh.dim_names.index(shard)] == 1:
+            dims_mapping.append(-1)
+        else:
+            dims_mapping.append(process_mesh.dim_names.index(shard))
+    return dims_mapping
+
+
+def convert_to_shard_spec(dims_mapping, process_mesh):
+    shard_spec = []
+    for dim_mapping in dims_mapping:
+        if dim_mapping == -1:
+            shard_spec.append(None)
+        else:
+            shard_spec.append(process_mesh.dim_names[dim_mapping])
+    return shard_spec
+
+
+def verify_shard_spec(shard_spec, tensor_shape, process_mesh):
+    if len(shard_spec) != len(tensor_shape):
+        return False
+    for shard in shard_spec:
+        if shard is not None and not isinstance(shard, str):
+            return False
+        if shard is not None and shard not in process_mesh.dim_names:
+            return False
+    dims_mapping = convert_to_dims_mapping(shard_spec, process_mesh)
+    if not verify_dims_mapping(dims_mapping, process_mesh):
+        return False
+    for i in range(len(tensor_shape)):
+        if (
+            dims_mapping[i] != -1
+            and tensor_shape[i] > 0
+            and tensor_shape[i] % process_mesh.shape[dims_mapping[i]] != 0
+        ):
+            return False
+    return True
+
+
 def compute_compatible_dim_mapping(dim_mappings):
     if not dim_mappings:
         return None
@@ -70,14 +151,17 @@ def compute_compatible_dims_mapping(dims_mapping_list):
         return None
     length = len(dims_mapping_list[0])
     for dims_mapping in dims_mapping_list:
-        assert dims_mapping is not None, \
-            "Dims mapping must not be None for compatible computation"
-        assert len(dims_mapping) == length, \
-            "The length of dims_mapping in list must be same for compatible computation."
+        assert (
+            dims_mapping is not None
+        ), "Dims mapping must not be None for compatible computation"
+        assert (
+            len(dims_mapping) == length
+        ), "The length of dims_mapping in list must be same for compatible computation."
     compatible_result = []
     for dim_mappings in zip(*dims_mapping_list):
         compatible_dim_mapping = compute_compatible_dim_mapping(
-            list(dim_mappings))
+            list(dim_mappings)
+        )
         if compatible_dim_mapping is None:
             return None
         compatible_result.append(compatible_dim_mapping)
@@ -90,7 +174,10 @@ def compute_compatible_process_mesh(process_mesh_list):
         return compatible_process_mesh
     for process_mesh in process_mesh_list:
         if process_mesh is not None:
-            if compatible_process_mesh is None or compatible_process_mesh == process_mesh:
+            if (
+                compatible_process_mesh is None
+                or compatible_process_mesh == process_mesh
+            ):
                 compatible_process_mesh = process_mesh
             else:
                 return None
@@ -130,15 +217,18 @@ def remove_distributed_attr_suffix(name):
 
 def check_distributed_attr_for_program(program, dist_context=None):
     from .dist_context import get_default_distributed_context
+
     if dist_context is None:
         dist_context = get_default_distributed_context()
-    assert dist_context.is_initialized_for_program(), \
-        "Distributed attributes must be initialized before check."
+    assert (
+        dist_context.is_initialized_for_program()
+    ), "Distributed attributes must be initialized before check."
     for block in program.blocks:
         for tensor in block.vars.values():
             dist_tensor = dist_context.get_dist_tensor_for_graph(tensor)
             tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                tensor)
+                tensor
+            )
             if (tensor_dist_attr is not None) and (not dist_tensor.is_valid()):
                 return False
         for op in block.ops:
@@ -158,6 +248,7 @@ def print_program_with_dist_attr(program, dist_context=None):
     lock.acquire()
     from .dist_context import get_default_distributed_context
     from .dist_context import set_default_distributed_context
+
     if dist_context is None:
         dist_context = get_default_distributed_context()
         print(program, flush=True)
@@ -171,7 +262,7 @@ def print_program_with_dist_attr(program, dist_context=None):
 
 def _get_comm_group(processes, shape, axis, rank):
     """
-    Given a rank and the processes mesh the rank belongs to,  
+    Given a rank and the processes mesh the rank belongs to,
     compute the communication peers of the rank based on the give axis in the mesh.
 
     Example: 16 processes managed in a 4-Dimensinal mesh with shape of [2, 2, 2, 2].
@@ -185,7 +276,8 @@ def _get_comm_group(processes, shape, axis, rank):
     # NOTE _linear_idx2coordinate assume processes mesh start with 0 and continuous
     # tricks to support processes mesh when it is not start with 0 or continuous
     assert rank in processes, "rank [{}] is NOT in processes group {}".format(
-        rank, processes)
+        rank, processes
+    )
     rank_relatvie = processes.index(rank)
     coordinate = _linear_idx2coordinate(shape, rank_relatvie)
     coordinates_in_group = [coordinate[:] for i in range(shape[axis])]
@@ -205,7 +297,7 @@ def _get_comm_group(processes, shape, axis, rank):
 
 def _get_idx_in_axis(processes, shape, axis, rank):
     """
-    Given a rank and the processes mesh the rank belongs to,  
+    Given a rank and the processes mesh the rank belongs to,
     compute the index of the rank in given axis.
 
     Example: 27 processes managed in a 3-Dimensinal mesh with shape of [3, 3, 3].
@@ -226,20 +318,20 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
     """
     convert a coordinate in multidimensional mesh space into a scala idx in linear space.
 
-    it use Row-major order for dimension conversion. 
+    it use Row-major order for dimension conversion.
     so it has:  [most_significant_dim, ..., least_significant_dim]
-    assume: 
+    assume:
 
         the size of i-th dimension to be:  S[i]
         the index of j-th dimension is: I[j]
 
-    linear_idx of a n dimensional coordinate is: 
+    linear_idx of a n dimensional coordinate is:
 
         I[n-1] * (S[n-2] * S[n-3] * S[n-4] *     ....    S[0]) +
-        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +       
-        I[n-3] * (                  S[n-4] *     ....    S[0]) +  
+        I[n-2] * (         S[n-3] * S[n-4] *     ....    S[0]) +
+        I[n-3] * (                  S[n-4] *     ....    S[0]) +
         ...
-        I[1]   * (                                       S[0]) + 
+        I[1]   * (                                       S[0]) +
         I[0]
 
     """
@@ -254,14 +346,19 @@ def _coordinate2linear_idx(mesh_shape, coordinate):
     assert len(mesh_shape) == len(
         coordinate
     ), "coordinate should have the same size as mesh shape, but got shape: {}, coordinate: {}".format(
-        mesh_shape, coordinate)
+        mesh_shape, coordinate
+    )
     for i in range(len(mesh_shape)):
-        assert coordinate[
-            i] >= 0, "index in dimension [{}] is least than zero. coordinate: {}".format(
-                i, coordinate)
-        assert coordinate[i] < mesh_shape[
-            i], "index beyond extent in dimension [{}]. shape: {}, coordinate: {}".format(
-                i, mesh_shape, coordinate)
+        assert (
+            coordinate[i] >= 0
+        ), "index in dimension [{}] is least than zero. coordinate: {}".format(
+            i, coordinate
+        )
+        assert (
+            coordinate[i] < mesh_shape[i]
+        ), "index beyond extent in dimension [{}]. shape: {}, coordinate: {}".format(
+            i, mesh_shape, coordinate
+        )
 
     base = mesh_shape[-1]
     linear_idx = coordinate[-1]
@@ -279,7 +376,7 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
     mapping a linear scala into multidimensional mesh space, return it coordinate in that space.
 
     it is the inverse function of _coordinate2linear_idx.
-    assume: 
+    assume:
 
         the size of i-th dimension to be:  S[i]
         the index of j-th dimension is: I[j]
@@ -294,11 +391,13 @@ def _linear_idx2coordinate(mesh_shape, linear_idx):
     """
 
     assert linear_idx >= 0, "linear index [{}] is least than zero".format(
-        linear_idx)
+        linear_idx
+    )
     assert linear_idx < np.prod(
         mesh_shape
     ), "linear index beyond the extent of mesh shape. shape: {}, linear index: {}".format(
-        mesh_shape, linear_idx)
+        mesh_shape, linear_idx
+    )
 
     base = 1
     coordinate = [-1] * len(mesh_shape)
@@ -321,15 +420,17 @@ def _get_corresponding_rank(dist_context, target_mesh, rank):
     coordinate = None
     for mesh in dist_context.process_meshes:
         if rank in mesh.processes and mesh.topology == target_mesh.topology:
-            coordinate = _linear_idx2coordinate(mesh.topology,
-                                                mesh.processes.index(rank))
+            coordinate = _linear_idx2coordinate(
+                mesh.topology, mesh.processes.index(rank)
+            )
             break
 
     # assert coordinate is not None, "could NOT found rank [{}] in any registered mesh".format(
     #     rank)
     if coordinate is not None:
-        return target_mesh.processes[_coordinate2linear_idx(
-            mesh.topology, coordinate)]
+        return target_mesh.processes[
+            _coordinate2linear_idx(mesh.topology, coordinate)
+        ]
     else:
         return target_mesh.processes[0]
 
@@ -341,7 +442,8 @@ def _get_unshard_dist_shape(var, dist_attr):
     assert len(var_shape) == len(
         mapping
     ), "variable shape [{}] and dim_mapping [{}] is NOT match !".format(
-        var_shape, mapping)
+        var_shape, mapping
+    )
     new_shape = []
     for idx in range(len(var_shape)):
         if var_shape[idx] == -1 or mapping[idx] == -1:
@@ -354,13 +456,15 @@ def _get_unshard_dist_shape(var, dist_attr):
 
 def make_data_unshard(dist_main_prog, dist_startup_prog, dist_context=None):
     from .dist_context import get_default_distributed_context
+
     if dist_context is None:
         dist_context = get_default_distributed_context()
 
     for var in dist_main_prog.list_vars():
         if var.is_data:
             tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                var)
+                var
+            )
             inverse_shape = _get_unshard_dist_shape(var, tensor_dist_attr)
             var.desc.set_shape(inverse_shape)
             dim_mapping = tensor_dist_attr.dims_mapping
@@ -370,62 +474,76 @@ def make_data_unshard(dist_main_prog, dist_startup_prog, dist_context=None):
 
 
 def _update_addition_info(addition_info):
-    """ Update default addition_info with inputs """
+    """Update default addition_info with inputs"""
     add_info = {"epoch": 0, "batch": 0, "batch_size": 0}
     if not addition_info:
         return add_info
     elif not isinstance(addition_info, dict):
-        raise TypeError("The type of 'addition_info' should be 'dict', "
-                        "but got '{}'.".format(str(type(addition_info))))
+        raise TypeError(
+            "The type of 'addition_info' should be 'dict', "
+            "but got '{}'.".format(str(type(addition_info)))
+        )
     else:
         for item, value in addition_info.items():
             if item not in ["epoch", "batch", "batch_size"]:
                 raise ValueError(
                     "The key of 'addition_info' should be one of the "
                     "['epoch', 'batch', 'batch_size'], but got '{}'.".format(
-                        str(item)))
+                        str(item)
+                    )
+                )
             if not isinstance(value, int):
                 raise ValueError(
                     "The value of 'addition_info' should be 'int', "
-                    "but got '{}'.".format(str(type(value))))
+                    "but got '{}'.".format(str(type(value)))
+                )
             add_info[item] = value
         return add_info
 
 
 def _check_valid_path(file_path):
-    """ Validity check of input file path """
+    """Validity check of input file path"""
     if not file_path:
         return file_path
     elif isinstance(file_path, list):
         for file in file_path:
             if not isinstance(file, str):
-                raise TypeError("The type of file path should be 'str', "
-                                "but got '{}'.".format(str(type(file))))
+                raise TypeError(
+                    "The type of file path should be 'str', "
+                    "but got '{}'.".format(str(type(file)))
+                )
             if not os.path.exists(file):
                 raise ValueError(
-                    "The file path '{}' does not exist.".format(file))
+                    "The file path '{}' does not exist.".format(file)
+                )
         return file_path
     else:
-        raise TypeError("The type of file path should be 'list', "
-                        "but got '{}'.".format(str(type(file_path))))
+        raise TypeError(
+            "The type of file path should be 'list', "
+            "but got '{}'.".format(str(type(file_path)))
+        )
 
 
 def _check_param_dict(param_dict):
     if not param_dict:
         raise ValueError("'param_dict' cannot be None.")
     elif not isinstance(param_dict, dict):
-        raise TypeError("The type of 'param_dict' should be 'dict', "
-                        "but got '{}'.".format(str(type(param_dict))))
+        raise TypeError(
+            "The type of 'param_dict' should be 'dict', "
+            "but got '{}'.".format(str(type(param_dict)))
+        )
     else:
         for name, value in param_dict.items():
             if not isinstance(name, str):
                 raise TypeError(
                     "The type of key of 'param_dict' should be 'str', "
-                    "but got '{}'.".format(str(type(name))))
+                    "but got '{}'.".format(str(type(name)))
+                )
             if not isinstance(value, paddle.fluid.LoDTensor):
                 raise TypeError(
                     "The type of value of 'param_dict' should be 'LoDTensor', "
-                    "but got '{}'.".format(str(type(value))))
+                    "but got '{}'.".format(str(type(value)))
+                )
         return param_dict
 
 
@@ -433,35 +551,42 @@ def _check_dist_attr(dist_attr):
     if not dist_attr:
         return dist_attr
     elif not isinstance(dist_attr, dict):
-        raise TypeError("The type of 'dist_attr' should be 'dict', "
-                        "but got '{}'.".format(str(type(dist_attr))))
+        raise TypeError(
+            "The type of 'dist_attr' should be 'dict', "
+            "but got '{}'.".format(str(type(dist_attr)))
+        )
     else:
         for name, value in dist_attr.items():
             if not isinstance(name, str):
                 raise TypeError(
                     "The type of param name of 'dist_attr' should be 'str', "
-                    "but got '{}'.".format(str(type(name))))
+                    "but got '{}'.".format(str(type(name)))
+                )
             if not isinstance(value, dict):
                 raise TypeError(
                     "The type of distributed attribute should be 'dict', "
-                    "but got '{}'".format(str(type(value))))
+                    "but got '{}'".format(str(type(value)))
+                )
             attr = ['process_shape', 'process_group', 'dims_mapping']
             if list(value.keys()) != attr:
                 raise ValueError(
                     "The key of distributed attribute should be "
                     "'['process_shape', 'process_group', 'dims_mapping']', "
-                    "but got {}.".format(str(value.keys())))
+                    "but got {}.".format(str(value.keys()))
+                )
         return dist_attr
 
 
-def save_distributed_checkpoint(program,
-                                checkpoint_path,
-                                dist_attr_path,
-                                addition_info=None,
-                                is_integrated=False,
-                                dist_context=None):
-    """ 
-    Save model parameter state, optimzer state, distributed attribute and 
+def save_distributed_checkpoint(
+    program,
+    checkpoint_path,
+    dist_attr_path,
+    addition_info=None,
+    is_integrated=False,
+    dist_context=None,
+):
+    """
+    Save model parameter state, optimzer state, distributed attribute and
     additional information of each rank.
 
     Args:
@@ -498,11 +623,12 @@ def save_distributed_checkpoint(program,
     else:
         # TODO: integrate param before save
         raise NotImplementedError(
-            "Integrating parameter has not been implemented.")
+            "Integrating parameter has not been implemented."
+        )
 
 
 def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
-    """ 
+    """
     Load parameter, optimizer, distributed attribute and addition_info.
 
     Args:
@@ -512,7 +638,7 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
     Returns:
         param_dict(dict): parameters' value of all ranks.
         dist_attr(dict): parameters' distributed attribute.
-        addition_info(dict): additional information user saved in last training. 
+        addition_info(dict): additional information user saved in last training.
 
     Notes:
         The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
@@ -520,16 +646,16 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
     Examples:
         .. code-block:: python
 
-            ckpt_path = ['./model_state_rank0.pdmodel', 
+            ckpt_path = ['./model_state_rank0.pdmodel',
                          './model_state_rank1.pdmodel']
-            dist_attr_path = ['./dist_attr_rank0.pdattr', 
+            dist_attr_path = ['./dist_attr_rank0.pdattr',
                               './dist_attr_rank1.pdattr']
             param_dict, dist_attr, add_info = load_distributed_checkpoint(ckpt_path, dist_attr_path)
     """
-    assert _check_valid_path(checkpoint_path), \
-        "'checkpoint_path' cannot be None."
-    assert _check_valid_path(dist_attr_path), \
-        "'dist_attr_path' cannot be None."
+    assert _check_valid_path(
+        checkpoint_path
+    ), "'checkpoint_path' cannot be None."
+    assert _check_valid_path(dist_attr_path), "'dist_attr_path' cannot be None."
 
     state_dict_info = _load_distributed_state_dict(checkpoint_path)
     dist_attr = _load_distributed_attribute(dist_attr_path)
@@ -538,11 +664,10 @@ def load_distributed_checkpoint(checkpoint_path, dist_attr_path):
     return param_dict, dist_attr, addition_info
 
 
-def load_checkpoint_into_program(checkpoint_path,
-                                 dist_attr_path,
-                                 program,
-                                 dist_context=None):
-    """ 
+def load_checkpoint_into_program(
+    checkpoint_path, dist_attr_path, program, dist_context=None
+):
+    """
     Load parameter, optimizer, distributed attribute and addition_info into model.
 
     Args:
@@ -553,7 +678,7 @@ def load_checkpoint_into_program(checkpoint_path,
 
     Returns:
         addition_info(dict): user saved in last train.
-    
+
     Notes:
         The return, 'addition_info', is belonging to the first file of checkpoint_path by default.
 
@@ -561,19 +686,19 @@ def load_checkpoint_into_program(checkpoint_path,
         .. code-block:: python
 
             exe.run(startup_program)
-            ckpt_path = ['./model_state_rank0.pdmodel', 
+            ckpt_path = ['./model_state_rank0.pdmodel',
                          './model_state_rank1.pdmodel']
-            dist_attr_path = ['./dist_attr_rank0.pdattr', 
+            dist_attr_path = ['./dist_attr_rank0.pdattr',
                               './dist_attr_rank1.pdattr']
             load_checkpoint_into_program(ckpt_path, dist_attr_path, main_program)
     """
     from .dist_context import get_default_distributed_context
 
     assert isinstance(program, paddle.fluid.framework.Program)
-    assert _check_valid_path(checkpoint_path), \
-        "'checkpoint_path' cannot be None."
-    assert _check_valid_path(dist_attr_path), \
-        "'dist_attr_path' cannot be None."
+    assert _check_valid_path(
+        checkpoint_path
+    ), "'checkpoint_path' cannot be None."
+    assert _check_valid_path(dist_attr_path), "'dist_attr_path' cannot be None."
     if dist_context is None:
         dist_context = get_default_distributed_context()
     all_state_dict_info = _load_distributed_state_dict(checkpoint_path)
@@ -581,16 +706,16 @@ def load_checkpoint_into_program(checkpoint_path,
     all_cur_dist_attr = get_dist_attr(program, dist_context)
     all_param_dict = all_state_dict_info["model"]
     addition_info = all_state_dict_info["addition_info"]
-    sliced_param_dict = merge_and_slice_parameter(all_param_dict,
-                                                  all_pre_dist_attr,
-                                                  all_cur_dist_attr)
+    sliced_param_dict = merge_and_slice_parameter(
+        all_param_dict, all_pre_dist_attr, all_cur_dist_attr
+    )
     load_parameter_into_program(sliced_param_dict, program)
 
     return addition_info
 
 
 def load_parameter_into_program(param_dict, program):
-    """ 
+    """
     Load parameters into program.
 
     Args:
@@ -605,28 +730,31 @@ def load_parameter_into_program(param_dict, program):
 
 
 def _save_distributed_attribute(program, dist_attr_path, dist_context):
-    """ Save distributed attribute of all parameters """
+    """Save distributed attribute of all parameters"""
     # TODO: just save a complete distributed attribute file
     rank_id = paddle.distributed.get_rank()
-    dist_attr_name = os.path.join(dist_attr_path,
-                                  "dist_attr_rank{}.pdattr".format(rank_id))
+    dist_attr_name = os.path.join(
+        dist_attr_path, "dist_attr_rank{}.pdattr".format(rank_id)
+    )
     dist_attr_dict = {
         "model": get_dist_attr(program, dist_context),
-        "world_size": paddle.distributed.get_world_size()
+        "world_size": paddle.distributed.get_world_size(),
     }
     paddle.save(dist_attr_dict, dist_attr_name)
     logging.info(
-        "Already saved distributed attribute to '{}'.".format(dist_attr_path))
+        "Already saved distributed attribute to '{}'.".format(dist_attr_path)
+    )
 
 
 def _load_distributed_attribute(dist_attr_path):
-    """ Load parameters' distributed attribute from dist_attr_path """
+    """Load parameters' distributed attribute from dist_attr_path"""
     total_dist_attr = {}
     for dist_attr_file in dist_attr_path:
         dist_attr = paddle.load(dist_attr_file)
         pre_world_size = dist_attr["world_size"]
-        assert pre_world_size == len(dist_attr_path), \
-            "The number of 'dist_attr_path' must be equal to the last training world size."
+        assert pre_world_size == len(
+            dist_attr_path
+        ), "The number of 'dist_attr_path' must be equal to the last training world size."
         for name, attr in dist_attr["model"].items():
             if name not in total_dist_attr:
                 total_dist_attr[name] = attr
@@ -635,27 +763,29 @@ def _load_distributed_attribute(dist_attr_path):
 
 
 def _save_distributed_state_dict(program, addition_info, checkpoint_path):
-    """ Save parameters' state_dict """
+    """Save parameters' state_dict"""
     rank = paddle.distributed.get_rank()
-    ckpt_file_name = os.path.join(checkpoint_path,
-                                  "model_state_rank{}.pdmodel".format(rank))
+    ckpt_file_name = os.path.join(
+        checkpoint_path, "model_state_rank{}.pdmodel".format(rank)
+    )
     state_dict = {
         "model": program.state_dict(),
         "world_size": paddle.distributed.get_world_size(),
-        "addition_info": addition_info
+        "addition_info": addition_info,
     }
     paddle.save(state_dict, ckpt_file_name)
     logging.info("Already saved model to '{}'.".format(checkpoint_path))
 
 
 def _load_distributed_state_dict(checkpoint_path):
-    """ Load parameters' state_dict from checkpoint_path """
+    """Load parameters' state_dict from checkpoint_path"""
     all_state_dict = {}
     for idx, ckpt_file in enumerate(checkpoint_path):
         state_dict_info = paddle.load(ckpt_file, return_numpy=True)
         pre_world_size = state_dict_info["world_size"]
-        assert pre_world_size == len(checkpoint_path), \
-            "The number of 'checkpoint_path' must be equal to the last training world size."
+        assert pre_world_size == len(
+            checkpoint_path
+        ), "The number of 'checkpoint_path' must be equal to the last training world size."
         if idx == 0:
             addition_info = state_dict_info["addition_info"]
         for name, value in state_dict_info["model"].items():
@@ -666,13 +796,13 @@ def _load_distributed_state_dict(checkpoint_path):
 
     all_state_dict_info = {
         "model": all_state_dict,
-        "addition_info": addition_info
+        "addition_info": addition_info,
     }
     return all_state_dict_info
 
 
 def get_dist_attr(program, dist_context=None):
-    """ 
+    """
     Get distributed attribute of current rank.
 
     Args:
@@ -687,13 +817,14 @@ def get_dist_attr(program, dist_context=None):
     for var in program.list_vars():
         if is_parameter(var) or is_belong_to_optimizer(var):
             tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                var)
+                var
+            )
             process_mesh = tensor_dist_attr.process_mesh
             dims_mapping = tensor_dist_attr.dims_mapping
             dist_attr[var.name] = {
                 "process_shape": process_mesh.topology,
                 "process_group": process_mesh.processes,
-                "dims_mapping": dims_mapping
+                "dims_mapping": dims_mapping,
             }
     return dist_attr
 
@@ -711,19 +842,26 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
         dist_param_dict(dict): parameters' value of current rank.
     """
     assert _check_dist_attr(pre_dist_attr), "'pre_dist_attr' cannot be None."
-    assert isinstance(dist_param_dict, dict), \
-        "The type of 'dist_param_dict' should be 'dict', but got {}.".format(
-            str(type(dist_param_dict)))
+    assert isinstance(
+        dist_param_dict, dict
+    ), "The type of 'dist_param_dict' should be 'dict', but got {}.".format(
+        str(type(dist_param_dict))
+    )
     for name, value in dist_param_dict.items():
         if not isinstance(name, str):
-            raise TypeError("The key of 'dist_param_dict' is parameter's name, "
-                            "and its type should be 'str', but got {}.".format(
-                                str(type(name))))
+            raise TypeError(
+                "The key of 'dist_param_dict' is parameter's name, "
+                "and its type should be 'str', but got {}.".format(
+                    str(type(name))
+                )
+            )
         if not isinstance(value, list) or not all(
-                isinstance(v, np.ndarray) for v in value):
+            isinstance(v, np.ndarray) for v in value
+        ):
             raise TypeError(
                 "The value of 'dist_param_dict' is parameter's value of all ranks, "
-                "and its type should be 'list(numpy.ndarray)'.")
+                "and its type should be 'list(numpy.ndarray)'."
+            )
 
     if cur_dist_attr is None:
         return {}
@@ -751,7 +889,8 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
         cur_dims_mapping = cur_attr["dims_mapping"]
         if len(set(pre_dims_mapping)) > 1 or -1 not in pre_dims_mapping:
             complete_param = _merge_parameter_with_dist_attr(
-                pre_param, pre_attr)
+                pre_param, pre_attr
+            )
             dist_param_dict[var_name] = complete_param
         else:
             complete_param = pre_param[0]
@@ -759,7 +898,8 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
 
         if len(set(cur_dims_mapping)) > 1 or -1 not in cur_dims_mapping:
             sliced_param = _slice_parameter_with_dist_attr(
-                complete_param, cur_attr)
+                complete_param, cur_attr
+            )
             dist_param_dict[var_name] = sliced_param
 
     for var_name in pre_dist_attr:
@@ -770,67 +910,81 @@ def merge_and_slice_parameter(dist_param_dict, pre_dist_attr, cur_dist_attr):
     if param_not_in_pre:
         warnings.warn(
             "Parameters '{}' are not found in last training process.".format(
-                str(param_not_in_pre)))
+                str(param_not_in_pre)
+            )
+        )
     if param_not_in_cur:
         warnings.warn(
             "Parameters '{}' are not found in current training process.".format(
-                str(param_not_in_cur)))
+                str(param_not_in_cur)
+            )
+        )
 
     return dist_param_dict
 
 
 def _merge_parameter_with_dist_attr(param_list, dist_attr):
-    """ Merge parameter with distributed attribute """
+    """Merge parameter with distributed attribute"""
     from .reshard import Resharder
 
     dims_mapping = dist_attr["dims_mapping"]
     process_shape = dist_attr["process_shape"]
     process_group = dist_attr["process_group"]
     # get the complete shape of the parameter
-    complete_shape = Resharder.compute_complete_shape(param_list[0].shape,
-                                                      process_shape,
-                                                      dims_mapping)
+    complete_shape = Resharder.compute_complete_shape(
+        param_list[0].shape, process_shape, dims_mapping
+    )
     # merge the parameter with dist_attr
     partition_param_list = []
     merged_partiton = []
     for process in process_group:
         partition_index = Resharder.compute_partition_index(
-            process, complete_shape, dims_mapping, process_shape, process_group)
+            process, complete_shape, dims_mapping, process_shape, process_group
+        )
         index = process_group.index(process)
         if partition_index not in merged_partiton:
             merged_partiton.append(partition_index)
-            _merge_parameter(partition_param_list, param_list[index],
-                             partition_index, complete_shape)
-
-    assert len(partition_param_list) == 1 or not partition_param_list, \
-        "Fail to merge parameter"
+            _merge_parameter(
+                partition_param_list,
+                param_list[index],
+                partition_index,
+                complete_shape,
+            )
+
+    assert (
+        len(partition_param_list) == 1 or not partition_param_list
+    ), "Fail to merge parameter"
     complete_param = partition_param_list[0][0]
     return complete_param
 
 
 def _slice_parameter_with_dist_attr(param, dist_attr):
-    """ Slice parameter with distributed attribute """
-    param = np.array(param) if isinstance(param,
-                                          paddle.fluid.LoDTensor) else param
+    """Slice parameter with distributed attribute"""
+    param = (
+        np.array(param) if isinstance(param, paddle.fluid.LoDTensor) else param
+    )
     dims_mapping = dist_attr["dims_mapping"]
     process_shape = dist_attr["process_shape"]
     process_group = dist_attr["process_group"]
     # slice the parameter with dist_attr
-    partition_index_list = _get_split_indices(param.shape, dims_mapping,
-                                              process_shape, process_group)
-    sliced_param_list = _slice_parameter(param, partition_index_list,
-                                         len(partition_index_list))
+    partition_index_list = _get_split_indices(
+        param.shape, dims_mapping, process_shape, process_group
+    )
+    sliced_param_list = _slice_parameter(
+        param, partition_index_list, len(partition_index_list)
+    )
     # get the current parameter's index in sliced_param_list
     rank_id = paddle.distributed.get_rank()
-    sliced_param_index = _get_sliced_param_index(rank_id, param.shape,
-                                                 dims_mapping, process_shape,
-                                                 process_group)
+    sliced_param_index = _get_sliced_param_index(
+        rank_id, param.shape, dims_mapping, process_shape, process_group
+    )
     sliced_param = sliced_param_list[sliced_param_index]
     return sliced_param
 
 
-def _merge_parameter(partition_param_list, param, partition_index,
-                     complete_shape):
+def _merge_parameter(
+    partition_param_list, param, partition_index, complete_shape
+):
     """
     Merge partitial parameters to a complete one.
 
@@ -864,19 +1018,30 @@ def _merge_parameter(partition_param_list, param, partition_index,
     else:
         i = 0
         while i < len(partition_param_list):
-            concat_axis, first_order, new_partition = Resharder.compute_concat_info(
-                partition_param_list[i][1], partition_index)
+            (
+                concat_axis,
+                first_order,
+                new_partition,
+            ) = Resharder.compute_concat_info(
+                partition_param_list[i][1], partition_index
+            )
             if concat_axis != -1:
                 if first_order == 0:
                     new_param = np.concatenate(
-                        (partition_param_list[i][0], param), axis=concat_axis)
+                        (partition_param_list[i][0], param), axis=concat_axis
+                    )
                 else:
                     new_param = np.concatenate(
-                        (param, partition_param_list[i][0]), axis=concat_axis)
+                        (param, partition_param_list[i][0]), axis=concat_axis
+                    )
 
                 partition_param_list.pop(i)
-                _merge_parameter(partition_param_list, new_param, new_partition,
-                                 complete_shape)
+                _merge_parameter(
+                    partition_param_list,
+                    new_param,
+                    new_partition,
+                    complete_shape,
+                )
                 break
             i += 1
 
@@ -904,19 +1069,21 @@ def _slice_parameter(complete_param, partition_index_list, length):
     """
     sliced_param_list = []
     axis = len(complete_param.shape) - length
-    sliced_param = np.split(complete_param,
-                            partition_index_list[axis],
-                            axis=axis)
+    sliced_param = np.split(
+        complete_param, partition_index_list[axis], axis=axis
+    )
     if length == 1:
         return sliced_param
     for param in sliced_param:
         sliced_param_list.extend(
-            _slice_parameter(param, partition_index_list, length - 1))
+            _slice_parameter(param, partition_index_list, length - 1)
+        )
     return sliced_param_list
 
 
-def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
-                            process_group):
+def _get_sliced_param_index(
+    rank, complete_shape, dims_mapping, process_shape, process_group
+):
     """
     Get sliced_param's index of current rank in all sliced parameters list.
 
@@ -935,7 +1102,7 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
             process_group = [0, 1, 2]
 
             slice_param = _slice_parameter(complete_param, [[], [], [2, 4]], 3)
-            # slice_param: 
+            # slice_param:
             # [array([[[1.11, 1.12]]]), array([[[1.13, 1.14]]]), array([[[1.15, 1.16]]])]
 
             index = _get_sliced_param_index(rank, complete_shape, dims_mapping
@@ -944,10 +1111,9 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
     """
     from .reshard import Resharder
 
-    partition_index = Resharder.compute_partition_index(rank, complete_shape,
-                                                        dims_mapping,
-                                                        process_shape,
-                                                        process_group)
+    partition_index = Resharder.compute_partition_index(
+        rank, complete_shape, dims_mapping, process_shape, process_group
+    )
     sliced_param_index = 0
     for i, shape in enumerate(complete_shape):
         if dims_mapping[i] == -1:
@@ -962,8 +1128,9 @@ def _get_sliced_param_index(rank, complete_shape, dims_mapping, process_shape,
     return sliced_param_index
 
 
-def _get_split_indices(complete_shape, dims_mapping, process_shape,
-                       process_group):
+def _get_split_indices(
+    complete_shape, dims_mapping, process_shape, process_group
+):
     """
     Get split indices of every dimension.
 
@@ -988,15 +1155,20 @@ def _get_split_indices(complete_shape, dims_mapping, process_shape,
     split_indices_list = []
     for process in process_group:
         partition_index = Resharder.compute_partition_index(
-            process, complete_shape, dims_mapping, process_shape, process_group)
+            process, complete_shape, dims_mapping, process_shape, process_group
+        )
         if split_indices_list:
             for dim in range(len(partition_index)):
                 split_indices_list[dim].extend(partition_index[dim])
         else:
             split_indices_list = partition_index
     split_indices_list = list(
-        map(lambda x, y: list(set(x) - set([y]) - set([0])), split_indices_list,
-            complete_shape))
+        map(
+            lambda x, y: list(set(x) - set([y]) - set([0])),
+            split_indices_list,
+            complete_shape,
+        )
+    )
     split_indices_list = [sorted(x) for x in split_indices_list]
     return split_indices_list
 
@@ -1015,8 +1187,10 @@ def set_grad_var_shape(program, dist_context):
         if int(op.attr('op_role')) != int(OpRole.Backward):
             continue
 
-        if int(block.ops[idx-1].attr('op_role')) == int(OpRole.Forward) or \
-            int(block.ops[idx-1].attr('op_role')) == 257:
+        if (
+            int(block.ops[idx - 1].attr('op_role')) == int(OpRole.Forward)
+            or int(block.ops[idx - 1].attr('op_role')) == 257
+        ):
             appended_grad_times += 1
 
         if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
@@ -1034,61 +1208,102 @@ def set_grad_var_shape(program, dist_context):
                 continue
             if var_name in grad_var_to_var[appended_grad_times]:
                 forward_var_name = grad_var_to_var[appended_grad_times][
-                    var_name]
+                    var_name
+                ]
             else:
-                forward_var_name = var_name[:var_name.find("@GRAD")]
+                forward_var_name = var_name[: var_name.find("@GRAD")]
 
             if op.type in [
-                    "c_allreduce_sum", "c_identity", "scale", "cast",
-                    "fill_zeros_like"
+                "c_allreduce_sum",
+                "c_identity",
+                "scale",
+                "cast",
+                "fill_any_like",
             ]:
                 forward_var_name = op.input_arg_names[0]
-            elif op.type == "matmul_v2_grad" or op.type == "matmul_grad" or op.type == "mul_grad":
+            elif (
+                op.type == "matmul_v2_grad"
+                or op.type == "matmul_grad"
+                or op.type == "mul_grad"
+            ):
                 forward_var_name = None
                 for output_name in op.output_names:
                     if var_name in op.output(output_name):
                         assert "@GRAD" in output_name
-                        input_name = output_name[:output_name.find("@GRAD")]
+                        input_name = output_name[: output_name.find("@GRAD")]
                         assert len(op.input(input_name)) == 1
                         forward_var_name = op.input(input_name)[0]
                 assert forward_var_name is not None
 
             need_set_shape_list = [
-                "reshape2_grad", "softmax_with_cross_entropy_grad",
-                "transpose2_grad", "softmax_grad", "cross_entropy_grad2",
-                "dropout_grad", "tanh_grad", "slice", "assign",
-                "matmul_v2_triple_grad", "elementwise_add_triple_grad",
-                "fill_constant", "sqrt_grad",
+                "reshape2_grad",
+                "softmax_with_cross_entropy_grad",
+                "transpose2_grad",
+                "softmax_grad",
+                "cross_entropy_grad2",
+                "dropout_grad",
+                "tanh_grad",
+                "slice",
+                "assign",
+                "matmul_v2_triple_grad",
+                "elementwise_add_triple_grad",
+                "fill_constant",
+                "sqrt_grad",
                 "fused_softmax_mask_upper_triangle_grad",
-                "flatten_contiguous_range_grad", "relu_grad"
+                "flatten_contiguous_range_grad",
+                "relu_grad",
             ]
             forward_list = [
-                "reshape2", "softmax_with_cross_entropy", "transpose2",
-                "softmax", "cross_entropy2", "dropout", "tanh",
-                ["slice_grad", "c_allgather"], "assign", "matmul_v2_grad_grad",
-                "elementwise_add_grad_grad", "shape", "sqrt",
-                "fused_softmax_mask_upper_triangle", "flatten_contiguous_range",
-                "relu"
+                "reshape2",
+                "softmax_with_cross_entropy",
+                "transpose2",
+                "softmax",
+                "cross_entropy2",
+                "dropout",
+                "tanh",
+                ["slice_grad", "c_allgather"],
+                "assign",
+                "matmul_v2_grad_grad",
+                "elementwise_add_grad_grad",
+                "shape",
+                "sqrt",
+                "fused_softmax_mask_upper_triangle",
+                "flatten_contiguous_range",
+                "relu",
             ]
             if op.type in need_set_shape_list:
                 for forward_op in block.ops:
                     idx = need_set_shape_list.index(op.type)
                     forward_op_name = forward_list[idx]
-                    if forward_op.type in forward_op_name and forward_var_name in forward_op.input_arg_names:
-                        op_dist_attr = dist_context.get_op_dist_attr_for_program(
-                            forward_op)
+                    if (
+                        forward_op.type in forward_op_name
+                        and forward_var_name in forward_op.input_arg_names
+                    ):
+                        op_dist_attr = (
+                            dist_context.get_op_dist_attr_for_program(
+                                forward_op
+                            )
+                        )
                         break
 
             forward_input_dist_attr = op_dist_attr.get_input_dist_attr(
-                forward_var_name)
-            assert forward_input_dist_attr is not None, f"{forward_var_name, str(op)}"
+                forward_var_name
+            )
+            assert (
+                forward_input_dist_attr is not None
+            ), f"{forward_var_name, str(op)}"
             forward_var = vars[forward_var_name]
-            forward_var_dist_attr = dist_context.get_tensor_dist_attr_for_program(
-                forward_var)
+            forward_var_dist_attr = (
+                dist_context.get_tensor_dist_attr_for_program(forward_var)
+            )
             assert forward_var_dist_attr is not None
             grad_var = vars[var_name]
-            ref_shape = infer_shape(block, forward_var, forward_var_dist_attr,
-                                    forward_input_dist_attr)
+            ref_shape = infer_shape(
+                block,
+                forward_var,
+                forward_var_dist_attr,
+                forward_input_dist_attr,
+            )
 
             if list(grad_var.shape) != ref_shape:
                 grad_var.desc.set_shape(ref_shape)
@@ -1100,28 +1315,33 @@ def set_grad_var_shape(program, dist_context):
 
 def is_forward_op(op):
     op_role = int(op.attr('op_role'))
-    return OP_ROLE_KEY in op.attr_names and (op_role == int(OpRole.Forward)
-                                             or op_role == int(OpRole.Loss))
+    return OP_ROLE_KEY in op.attr_names and (
+        op_role == int(OpRole.Forward) or op_role == int(OpRole.Loss)
+    )
 
 
 def is_backward_op(op):
-    return OP_ROLE_KEY in op.attr_names and \
-            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Backward)
+    return OP_ROLE_KEY in op.attr_names and int(
+        op.all_attrs()[OP_ROLE_KEY]
+    ) & int(OpRole.Backward)
 
 
 def is_optimize_op(op):
-    return OP_ROLE_KEY in op.attr_names and \
-            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize)
+    return OP_ROLE_KEY in op.attr_names and int(
+        op.all_attrs()[OP_ROLE_KEY]
+    ) & int(OpRole.Optimize)
 
 
 def is_lr_sched_op(op):
-    return OP_ROLE_KEY in op.attr_names and \
-            int(op.all_attrs()[OP_ROLE_KEY]) & int(OpRole.Optimize.LRSched)
+    return OP_ROLE_KEY in op.attr_names and int(
+        op.all_attrs()[OP_ROLE_KEY]
+    ) & int(OpRole.Optimize.LRSched)
 
 
 def is_loss_op(op):
-    return OP_ROLE_KEY in op.attr_names and \
-        int(op.all_attrs()[OP_ROLE_KEY]) == (int(OpRole.Forward) | int(OpRole.Loss))
+    return OP_ROLE_KEY in op.attr_names and int(
+        op.all_attrs()[OP_ROLE_KEY]
+    ) == (int(OpRole.Forward) | int(OpRole.Loss))
 
 
 def is_loss_grad_op(op):
@@ -1132,8 +1352,9 @@ def is_loss_grad_op(op):
 
 
 def is_gradient_clip_op(op):
-    return op.desc.has_attr("op_namescope") \
-        and op.desc.attr("op_namescope").startswith("/gradient_clip")
+    return op.desc.has_attr("op_namescope") and op.desc.attr(
+        "op_namescope"
+    ).startswith("/gradient_clip")
 
 
 def is_prim_op(op):
@@ -1144,8 +1365,9 @@ def get_loss_op(block):
     loss_ops = []
     for op in block.ops:
         if is_loss_op(op):
-            assert len(op.desc.output_arg_names()
-                       ) == 1, "loss op should only output loss var"
+            assert (
+                len(op.desc.output_arg_names()) == 1
+            ), "loss op should only output loss var"
             loss_ops.append(op)
 
     assert len(loss_ops) == 1, "num of loss op is not equal to one"
@@ -1165,7 +1387,8 @@ def set_var_dist_attr(dist_context, var, dims_mapping, process_mesh, **kwargs):
 
 
 def naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
-        new_op, process_mesh, ref_mapping, ctx):
+    new_op, process_mesh, ref_mapping, ctx
+):
     assert process_mesh is not None
     assert ref_mapping is not None
 
@@ -1199,9 +1422,11 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op):
         dims_mapping = op_dist_attr.get_input_dims_mapping(arg_name)
         if len(dims_mapping) > 1:
             for idx, mapping in enumerate(dims_mapping[1:]):
-                assert mapping == -1, \
-                    "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
-                        .format(op_desc.type(), idx, mapping)
+                assert (
+                    mapping == -1
+                ), "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.".format(
+                    op_desc.type(), idx, mapping
+                )
         batch_dim_mappings.append(dims_mapping[0])
     for arg_name in op_desc.output_arg_names():
         serial_tensor = dist_op.get_serial_output(arg_name)
@@ -1211,23 +1436,31 @@ def update_op_dims_mapping_by_default_dist_impl(dist_op):
         if arg_name not in xshape_arg_names:
             if len(dims_mapping) > 1:
                 for idx, mapping in enumerate(dims_mapping[1:]):
-                    assert mapping == -1, \
-                        "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part."\
-                            .format(op_desc.type(), idx, mapping)
+                    assert (
+                        mapping == -1
+                    ), "{} only the batch dimension (0-dim) can be sharded, but the dimension {} is sharded by {} part.".format(
+                        op_desc.type(), idx, mapping
+                    )
             batch_dim_mappings.append(dims_mapping[0])
         else:
-            assert dims_mapping[0] == -1, \
-                "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part."\
-                    .format(op_desc.type(), mapping)
+            assert (
+                dims_mapping[0] == -1
+            ), "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension 0 is sharded by {} part.".format(
+                op_desc.type(), mapping
+            )
             if len(dims_mapping) > 2:
                 for idx, mapping in enumerate(dims_mapping[2:]):
-                    assert mapping == -1, \
-                        "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part."\
-                            .format(op_desc.type(), idx, mapping)
+                    assert (
+                        mapping == -1
+                    ), "{} only the batch dimension (1-dim) of XShape can be sharded, but the dimension {} is sharded by {} part.".format(
+                        op_desc.type(), idx, mapping
+                    )
             batch_dim_mappings.append(dims_mapping[1])
 
     compatible_dim_mapping = compute_compatible_dim_mapping(batch_dim_mappings)
-    assert compatible_dim_mapping is not None, "There is no compatible dim mapping."
+    assert (
+        compatible_dim_mapping is not None
+    ), "There is no compatible dim mapping."
     for arg_name in op_desc.input_arg_names():
         serial_tensor = dist_op.get_serial_input(arg_name)
         if serial_tensor.is_parameter:
@@ -1273,8 +1506,9 @@ def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
         if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
             new_dims_mapping = [-1 for _ in range(max_dims_mapping_len)]
             for i in range(input_dims_mapping_lens[arg_name]):
-                new_idx = (max_dims_mapping_len -
-                           input_dims_mapping_lens[arg_name]) + i
+                new_idx = (
+                    max_dims_mapping_len - input_dims_mapping_lens[arg_name]
+                ) + i
                 new_dims_mapping[new_idx] = input_dims_mapping_dict[arg_name][i]
             dims_mapping_list.append(new_dims_mapping)
         else:
@@ -1286,7 +1520,9 @@ def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
         dims_mapping_list.append(dims_mapping)
 
     compatible_dims_mapping = compute_compatible_dims_mapping(dims_mapping_list)
-    assert compatible_dims_mapping is not None, "There is no compatible dim mapping."
+    assert (
+        compatible_dims_mapping is not None
+    ), "There is no compatible dim mapping."
 
     for arg_name in input_arg_names:
         if input_dims_mapping_lens[arg_name] < max_dims_mapping_len:
@@ -1294,55 +1530,64 @@ def update_op_dims_mapping_by_elementwise_like_dist_impl(dist_op):
                 -1 for _ in range(input_dims_mapping_lens[arg_name])
             ]
             for i in range(input_dims_mapping_lens[arg_name]):
-                new_idx = (max_dims_mapping_len -
-                           input_dims_mapping_lens[arg_name]) + i
+                new_idx = (
+                    max_dims_mapping_len - input_dims_mapping_lens[arg_name]
+                ) + i
                 new_dims_mapping[i] = compatible_dims_mapping[new_idx]
             if new_dims_mapping != input_dims_mapping_dict[arg_name]:
                 op_dist_attr.set_input_dims_mapping(arg_name, new_dims_mapping)
                 changed = True
         else:
             if compatible_dims_mapping != input_dims_mapping_dict[arg_name]:
-                op_dist_attr.set_input_dims_mapping(arg_name,
-                                                    compatible_dims_mapping)
+                op_dist_attr.set_input_dims_mapping(
+                    arg_name, compatible_dims_mapping
+                )
                 changed = True
 
     for arg_name in output_arg_names:
         dims_mapping = op_dist_attr.get_output_dims_mapping(arg_name)
         if compatible_dims_mapping != dims_mapping:
-            op_dist_attr.set_output_dims_mapping(arg_name,
-                                                 compatible_dims_mapping)
+            op_dist_attr.set_output_dims_mapping(
+                arg_name, compatible_dims_mapping
+            )
             changed = True
 
     return changed
 
 
-def get_all_distributed_main_program(serial_program_info, dist_context,
-                                     parallelizer):
+def get_all_distributed_main_program(
+    serial_program_info, dist_context, parallelizer
+):
     "Get all distributed main programs by dist_context."
     from .dist_context import DistributedOperatorContext, DistributedContext
+
     cluster = serial_program_info.cluster
     copied_parallelizer = copy.deepcopy(parallelizer)
     all_dist_main_program = []
-    ranks = paddle.distributed.get_world_size() if cluster is None else len(
-        cluster.get_all_devices("GPU"))
+    ranks = (
+        paddle.distributed.get_world_size()
+        if cluster is None
+        else len(cluster.get_all_devices("GPU"))
+    )
     for rank_id in range(ranks):
         used_dist_context = copy.deepcopy(dist_context)
         used_dist_context._dist_op_context = DistributedOperatorContext()
-        _, _, dist_startup_program, dist_main_program, _ = copied_parallelizer._get_dist_program(
-            rank_id, used_dist_context)
+        (
+            _,
+            _,
+            dist_startup_program,
+            dist_main_program,
+            _,
+        ) = copied_parallelizer._get_dist_program(rank_id, used_dist_context)
         all_dist_main_program.append(dist_main_program)
 
     return all_dist_main_program
 
 
 class SerialProgramInfo:
-
-    def __init__(self,
-                 train_program,
-                 satrtup_program,
-                 loss,
-                 optimizer,
-                 cluster=None):
+    def __init__(
+        self, train_program, satrtup_program, loss, optimizer, cluster=None
+    ):
         self._train_program = train_program
         self._startup_program = satrtup_program
         self._loss = loss
@@ -1371,7 +1616,6 @@ def cluster(self):
 
 
 def get_standalone_cost_data(distributed_programs):
-
     def _compute_runtime(op_cost, op, vars):
         runtime = 0
         try:
@@ -1384,32 +1628,47 @@ def _compute_runtime(op_cost, op, vars):
         parsed_info = op_config.split("\n")
         variable = "(Variable)"
         for info in parsed_info:
-            variable = "(Variable)" if "(Variable)" in info else "(list<Variable>"
+            variable = (
+                "(Variable)" if "(Variable)" in info else "(list<Variable>"
+            )
             if variable in info:
-                arg_name_lower = info[:info.find(variable) - 1]
+                arg_name_lower = info[: info.find(variable) - 1]
                 shape_left_boundary = info.find("[")
                 shape_right_boundary = info.find("]")
-                assert shape_left_boundary > 0 and shape_right_boundary > 0 and shape_right_boundary > shape_left_boundary, "Get shape failed."
-                shape = info[shape_left_boundary +
-                             1:shape_right_boundary].split(",")
+                assert (
+                    shape_left_boundary > 0
+                    and shape_right_boundary > 0
+                    and shape_right_boundary > shape_left_boundary
+                ), "Get shape failed."
+                shape = info[
+                    shape_left_boundary + 1 : shape_right_boundary
+                ].split(",")
                 shape = list(map(lambda x: int(x.strip()), shape))
                 dtype_factor = 1
                 total_static_input_size += reduce(lambda x, y: x * y, shape)
                 if op.type == "c_embedding":
-                    arg_name_lower = "w" if arg_name_lower == "weight" else "ids"
+                    arg_name_lower = (
+                        "w" if arg_name_lower == "weight" else "ids"
+                    )
                 for arg_name in op.input_names:
                     if arg_name.lower() == arg_name_lower:
                         for var_name in op.input(arg_name):
                             var = vars[var_name]
                             total_actual_input_size += reduce(
-                                lambda x, y: x * y, var.shape)
+                                lambda x, y: x * y, var.shape
+                            )
                         break
-        assert total_static_input_size > 0 and total_actual_input_size > 0, "Get input size failed."
+        assert (
+            total_static_input_size > 0 and total_actual_input_size > 0
+        ), "Get input size failed."
 
-        actual_runtime = total_actual_input_size / total_static_input_size * runtime
+        actual_runtime = (
+            total_actual_input_size / total_static_input_size * runtime
+        )
         return actual_runtime
 
     import paddle.cost_model as cm
+
     cost_model = cm.CostModel()
     cost_model.static_cost_data()
     DEFAULT_MULTIPLE = 2
@@ -1420,13 +1679,16 @@ def _compute_runtime(op_cost, op, vars):
         "reshape2": "reshape",
         "unsqueeze2": "unsqueeze",
         "reduce_sum": "sum",
-        "elementwise_div": "divide"
+        "elementwise_div": "divide",
     }
 
     standalone_cost_data = []
     # skip ops
     not_enum_ops = [
-        "create_py_reader", "create_double_buffer_reader", "read", "assign"
+        "create_py_reader",
+        "create_double_buffer_reader",
+        "read",
+        "assign",
     ]
     for distributed_program in distributed_programs:
         cost_data = {}
@@ -1436,26 +1698,33 @@ def _compute_runtime(op_cost, op, vars):
             if op.type in not_enum_ops:
                 cost_data[op.desc.id()] = runtime
                 continue
-            dtype = str(vars[op.input_arg_names[0]].dtype
-                        ) if op.input_arg_names else "float32"
+            dtype = (
+                str(vars[op.input_arg_names[0]].dtype)
+                if op.input_arg_names
+                else "float32"
+            )
             if int(op.attr('op_role')) == int(OpRole.Backward):
                 if "_grad" in op.type:
                     forward_op_name = op.type[:-5]
                     if forward_op_name in OP_NAME_MAPPING.keys():
                         forward_op_name = OP_NAME_MAPPING[forward_op_name]
-                    op_cost = cost_model.get_static_op_time(forward_op_name,
-                                                            forward=False,
-                                                            dtype=dtype)
+                    op_cost = cost_model.get_static_op_time(
+                        forward_op_name, forward=False, dtype=dtype
+                    )
                     if op_cost:
                         runtime = _compute_runtime(op_cost, op, vars)
                     else:
-                        op_cost = cost_model.get_static_op_time(forward_op_name,
-                                                                dtype=dtype)
+                        op_cost = cost_model.get_static_op_time(
+                            forward_op_name, dtype=dtype
+                        )
                         if op_cost:
                             runtime = 2 * _compute_runtime(op_cost, op, vars)
             elif int(op.attr('op_role')) == int(OpRole.Forward):
-                op_name = OP_NAME_MAPPING[
-                    op.type] if op.type in OP_NAME_MAPPING.keys() else op.type
+                op_name = (
+                    OP_NAME_MAPPING[op.type]
+                    if op.type in OP_NAME_MAPPING.keys()
+                    else op.type
+                )
                 op_cost = cost_model.get_static_op_time(op_name)
                 if op_cost:
                     runtime = _compute_runtime(op_cost, op, vars)
@@ -1494,7 +1763,8 @@ def to_list(value):
 def debug_program(program, path, name):
 
     filename = os.path.join(
-        path, name + '_program' + ".%d" % (paddle.distributed.get_rank()))
+        path, name + '_program' + ".%d" % (paddle.distributed.get_rank())
+    )
     with open(filename, 'w') as f:
         f.write(str(program))
 
@@ -1504,3 +1774,142 @@ def ring_id_to_process_group(ring_id):
         if g.id == ring_id:
             return g
     return None
+
+
+def find_higher_order_backward_op(program):
+
+    higher_order_op_suffix = ['_grad_grad', 'triple_grad']
+    for block in program.blocks:
+        for op in block.ops:
+            for suffix in higher_order_op_suffix:
+                if suffix in op.type:
+                    return True
+
+    return False
+
+
+def get_lr(optimizer):
+    if isinstance(optimizer, paddle.optimizer.Optimizer):
+        return optimizer.get_lr()
+    elif isinstance(optimizer, paddle.fluid.optimizer.Optimizer):
+        if isinstance(optimizer._learning_rate, float):
+            return optimizer._learning_rate
+        else:
+            return optimizer._learning_rate()
+    else:
+        raise TypeError(
+            "'optimizer' must be object of class `paddle.optimizer.Optimizer`"
+            " or `paddle.fluid.optimizer.Optimizer`, but got {}.".format(
+                type(optimizer)
+            )
+        )
+
+
+def initialize_pg_in_full_mode(all_process_groups, cur_rank):
+    import socket
+    from ..collective import _get_global_env
+
+    has_recv_by_socket = []
+    # This is a magic number
+    magic_num = 500
+    genv = _get_global_env()
+    cur_rank_ip, cur_rank_port = genv.current_endpoint.split(":")
+    cur_rank_recv_port = int(cur_rank_port) + magic_num
+    server_socket = None
+    # Large enough for recv rank
+    buff_size = 1024
+    server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    server_socket.bind((cur_rank_ip, cur_rank_recv_port))
+    # The 10 is an empirical value
+    server_socket.listen(10)
+    client_sockets = {}
+    for process_group in all_process_groups:
+        if cur_rank not in process_group.ranks:
+            continue
+        if len(process_group.ranks) == 2:
+            index = process_group.ranks.index(cur_rank)
+            is_send = True if index == 0 else False
+            if is_send:
+                recv_rank = process_group.ranks[1]
+                recv_rank_ip, recv_rank_port = genv.trainer_endpoints[
+                    recv_rank
+                ].split(":")
+                connect_port = int(recv_rank_port) + magic_num
+                client_socket = socket.socket(
+                    socket.AF_INET, socket.SOCK_STREAM
+                )
+                client_socket.connect((recv_rank_ip, connect_port))
+                client_socket.send(str(cur_rank).encode('utf-8'))
+                rank = client_socket.recv(buff_size).decode('utf-8')
+                rank = int(rank)
+                if rank != recv_rank:
+                    raise ValueError(
+                        "Please check comm pair, the recv rank should be {} but got {}.".format(
+                            recv_rank, rank
+                        )
+                    )
+                else:
+                    print(
+                        "It is able to instantiate {} as sender now.".format(
+                            process_group.ranks
+                        )
+                    )
+                client_socket.close()
+            else:
+                send_rank = process_group.ranks[0]
+                while True:
+                    if send_rank not in has_recv_by_socket:
+                        client_socket, recv_addr = server_socket.accept()
+                        rank = int(client_socket.recv(buff_size).decode())
+                        client_sockets[rank] = client_socket
+                        has_recv_by_socket.append(rank)
+                    else:
+                        client_sockets[send_rank].send(
+                            str(cur_rank).encode("utf-8")
+                        )
+                        client_sockets[send_rank].close()
+                        print(
+                            "It is able to instantiate {} as recver now.".format(
+                                process_group.ranks
+                            )
+                        )
+                        break
+        print(
+            "***process_group: id:",
+            process_group.id,
+            "rank:",
+            process_group.ranks,
+        )
+        process_group.instantiate()
+    server_socket.close()
+
+
+def get_input_split_info(cur_rank, var, dist_context):
+    # deduce how the input data is split among the cluster
+    tensor_dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
+    process_mesh = tensor_dist_attr.process_mesh
+    dims_mapping = tensor_dist_attr.dims_mapping
+
+    if cur_rank not in process_mesh.processes:
+        rank_id = _get_corresponding_rank(dist_context, process_mesh, cur_rank)
+    else:
+        rank_id = cur_rank
+
+    batch_size_axis = dims_mapping[0]
+    if batch_size_axis > -1 and process_mesh.topology[batch_size_axis] > 1:
+        group_ranks = _get_comm_group(
+            process_mesh.processes,
+            process_mesh.topology,
+            batch_size_axis,
+            rank_id,
+        )
+        return len(group_ranks), group_ranks.index(rank_id)
+
+    return 1, 0
+
+
+def validate_opt(optimizer):
+    if optimizer is not None:
+        optimizer._parameter_list = None
+        optimizer._param_groups = None
+    return optimizer
diff --git a/python/paddle/distributed/cloud_utils.py b/python/paddle/distributed/cloud_utils.py
index a8eedb96a3ecda..651298d6d766f6 100644
--- a/python/paddle/distributed/cloud_utils.py
+++ b/python/paddle/distributed/cloud_utils.py
@@ -14,10 +14,8 @@
 
 import os
 import paddle
-from paddle.distributed.utils import get_cluster
-from paddle.distributed.utils import logger
-from paddle.distributed.utils import get_gpus
-from paddle.distributed.utils import get_cluster_from_args
+from paddle.distributed.utils.launch_utils import get_cluster, get_gpus, get_cluster_from_args
+from paddle.distributed.utils.launch_utils import logger
 
 __all__ = []
 
diff --git a/python/paddle/distributed/collective.py b/python/paddle/distributed/collective.py
index 5960be4800de83..d2e972c4d02792 100644
--- a/python/paddle/distributed/collective.py
+++ b/python/paddle/distributed/collective.py
@@ -40,85 +40,24 @@
 from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid.dygraph_utils as dygraph_utils
 import contextlib
+from .fleet.layers.mpu.mp_ops import split
+from .fleet.layers.mpu.mp_ops import _c_identity
+from .fleet.layers.mpu.mp_ops import _c_concat
+from .fleet.layers.mpu.mp_ops import _c_split
+from .fleet.layers.mpu.mp_ops import _mp_allreduce
+from .fleet.layers.mpu.mp_ops import _c_lookup_table
+from .fleet.layers.mpu.mp_ops import _Linear
+from .fleet.layers.mpu.mp_ops import _set_var_distributed
+from .fleet.layers.mpu.mp_ops import _c_softmax_with_cross_entropy
+from .fleet.layers.mpu.mp_ops import _linear
+from .fleet.layers.mpu.mp_ops import _parallel_linear
+from .fleet.layers.mpu.mp_ops import _parallel_embedding
+from .communication.group import Group, _add_new_group
+from .communication.all_reduce import all_reduce
+from .communication.reduce import _get_reduce_op, ReduceOp
 
 __all__ = []
 
-
-class ReduceOp:
-    """
-    Specify the type of operation used for element-wise reductions.
-    It should be one of the following values:
-
-        ReduceOp.SUM
-
-        ReduceOp.MAX
-
-        ReduceOp.MIN
-
-        ReduceOp.PROD
-
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import paddle
-            import paddle.distributed as dist
-
-            dist.init_parallel_env()
-            if dist.get_rank() == 0:
-                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
-            else:
-                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
-            dist.all_reduce(data, op=dist.ReduceOp.SUM)
-            print(data)
-            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
-    """
-    SUM = 0
-    MAX = 1
-    MIN = 2
-    PROD = 3
-    AVG = 4
-
-
-class Group():
-    """
-    The abstract representation of group.
-    """
-
-    def __init__(self, rank, rank_num, id=0, ranks=[], pg=None, name=None):
-        self.rank = rank
-        self.nranks = rank_num
-        self.id = id
-        self.ranks = ranks
-        self.pg = pg
-        self.name = name
-
-    def is_member(self):
-        if self.rank < 0:
-            return False
-        if self.nranks < 2:
-            return False
-        return True
-
-    def get_group_rank(self, rank):
-        if self.is_member() and rank in self.ranks:
-            return self.ranks.index(rank)
-        else:
-            return -1
-
-    @property
-    def process_group(self):
-        return self.pg
-
-    def __repr__(self):
-        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
-            self.rank, self.nranks, self.id)
-        debug_str += ", ".join(map(str, self.ranks))
-        debug_str += "; name: "
-        debug_str += self.name if self.name else "None"
-        return debug_str
-
-
 _global_env = None
 
 
@@ -166,9 +105,9 @@ def _get_group_map():
     global _group_map
     if _global_env_gid not in _group_map:
         genv = _get_global_env()
-        _group_map[_global_env_gid] = Group(genv.rank,
-                                            genv.world_size,
-                                            ranks=list(range(genv.world_size)))
+        _group_map[_global_env_gid] = Group(
+            genv.rank, 0, list(range(genv.world_size))
+        )
     return _group_map
 
 
@@ -183,8 +122,10 @@ def _get_group_map_by_name():
 
 def _get_default_group():
     global _group_map_by_name
-    assert is_initialized(), ("Call paddle.distributed.init_parallel_env first "
-                              "to initialize the distributed environment.")
+    assert is_initialized(), (
+        "Call paddle.distributed.init_parallel_env first "
+        "to initialize the distributed environment."
+    )
     return _get_group_map_by_name()[_default_group_name]
 
 
@@ -216,19 +157,6 @@ def _new_ring_id():
         return len(_get_group_map()) + max(_get_global_env().nrings, 9)
 
 
-def _get_reduce_op(reduce_op, func_name):
-    if reduce_op == ReduceOp.SUM:
-        return core.ReduceOp.SUM
-    elif reduce_op == ReduceOp.MAX:
-        return core.ReduceOp.MAX
-    elif reduce_op == ReduceOp.MIN:
-        return core.ReduceOp.MIN
-    elif reduce_op == ReduceOp.PROD:
-        return core.ReduceOp.PRODUCT
-    else:
-        raise ValueError("Unknown reduce_op type for {}.".format(func_name))
-
-
 def get_group(id=0):
     """
 
@@ -253,21 +181,23 @@ def get_group(id=0):
     return gm[id] if id in gm else None
 
 
-def _new_process_group_impl(backend,
-                            store,
-                            rank,
-                            world_size,
-                            group_name,
-                            pg_options,
-                            group_id=0,
-                            src_rank=None,
-                            dst_rank=None):
+def _new_process_group_impl(
+    backend,
+    store,
+    rank,
+    world_size,
+    group_name,
+    pg_options,
+    group_id=0,
+    src_rank=None,
+    dst_rank=None,
+):
     pg = None
     genv = _get_global_env()
     if backend != 'heter':
         assert src_rank is None and dst_rank is None, (
-            "src_rank and dst_rank "
-            "can only be set for heter backend.")
+            "src_rank and dst_rank " "can only be set for heter backend."
+        )
     assert backend in _valid_backend_list, "Unsupported backend: %s." % backend
     if backend == "gloo":
         place = core.CPUPlace()
@@ -296,24 +226,27 @@ def _new_process_group_impl(backend,
         switch_ep = os.getenv("CLUSTER_SWITCH", None)
         assert switch_ep, "please set the CLUSTER_SWITCH variable."
         cluster_size_cumsum = np.cumsum(cluster_size)
-        cluster_offset = 0 if cluster_id == 0 else cluster_size_cumsum[
-            cluster_id - 1]
+        cluster_offset = (
+            0 if cluster_id == 0 else cluster_size_cumsum[cluster_id - 1]
+        )
         global_rank = cluster_offset + rank
         global_world_size = cluster_size_cumsum[-1]
         global_rank, global_world_size = _get_global_config(backend, rank)
-        pg = core.ProcessGroupHeter(store,
-                                    rank=global_rank,
-                                    world_size=global_world_size,
-                                    place=place,
-                                    gid=group_id,
-                                    local_rank=rank,
-                                    local_size=world_size,
-                                    gloo_rank=cluster_id,
-                                    gloo_size=len(cluster_size),
-                                    with_switch=True,
-                                    switch_endpoint=switch_ep,
-                                    src_rank=src_rank,
-                                    dst_rank=dst_rank)
+        pg = core.ProcessGroupHeter(
+            store,
+            rank=global_rank,
+            world_size=global_world_size,
+            place=place,
+            gid=group_id,
+            local_rank=rank,
+            local_size=world_size,
+            gloo_rank=cluster_id,
+            gloo_size=len(cluster_size),
+            with_switch=True,
+            switch_endpoint=switch_ep,
+            src_rank=src_rank,
+            dst_rank=dst_rank,
+        )
 
     return pg
 
@@ -359,10 +292,12 @@ def barrier(group=None):
     if not isinstance(ring_id, int):
         raise ValueError("The type of 'group' for barrier must be int.")
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [temp]},
-                     outputs={'Out': [temp]},
-                     attrs={'ring_id': ring_id})
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [temp]},
+        attrs={'ring_id': ring_id},
+    )
 
 
 # _custom_gid provides a way for users to
@@ -384,7 +319,7 @@ def _barrier_by_tcp_store(group_name, store, timeout):
         return
 
     barrier_prefix = "Barrier/" + group_name + "/"
-    is_master = (global_rank == 0)
+    is_master = global_rank == 0
 
     def _check_keys_ready(wait_keys):
         start_time = time.time()
@@ -397,9 +332,12 @@ def _check_keys_ready(wait_keys):
                     "Keys {} are not ready sinck rank {} is waiting them."
                     "Two reason may cause this error:\n 1. The create process group api should be called by all ranks.\n"
                     " 2. Try to increase the waiting time.\n".format(
-                        group_name, wait_keys, global_rank))
+                        group_name, wait_keys, global_rank
+                    )
+                )
             wait_keys = list(
-                filter(lambda key: int(store.get(key)) != 1, wait_keys))
+                filter(lambda key: int(store.get(key)) != 1, wait_keys)
+            )
 
     # all the workers set their exiting key and exit
     # the master will wait for all workers' exiting key, ensure to exit in the end
@@ -433,7 +371,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
             paddle.distributed.init_parallel_env()
             tindata = paddle.randn(shape=[2, 3])
             gp = paddle.distributed.new_group([2,4,6])
-            paddle.distributed.all_reduce(tindata, group=gp, use_calc_stream=False)
+            paddle.distributed.all_reduce(tindata, group=gp, sync_op=False)
 
     """
     global _custom_gid
@@ -451,29 +389,35 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
                 ranks = global_ranks
             assert len(ranks) <= len(global_ranks), (
                 "Size of new group must be less than or "
-                "equal to that of the default global group.")
+                "equal to that of the default global group."
+            )
         size = len(ranks)
         ranks = sorted(ranks)
         if backend == 'heter' or (size > 1 and global_rank in ranks):
             rank = 0 if backend == 'heter' else ranks.index(global_rank)
             src_rank = ranks[0] if backend == 'heter' else None
             dst_rank = ranks[1] if backend == 'heter' else None
-            pg = _new_process_group_impl(backend,
-                                         _default_store,
-                                         rank,
-                                         size,
-                                         group_name,
-                                         pg_options=None,
-                                         group_id=gid,
-                                         src_rank=src_rank,
-                                         dst_rank=dst_rank)
+            pg = _new_process_group_impl(
+                backend,
+                _default_store,
+                rank,
+                size,
+                group_name,
+                pg_options=None,
+                group_id=gid,
+                src_rank=src_rank,
+                dst_rank=dst_rank,
+            )
         else:
             rank = -1
             pg = None
-        group = Group(rank, size, id=gid, ranks=ranks, pg=pg, name=group_name)
+        group = Group(rank, gid, ranks, pg=pg, name=group_name)
         _group_map_by_name[group_name] = group
         _group_map[gid] = group
         _group_map_backend[group] = backend
+        # TODO: The method below is a new method for group management, will replace the previous
+        # three in the future.
+        _add_new_group(group)
 
         # TODO(shenliang03): This is a temporary solution to solve the problem of
         # hang caused by tcp
@@ -487,7 +431,7 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
 
     if not backend:
         backend = 'nccl'
-    assert backend == 'nccl', ("backend other than nccl is not supported yet")
+    assert backend == 'nccl', "backend other than nccl is not supported yet"
 
     genv = _get_global_env()
     global_rank = genv.rank
@@ -495,13 +439,13 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
     ring_id = _new_ring_id()
 
     if global_rank not in ranks:
-        gp = Group(-1, -1, ring_id, ranks)
+        gp = Group(-1, ring_id, ranks)
         _group_map[ring_id] = gp
     else:
         ranks = sorted(ranks)
         group_rank = ranks.index(global_rank)
         group_size = len(ranks)
-        gp = Group(group_rank, group_size, ring_id, ranks)
+        gp = Group(group_rank, ring_id, ranks)
         _group_map[ring_id] = gp
 
         if group_size >= 2:
@@ -516,31 +460,37 @@ def new_group(ranks=None, backend=None, timeout=_default_timeout):
 
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(genv.device_id)
-                core.NCCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.NCCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             elif core.is_compiled_with_npu():
                 place = core.NPUPlace(genv.device_id)
-                core.HCCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.HCCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             elif core.is_compiled_with_mlu():
                 place = core.MLUPlace(genv.device_id)
-                core.CNCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.CNCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             elif core.is_compiled_with_xpu():
                 place = core.XPUPlace(genv.device_id)
-                core.BKCLParallelContext(strategy,
-                                         place).init_with_ring_id(ring_id)
+                core.BKCLParallelContext(strategy, place).init_with_ring_id(
+                    ring_id
+                )
             else:
-                assert False, ("no cuda device found")
+                assert False, "no cuda device found"
         else:
             return gp
 
     # TODO(shenliang03): This is a temporary solution to solve the problem of
     # hang caused by cross-creation of new_group
-    tmp = paddle.to_tensor(
-        [1], dtype="int32") if _non_static_mode() else fill_constant(
-            [0], dtype="int32", value="1")
-    paddle.distributed.all_reduce(tmp, use_calc_stream=True)
+    tmp = (
+        paddle.to_tensor([1], dtype="int32")
+        if _non_static_mode()
+        else fill_constant([0], dtype="int32", value="1")
+    )
+    paddle.distributed.all_reduce(tmp, sync_op=True)
     paddle.distributed.wait(tmp)
     return gp
 
@@ -550,7 +500,8 @@ def is_initialized():
 
     Check whether the distributed environment has been initialized
 
-    Returns (bool): `True` if distributed environment has been initialized, otherwise `False`.
+    Returns:
+        `True` if distributed environment has been initialized, otherwise `False`.
 
     Examples:
         .. code-block:: python
@@ -575,10 +526,10 @@ def destroy_process_group(group=None):
     Destroy a given group for communication
 
     Args:
-        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including 
-                                        the default group, will be destroyed and the distributed 
+        group (ProcessGroup, optional): The group to be destroyed. All of process groups, including
+                                        the default group, will be destroyed and the distributed
                                         environment will be deinitialized.
-    
+
     Returns : None
 
     Examples:
@@ -636,7 +587,7 @@ def wait(tensor, group=None, use_calc_stream=True):
 
             paddle.distributed.init_parallel_env()
             tindata = paddle.randn(shape=[2, 3])
-            paddle.distributed.all_reduce(tindata, use_calc_stream=True)
+            paddle.distributed.all_reduce(tindata, sync_op=True)
             paddle.distributed.wait(tindata)
 
     """
@@ -670,8 +621,9 @@ def _sync_calc_stream(tensor):
 def _sync_comm_stream(tensor, ring_id=0):
 
     if _non_static_mode():
-        return _legacy_C_ops.c_sync_comm_stream([tensor], [tensor], 'ring_id',
-                                                ring_id)
+        return _legacy_C_ops.c_sync_comm_stream(
+            [tensor], [tensor], 'ring_id', ring_id
+        )
 
     op_type = 'c_sync_comm_stream'
 
@@ -684,7 +636,7 @@ def _sync_comm_stream(tensor, ring_id=0):
     )
 
 
-def broadcast(tensor, src, group=None, use_calc_stream=True):
+def broadcast(tensor, src, group=None, sync_op=True):
     """
 
     Broadcast a tensor from the source to all others.
@@ -698,11 +650,10 @@ def broadcast(tensor, src, group=None, use_calc_stream=True):
 
     Args:
         tensor (Tensor): The Tensor to send if current rank is the source, or the Tensor to receive otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank.
-        group (Group): The group instance return by new_group or None for global default group.
-        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
+        group (Group, optional): The group instance return by new_group or None for global default group.
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
     Returns:
         None.
@@ -733,139 +684,62 @@ def broadcast(tensor, src, group=None, use_calc_stream=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         gsrc = group.get_group_rank(src)
-        assert gsrc >= 0, ("src rank out of group, need global rank")
+        assert gsrc >= 0, "src rank out of group, need global rank"
         task = group.process_group.broadcast(tensor, gsrc)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
             return task
 
+    use_calc_stream = sync_op
     ring_id = ring_id = 0 if group is None else group.id
     gsrc = src if group is None else group.get_group_rank(src)
-    assert gsrc >= 0, ("src rank out of group, need global rank")
+    assert gsrc >= 0, "src rank out of group, need global rank"
 
     if _non_static_mode():
-        return _legacy_C_ops.c_broadcast(tensor, tensor, 'root', gsrc,
-                                         'use_calc_stream', use_calc_stream,
-                                         'ring_id', ring_id)
+        return _legacy_C_ops.c_broadcast(
+            tensor,
+            tensor,
+            'root',
+            gsrc,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+        )
 
     op_type = 'c_broadcast'
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'broadcast')
-
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'root': gsrc,
-                         'use_calc_stream': use_calc_stream,
-                         'ring_id': ring_id,
-                     })
-
-
-def all_reduce(tensor, op=ReduceOp.SUM, group=None, use_calc_stream=True):
-    """
-
-    Reduce a tensor over all ranks so that all get the result.
-    As shown below, one process is started with a GPU and the data of this process is represented
-    by its group rank. The reduce operator is sum. Through all_reduce operator, 
-    each GPU will have the sum of the data from all GPUs.
-
-    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
-        :width: 800
-        :alt: all_reduce
-        :align: center
-
-    Args:
-        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
-        group (Group): The group instance return by new_group or None for global default group.
-        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
-
-    Returns:
-        None.
-
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import paddle
-            import paddle.distributed as dist
-
-            dist.init_parallel_env()
-            if dist.get_rank() == 0:
-                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
-            else:
-                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
-            dist.all_reduce(data)
-            print(data)
-            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
-    """
-    if group is not None and not group.is_member():
-        return
-
-    if in_dygraph_mode():
-        op_type = _get_reduce_op(op, "all_reduce")
-        group = _get_default_group() if group is None else group
-        task = group.process_group.allreduce(tensor, op_type)
-        if use_calc_stream:
-            task.wait()
-            return None
-        else:
-            return task
-
-    ring_id = 0 if group is None else group.id
-    if _non_static_mode():
-        if op == ReduceOp.SUM:
-            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id)
-        elif op == ReduceOp.MAX:
-            return _legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id)
-        elif op == ReduceOp.MIN:
-            return _legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id)
-        elif op == ReduceOp.PROD:
-            return _legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream',
-                                                   use_calc_stream, 'ring_id',
-                                                   ring_id)
-        else:
-            raise ValueError("Unknown parameter: {}.".format(op))
+    check_variable_and_dtype(
+        tensor,
+        'tensor',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'int8',
+            'uint8',
+            'bool',
+        ],
+        'broadcast',
+    )
 
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'all_reduce')
-    if op == ReduceOp.SUM:
-        op_type = 'c_allreduce_sum'
-    elif op == ReduceOp.MAX:
-        op_type = 'c_allreduce_max'
-    elif op == ReduceOp.MIN:
-        op_type = 'c_allreduce_min'
-    elif op == ReduceOp.PROD:
-        op_type = 'c_allreduce_prod'
-    if not isinstance(ring_id, int):
-        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': use_calc_stream
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'ring_id': ring_id,
+        },
+    )
 
 
-def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
+def reduce(tensor, dst, op=ReduceOp.SUM, group=None, sync_op=True):
     """
 
     Reduce a tensor to the destination from all others. As shown below, one process is started with a GPU and the data of this process is represented
@@ -879,12 +753,11 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
 
     Args:
         tensor (Tensor): The output Tensor for the destination and the input Tensor otherwise. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default value is ReduceOp.SUM.
-        group (Group): The group instance return by new_group or None for global default group.
-        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
+        group (Group, optional): The group instance return by new_group or None for global default group.
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
     Returns:
         None.
@@ -913,44 +786,83 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         op_type = _get_reduce_op(op, "reduce")
         group = _get_default_group() if group is None else group
         gdst = group.get_group_rank(dst)
-        assert gdst >= 0, ("dst rank out of group, need global rank")
+        assert gdst >= 0, "dst rank out of group, need global rank"
         task = group.process_group.reduce(tensor, gdst, op_type)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
             return task
 
+    use_calc_stream = sync_op
     ring_id = 0 if group is None else group.id
     gdst = dst if group is None else group.get_group_rank(dst)
-    assert gdst >= 0, ("dst rank out of group, need global rank")
+    assert gdst >= 0, "dst rank out of group, need global rank"
 
     if _non_static_mode():
         if op == ReduceOp.SUM:
-            return _legacy_C_ops.c_reduce_sum(tensor, tensor, 'use_calc_stream',
-                                              use_calc_stream, 'ring_id',
-                                              ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_sum(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         elif op == ReduceOp.MAX:
-            return _legacy_C_ops.c_reduce_max(tensor, tensor, 'use_calc_stream',
-                                              use_calc_stream, 'ring_id',
-                                              ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_max(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         elif op == ReduceOp.MIN:
-            return _legacy_C_ops.c_reduce_min(tensor, tensor, 'use_calc_stream',
-                                              use_calc_stream, 'ring_id',
-                                              ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_min(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         elif op == ReduceOp.PROD:
-            return _legacy_C_ops.c_reduce_prod(tensor, tensor,
-                                               'use_calc_stream',
-                                               use_calc_stream, 'ring_id',
-                                               ring_id, 'root_id', gdst)
+            return _legacy_C_ops.c_reduce_prod(
+                tensor,
+                tensor,
+                'use_calc_stream',
+                use_calc_stream,
+                'ring_id',
+                ring_id,
+                'root_id',
+                gdst,
+            )
         else:
             raise ValueError("Unknown parameter: {}.".format(op))
 
     op_type = 'c_reduce'
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'reduce')
+    check_variable_and_dtype(
+        tensor,
+        'tensor',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'int8',
+            'uint8',
+            'bool',
+        ],
+        'reduce',
+    )
 
     if op == ReduceOp.SUM:
         op_type = 'c_reduce_sum'
@@ -962,17 +874,19 @@ def reduce(tensor, dst, op=ReduceOp.SUM, group=None, use_calc_stream=True):
         op_type = 'c_reduce_prod'
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': use_calc_stream,
-                         'root_id': gdst,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'use_calc_stream': use_calc_stream,
+            'root_id': gdst,
+        },
+    )
 
 
-def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
+def all_gather(tensor_list, tensor, group=None, sync_op=True):
     """
 
     Gather tensors from all participators and all get the result. As shown
@@ -987,12 +901,11 @@ def all_gather(tensor_list, tensor, group=None, use_calc_stream=True):
 
     Args:
         tensor_list (list): A list of output Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool, bfloat16, complex64 or complex128.
         tensor (Tensor): The Tensor to send. Its data type
             should be float16, float32, float64, int32, int64, int8, uint8, bool, complex64 or complex128.
-        group (Group): The group instance return by new_group or None for global default group.
-        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
+        group (Group, optional): The group instance return by new_group or None for global default group.
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
     Returns:
         None.
@@ -1023,8 +936,9 @@ def convert_to_complex(list_of_tensor):
             list_of_complex.append(paddle.as_complex(tensor))
         return list_of_complex
 
-    is_input_complex = (tensor.dtype == paddle.complex64
-                        or tensor.dtype == paddle.complex128)
+    is_input_complex = (
+        tensor.dtype == paddle.complex64 or tensor.dtype == paddle.complex128
+    )
     if is_input_complex:
         tensor = paddle.as_real(tensor)
 
@@ -1046,37 +960,73 @@ def convert_to_complex(list_of_tensor):
             tensor_list.extend(list_of_tensor)
         return
 
+    use_calc_stream = sync_op
     ring_id = 0 if group is None else group.id
     nranks = _get_global_group().nranks if group is None else group.nranks
 
     if _non_static_mode():
-        out = _legacy_C_ops.c_allgather(tensor, 'use_calc_stream',
-                                        use_calc_stream, 'ring_id', ring_id,
-                                        'nranks', nranks)
+        out = _legacy_C_ops.c_allgather(
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'nranks',
+            nranks,
+        )
     else:
         op_type = 'c_allgather'
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
         if not isinstance(tensor_list, list):
-            raise ValueError("The type of 'tensor_list' for all_gather "
-                             "should be list.")
+            raise ValueError(
+                "The type of 'tensor_list' for all_gather " "should be list."
+            )
         for elem in tensor_list:
-            check_variable_and_dtype(elem, 'tensor_list', [
-                'float16', 'float32', 'float64', 'int32', 'int64', 'bool',
-                'int8', 'uint8', 'complex64', 'complex128'
-            ], 'all_gather')
-        check_variable_and_dtype(tensor, 'tensor', [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'bool', 'int8',
-            'uint8', 'complex64', 'complex128'
-        ], 'all_gather')
-        helper.append_op(type=op_type,
-                         inputs={'X': [tensor]},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'ring_id': ring_id,
-                             'use_calc_stream': use_calc_stream,
-                             'nranks': nranks
-                         })
+            check_variable_and_dtype(
+                elem,
+                'tensor_list',
+                [
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'bool',
+                    'int8',
+                    'uint8',
+                    'complex64',
+                    'complex128',
+                ],
+                'all_gather',
+            )
+        check_variable_and_dtype(
+            tensor,
+            'tensor',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'bool',
+                'int8',
+                'uint8',
+                'complex64',
+                'complex128',
+            ],
+            'all_gather',
+        )
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [tensor]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+                'nranks': nranks,
+            },
+        )
 
     list_of_tensor = paddle.split(out, nranks, 0)
     if is_input_complex:
@@ -1132,7 +1082,8 @@ def all_gather_object(object_list, obj, group=None):
             print(object_list)
             # [{'foo': [1, 2, 3]}, {'bar': [4, 5, 6]}] (2 GPUs)
     """
-    assert in_dygraph_mode(
+    assert (
+        in_dygraph_mode()
     ), "all_gather_object doesn't support static graph mode."
 
     tensor, len_of_tensor = _convert_object_to_tensor(obj)
@@ -1153,10 +1104,11 @@ def all_gather_object(object_list, obj, group=None):
     all_gather(tensor_list, input_tensor, group)
     for i, tensor in enumerate(tensor_list):
         object_list.append(
-            _convert_tensor_to_object(tensor, list_len_of_tensor[i]))
+            _convert_tensor_to_object(tensor, list_len_of_tensor[i])
+        )
 
 
-def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
+def scatter(tensor, tensor_list=None, src=0, group=None, sync_op=True):
     """
 
     Scatter a tensor to all participators. As shown below, one process is started with a GPU and the source of the scatter
@@ -1169,13 +1121,12 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
 
     Args:
         tensor (Tensor): The output Tensor. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list|tuple): A list/tuple of Tensors to scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool. Default value is None.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16. Default value is None.
         src (int): The source rank id. Default value is 0.
-        group (Group): The group instance return by new_group or None for global default group.
-        use_calc_stream (bool): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
+        group (Group, optional): The group instance return by new_group or None for global default group.
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
     Returns:
         None.
@@ -1216,7 +1167,7 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
         gsrc = src if group is None else group.get_group_rank(src)
         rank = _get_global_group().rank if group is None else group.rank
         nranks = _get_global_group().nranks if group is None else group.nranks
-    assert gsrc >= 0, ("src rank out of group, need global rank")
+    assert gsrc >= 0, "src rank out of group, need global rank"
 
     if rank != gsrc:
         tensor_list = []
@@ -1225,775 +1176,57 @@ def scatter(tensor, tensor_list=None, src=0, group=None, use_calc_stream=True):
     temp = paddle.concat(tensor_list, axis=0)
     if in_dygraph_mode():
         task = group.process_group.scatter(temp, tensor, gsrc)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
             return task
 
+    use_calc_stream = sync_op
     if _non_static_mode():
-        return _legacy_C_ops.c_scatter(temp, tensor, 'use_calc_stream',
-                                       use_calc_stream, 'ring_id', ring_id,
-                                       'nranks', nranks, 'root', gsrc)
+        return _legacy_C_ops.c_scatter(
+            temp,
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'nranks',
+            nranks,
+            'root',
+            gsrc,
+        )
     op_type = 'c_scatter'
-    check_variable_and_dtype(tensor, 'tensor', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-        'bool'
-    ], 'scatter')
-    helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [temp]},
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'root': gsrc,
-                         'use_calc_stream': use_calc_stream,
-                         'nranks': nranks,
-                     })
-
-
-def _c_identity(tensor, group=None):
-    """
-    Return a copy of the tensor, mainly used with model parallel.
-
-    Args:
-        tensor (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
-
-    Returns:
-        Tensor.
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True,
-                                        'ring_id', ring_id,
-                                        'use_model_parallel', True)
-    op_type = 'c_identity'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        '_c_identity')
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': True,
-                         'use_model_parallel': True,
-                     })
-    return out
-
-
-def _c_concat(tensor, group=None):
-    """
-    Return allgather of the tensor, mainly used with model parallel.
-
-    Args:
-        tensor (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64, int32 or int64.
-        group (int): The id of the process group to work on.
-
-    Returns:
-        Tensor.
-    """
-    if group is not None and not group.is_member():
-        return
-    group = _get_default_group() if group is None else group
-    ring_id = group.id
-
-    global_rank = _get_global_env().rank
-    rank = group.rank
-    nranks = group.nranks
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_concat(tensor, 'ring_id', ring_id,
-                                      'use_calc_stream', True, 'rank', rank,
-                                      'nranks', nranks, 'use_model_parallel',
-                                      True)
-
-    op_type = 'c_concat'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
     check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        '_c_concat')
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': True,
-                         'use_model_parallel': True,
-                         'nranks': nranks,
-                         'rank': rank
-                     })
-    return out
-
-
-def _c_split(tensor, group=None):
-    """
-    Split tensor evenly among all members, mainly used with model parallel.
-
-    Args:
-        tensor (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64, int32 or int64.
-        rank (int): The rank of the current process.
-        group (int): The id of the process group to work on.
-
-    Returns:
-        Tensor.
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    global_rank = _get_global_env().rank
-    rank = global_rank if group is None else group.get_group_rank(global_rank)
-    nranks = _get_global_env().world_size if group is None else group.nranks
-
-    if _non_static_mode():
-        return _legacy_C_ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
-                                     ring_id, 'rank', rank, 'nranks', nranks,
-                                     'use_model_parallel', True)
-
-    op_type = 'c_split'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        '_c_split')
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': True,
-                         'rank': rank,
-                         'nranks': nranks,
-                         'use_model_parallel': True,
-                     })
-    return out
-
-
-def _mp_allreduce(tensor,
-                  op=ReduceOp.SUM,
-                  group=None,
-                  use_calc_stream=True,
-                  use_model_parallel=True):
-    """[it is same as allreduce above, but it supports model parallel. And it support inplace startegy]
-    """
-    if group is not None and not group.is_member():
-        return
-
-    if in_dygraph_mode():
-        group = _get_default_group() if group is None else group
-        assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op)
-
-        from paddle.autograd import PyLayer
-
-        class mp_allreduce_eager(PyLayer):
-
-            @staticmethod
-            def forward(ctx, tensor, group, use_calc_stream,
-                        use_model_parallel):
-                ctx.ring_id = group.id
-
-                if use_calc_stream:
-                    op_type = _get_reduce_op(op, "_mp_allreduce")
-                    group.process_group.allreduce_on_calc_stream(
-                        tensor, op_type)
-                    return tensor
-                else:
-                    return _legacy_C_ops.c_allreduce_sum_(
-                        tensor, 'use_calc_stream', use_calc_stream, 'ring_id',
-                        ring_id, "use_model_parallel", use_model_parallel)
-
-            @staticmethod
-            def backward(ctx, dy):
-                return _legacy_C_ops.c_identity(dy, 'use_calc_stream', True,
-                                                'ring_id', ctx.ring_id,
-                                                'use_model_parallel', True)
-
-        return mp_allreduce_eager.apply(tensor, group, use_calc_stream,
-                                        use_model_parallel)
-
-    ring_id = 0 if group is None else group.id
-    if _in_legacy_dygraph():
-        if op == ReduceOp.SUM:
-            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
-                                                  use_calc_stream, 'ring_id',
-                                                  ring_id, "use_model_parallel",
-                                                  use_model_parallel)
-        else:
-            raise ValueError("Unknown parameter: {}.".format(op))
-
-    op_type = 'c_allreduce_sum'
-    helper = LayerHelper(op_type, **locals())
-    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
-
-    check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        op_type)
-
-    helper.append_op(type=op_type,
-                     inputs={'X': tensor},
-                     outputs={'Out': out},
-                     attrs={
-                         'ring_id': ring_id,
-                         'use_calc_stream': use_calc_stream,
-                         'use_model_parallel': use_model_parallel,
-                     })
-    return out
-
-
-def _c_lookup_table(table, index, start_index=0, name=None):
-    """
-    Lookup table according to index.
-
-    Args:
-        table (Tensor): The input Tensor. Its data type
-            should be float16, float32, float64.
-        index (Tensor): The index to lookup table.
-        start_index (int): The initial index for table range.
-        name (string): The name of the api
-
-    Returns:
-        Tensor.
-    """
-    if _non_static_mode():
-        return _legacy_C_ops.c_embedding(table, index, "start_index",
-                                         start_index)
-
-    op_type = 'c_embedding'
+        tensor,
+        'tensor',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'int8',
+            'uint8',
+            'bool',
+        ],
+        'scatter',
+    )
     helper = LayerHelper(op_type, **locals())
-    dtype = helper.input_dtype(input_param_name='table')
-    check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
-    tmp = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='c_embedding',
-                     inputs={
-                         'Ids': index,
-                         'W': table
-                     },
-                     outputs={'Out': tmp},
-                     attrs={"start_index": start_index})
-    return tmp
-
-
-class _Linear(layers.Layer):
-    """
-    Linear
-    """
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
-        super(_Linear, self).__init__()
-        self._dtype = self._helper.get_default_dtype()
-        self._weight_attr = weight_attr
-        self._bias_attr = bias_attr
-        self.weight = self.create_parameter(shape=[in_features, out_features],
-                                            attr=self._weight_attr,
-                                            dtype=self._dtype,
-                                            is_bias=False)
-        self.bias = self.create_parameter(shape=[out_features],
-                                          attr=self._bias_attr,
-                                          dtype=self._dtype,
-                                          is_bias=True)
-        self.name = name
-
-    def forward(self, input):
-        out = _linear(x=input,
-                      weight=self.weight,
-                      bias=self.bias,
-                      name=self.name)
-        return out
-
-    def extra_repr(self):
-        name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'in_features={}, out_features={}, dtype={}{}'.format(
-            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
-
-
-def _c_softmax_with_cross_entropy(logits,
-                                  label,
-                                  group=None,
-                                  return_softmax=False):
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-    global_rank = _get_global_env().rank
-    rank = global_rank if group is None else group.get_group_rank(global_rank)
-    nranks = _get_global_env().world_size if group is None else group.nranks
-
-    input_dims = len(list(logits.shape))
-    label_dims = len(list(label.shape))
-    if input_dims - 1 != label_dims and input_dims != label_dims:
-        raise ValueError(
-            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
-    if input_dims - 1 == label_dims:
-        label = paddle.unsqueeze(label, axis=-1)
-
-    if _non_static_mode():
-        softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
-            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
-        if not return_softmax:
-            return loss
-        else:
-            return loss, softmax
-
-    attrs = {
-        'ring_id': ring_id,
-        'rank': rank,
-        'nranks': nranks,
-    }
-    helper = LayerHelper('c_softmax_with_cross_entropy', **locals())
-    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
-    helper.append_op(type='c_softmax_with_cross_entropy',
-                     inputs={
-                         'Logits': logits,
-                         'Label': label
-                     },
-                     outputs={
-                         'Softmax': softmax,
-                         'Loss': loss
-                     },
-                     attrs=attrs)
-
-    if return_softmax:
-        return loss, softmax
-
-    return loss
-
-
-def _linear(x, weight, bias=None, name=None):
-    """
-    Fuction Linear
-    """
-    if _non_static_mode():
-        pre_bias = _varbase_creator(dtype=x.dtype)
-        _legacy_C_ops.matmul(x, weight, pre_bias, 'transpose_X', False,
-                             'transpose_Y', False, "alpha", 1)
-        return dygraph_utils._append_bias_in_dygraph(pre_bias,
-                                                     bias,
-                                                     axis=len(x.shape) - 1)
-    else:
-        helper = LayerHelper('linear', **locals())
-        dtype = x.dtype
-        assert len(
-            x.shape) < 4, "X latitude is not supported greater than 3 now."
-
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'linear')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
-
-        inputs = {'X': [x], 'Y': [weight]}
-        attrs = {
-            'transpose_X': False,
-            'transpose_Y': False,
-            'alpha': 1,
-        }
-        tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='matmul_v2',
-                         inputs=inputs,
-                         outputs={'Out': tmp},
-                         attrs=attrs)
-        if bias is not None:
-            res = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_add',
-                             inputs={
-                                 'X': [tmp],
-                                 'Y': [bias]
-                             },
-                             outputs={'Out': [res]},
-                             attrs={'axis': len(x.shape) - 1})
-        else:
-            res = tmp
-        return res
-
-
-def _set_var_distributed(var):
-    if var is None:
-        return
-
-    var.is_distributed = True
-
-    # NOTE: use current_block and find_var_recursive to support while_loop
-    startup_block = paddle.static.default_startup_program().current_block()
-    main_block = paddle.static.default_main_program().current_block()
-    startup_block._find_var_recursive(var.name).is_distributed = True
-    main_block._find_var_recursive(var.name).is_distributed = True
-
-
-def _parallel_linear(x,
-                     num_rows,
-                     num_cols,
-                     axis,
-                     param_attr,
-                     bias_attr,
-                     gather_out,
-                     inner_rank,
-                     nranks,
-                     split_tensor,
-                     name,
-                     group=None):
-    """
-    Parallel Linear
-
-    axis the dimension of the parameter of linear layer. 
-    axis = 0: the row dimension
-    axis = 1: the col dimension
-    
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    if axis == 0:
-        if split_tensor:
-            x = _c_split(x, group=group)
-    else:
-        x = _c_identity(x, group=group)
-
-    linear = paddle.nn.Linear(num_rows,
-                              num_cols,
-                              weight_attr=param_attr,
-                              bias_attr=bias_attr,
-                              name=name)
-
-    # NOTE: npu linear function use matmul_v2 but linear use matmul
-    linear_function = _linear if core.is_compiled_with_npu()\
-        else paddle.nn.functional.linear
-    linear_out = linear_function(
-        x,
-        linear.weight,
-        # NOTE(wangxi): row split, bias need add after allreduce
-        None if axis == 0 else linear.bias,
-        linear.name)
-
-    _set_var_distributed(linear.weight)
-    # set is_distributed for splited bias
-    # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
-    # if a linear layer is splited by col, the bias would also be split into each rank as its weight
-    if axis == 1 and linear._bias_attr != False:
-        _set_var_distributed(linear.bias)
-
-    if not gather_out: return linear_out
-
-    out_shape = list(linear_out.shape)
-    out_shape[0] *= 1 if axis == 0 else nranks
-    main_block = paddle.static.default_main_program().current_block()
-    out = main_block.create_var(
-        shape=out_shape,
-        dtype=linear_out.dtype,
-        type=linear_out.type,
-        lod_level=linear_out.lod_level,
-        persistable=False,
-        is_data=False,
-        need_check_feed=linear_out.desc.need_check_feed())
-    if axis == 0:
-        main_block.append_op(type='c_allreduce_sum',
-                             inputs={'X': linear_out},
-                             outputs={'Out': out},
-                             attrs={
-                                 'ring_id': ring_id,
-                                 'use_calc_stream': True,
-                                 'use_model_parallel': True
-                             })
-        if linear.bias is not None:
-            out = out + linear.bias
-    else:
-        main_block.append_op(type='c_concat',
-                             inputs={'X': linear_out},
-                             outputs={'Out': out},
-                             attrs={
-                                 'rank': inner_rank,
-                                 'ring_id': ring_id,
-                                 'nranks': nranks,
-                                 'use_calc_stream': True,
-                                 'use_model_parallel': True
-                             })
-    return out
-
-
-def _parallel_embedding(x,
-                        per_part_embeddings,
-                        origin_size,
-                        param_attr,
-                        inner_rank,
-                        num_partitions,
-                        name,
-                        group=None):
-    """
-    Parallel Embedding
-    """
-    if group is not None and not group.is_member():
-        return
-    ring_id = 0 if group is None else group.id
-
-    helper = LayerHelper("_parallel_embedding", **locals())
-
-    per_part_size = per_part_embeddings
-    rank = inner_rank
-
-    vocab_start_index = rank * per_part_size
-    dtype = helper.get_default_dtype()
-    size = [per_part_size, origin_size[1]]
-
-    weight = helper.create_parameter(attr=param_attr,
-                                     shape=size,
-                                     dtype=dtype,
-                                     is_bias=False)
-
-    if num_partitions == 1:
-        return paddle.nn.functional.embedding(x,
-                                              weight=weight,
-                                              padding_idx=None,
-                                              sparse=False,
-                                              name=name)
-
-    startup_block = paddle.static.default_startup_program().global_block()
-    main_block = paddle.static.default_main_program().global_block()
-    startup_block.vars[weight.name].is_distributed = True
-    main_block.vars[weight.name].is_distributed = True
-
-    output_parallel = paddle.distributed.collective._c_lookup_table(
-        weight, x, start_index=vocab_start_index, name=name)
-    out = paddle.distributed.collective._mp_allreduce(output_parallel,
-                                                      group=group,
-                                                      use_calc_stream=True,
-                                                      use_model_parallel=True)
-    return out
-
-
-def split(x,
-          size,
-          operation,
-          axis=0,
-          num_partitions=1,
-          gather_out=True,
-          weight_attr=None,
-          bias_attr=None,
-          name=None):
-    """
-
-    Split the weight of the specified operation into multiple devices
-    and do the computation in parallel.
-
-    Now the following three cases are supported.
-
-    Case 1: Parallel Embedding
-        The weight of the embedding operation is a NxM matrix with N rows and M columns.
-        With parallel embedding, the weight is split into num_partitions partitions, each
-        of which is a matrix with (N/num_partitions + 1) rows and M column where the last
-        row as the padding idx.
-
-        Suppose we split the NxM weight into two partitons on device_0 and device_1
-        respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
-        index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
-        keep unchanged and all other values are changed to N/2 which is the padding index and
-        are mapped to all zeros after embedding. In the same way, on device_1, the value V in the
-        input within [N/2, N-1] will be changed to (V - N/2), and all other values are changed
-        to N/2 and are mapped to all zeros after embedding. Finally, the results on the two
-        devices are sum-reduced.
-
-        The Embedding put on single card is as shown below:
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_single.png
-            :width: 800
-            :height: 350
-            :alt: single_embedding
-            :align: center
-
-        Parallel Embedding is shown as below:
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_split.png
-            :width: 800
-            :alt: split_embedding
-            :align: center
-
-    Case 2: Row Parallel Linear
-        The weight of the linear operation is a NxM matrix with N rows and M columns.
-        With row parallel linear, the weight is split into num_partitions partitions, each
-        of which is a matrix with N/num_partitions rows and M column.
-
-        The linear layer put on single card is shown as below, the input variable is represented by X,
-        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is 
-        simple matrix multiplication operation, O = X * W.
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
-            :width: 800
-            :alt: single_linear
-            :align: center
-
-        Row Parallel Linear is shown as below. As the name suggests, Row Parallel Linear splits the weight matrix W into
-        [[W_row1], [W_row2]] along the row. And accordingly the input is splitted along the column into [X_col1, X_col2] and multiply their
-        respective weight matrices. Finally apply AllReduce on the output from each card to get the final output.
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_row.png
-            :width: 800
-            :alt: split_row
-            :align: center
-
-    Case 3: Column Parallel Linear
-        The weight of the linear operation is a NxM matrix with N rows and M columns.
-        With column parallel linear, the weight is split into num_paratitions partitions, each
-        of which is a matrix with N rows and M/num_partitions column.
-
-        The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
-        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and 
-        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output. 
-
-        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
-            :width: 800
-            :alt: split_col
-            :align: center
-    
-    As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
-    operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.
-
-    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col_row.png
-            :width: 800
-            :alt: split_col_row
-            :align: center
-
-    Args:
-        x (Tensor): Input tensor. It's data type should be float16, float32, float64, int32 or int64.
-        size (list|tuple): A list or tuple with two elements indicating the shape of the weight.
-        operation (str): The name of the operation. The supported operations are 'linear' and 'embedding'.
-        axis (int, Optional): Indicate along which axis to split the weight. Default: 0.
-        num_partitions (int, Optional): How many parts the weight is partitioned. Default: 1.
-        gather_out (bool, Optional): Whether to gather the output after computation. By default, the output
-            on each partitions will be gathered after computation. Default: True.
-        weight_attr (ParamAttr, Optional): The parameter attribute for the learnable
-            weights(Parameter) of the specified operation. Default: None.
-        bias_attr (ParamAttr, Optional): The parameter attribute for the bias
-            of the specified operation. Default: None.
-        name (str, Optional): The default value is None. Normally there is no need for user to set this
-            property. Default: None. For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        Tensor.
-
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import paddle
-            import paddle.distributed.fleet as fleet
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [temp]},
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'root': gsrc,
+            'use_calc_stream': use_calc_stream,
+            'nranks': nranks,
+        },
+    )
 
-            paddle.enable_static()
-            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
-            fleet.init(is_collective=True)
-            data = paddle.randint(0, 8, shape=[10,4])
-            emb_out = paddle.distributed.split(
-                data,
-                (8, 8),
-                operation="embedding",
-                num_partitions=2)
 
-    """
-    assert isinstance(
-        size,
-        (list, tuple)), ("The type of size for "
-                         "paddle.distributed.split must be list or tuple.")
-    assert len(size) == 2, ("Number of elements in size of "
-                            "paddle.distributed.split must be two.")
-    assert isinstance(operation, str), ("The type of operation for "
-                                        "paddle.distributed.split must be str.")
-    supported_operations = [
-        'linear',
-        'embedding',
-    ]
-    assert operation in supported_operations, (
-        "The operation for "
-        "paddle.distributed.split must be one of {}.".format(
-            supported_operations))
-    if _non_static_mode():
-        raise ValueError(
-            "paddle.distributed.split cannot be used in dynamic "
-            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
-            "ParallelColumnLinear instead.")
-    else:
-        from .fleet import fleet
-        assert fleet._role_maker, ("To use paddle.distributed.split, "
-                                   "you must call fleet.init() firstly.")
-        rank = fleet.worker_index()
-        nranks = fleet.worker_num()
-
-    # rank within a model parallel group
-    inner_rank = rank % num_partitions
-
-    if operation == "embedding":
-        assert axis == 0, ("We only support to split the weight of embedding "
-                           "along the first axis now.")
-        assert size[0] % num_partitions == 0, \
-            "The length of the vocabulary must be divisible by num_partitions " \
-            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
-
-        per_part_size = size[0] // num_partitions
-        emb_out = _parallel_embedding(x,
-                                      per_part_size,
-                                      size,
-                                      weight_attr,
-                                      inner_rank,
-                                      num_partitions,
-                                      name,
-                                      group=None)
-        return emb_out
-    else:
-        should_split = False
-        if axis == 0:
-            assert size[0] % num_partitions == 0, (
-                "Number of rows of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(
-                    size[0], num_partitions))
-            per_part_size = size[0] // num_partitions
-            linear_size = (per_part_size, size[1])
-            if x.shape[-1] == size[0]: should_split = True
-
-        elif axis == 1:
-            assert size[1] % num_partitions == 0, (
-                "Number of column of the weight for linear ({}) must be"
-                " divisible by num_partitions ({})".format(
-                    size[1], num_partitions))
-            per_part_size = size[1] // num_partitions
-            linear_size = (size[0], per_part_size)
-        else:
-            raise ValueError("The value of axis must be 0 or 1, but the value "
-                             "given is {}.".format(axis))
-
-        linear_out = _parallel_linear(x,
-                                      linear_size[0],
-                                      linear_size[1],
-                                      axis,
-                                      weight_attr,
-                                      bias_attr,
-                                      gather_out,
-                                      inner_rank,
-                                      num_partitions,
-                                      should_split,
-                                      name=name,
-                                      group=None)
-        return linear_out
-
-
-def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
+def alltoall(in_tensor_list, out_tensor_list, group=None, sync_op=True):
     """
     Scatter tensors in in_tensor_list to all participators averagely and gather the result tensors in out_tensor_list.
     As shown below, the in_tensor_list in GPU0 includes 0_0 and 0_1, and GPU1 includes 1_0 and 1_1.
@@ -2007,15 +1240,15 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
 
     Args:
         in_tensor_list (list): A list of input Tensors. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor_list (list): A list of output Tensors. The data type of its elements should be the same as the
             data type of the input Tensors.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
-    
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
+
     Returns:
         None.
-    
+
     Examples:
         .. code-block:: python
 
@@ -2042,7 +1275,7 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
     else:
         ring_id = 0 if group is None else group.id
 
@@ -2061,64 +1294,77 @@ def alltoall(in_tensor_list, out_tensor_list, group=None, use_calc_stream=True):
         out_tensor_list.extend(paddle.split(out, nranks, 0))
         return
 
+    use_calc_stream = sync_op
     if _non_static_mode():
-        out = _legacy_C_ops.alltoall(temp, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id)
+        out = _legacy_C_ops.alltoall(
+            temp, 'use_calc_stream', use_calc_stream, 'ring_id', ring_id
+        )
     else:
         op_type = 'alltoall'
         helper = LayerHelper(op_type, **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=in_tensor_list[0].dtype)
+            dtype=in_tensor_list[0].dtype
+        )
 
         if not isinstance(in_tensor_list, list):
-            raise ValueError("The type of 'in_tensor_list' for all_to_all "
-                             "should be list.")
+            raise ValueError(
+                "The type of 'in_tensor_list' for all_to_all " "should be list."
+            )
         for elem in in_tensor_list:
             check_variable_and_dtype(
-                elem, 'in_tensor_list',
+                elem,
+                'in_tensor_list',
                 ['float16', 'float32', 'float64', 'int32', 'int64'],
-                'all_to_all')
+                'all_to_all',
+            )
         if not isinstance(out_tensor_list, list):
-            raise ValueError("The type of 'out_tensor_list' for all_to_all "
-                             "should be list.")
+            raise ValueError(
+                "The type of 'out_tensor_list' for all_to_all "
+                "should be list."
+            )
         if len(out_tensor_list) != 0:
-            raise ValueError("The 'out_tensor_list' for all_to_all "
-                             "must be an empty list.")
-        helper.append_op(type=op_type,
-                         inputs={'X': [temp]},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'ring_id': ring_id,
-                             'use_calc_stream': use_calc_stream,
-                         })
+            raise ValueError(
+                "The 'out_tensor_list' for all_to_all " "must be an empty list."
+            )
+        helper.append_op(
+            type=op_type,
+            inputs={'X': [temp]},
+            outputs={'Out': [out]},
+            attrs={
+                'ring_id': ring_id,
+                'use_calc_stream': use_calc_stream,
+            },
+        )
     out_tensor_list.extend(paddle.split(out, nranks, 0))
 
 
-def alltoall_single(in_tensor,
-                    out_tensor,
-                    in_split_sizes=None,
-                    out_split_sizes=None,
-                    group=None,
-                    use_calc_stream=True):
+def alltoall_single(
+    in_tensor,
+    out_tensor,
+    in_split_sizes=None,
+    out_split_sizes=None,
+    group=None,
+    sync_op=True,
+):
     """
     Scatter a single input tensor to all participators and gather the received tensors in out_tensor.
 
-    .. note::
+    Note:
         ``alltoall_single`` is only supported in eager mode.
 
     Args:
-        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        in_tensor (Tensor): Input tensor. The data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         out_tensor (Tensor): Output Tensor. The data type should be the same as the data type of the input Tensor.
-        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor`` 
+        in_split_sizes (list[int], optional): Split sizes of ``in_tensor`` for dim[0]. If not given, dim[0] of ``in_tensor``
             must be divisible by group size and ``in_tensor`` will be scattered averagely to all participators. Default: None.
-        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor`` 
+        out_split_sizes (list[int], optional): Split sizes of ``out_tensor`` for dim[0]. If not given, dim[0] of ``out_tensor``
             must be divisible by group size and ``out_tensor`` will be gathered averagely from all participators. Default: None.
         group (Group, optional): The group instance return by ``new_group`` or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Whether to use calculation stream (True) or communication stream. Default: True.
-    
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
+
     Returns:
-        None, if ``use_calc_stream`` is set to ``True``; ``Task`` of ``group``, if ``use_calc_stream`` is set to ``False``.
-    
+        None, if ``sync_op`` is set to ``True``; ``Task`` of ``group``, if ``sync_op`` is set to ``False``.
+
     Examples:
         .. code-block:: python
 
@@ -2156,7 +1402,7 @@ def alltoall_single(in_tensor,
                                         output,
                                         in_split_sizes,
                                         out_split_sizes,
-                                        use_calc_stream=False,
+                                        sync_op=False,
                                         group=group)
             task.wait()
             print(output)
@@ -2172,14 +1418,15 @@ def alltoall_single(in_tensor,
 
     group = _get_default_group() if group is None else group
     backend = _group_map_backend[group]
-    assert backend != 'gloo', ("backend gloo is not supported yet")
+    assert backend != 'gloo', "backend gloo is not supported yet"
 
     in_split_sizes = [] if in_split_sizes is None else in_split_sizes
     out_split_sizes = [] if out_split_sizes is None else out_split_sizes
 
-    task = group.process_group.alltoall_single(in_tensor, out_tensor,
-                                               in_split_sizes, out_split_sizes)
-    if use_calc_stream:
+    task = group.process_group.alltoall_single(
+        in_tensor, out_tensor, in_split_sizes, out_split_sizes
+    )
+    if sync_op:
         task.wait()
         return
     else:
@@ -2190,17 +1437,17 @@ def _get_group_rank(global_rank, group=None):
     return global_rank if group is None else group.get_group_rank(global_rank)
 
 
-def send(tensor, dst=0, group=None, use_calc_stream=True):
+def send(tensor, dst=0, group=None, sync_op=True):
     """
     Send a tensor to the receiver.
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
-    
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
+
     Returns:
         None.
 
@@ -2227,45 +1474,58 @@ def send(tensor, dst=0, group=None, use_calc_stream=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         task = group.process_group.send(tensor, dst)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
             return task
 
+    use_calc_stream = sync_op
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _legacy_C_ops.send_v2(tensor, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id, 'peer', dst)
+        return _legacy_C_ops.send_v2(
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'peer',
+            dst,
+        )
     op_type = 'send_v2'
     check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'send')
+        tensor,
+        'tensor',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'send',
+    )
 
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     inputs={'X': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'peer': dst,
-                         'use_calc_stream': use_calc_stream,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': dst,
+            'use_calc_stream': use_calc_stream,
+        },
+    )
 
 
-def recv(tensor, src=0, group=None, use_calc_stream=True):
+def recv(tensor, src=0, group=None, sync_op=True):
     """
     Receive a tensor to the sender.
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Whether to use calculate stream or communication stream. Default: True.
-    
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
+
     Returns:
         None.
 
@@ -2293,48 +1553,70 @@ def recv(tensor, src=0, group=None, use_calc_stream=True):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         task = group.process_group.recv(tensor, src)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
             return task
 
+    use_calc_stream = sync_op
     ring_id = 0 if group is None else group.id
 
     if _non_static_mode():
-        return _legacy_C_ops.recv_v2(tensor, 'use_calc_stream', use_calc_stream,
-                                     'ring_id', ring_id, 'peer', src, 'dtype',
-                                     tensor.dtype, 'out_shape', tensor.shape)
+        return _legacy_C_ops.recv_v2(
+            tensor,
+            'use_calc_stream',
+            use_calc_stream,
+            'ring_id',
+            ring_id,
+            'peer',
+            src,
+            'dtype',
+            tensor.dtype,
+            'out_shape',
+            tensor.shape,
+        )
     op_type = 'recv_v2'
     check_variable_and_dtype(
-        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'recv')
+        tensor,
+        'tensor',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'recv',
+    )
     helper = LayerHelper(op_type, **locals())
-    helper.append_op(type=op_type,
-                     outputs={'Out': [tensor]},
-                     attrs={
-                         'ring_id': ring_id,
-                         'peer': src,
-                         'out_shape': tensor.shape,
-                         'dtype': tensor.dtype,
-                         'use_calc_stream': use_calc_stream,
-                     })
+    helper.append_op(
+        type=op_type,
+        outputs={'Out': [tensor]},
+        attrs={
+            'ring_id': ring_id,
+            'peer': src,
+            'out_shape': tensor.shape,
+            'dtype': tensor.dtype,
+            'use_calc_stream': use_calc_stream,
+        },
+    )
 
 
 def _check_single_tensor(tensor, tensor_name):
     if not isinstance(tensor, (core.eager.Tensor, paddle.Tensor)):
-        raise RuntimeError("Invalid function argument. Expected parameter {}"
-                           "to be of type paddle.Tensor, but it's {}".format(
-                               tensor_name, type(tensor)))
+        raise RuntimeError(
+            "Invalid function argument. Expected parameter {}"
+            "to be of type paddle.Tensor, but it's {}".format(
+                tensor_name, type(tensor)
+            )
+        )
 
 
 def _check_tensor_list(tensor_list, tensor_name):
-    if not isinstance(tensor_list, list) or \
-        not all(isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list):
-        raise RuntimeError("Invalid function argument. Expected parameter {}"
-                           "to be of type paddle.Tensor".format(tensor_name))
+    if not isinstance(tensor_list, list) or not all(
+        isinstance(t, (core.eager.Tensor, paddle.Tensor)) for t in tensor_list
+    ):
+        raise RuntimeError(
+            "Invalid function argument. Expected parameter {}"
+            "to be of type paddle.Tensor".format(tensor_name)
+        )
 
 
 def isend(tensor, dst, group=None):
@@ -2343,14 +1625,14 @@ def isend(tensor, dst, group=None):
 
     Args:
         tensor (Tensor): The Tensor to send. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         dst (int): The destination rank.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-    
+
     Returns:
         A distributed task object.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -2379,9 +1661,9 @@ def isend(tensor, dst, group=None):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         group_dst_rank = group.get_group_rank(dst)
-        assert group_dst_rank >= 0, ("dst rank out of group, need global rank")
+        assert group_dst_rank >= 0, "dst rank out of group, need global rank"
         return group.process_group.send(tensor, group_dst_rank)
     else:
         raise RuntimeError("Only support eager dygraph mode.")
@@ -2393,14 +1675,14 @@ def irecv(tensor, src=None, group=None):
 
     Args:
         tensor (Tensor): The Tensor to receive. Its data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         src (int): The source rank id.
         group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
 
     Returns:
         A distributed task object.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -2428,9 +1710,9 @@ def irecv(tensor, src=None, group=None):
     if in_dygraph_mode():
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
         group_src_rank = group.get_group_rank(src)
-        assert group_src_rank >= 0, ("src rank out of group, need global rank")
+        assert group_src_rank >= 0, "src rank out of group, need global rank"
         return group.process_group.recv(tensor, group_src_rank)
     else:
         raise RuntimeError("Only support eager dygraph mode.")
@@ -2449,16 +1731,18 @@ class P2POp(object):
             The type of ``op`` is either ``paddle.distributed.isend`` or ``paddle.distributed.irecv``.
         tensor (Tensor): Tensor to send or receive.
         peer (int): The destination or source rank.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
 
     """
 
     def __init__(self, op, tensor, peer, group=None):
         if op not in [isend, irecv]:
-            raise RuntimeError("Invalid ``op`` function. Expected ``op`` "
-                               "to be of type ``paddle.distributed.isend`` or "
-                               "``paddle.distributed.irecv``.")
+            raise RuntimeError(
+                "Invalid ``op`` function. Expected ``op`` "
+                "to be of type ``paddle.distributed.isend`` or "
+                "``paddle.distributed.irecv``."
+            )
         _check_single_tensor(tensor, "tensor")
 
         self.op = op
@@ -2484,13 +1768,17 @@ def _check_p2p_op_list(p2p_op_list):
     all ops use the same backend.
     """
     if not isinstance(p2p_op_list, list) or not all(
-            isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list):
-        raise RuntimeError("Invalid ``p2p_op_list``. Each op is expected to "
-                           "to be of type ``paddle.distributed.P2POp``.")
+        isinstance(p2p_op, P2POp) for p2p_op in p2p_op_list
+    ):
+        raise RuntimeError(
+            "Invalid ``p2p_op_list``. Each op is expected to "
+            "to be of type ``paddle.distributed.P2POp``."
+        )
 
     backend = _group_map_backend[p2p_op_list[0].group]
-    if not all(backend == _group_map_backend[p2p_op.group]
-               for p2p_op in p2p_op_list):
+    if not all(
+        backend == _group_map_backend[p2p_op.group] for p2p_op in p2p_op_list
+    ):
         raise RuntimeError("All groups need to use the same backend.")
 
 
@@ -2498,20 +1786,20 @@ def batch_isend_irecv(p2p_op_list):
     """
     Send or Receive a batch of tensors asynchronously and return a list of requests.
 
-    Process each of the point-to-point operations in ``p2p_op_list`` and return the 
+    Process each of the point-to-point operations in ``p2p_op_list`` and return the
     corresponding tasks. NCCL are currently supported.
 
     Args:
-        p2p_op_list: A list of point-to-point operations(type of each operator is
+        p2p_op_list (List[P2POp]): A list of point-to-point operations(type of each operator is
             ``paddle.distributed.P2POp``). The order of the isend/irecv in the list
             matters and it needs to match with corresponding isend/irecv on the
             remote end.
 
     Returns:
         A list of distributed tasks returned by calling the corresponding
-        op in the op_list. 
+        op in the op_list.
 
-    Warning:    
+    Warning:
         This API only supports the dygraph mode.
 
     Examples:
@@ -2539,7 +1827,7 @@ def batch_isend_irecv(p2p_op_list):
 
             for task in tasks:
                 task.wait()
-            
+
             print(recv_t)
             # paddle.tensor([1, 2])     # Rank-0
             # paddle.tensor([0, 1])     # Rank-1
@@ -2567,28 +1855,26 @@ def batch_isend_irecv(p2p_op_list):
         raise RuntimeError("Don't support static graph mode currently.")
 
 
-def reduce_scatter(tensor,
-                   tensor_list,
-                   op=ReduceOp.SUM,
-                   group=None,
-                   use_calc_stream=True):
+def reduce_scatter(
+    tensor, tensor_list, op=ReduceOp.SUM, group=None, sync_op=True
+):
     """
     Reduces, then scatters a list of tensors to all processes in a group
 
     Args:
-        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        tensor (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         tensor_list (list[Tensor]): List of tensors to reduce and scatter. Every element in the list must be a Tensor whose data type
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
-        group (Group, optional): The group instance return by new_group or None for global 
+        group (Group, optional): The group instance return by new_group or None for global
             default group. Default: None.
-        use_calc_stream (bool, optional): Whether this op should be an async op.
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
 
     Returns:
-        Async task handle, if use_calc_stream is set to False.
-        None, if use_calc_stream or if not part of the group.
-    
-    Warning:    
+        Async task handle, if sync_op is set to False.
+        None, if sync_op or if not part of the group.
+
+    Warning:
         This API only supports the dygraph mode.
 
 
@@ -2622,11 +1908,11 @@ def reduce_scatter(tensor,
         op_type = _get_reduce_op(op, "reduce_scatter")
         group = _get_default_group() if group is None else group
         backend = _group_map_backend[group]
-        assert backend != 'gloo', ("backend gloo is not supported yet")
+        assert backend != 'gloo', "backend gloo is not supported yet"
 
         temp = paddle.concat(tensor_list, axis=0)
         task = group.process_group._reduce_scatter_base(tensor, temp, op_type)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
@@ -2635,26 +1921,24 @@ def reduce_scatter(tensor,
         raise RuntimeError("Don't support static graph mode currently.")
 
 
-def _reduce_scatter_base(output,
-                         input,
-                         op=ReduceOp.SUM,
-                         group=None,
-                         use_calc_stream=True):
+def _reduce_scatter_base(
+    output, input, op=ReduceOp.SUM, group=None, sync_op=True
+):
     """
     Reduces, then scatters a flattened tensor to all processes in a group.
 
     Args:
-        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
-        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type 
-            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        output (Tensor): Output tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
+        input (Tensor): Input tensor that is of size output tensor size times world size. Its data type
+            should be float16, float32, float64, int32, int64, int8, uint8, bool or bfloat16.
         op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD): Optional. The operation used. Default: ReduceOp.SUM.
         group (ProcessGroup, optional): The process group to work on. If None,
             the default process group will be used.
-        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream (False).
-            Default to True.
+        sync_op (bool, optional): Whether this op is a sync op. The default value is True.
+
     Returns:
-        Async task handle, if use_calc_stream is set to False.
-        None, if use_calc_stream or if not part of the group.
+        Async task handle, if sync_op is set to False.
+        None, if sync_op or if not part of the group.
 
     Examples:
         .. code-block:: python
@@ -2685,7 +1969,7 @@ def _reduce_scatter_base(output,
         op_type = _get_reduce_op(op, "_reduce_scatter_base")
         group = _get_default_group() if group is None else group
         task = group.process_group._reduce_scatter_base(output, input, op_type)
-        if use_calc_stream:
+        if sync_op:
             task.wait()
             return None
         else:
diff --git a/python/paddle/distributed/communication/__init__.py b/python/paddle/distributed/communication/__init__.py
index 95d6c31580a886..97043fd7ba6885 100644
--- a/python/paddle/distributed/communication/__init__.py
+++ b/python/paddle/distributed/communication/__init__.py
@@ -11,5 +11,3 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
-__all__ = ["stream"]
diff --git a/python/paddle/distributed/communication/all_reduce.py b/python/paddle/distributed/communication/all_reduce.py
new file mode 100644
index 00000000000000..737e0cbbfb56c0
--- /dev/null
+++ b/python/paddle/distributed/communication/all_reduce.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed.communication import stream as stream
+from paddle.distributed.communication.reduce import ReduceOp
+
+
+def all_reduce(tensor, op=ReduceOp.SUM, group=None, sync_op=True):
+    """
+
+    Reduce a tensor over all ranks so that all get the result.
+    As shown below, one process is started with a GPU and the data of this process is represented
+    by its group rank. The reduce operator is sum. Through all_reduce operator,
+    each GPU will have the sum of the data from all GPUs.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/allreduce.png
+        :width: 800
+        :alt: all_reduce
+        :align: center
+
+    Args:
+        tensor (Tensor): The input Tensor. It also works as the output Tensor. Its data type
+            should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The operation used. Default value is ReduceOp.SUM.
+        group (Group, optional): The group instance return by new_group or None for global default group.
+        sync_op (bool, optional): Wether this op is a sync op. Default value is True.
+
+    Returns:
+        Return a task object.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            dist.all_reduce(data)
+            print(data)
+            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
+    """
+    if not framework._in_legacy_dygraph():
+        return stream.all_reduce(tensor,
+                                 op=op,
+                                 group=group,
+                                 sync_op=sync_op,
+                                 use_calc_stream=False)
+
+    # code below will be removed after we remove the old dygraph
+    use_calc_stream = sync_op
+    ring_id = 0 if group is None else group.id
+    if op == ReduceOp.SUM:
+        return paddle._legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                     use_calc_stream, 'ring_id',
+                                                     ring_id)
+    elif op == ReduceOp.MAX:
+        return paddle._legacy_C_ops.c_allreduce_max_(tensor, 'use_calc_stream',
+                                                     use_calc_stream, 'ring_id',
+                                                     ring_id)
+    elif op == ReduceOp.MIN:
+        return paddle._legacy_C_ops.c_allreduce_min_(tensor, 'use_calc_stream',
+                                                     use_calc_stream, 'ring_id',
+                                                     ring_id)
+    elif op == ReduceOp.PROD:
+        return paddle._legacy_C_ops.c_allreduce_prod_(tensor, 'use_calc_stream',
+                                                      use_calc_stream,
+                                                      'ring_id', ring_id)
+    else:
+        raise ValueError("Unknown parameter: {}.".format(op))
diff --git a/python/paddle/distributed/communication/group.py b/python/paddle/distributed/communication/group.py
new file mode 100644
index 00000000000000..6b4e545b245d1e
--- /dev/null
+++ b/python/paddle/distributed/communication/group.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Group():
+    """
+    The abstract representation of group.
+    """
+
+    def __init__(self, rank_in_group, id, ranks, pg=None, name=None):
+        self._rank_in_group = rank_in_group
+        self._world_size = len(ranks) if rank_in_group >= 0 else -1
+        self._id = id
+        self._ranks = ranks
+        self._pg = pg
+        self._name = name
+
+    @property
+    def rank(self):
+        return self._rank_in_group
+
+    @property
+    def ranks(self):
+        return self._ranks
+
+    @property
+    def nranks(self):
+        return len(self._ranks)
+
+    @property
+    def name(self):
+        return self._name
+
+    @property
+    def process_group(self):
+        return self._pg
+
+    @property
+    def world_size(self):
+        return self._world_size
+
+    @property
+    def id(self):
+        return self._id
+
+    def is_member(self):
+        if self.rank < 0:
+            return False
+        if self.nranks < 2:
+            return False
+        return True
+
+    def get_group_rank(self, rank):
+        if self.is_member():
+            return self.ranks.index(rank)
+        else:
+            return -1
+
+    def __repr__(self):
+        debug_str = "rank: {}, nranks: {}, id: {}, ranks: ".format(
+            self.rank, self.nranks, self.id)
+        debug_str += ", ".join(map(str, self.ranks))
+        debug_str += "; name: "
+        debug_str += self.name if self.name else "None"
+        return debug_str
+
+
+class _GroupManager():
+    global_group_id = 0
+    group_map_by_id = {}
+
+
+def _get_global_group():
+    if _GroupManager.global_group_id not in _GroupManager.group_map_by_id:
+        raise RuntimeError("The global group is not initialized.")
+    return _GroupManager.group_map_by_id[_GroupManager.global_group_id]
+
+
+def _add_new_group(group):
+    if group.id in _GroupManager.group_map_by_id:
+        raise RuntimeError("The group with id {} already exist.".format(
+            group.id))
+    _GroupManager.group_map_by_id[group.id] = group
diff --git a/python/paddle/distributed/communication/reduce.py b/python/paddle/distributed/communication/reduce.py
new file mode 100644
index 00000000000000..5caa5bebedfd81
--- /dev/null
+++ b/python/paddle/distributed/communication/reduce.py
@@ -0,0 +1,76 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+import paddle.fluid.core as core
+
+
+class ReduceOp:
+    """
+
+    Specify the type of operation used for element-wise reductions.
+    It should be one of the following values:
+
+        ReduceOp.SUM
+
+        ReduceOp.MAX
+
+        ReduceOp.MIN
+
+        ReduceOp.PROD
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            dist.all_reduce(data, op=dist.ReduceOp.SUM)
+            print(data)
+            # [[5, 7, 9], [5, 7, 9]] (2 GPUs)
+    """
+    SUM = 0
+    MAX = 1
+    MIN = 2
+    PROD = 3
+    AVG = 4
+
+
+def _get_reduce_op(reduce_op, func_name):
+    if framework.in_dygraph_mode():
+        if reduce_op == ReduceOp.SUM:
+            return core.ReduceOp.SUM
+        elif reduce_op == ReduceOp.MAX:
+            return core.ReduceOp.MAX
+        elif reduce_op == ReduceOp.MIN:
+            return core.ReduceOp.MIN
+        elif reduce_op == ReduceOp.PROD:
+            return core.ReduceOp.PRODUCT
+    else:
+        if reduce_op == ReduceOp.SUM:
+            return 'c_allreduce_sum'
+        elif reduce_op == ReduceOp.MAX:
+            return 'c_allreduce_max'
+        elif reduce_op == ReduceOp.MIN:
+            return 'c_allreduce_min'
+        elif reduce_op == ReduceOp.PROD:
+            return 'c_allreduce_prod'
+
+    raise ValueError("Unknown reduce_op type for {}.".format(func_name))
diff --git a/python/paddle/distributed/communication/stream/__init__.py b/python/paddle/distributed/communication/stream/__init__.py
index 24194dd9fb1e2c..43952ce5541a33 100644
--- a/python/paddle/distributed/communication/stream/__init__.py
+++ b/python/paddle/distributed/communication/stream/__init__.py
@@ -12,6 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from .all_gather import all_gather
 from .all_reduce import all_reduce
+from .alltoall import alltoall
+from .alltoall_single import alltoall_single
+from .broadcast import broadcast
+from .reduce import reduce
+from .reduce_scatter import reduce_scatter
+from .recv import recv
+from .scatter import scatter
+from .send import send
 
-__all__ = ["all_reduce"]
+__all__ = [
+    "all_gather", "all_reduce", "alltoall", "alltoall_single", "broadcast",
+    "reduce", "reduce_scatter", "recv", "scatter", "send"
+]
diff --git a/python/paddle/distributed/communication/stream/all_gather.py b/python/paddle/distributed/communication/stream/all_gather.py
new file mode 100644
index 00000000000000..9eb961cda171d4
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/all_gather.py
@@ -0,0 +1,138 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    expect_shape = list(shape)
+    expect_shape[0] *= nranks
+    if list(tensor.shape) != expect_shape:
+        raise RuntimeError('The tensor for all_gather is not correctly-sized.')
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            'The tensor_list for all_gather is not correctly-sized.')
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                'The tensor_list for all_gather is not correctly-sized.')
+
+
+def _all_gather_into_tensor_in_dygraph(out_tensor, in_tensor, group, sync_op,
+                                       use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.allgather_into_tensor_on_calc_stream(
+            in_tensor, out_tensor)
+
+    task = group.process_group.allgather_into_tensor(in_tensor, out_tensor,
+                                                     sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _all_gather_in_dygraph(tensor_list, tensor, group, sync_op,
+                           use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    if len(tensor_list) == 0:
+        tensor_list += [paddle.empty_like(tensor) for _ in range(group.nranks)]
+    else:
+        _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.allgather_on_calc_stream(tensor, tensor_list)
+
+    task = group.process_group.allgather(tensor, tensor_list, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def all_gather(tensor_or_tensor_list,
+               tensor,
+               group=None,
+               sync_op=True,
+               use_calc_stream=False):
+    """
+
+    Gather tensors across devices to a correctly-sized tensor or a tensor list.
+
+    Args:
+        tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized. If it is a list, it
+            should be empty or contain correctly-sized tensors.
+        tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            tensor_list = []
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            task = dist.stream.all_gather(tensor_list, data, sync_op=False)
+            task.wait()
+            print(tensor_list)
+            # [[[4, 5, 6], [4, 5, 6]], [[1, 2, 3], [1, 2, 3]]] (2 GPUs)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        if paddle.is_tensor(tensor_or_tensor_list):
+            return _all_gather_into_tensor_in_dygraph(tensor_or_tensor_list,
+                                                      tensor, group, sync_op,
+                                                      use_calc_stream)
+        else:
+            return _all_gather_in_dygraph(tensor_or_tensor_list, tensor, group,
+                                          sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.all_gather is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/all_reduce.py b/python/paddle/distributed/communication/stream/all_reduce.py
index 6a0b622cf0dfe4..0ba161a078ab89 100644
--- a/python/paddle/distributed/communication/stream/all_reduce.py
+++ b/python/paddle/distributed/communication/stream/all_reduce.py
@@ -13,12 +13,16 @@
 # limitations under the License.
 
 import paddle.fluid.framework as framework
-from ...collective import _get_default_group, _get_reduce_op, ReduceOp
+import paddle.fluid.data_feeder as data_feeder
+import paddle.fluid.layer_helper as layer_helper
+from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
+from paddle.distributed.communication.group import _get_global_group
 
 
 def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
     op_type = _get_reduce_op(op, "all_reduce")
-    group = _get_default_group() if group is None else group
+
+    group = _get_global_group() if group is None else group
     if use_calc_stream:
         return group.process_group.allreduce_on_calc_stream(tensor, op_type)
 
@@ -29,6 +33,32 @@ def _all_reduce_in_dygraph(tensor, op, group, sync_op, use_calc_stream):
     return task
 
 
+def _all_reduce_in_static_mode(tensor, op, group, sync_op, use_calc_stream):
+    data_feeder.check_variable_and_dtype(tensor, 'tensor', [
+        'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
+        'bool'
+    ], 'all_reduce')
+
+    op_type = _get_reduce_op(op, "all_reduce")
+    ring_id = 0 if group is None else group.id
+
+    if not isinstance(ring_id, int):
+        raise ValueError("The type of 'ring_id' for all_reduce should be int.")
+
+    # TODO: Support task and use task.wait in static mode
+    #       Use use_calc_stream rather than sync_op
+    helper = layer_helper.LayerHelper(op_type, **locals())
+    helper.append_op(type=op_type,
+                     inputs={'X': [tensor]},
+                     outputs={'Out': [tensor]},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': sync_op
+                     })
+
+    return None
+
+
 def all_reduce(tensor,
                op=ReduceOp.SUM,
                group=None,
@@ -40,8 +70,8 @@ def all_reduce(tensor,
 
     Args:
         tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
-            float16, float32, float64, int32 or int64 as the input data type.
-        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.Min|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
         group (Group, optional): Communicate in which group. If none is given, use the global group as default.
         sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
         use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
@@ -50,9 +80,6 @@ def all_reduce(tensor,
     Returns:
         Return a task object.
 
-    Warning:
-        This API only supports the dygraph mode now.
-
     Examples:
         .. code-block:: python
 
@@ -84,7 +111,6 @@ def all_reduce(tensor,
     if framework.in_dygraph_mode():
         return _all_reduce_in_dygraph(tensor, op, group, sync_op,
                                       use_calc_stream)
-
-    raise RuntimeError(
-        "paddle.distributed.stream.all_reduce is only supported in dygraph mode now."
-    )
+    else:
+        return _all_reduce_in_static_mode(tensor, op, group, sync_op,
+                                          use_calc_stream)
diff --git a/python/paddle/distributed/communication/stream/alltoall.py b/python/paddle/distributed/communication/stream/alltoall.py
new file mode 100644
index 00000000000000..b216906d045688
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/alltoall.py
@@ -0,0 +1,157 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    if tensor.shape != shape:
+        raise RuntimeError('The tensor for alltoall is not correctly-sized.')
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            'The tensor_list for alltoall is not correctly-sized.')
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                'The tensor_list for alltoall is not correctly-sized.')
+
+
+def _alltoall_tensor_in_dygraph(out_tensor, in_tensor, group, sync_op,
+                                use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.alltoall_tensor_on_calc_stream(
+            in_tensor, out_tensor)
+
+    task = group.process_group.alltoall_tensor(in_tensor, out_tensor, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _alltoall_in_dygraph(out_tensor_list, in_tensor_list, group, sync_op,
+                         use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    if len(in_tensor_list) == 0:
+        raise RuntimeError("The input tensor_list should not be empty.")
+
+    if len(out_tensor_list) == 0:
+        out_tensor_list += [
+            paddle.empty_like(tensor) for tensor in in_tensor_list
+        ]
+    else:
+        _check_tensor_list_shape(out_tensor_list, in_tensor_list[0].shape,
+                                 group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.alltoall_on_calc_stream(
+            in_tensor_list, out_tensor_list)
+
+    task = group.process_group.alltoall(in_tensor_list, out_tensor_list,
+                                        sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def alltoall(out_tensor_or_tensor_list,
+             in_tensor_or_tensor_list,
+             group=None,
+             sync_op=True,
+             use_calc_stream=False):
+    """
+
+    Scatter a tensor (or a tensor list) across devices and gather outputs to another tensor (or a tensor list, respectively).
+
+    Args:
+        out_tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The output. If it is a tensor, it should be correctly-sized.
+        If it is a list, it should be empty or contain correctly-sized tensors. Its data type should be the same as the input.
+        in_tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter (must be specified on the source rank).
+            If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            out_tensor_list = []
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
+                data2 = paddle.to_tensor([[7, 8, 9], [10, 11, 12]])
+            else:
+                data1 = paddle.to_tensor([[13, 14, 15], [16, 17, 18]])
+                data2 = paddle.to_tensor([[19, 20, 21], [22, 23, 24]])
+            task = dist.stream.alltoall(out_tensor_list, [data1, data2], sync_op=False)
+            task.wait()
+            print(out_tensor_list)
+            # [[[1, 2, 3], [4, 5, 6]], [[13, 14, 15], [16, 17, 18]]]    (2 GPUs, out for rank 0)
+            # [[[7, 8, 9], [10, 11, 12]], [[19, 20, 21], [22, 23, 24]]] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if out_tensor_or_tensor_list is None:
+        raise RuntimeError("The output should be specified.")
+    if in_tensor_or_tensor_list is None:
+        raise RuntimeError("The input should be specified.")
+
+    if framework.in_dygraph_mode():
+        out_is_tensor = paddle.is_tensor(out_tensor_or_tensor_list)
+        in_is_tensor = paddle.is_tensor(in_tensor_or_tensor_list)
+        if out_is_tensor and in_is_tensor:
+            return _alltoall_tensor_in_dygraph(out_tensor_or_tensor_list,
+                                               in_tensor_or_tensor_list, group,
+                                               sync_op, use_calc_stream)
+        elif not out_is_tensor and not in_is_tensor:
+            return _alltoall_in_dygraph(out_tensor_or_tensor_list,
+                                        in_tensor_or_tensor_list, group,
+                                        sync_op, use_calc_stream)
+        else:
+            raise RuntimeError(
+                "The output and input should be both tensor or tensor list.")
+
+    raise RuntimeError(
+        "paddle.distributed.stream.alltoall is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/alltoall_single.py b/python/paddle/distributed/communication/stream/alltoall_single.py
new file mode 100644
index 00000000000000..b2187cc06e3439
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/alltoall_single.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _alltoall_single_in_dygraph(out_tensor, in_tensor, out_split_sizes,
+                                in_split_sizes, group, sync_op,
+                                use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    if out_split_sizes is None:
+        out_split_sizes = []
+    if in_split_sizes is None:
+        in_split_sizes = []
+
+    if use_calc_stream:
+        return group.process_group.alltoall_single_on_calc_stream(
+            in_tensor, out_tensor, in_split_sizes, out_split_sizes)
+
+    task = group.process_group.alltoall_single(in_tensor, out_tensor,
+                                               in_split_sizes, out_split_sizes,
+                                               sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def alltoall_single(out_tensor,
+                    in_tensor,
+                    out_split_sizes=None,
+                    in_split_sizes=None,
+                    group=None,
+                    sync_op=True,
+                    use_calc_stream=False):
+    """
+
+    Split and Scatter the splitted input tensor to the out tensor across devices.
+
+    Args:
+        out_tensor(Tensor): The output tensor. Its data type should be the same as the input.
+        in_tensor (Tensor): The input tensor. Its data type should be float16, float32, float64, int32, int64, int8, uint8 or bool.
+        out_split_sizes (List[int], optional): Split sizes of out_tensor for dim[0]. If not given, dim[0] of out_tensor must be divisible
+            by group size and out_tensor will be gathered averagely from all participators. If none is given, use a empty list as default.
+        in_split_sizes (List[int], optional): Split sizes of in_tensor for dim[0]. If not given, dim[0] of in_tensor must be divisible
+        by group size and in_tensor will be scattered averagely to all participators. If none is given, use a empty list as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+
+            # case 1
+            output = paddle.empty([2], dtype="int64")
+            if local_rank == 0:
+                data = paddle.to_tensor([0, 1])
+            else:
+                data = paddle.to_tensor([2, 3])
+            task = dist.stream.alltoall_single(output, data, sync_op=False)
+            task.wait()
+            out = output.numpy()
+            # [0, 2] (2 GPUs, out for rank 0)
+            # [1, 3] (2 GPUs, out for rank 1)
+
+            # case 2
+            size = dist.get_world_size()
+            output = paddle.empty([(local_rank + 1) * size, size], dtype='float32')
+            if local_rank == 0:
+                data = paddle.to_tensor([[0., 0.], [0., 0.], [0., 0.]])
+            else:
+                data = paddle.to_tensor([[1., 1.], [1., 1.], [1., 1.]])
+            out_split_sizes = [local_rank + 1 for i in range(size)]
+            in_split_sizes = [i + 1 for i in range(size)]
+            task = dist.stream.alltoall_single(output,
+                                            data,
+                                            out_split_sizes,
+                                            in_split_sizes,
+                                            sync_op=False)
+            task.wait()
+            out = output.numpy()
+            # [[0., 0.], [1., 1.]]                     (2 GPUs, out for rank 0)
+            # [[0., 0.], [0., 0.], [1., 1.], [1., 1.]] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _alltoall_single_in_dygraph(out_tensor, in_tensor,
+                                           out_split_sizes, in_split_sizes,
+                                           group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.alltoall_single is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/broadcast.py b/python/paddle/distributed/communication/stream/broadcast.py
new file mode 100644
index 00000000000000..06bde316937a9d
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/broadcast.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _broadcast_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.broadcast_on_calc_stream(tensor, src)
+
+    task = group.process_group.broadcast(tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def broadcast(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Broadcast a tensor to all devices.
+
+    Args:
+        tensor (Tensor): The tensor to broadcast. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            task = dist.stream.broadcast(data, src=1, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[1, 2, 3], [1, 2, 3]] (2 GPUs)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _broadcast_in_dygraph(tensor, src, group, sync_op,
+                                     use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.broadcast is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/recv.py b/python/paddle/distributed/communication/stream/recv.py
new file mode 100644
index 00000000000000..25a8173788473a
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/recv.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.recv_on_calc_stream(tensor, src)
+
+    task = group.process_group.recv(tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def recv(tensor, src=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Receive a tensor from the source device.
+
+    Args:
+        tensor (Tensor): The tensor to receive. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+                task = dist.stream.send(data, dst=1, sync_op=False)
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+                task = dist.stream.recv(data, src=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[4, 5, 6], [4, 5, 6]] (2 GPUs)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _recv_in_dygraph(tensor, src, group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.recv is only supported in dygraph mode now.")
diff --git a/python/paddle/distributed/communication/stream/reduce.py b/python/paddle/distributed/communication/stream/reduce.py
new file mode 100644
index 00000000000000..b0f7f5c884743d
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/reduce.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed.communication.group import _get_global_group
+from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
+
+
+def _reduce_in_dygraph(tensor, dst, op, group, sync_op, use_calc_stream):
+    op_type = _get_reduce_op(op, "reduce")
+    group = _get_global_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.reduce_on_calc_stream(tensor, dst, op_type)
+
+    task = group.process_group.reduce(tensor, dst, op_type, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def reduce(tensor,
+           dst=0,
+           op=ReduceOp.SUM,
+           group=None,
+           sync_op=True,
+           use_calc_stream=False):
+    """
+
+    Perform specific reduction (for example, sum, max) on a tensor across devices and send to the destintion device.
+
+    Args:
+        tensor (Tensor): The input tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        dst (int, optional): Rank of the destination device. If none is given, use `0` as default.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+            task = dist.stream.reduce(data, dst=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[5, 7, 9], [5, 7, 9]] (2 GPUs, out for rank 0)
+            # [[1, 2, 3], [1, 2, 3]] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _reduce_in_dygraph(tensor, dst, op, group, sync_op,
+                                  use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.reduce is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/reduce_scatter.py b/python/paddle/distributed/communication/stream/reduce_scatter.py
new file mode 100644
index 00000000000000..71fc93478448fc
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/reduce_scatter.py
@@ -0,0 +1,215 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed.communication.group import _get_global_group
+from paddle.distributed.communication.reduce import _get_reduce_op, ReduceOp
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    expect_shape = list(shape)
+    expect_shape[0] //= nranks
+    if list(tensor.shape) != expect_shape:
+        raise RuntimeError(
+            "The in_tensor for reduce_scatter is not correctly-sized.")
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            f"The tensor_list for reduce_scatter is not correctly-sized.")
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                f"The tensor_list for reduce_scatter is not correctly-sized.")
+
+
+def _reduce_scatter_tensor_in_dygraph(out_tensor,
+                                      in_tensor,
+                                      op,
+                                      group,
+                                      sync_op,
+                                      use_calc_stream,
+                                      caller="reduce_scatter"):
+    op_type = _get_reduce_op(op, caller)
+    group = _get_global_group() if group is None else group
+
+    _check_tensor_shape(out_tensor, in_tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.reduce_scatter_tensor_on_calc_stream(
+            in_tensor, out_tensor, op_type)
+
+    task = group.process_group.reduce_scatter_tensor(in_tensor, out_tensor,
+                                                     op_type, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _reduce_scatter_in_dygraph(tensor, tensor_list, op, group, sync_op,
+                               use_calc_stream):
+    op_type = _get_reduce_op(op, "reduce_scatter")
+    group = _get_global_group() if group is None else group
+
+    _check_tensor_list_shape(tensor_list, tensor.shape, group.nranks)
+
+    if use_calc_stream:
+        return group.process_group.reduce_scatter_on_calc_stream(
+            tensor_list, tensor, op_type)
+
+    task = group.process_group.reduce_scatter(tensor_list, tensor, op_type,
+                                              sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def reduce_scatter(tensor,
+                   tensor_or_tensor_list,
+                   op=ReduceOp.SUM,
+                   group=None,
+                   sync_op=True,
+                   use_calc_stream=False):
+    """
+
+    Reduce, then scatter a tensor (or a tensor list) across devices.
+
+    Args:
+        tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        tensor_list (List[Tensor]]): The input to scatter.
+            If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([0, 1])
+                data2 = paddle.to_tensor([2, 3])
+            else:
+                data1 = paddle.to_tensor([4, 5])
+                data2 = paddle.to_tensor([6, 7])
+            dist.stream.reduce_scatter(data1, [data1, data2])
+            out = data1.numpy()
+            # [4, 6]  (2 GPUs, out for rank 0)
+            # [8, 10] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        if paddle.is_tensor(tensor_or_tensor_list):
+            return _reduce_scatter_tensor_in_dygraph(tensor,
+                                                     tensor_or_tensor_list, op,
+                                                     group, sync_op,
+                                                     use_calc_stream)
+        else:
+            return _reduce_scatter_in_dygraph(tensor, tensor_or_tensor_list, op,
+                                              group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.reduce_scatter is only supported in dygraph mode now."
+    )
+
+
+def _reduce_scatter_base(out_tensor,
+                         in_tensor,
+                         op=ReduceOp.SUM,
+                         group=None,
+                         sync_op=True,
+                         use_calc_stream=False):
+    """
+
+    Reduce, then scatter a flattened tensor across devices.
+
+    Args:
+        out_tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32 or int64 as the input data type.
+        in_tensor (Tensor): The input tensor to reduce and scatter.
+        op (ReduceOp.SUM|ReduceOp.MAX|ReduceOp.MIN|ReduceOp.PROD, optional): The reduction used. If none is given, use ReduceOp.SUM as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API will be deprecated in the future, and only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([7, 8, 9])
+                data2 = paddle.to_tensor([10, 11, 12])
+                dist.stream.scatter(data1, src=1)
+            else:
+                data1 = paddle.to_tensor([1, 2, 3])
+                data2 = paddle.to_tensor([4, 5, 6])
+                dist.stream.scatter(data1, [data1, data2], src=1)
+            out = data1.numpy()
+            # [1, 2, 3] (2 GPUs, out for rank 0)
+            # [4, 5, 6] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _reduce_scatter_tensor_in_dygraph(out_tensor, in_tensor, op,
+                                                 group, sync_op,
+                                                 use_calc_stream,
+                                                 "_reduce_scatter_base")
+
+    raise RuntimeError(
+        "paddle.distributed.stream._reduce_scatter_base is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/scatter.py b/python/paddle/distributed/communication/stream/scatter.py
new file mode 100644
index 00000000000000..ee75583d161448
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/scatter.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _check_tensor_shape(tensor, shape, nranks=1):
+    expect_shape = list(shape)
+    expect_shape[0] //= nranks
+    if list(tensor.shape) != expect_shape:
+        raise RuntimeError("The in_tensor for scatter is not correctly-sized.")
+
+
+def _check_tensor_list_shape(tensor_list, shape, nranks=1):
+    if len(tensor_list) != nranks:
+        raise RuntimeError(
+            f"The tensor_list for scatter is not correctly-sized.")
+    for tensor in tensor_list:
+        if tensor.shape != shape:
+            raise RuntimeError(
+                f"The tensor_list for scatter is not correctly-sized.")
+
+
+def _scatter_tensor_in_dygraph(out_tensor, in_tensor, src, group, sync_op,
+                               use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    src_rank = group.get_group_rank(src)
+    if src_rank == -1:
+        raise RuntimeError("Src rank out of group.")
+
+    nranks = group.nranks
+    rank = paddle.distributed.get_rank()
+    if rank == src_rank:
+        _check_tensor_shape(out_tensor, in_tensor.shape, nranks)
+
+    if use_calc_stream:
+        return group.process_group.scatter_tensor_on_calc_stream(
+            in_tensor, out_tensor, src)
+
+    task = group.process_group.scatter_tensor(in_tensor, out_tensor, src,
+                                              sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def _scatter_in_dygraph(tensor, tensor_list, src, group, sync_op,
+                        use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+
+    src_rank = group.get_group_rank(src)
+    if src_rank == -1:
+        raise RuntimeError("Src rank out of group.")
+
+    nranks = group.nranks
+    rank = paddle.distributed.get_rank()
+    if rank == src_rank:
+        if len(tensor_list) == 0:
+            raise RuntimeError(
+                "The tensor_list should not be empty on src rank.")
+        _check_tensor_list_shape(tensor_list, tensor.shape, nranks)
+    else:
+        tensor_list = [tensor for _ in range(nranks)]
+
+    if use_calc_stream:
+        return group.process_group.scatter_on_calc_stream(
+            tensor_list, tensor, src)
+
+    task = group.process_group.scatter(tensor_list, tensor, src, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def scatter(tensor,
+            tensor_or_tensor_list=None,
+            src=0,
+            group=None,
+            sync_op=True,
+            use_calc_stream=False):
+    """
+
+    Scatter a tensor (or a tensor list) across devices.
+
+    Args:
+        tensor (Tensor): The output tensor on each rank. The result will overwrite this tenor after communication. Support
+            float16, float32, float64, int32, int64, int8, uint8 or bool as the input data type.
+        tensor_or_tensor_list (Union[Tensor, List[Tensor]]): The input to scatter (default is `None`, must be specified on the source rank).
+            If it is a tensor, it should be correctly-sized. If it is a list, it should contain correctly-sized tensors.
+        src (int, optional): Rank of the source device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            if dist.get_rank() == 0:
+                data1 = paddle.to_tensor([7, 8, 9])
+                data2 = paddle.to_tensor([10, 11, 12])
+                dist.stream.scatter(data1, src=1)
+            else:
+                data1 = paddle.to_tensor([1, 2, 3])
+                data2 = paddle.to_tensor([4, 5, 6])
+                dist.stream.scatter(data1, [data1, data2], src=1)
+            out = data1.numpy()
+            # [1, 2, 3] (2 GPUs, out for rank 0)
+            # [4, 5, 6] (2 GPUs, out for rank 1)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be true in sync op behavior.")
+
+    if tensor_or_tensor_list is None:
+        raise RuntimeError("The input should be specified.")
+
+    if framework.in_dygraph_mode():
+        if paddle.is_tensor(tensor_or_tensor_list):
+            return _scatter_tensor_in_dygraph(tensor, tensor_or_tensor_list,
+                                              src, group, sync_op,
+                                              use_calc_stream)
+        else:
+            return _scatter_in_dygraph(tensor, tensor_or_tensor_list, src,
+                                       group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.scatter is only supported in dygraph mode now."
+    )
diff --git a/python/paddle/distributed/communication/stream/send.py b/python/paddle/distributed/communication/stream/send.py
new file mode 100644
index 00000000000000..41ec2c0141b122
--- /dev/null
+++ b/python/paddle/distributed/communication/stream/send.py
@@ -0,0 +1,82 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.fluid.framework as framework
+from paddle.distributed import collective
+
+
+def _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream):
+    group = collective._get_default_group() if group is None else group
+    if use_calc_stream:
+        return group.process_group.send_on_calc_stream(tensor, dst)
+
+    task = group.process_group.send(tensor, dst, sync_op)
+    if sync_op:
+        task.wait()
+
+    return task
+
+
+def send(tensor, dst=0, group=None, sync_op=True, use_calc_stream=False):
+    """
+
+    Send a tensor to the destination device.
+
+    Args:
+        tensor (Tensor): The tensor to send. Support float16, float32, float64, int32, int64, int8, uint8 or bool as its data type.
+        dst (int, optional): Rank of the destination device. If none is given, use `0` as default.
+        group (Group, optional): Communicate in which group. If none is given, use the global group as default.
+        sync_op (bool, optional): Indicate whether the communication is sync or not. If none is given, use true as default.
+        use_calc_stream (bool, optional): Indicate whether the communication is done on calculation stream. If none is given, use false as default. This
+            option is designed for high performance demand, be careful to turn it on except you are clearly know its meaning.
+
+    Returns:
+        Return a task object.
+
+    Warning:
+        This API only supports the dygraph mode now.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed as dist
+
+            dist.init_parallel_env()
+            local_rank = dist.get_rank()
+            if local_rank == 0:
+                data = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
+                task = dist.stream.send(data, dst=1, sync_op=False)
+            else:
+                data = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
+                task = dist.stream.recv(data, src=0, sync_op=False)
+            task.wait()
+            out = data.numpy()
+            # [[4, 5, 6], [4, 5, 6]] (2 GPUs)
+    """
+    if group is not None and not group.is_member():
+        raise RuntimeError(
+            "The group should not be None and all ranks which invoke this operation should be the member of this group."
+        )
+
+    if not sync_op and use_calc_stream:
+        raise RuntimeError(
+            "use_calc_stream can only be True in sync op behavior.")
+
+    if framework.in_dygraph_mode():
+        return _send_in_dygraph(tensor, dst, group, sync_op, use_calc_stream)
+
+    raise RuntimeError(
+        "paddle.distributed.stream.send is only supported in dygraph mode now.")
diff --git a/python/paddle/distributed/fleet/__init__.py b/python/paddle/distributed/fleet/__init__.py
index 11d7643c676dd4..b75d84edf29b28 100755
--- a/python/paddle/distributed/fleet/__init__.py
+++ b/python/paddle/distributed/fleet/__init__.py
@@ -32,6 +32,7 @@
 from .model import distributed_model
 from .optimizer import distributed_optimizer
 from .scaler import distributed_scaler
+from .utils import log_util
 
 __all__ = [  #noqa
     "CommunicateTopology", "UtilBase", "HybridCommunicateGroup",
@@ -90,3 +91,7 @@
 shrink = fleet.shrink
 get_hybrid_communicate_group = fleet.get_hybrid_communicate_group
 distributed_scaler = distributed_scaler
+set_log_level = log_util.set_log_level
+get_log_level_code = log_util.get_log_level_code
+get_log_level_name = log_util.get_log_level_name
+from .. import auto_parallel as auto
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 2a11dd7eace7f6..2cfded8c96013d 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -26,7 +26,6 @@
 
 
 def __non_auto_func_called__(func):
-
     def __impl__(*args, **kwargs):
         global non_auto_func_called
         non_auto_func_called = False
@@ -112,14 +111,15 @@ class DistributedStrategy(object):
 
     def __init__(self):
         """
+
         DistributedStrategy is the main configuration entry for distributed training of Paddle.
         All of the distributed training configurations can be configured in DistributedStrategy,
-        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS), 
+        such as automatic mixed precision (AMP), Layer-wise Adaptive Rate Scaling (LARS),
         asynchronous update parameter server(ASGD), etc.
 
         DistributedStrategy can be serialized into protobuf file or deserialized from protobuf file
 
-        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and 
+        Users who run local training usually configure BuildStrategy and ExecutionStrategy, and
         DistributedStrategy supports configurations from BuildStrategy and ExecutionStrategy
 
         """
@@ -129,7 +129,8 @@ def __init__(self):
         key = 'FLAGS_cudnn_batchnorm_spatial_persistent'
         if _global_flags().is_public(key):
             self.strategy.cudnn_batchnorm_spatial_persistent = bool(
-                _global_flags()[key])
+                _global_flags()[key]
+            )
         key = 'FLAGS_conv_workspace_size_limit'
         if _global_flags().is_public(key):
             self.strategy.conv_workspace_size_limit = int(_global_flags()[key])
@@ -144,43 +145,47 @@ def __init__(self):
 
     def __setattr__(self, key, value):
         if self.__lock_attr and not hasattr(self, key):
-            raise TypeError("%s is not a attribute of %s" %
-                            (key, self.__class__.__name__))
+            raise TypeError(
+                "%s is not a attribute of %s" % (key, self.__class__.__name__)
+            )
         object.__setattr__(self, key, value)
 
     def save_to_prototxt(self, output):
         """
+
         Serialize current DistributedStrategy to string and save to output file
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.dgc = True
+                strategy.recompute = True
+                strategy.recompute_configs = {"checkpoints": ["x"]}
+                strategy.save_to_prototxt("dist_strategy.prototxt")
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.dgc = True
-            strategy.recompute = True
-            strategy.recompute_configs = {"checkpoints": ["x"]}
-            strategy.save_to_prototxt("dist_strategy.prototxt")
         """
         with open(output, "w") as fout:
             fout.write(str(self.strategy))
 
     def load_from_prototxt(self, pb_file):
         """
+
         Load from prototxt file for DistributedStrategy initialization
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.load_from_prototxt("dist_strategy.prototxt")
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.load_from_prototxt("dist_strategy.prototxt")
         """
         with open(pb_file, 'r') as f:
             self.strategy = google.protobuf.text_format.Merge(
-                str(f.read()), self.strategy)
+                str(f.read()), self.strategy
+            )
 
     @property
     def execution_strategy(self):
@@ -188,23 +193,26 @@ def execution_strategy(self):
         Configure ExecutionStrategy for DistributedStrategy
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                exe_strategy = paddle.static.ExecutionStrategy()
+                exe_strategy.num_threads = 10
+                exe_strategy.num_iteration_per_drop_scope = 10
+                exe_strategy.num_iteration_per_run = 10
 
-            import paddle
-            exe_strategy = paddle.static.ExecutionStrategy()
-            exe_strategy.num_threads = 10
-            exe_strategy.num_iteration_per_drop_scope = 10
-            exe_strategy.num_iteration_per_run = 10
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.execution_strategy = exe_strategy
 
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.execution_strategy = exe_strategy
         """
         execution_strategy = paddle.fluid.ExecutionStrategy()
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
-            setattr(execution_strategy, f.name,
-                    getattr(self.strategy.execution_strategy, f.name))
+            setattr(
+                execution_strategy,
+                f.name,
+                getattr(self.strategy.execution_strategy, f.name),
+            )
         return execution_strategy
 
     @execution_strategy.setter
@@ -212,33 +220,37 @@ def execution_strategy(self):
     def execution_strategy(self, strategy):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
-            setattr(self.strategy.execution_strategy, f.name,
-                    getattr(strategy, f.name))
+            setattr(
+                self.strategy.execution_strategy,
+                f.name,
+                getattr(strategy, f.name),
+            )
 
     @property
     def build_strategy(self):
         """
+
         Configure BuildStrategy for DistributedStrategy
         Note that the properties of BuildStrategy are valid in DistributedStrategy
         only if the property is non-distributed strategy.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.enable_sequential_execution = True
+                build_strategy.fuse_elewise_add_act_ops = True
+                build_strategy.fuse_bn_act_ops = True
+                build_strategy.enable_auto_fusion = True
+                build_strategy.fuse_relu_depthwise_conv = True
+                build_strategy.fuse_broadcast_ops = True
+                build_strategy.fuse_all_optimizer_ops = True
+                build_strategy.enable_inplace = True
 
-            import paddle
-            build_strategy = paddle.static.BuildStrategy()
-            build_strategy.enable_sequential_execution = True
-            build_strategy.fuse_elewise_add_act_ops = True
-            build_strategy.fuse_bn_act_ops = True
-            build_strategy.enable_auto_fusion = True
-            build_strategy.fuse_relu_depthwise_conv = True
-            build_strategy.fuse_broadcast_ops = True
-            build_strategy.fuse_all_optimizer_ops = True
-            build_strategy.enable_inplace = True
+                strategy = paddle.distributed.fleet.DistributedStrategy()
+                strategy.build_strategy = build_strategy
 
-            strategy = paddle.distributed.fleet.DistributedStrategy()
-            strategy.build_strategy = build_strategy
         """
 
         build_strategy = paddle.fluid.BuildStrategy()
@@ -261,52 +273,60 @@ def build_strategy(self, strategy):
                     value = ReduceStrategyFleet(value)
                 setattr(self.strategy.build_strategy, f.name, value)
             elif f.label == 3:  # repeated field
-                getattr(self.strategy.build_strategy,
-                        f.name).extend(getattr(strategy, f.name))
+                getattr(self.strategy.build_strategy, f.name).extend(
+                    getattr(strategy, f.name)
+                )
 
     @property
     def gradient_scale_configs(self):
         """
+
         Set the strategy of gradient scale
+
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.gradient_scale_configs = {'scale_strategy': 'avg'}
 
         Note that, strategy must be in 'avg', 'sum' or 'customized'
+
         """
         return get_msg_dict(self.strategy.gradient_scale_configs)
 
     @gradient_scale_configs.setter
     @is_strict_auto
     def gradient_scale_configs(self, config):
-        check_configs_key(self.strategy.gradient_scale_configs, config,
-                          'gradient_scale_configs')
+        check_configs_key(
+            self.strategy.gradient_scale_configs,
+            config,
+            'gradient_scale_configs',
+        )
         assign_configs_value(self.strategy.gradient_scale_configs, config)
 
     @property
     def a_sync(self):
         """
+
         Indicating whether we are using asynchronous stocastic gradient descent updates
-        for training. This property is valid when we are using parameter server training, 
+        for training. This property is valid when we are using parameter server training,
         which is implied by setting approperate RoleMaker
         Default value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                strategy.a_sync = True  # by default this is True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.a_sync = True  # by default this is True
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.a_sync
 
@@ -318,12 +338,15 @@ def a_sync(self, flag):
             self.a_sync_configs = {"k_steps": 0}
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}"
-                .format(type(flag)))
+                "The type of `flag` is invalid, expected type is bool, but received {}".format(
+                    type(flag)
+                )
+            )
 
     @property
     def a_sync_configs(self):
         """
+
         Set a_sync update configurations. In general, asynchronous parameter server
         training has serveral configurable settings that can be configured through
         a dict.
@@ -344,20 +367,19 @@ def a_sync_configs(self):
             runtime_split_send_recv(bool): if we are using Tensor split for send and recv during runtime
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                strategy.a_sync = True  # by default this is True
+                configs = {"k_steps": 1024, "send_queue_size": 32}
+                strategy.a_sync_configs = configs
 
-            strategy = fleet.DistributedStrategy()
-            strategy.a_sync = True  # by default this is True
-            configs = {"k_steps": 1024, "send_queue_size": 32}
-            strategy.a_sync_configs = configs
-
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return get_msg_dict(self.strategy.a_sync_configs)
@@ -365,14 +387,16 @@ def a_sync_configs(self):
     @a_sync_configs.setter
     @is_strict_auto
     def a_sync_configs(self, configs):
-        check_configs_key(self.strategy.a_sync_configs, configs,
-                          "a_sync_configs")
+        check_configs_key(
+            self.strategy.a_sync_configs, configs, "a_sync_configs"
+        )
         assign_configs_value(self.strategy.a_sync_configs, configs)
 
     @property
     def trainer_desc_configs(self):
         """
-        Set trainer desc configurations. 
+
+        Set trainer desc configurations.
 
         **Notes**:
             dump_fields_path(str): the path of dump fields
@@ -381,22 +405,21 @@ def trainer_desc_configs(self):
 
             dump_param(list(str)): the param that you want to dump
 
-            stat_var_names(list(str)): 
+            stat_var_names(list(str)):
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
+                strategy.trainer_desc_configs = configs
 
-            strategy = fleet.DistributedStrategy()
-            configs = {"dump_fields_path": "./dump_data", "dump_fields": ["xxx", "yyy"]}
-            strategy.trainer_desc_configs = configs
-
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return get_msg_dict(self.strategy.trainer_desc_configs)
@@ -404,22 +427,23 @@ def trainer_desc_configs(self):
     @property
     def adam_d2sum(self):
         """
+
         set adam_d2sum
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
 
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                strategy.adam_d2sum = True  # by default this is False
 
-            strategy = fleet.DistributedStrategy()
-            strategy.adam_d2sum = True  # by default this is False
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
 
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.adam_d2sum
 
@@ -430,43 +454,55 @@ def adam_d2sum(self, flag):
             self.strategy.adam_d2sum = flag
         else:
             raise ValueError(
-                "The type of `flag` is invalid, expected type is bool, but received {}"
-                .format(type(flag)))
+                "The type of `flag` is invalid, expected type is bool, but received {}".format(
+                    type(flag)
+                )
+            )
 
     @trainer_desc_configs.setter
     @is_strict_auto
     def trainer_desc_configs(self, configs):
-        check_configs_key(self.strategy.trainer_desc_configs, configs,
-                          "trainer_desc_configs")
+        check_configs_key(
+            self.strategy.trainer_desc_configs, configs, "trainer_desc_configs"
+        )
         assign_configs_value(self.strategy.trainer_desc_configs, configs)
 
     @property
     def fs_client_param(self):
         """
-        Set fs client configurations. 
-        **Notes**:
+
+        Set fs client configurations.
+
+        Note:
             uri(str): the uri of fs client
+
             user(str): the user_name of fs client
+
             passwd(str): the passwd of fs client
-            hadoop_bin(str): 
+
+            hadoop_bin(str):
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            role_maker = fleet.PaddleCloudRoleMaker()
-            fleet.init(role_maker)
-            strategy = fleet.DistributedStrategy()
-            configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"}
-            strategy.fs_client_param = configs
-            # code block for defining loss and local optimizer
-            # sgd = fleet.distributed_optimizer(optimizer, strategy)
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                role_maker = fleet.PaddleCloudRoleMaker()
+                fleet.init(role_maker)
+                strategy = fleet.DistributedStrategy()
+                configs = {"uri": "xxx", "user": "xxx", passwd: "xxx"}
+                strategy.fs_client_param = configs
+                # code block for defining loss and local optimizer
+                # sgd = fleet.distributed_optimizer(optimizer, strategy)
+
         """
         return self.strategy.fs_client_param
 
     @fs_client_param.setter
     @is_strict_auto
     def fs_client_param(self, configs):
-        check_configs_key(self.strategy.fs_client_param, configs,
-                          "fs_client_param")
+        check_configs_key(
+            self.strategy.fs_client_param, configs, "fs_client_param"
+        )
         assign_configs_value(self.strategy.fs_client_param, configs)
 
     @property
@@ -477,6 +513,7 @@ def sparse_table_configs(self):
     @is_strict_auto
     def sparse_table_configs(self, configs):
         from google.protobuf.descriptor import FieldDescriptor
+
         table_param = self.strategy.downpour_table_param
 
         def set_table_config(msg, config_name, configs, index=0):
@@ -493,8 +530,9 @@ def set_table_config(msg, config_name, configs, index=0):
                             data = getattr(msg, field.name).add()
                             set_table_config(data, name, configs, i)
                     else:
-                        set_table_config(getattr(msg, field.name), name,
-                                         configs)
+                        set_table_config(
+                            getattr(msg, field.name), name, configs
+                        )
                 else:
                     # print("not message:", name)
                     if name not in configs:
@@ -513,133 +551,206 @@ def set_table_config(msg, config_name, configs, index=0):
             for table_name in configs:
                 table_data = table_param.add()
                 table_data.table_name = table_name
-                set_table_config(table_data, "table_parameters." + table_name,
-                                 configs[table_name])
+                set_table_config(
+                    table_data,
+                    "table_parameters." + table_name,
+                    configs[table_name],
+                )
 
     @sparse_table_configs.setter
     def fleet_desc_configs(self, configs):
-        support_sparse_key_list = ['sparse_table_class', 'sparse_compress_in_save', 'sparse_shard_num', \
-                                   'sparse_accessor_class', 'sparse_learning_rate', 'sparse_initial_g2sum', 'sparse_initial_range', \
-                                   'sparse_weight_bounds', 'sparse_fea_dim', 'sparse_embedx_dim', 'sparse_embedx_threshold', 'sparse_nonclk_coeff', \
-                                   'sparse_click_coeff', 'sparse_base_threshold', 'sparse_delta_threshold', 'sparse_delta_keep_days', \
-                                   'sparse_delete_after_unseen_days', 'sparse_show_click_decay_rate', 'sparse_delete_threshold', \
-                                   'sparse_converter', 'sparse_deconverter', 'sparse_enable_cache', 'sparse_cache_rate', \
-                                   'sparse_cache_file_num', 'sparse_beta1_decay_rate', 'sparse_beta2_decay_rate', \
-                                   'sparse_ada_epsilon', 'sparse_optimizer', 'sparse_ssd_unseenday_threshold',
-                                   'embed_sparse_optimizer', 'embed_sparse_learning_rate', 'embed_sparse_weight_bounds', \
-                                   'embed_sparse_initial_range', 'embed_sparse_initial_g2sum', 'embed_sparse_beta1_decay_rate', \
-                                   'embed_sparse_beta2_decay_rate', 'embedx_sparse_optimizer', 'embedx_sparse_learning_rate', \
-                                   'embedx_sparse_weight_bounds', 'embedx_sparse_initial_range', 'embedx_sparse_initial_g2sum', \
-                                   'embedx_sparse_beta1_decay_rate', 'embedx_sparse_beta2_decay_rate', 'feature_learning_rate', 'nodeid_slot']
+        support_sparse_key_list = [
+            'sparse_table_class',
+            'sparse_compress_in_save',
+            'sparse_shard_num',
+            'sparse_accessor_class',
+            'sparse_learning_rate',
+            'sparse_initial_g2sum',
+            'sparse_initial_range',
+            'sparse_weight_bounds',
+            'sparse_fea_dim',
+            'sparse_embedx_dim',
+            'sparse_embedx_threshold',
+            'sparse_nonclk_coeff',
+            'sparse_click_coeff',
+            'sparse_base_threshold',
+            'sparse_delta_threshold',
+            'sparse_delta_keep_days',
+            'sparse_delete_after_unseen_days',
+            'sparse_show_click_decay_rate',
+            'sparse_delete_threshold',
+            'sparse_converter',
+            'sparse_deconverter',
+            'sparse_enable_cache',
+            'sparse_cache_rate',
+            'sparse_cache_file_num',
+            'sparse_beta1_decay_rate',
+            'sparse_beta2_decay_rate',
+            'sparse_ada_epsilon',
+            'sparse_optimizer',
+            'sparse_ssd_unseenday_threshold',
+            'embed_sparse_optimizer',
+            'embed_sparse_learning_rate',
+            'embed_sparse_weight_bounds',
+            'embed_sparse_initial_range',
+            'embed_sparse_initial_g2sum',
+            'embed_sparse_beta1_decay_rate',
+            'embed_sparse_beta2_decay_rate',
+            'embedx_sparse_optimizer',
+            'embedx_sparse_learning_rate',
+            'embedx_sparse_weight_bounds',
+            'embedx_sparse_initial_range',
+            'embedx_sparse_initial_g2sum',
+            'embedx_sparse_beta1_decay_rate',
+            'embedx_sparse_beta2_decay_rate',
+            'feature_learning_rate',
+            'nodeid_slot',
+        ]
         support_sparse_table_class = ['DownpourSparseTable']
         support_sparse_accessor_class = [
-            'DownpourSparseValueAccessor', 'DownpourCtrAccessor',
-            'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor',
-            'DownpourDoubleUnitAccessor', 'DownpourCtrDymfAccessor'
+            'DownpourSparseValueAccessor',
+            'DownpourCtrAccessor',
+            'DownpourCtrDoubleAccessor',
+            'DownpourUnitAccessor',
+            'DownpourDoubleUnitAccessor',
+            'DownpourCtrDymfAccessor',
         ]
         from google.protobuf.descriptor import FieldDescriptor
+
         table_param = self.strategy.downpour_table_param
 
         def add_graph_config(graph, strategy):
-            graph.feature_learning_rate = strategy.get('feature_learning_rate',
-                                                       0.05)
+            graph.feature_learning_rate = strategy.get(
+                'feature_learning_rate', 0.05
+            )
             graph.nodeid_slot = strategy.get('nodeid_slot', 9008)
 
         def sparse_optimizer_config(sgd, strategy, prefix):
-            optimizer_name = strategy.get(prefix + "sparse_optimizer",
-                                          "adagrad")
+            optimizer_name = strategy.get(
+                prefix + "sparse_optimizer", "adagrad"
+            )
             sgd.name = optimizer_name
             if optimizer_name == "naive":
                 sgd.name = "SparseNaiveSGDRule"
                 sgd.naive.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                 sgd.naive.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_initial_range', 1e-4
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.naive.weight_bounds.extend(bounds)
             elif optimizer_name == "adagrad":
                 sgd.name = 'SparseAdaGradSGDRule'
                 sgd.adagrad.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                 sgd.adagrad.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 if prefix == "embed_":
                     sgd.adagrad.initial_range = 0
                 sgd.adagrad.initial_g2sum = strategy.get(
-                    prefix + 'sparse_initial_g2sum', 3)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_initial_g2sum', 3
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adagrad.weight_bounds.extend(bounds)
             elif optimizer_name == "std_adagrad":
                 sgd.name = 'StdAdaGradSGDRule'
                 sgd.adagrad.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.05)
+                    prefix + 'sparse_learning_rate', 0.05
+                )
                 sgd.adagrad.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 if prefix == "embed_":
                     sgd.adagrad.initial_range = 0
                 sgd.adagrad.initial_g2sum = strategy.get(
-                    prefix + 'sparse_initial_g2sum', 3)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_initial_g2sum', 3
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adagrad.weight_bounds.extend(bounds)
             elif optimizer_name == "adam":
                 sgd.name = 'SparseAdamSGDRule'
                 sgd.adam.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.001)
+                    prefix + 'sparse_learning_rate', 0.001
+                )
                 sgd.adam.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 sgd.adam.beta1_decay_rate = strategy.get(
-                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                    prefix + 'sparse_beta1_decay_rate', 0.9
+                )
                 sgd.adam.beta2_decay_rate = strategy.get(
-                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                    prefix + 'sparse_beta2_decay_rate', 0.999
+                )
                 sgd.adam.ada_epsilon = strategy.get(
-                    prefix + 'sparse_ada_epsilon', 1e-8)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_ada_epsilon', 1e-8
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adam.weight_bounds.extend(bounds)
             elif optimizer_name == "shared_adam":
                 sgd.name = 'SparseSharedAdamSGDRule'
                 sgd.adam.learning_rate = strategy.get(
-                    prefix + 'sparse_learning_rate', 0.001)
+                    prefix + 'sparse_learning_rate', 0.001
+                )
                 sgd.adam.initial_range = strategy.get(
-                    prefix + 'sparse_initial_range', 1e-4)
+                    prefix + 'sparse_initial_range', 1e-4
+                )
                 sgd.adam.beta1_decay_rate = strategy.get(
-                    prefix + 'sparse_beta1_decay_rate', 0.9)
+                    prefix + 'sparse_beta1_decay_rate', 0.9
+                )
                 sgd.adam.beta2_decay_rate = strategy.get(
-                    prefix + 'sparse_beta2_decay_rate', 0.999)
+                    prefix + 'sparse_beta2_decay_rate', 0.999
+                )
                 sgd.adam.ada_epsilon = strategy.get(
-                    prefix + 'sparse_ada_epsilon', 1e-8)
-                bounds = strategy.get(prefix + 'sparse_weight_bounds',
-                                      [-10, 10])
+                    prefix + 'sparse_ada_epsilon', 1e-8
+                )
+                bounds = strategy.get(
+                    prefix + 'sparse_weight_bounds', [-10, 10]
+                )
                 sgd.adam.weight_bounds.extend(bounds)
 
         def set_sparse_table_config(table_data, config):
             for key in config:
                 if key not in support_sparse_key_list:
                     raise ValueError("strategy key '%s' not support" % (key))
-            table_class = config.get("sparse_table_class",
-                                     "DownpourSparseTable")
+            table_class = config.get(
+                "sparse_table_class", "DownpourSparseTable"
+            )
             if table_class not in support_sparse_table_class:
                 raise ValueError(
                     "support sparse_table_class: ['DownpourSparseTable'], but actual %s"
-                    % (table_class))
+                    % (table_class)
+                )
             table_data.table_class = 'MemorySparseTable'
             table_data.shard_num = config.get('sparse_shard_num', 1000)
             table_data.enable_sparse_table_cache = config.get(
-                'sparse_enable_cache', True)
+                'sparse_enable_cache', True
+            )
             table_data.sparse_table_cache_rate = config.get(
-                'sparse_cache_rate', 0.00055)
+                'sparse_cache_rate', 0.00055
+            )
             table_data.sparse_table_cache_file_num = config.get(
-                'sparse_cache_file_num', 16)
+                'sparse_cache_file_num', 16
+            )
 
-            accessor_class = config.get("sparse_accessor_class",
-                                        "DownpourCtrAccessor")
+            accessor_class = config.get(
+                "sparse_accessor_class", "DownpourCtrAccessor"
+            )
             if accessor_class not in support_sparse_accessor_class:
                 raise ValueError(
                     "support sparse_accessor_class: ['DownpourSparseValueAccessor', 'DownpourCtrAccessor', 'DownpourCtrDoubleAccessor', 'DownpourUnitAccessor', 'DownpourDoubleUnitAccessor'], but actual %s"
-                    % (accessor_class))
+                    % (accessor_class)
+                )
 
             if accessor_class.find("Double") >= 0:
                 table_data.accessor.accessor_class = 'CtrDoubleAccessor'
@@ -654,7 +765,8 @@ def set_sparse_table_config(table_data, config):
             table_data.accessor.embedx_dim = config.get('sparse_embedx_dim', 8)
             table_data.accessor.fea_dim = table_data.accessor.embedx_dim + 3
             table_data.accessor.embedx_threshold = config.get(
-                'sparse_embedx_threshold', 10)
+                'sparse_embedx_threshold', 10
+            )
 
             if accessor_class == 'DownpourUnitAccessor':
                 table_data.accessor.ctr_accessor_param.show_scale = False
@@ -662,23 +774,32 @@ def set_sparse_table_config(table_data, config):
                 table_data.accessor.ctr_accessor_param.show_scale = True
 
             table_data.accessor.ctr_accessor_param.nonclk_coeff = config.get(
-                'sparse_nonclk_coeff', 0.1)
+                'sparse_nonclk_coeff', 0.1
+            )
             table_data.accessor.ctr_accessor_param.click_coeff = config.get(
-                'sparse_click_coeff', 1)
+                'sparse_click_coeff', 1
+            )
             table_data.accessor.ctr_accessor_param.base_threshold = config.get(
-                'sparse_base_threshold', 1.5)
+                'sparse_base_threshold', 1.5
+            )
             table_data.accessor.ctr_accessor_param.delta_threshold = config.get(
-                'sparse_delta_threshold', 0.25)
+                'sparse_delta_threshold', 0.25
+            )
             table_data.accessor.ctr_accessor_param.delta_keep_days = config.get(
-                'sparse_delta_keep_days', 16)
-            table_data.accessor.ctr_accessor_param.show_click_decay_rate = config.get(
-                'sparse_show_click_decay_rate', 0.98)
-            table_data.accessor.ctr_accessor_param.delete_threshold = config.get(
-                'sparse_delete_threshold', 0.8)
-            table_data.accessor.ctr_accessor_param.delete_after_unseen_days = config.get(
-                'sparse_delete_after_unseen_days', 30)
-            table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = config.get(
-                'sparse_ssd_unseenday_threshold', 1)
+                'sparse_delta_keep_days', 16
+            )
+            table_data.accessor.ctr_accessor_param.show_click_decay_rate = (
+                config.get('sparse_show_click_decay_rate', 0.98)
+            )
+            table_data.accessor.ctr_accessor_param.delete_threshold = (
+                config.get('sparse_delete_threshold', 0.8)
+            )
+            table_data.accessor.ctr_accessor_param.delete_after_unseen_days = (
+                config.get('sparse_delete_after_unseen_days', 30)
+            )
+            table_data.accessor.ctr_accessor_param.ssd_unseenday_threshold = (
+                config.get('sparse_ssd_unseenday_threshold', 1)
+            )
             converter = config.get('sparse_converter', "")
             deconverter = config.get('sparse_deconverter', "")
 
@@ -692,23 +813,33 @@ def set_sparse_table_config(table_data, config):
             save_data2.converter = converter
             save_data2.deconverter = deconverter
 
-            if accessor_class == 'DownpourCtrAccessor' or accessor_class == 'DownpourCtrDoubleAccessor':
-                sparse_optimizer_config(table_data.accessor.embed_sgd_param,
-                                        config, '')
-                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
-                                        config, '')
+            if (
+                accessor_class == 'DownpourCtrAccessor'
+                or accessor_class == 'DownpourCtrDoubleAccessor'
+            ):
+                sparse_optimizer_config(
+                    table_data.accessor.embed_sgd_param, config, ''
+                )
+                sparse_optimizer_config(
+                    table_data.accessor.embedx_sgd_param, config, ''
+                )
             else:
-                sparse_optimizer_config(table_data.accessor.embed_sgd_param,
-                                        config, 'embed_')
-                sparse_optimizer_config(table_data.accessor.embedx_sgd_param,
-                                        config, 'embedx_')
+                sparse_optimizer_config(
+                    table_data.accessor.embed_sgd_param, config, 'embed_'
+                )
+                sparse_optimizer_config(
+                    table_data.accessor.embedx_sgd_param, config, 'embedx_'
+                )
             add_graph_config(table_data.accessor.graph_sgd_param, config)
 
         if not configs:
             print("fleet desc config is empty")
         else:
             for table_name in configs:
-                if table_name == 'dense_table' or table_name == 'datanorm_table':
+                if (
+                    table_name == 'dense_table'
+                    or table_name == 'datanorm_table'
+                ):
                     continue
                 if type(configs[table_name]) != dict:
                     continue
@@ -744,6 +875,7 @@ def amp(self, flag):
     @property
     def amp_configs(self):
         """
+
         Set automatic mixed precision training configurations. In general, amp has serveral configurable
         settings that can be configured through a dict.
 
@@ -772,28 +904,27 @@ def amp_configs(self):
                    Default True. Only takes effect when `use_pure_fp16` is turned on.
 
         Examples 1:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.amp = True
-            strategy.amp_configs = {
-                "init_loss_scaling": 32768,
-                "custom_white_list": ['conv2d']}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.amp = True
+                strategy.amp_configs = {
+                    "init_loss_scaling": 32768,
+                    "custom_white_list": ['conv2d']}
 
         Examples 2:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.amp = True
+                # pure fp16
+                strategy.amp_configs = {
+                    "init_loss_scaling": 32768,
+                    "use_pure_fp16": True
+                }
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.amp = True
-            # pure fp16
-            strategy.amp_configs = {
-                "init_loss_scaling": 32768,
-                "use_pure_fp16": True
-            }
         """
         return get_msg_dict(self.strategy.amp_configs)
 
@@ -806,16 +937,16 @@ def amp_configs(self, configs):
     @property
     def asp(self):
         """
+
         Indicating whether we are using automatic sparsity training
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.asp = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.asp = True # by default this is false
 
         """
         return self.strategy.asp
@@ -835,30 +966,31 @@ def recompute(self):
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.recompute = True
+                # suppose x and y are names of checkpoint tensors for recomputation
+                strategy.recompute_configs = {"checkpoints": ["x", "y"]}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.recompute = True
-            # suppose x and y are names of checkpoint tensors for recomputation
-            strategy.recompute_configs = {"checkpoints": ["x", "y"]}
         """
         return self.strategy.recompute
 
     @property
     def sync_nccl_allreduce(self):
         """
+
         Indicating whether we are using synchronized all reduce in each communication thread
         We note that system overhead is usually lower when sync_nccl_allreduce = True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sync_nccl_allreduce = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sync_nccl_allreduce = True
         """
         return self.strategy.sync_nccl_allreduce
 
@@ -873,17 +1005,18 @@ def sync_nccl_allreduce(self, flag):
     @property
     def use_hierarchical_allreduce(self):
         """
+
         Indicating whether we are using hierarchical allreduce in collective communication
         Hierarchical allreduce often does allreduce within a certain node group and then do
         allreduce among the leaders of each group
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.use_hierarchical_allreduce = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.use_hierarchical_allreduce = True
         """
         return self.strategy.use_hierarchical_allreduce
 
@@ -900,16 +1033,17 @@ def use_hierarchical_allreduce(self, flag):
     @property
     def hierarchical_allreduce_inter_nranks(self):
         """
+
         Number of ranks for low level node groups in hierarchical allreduce
         Default value: number of GPU cards on each single GPU machine
 
         Example:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.hierarchical_allreduce_inter_nranks = 8
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.hierarchical_allreduce_inter_nranks = 8
         """
         return self.strategy.hierarchical_allreduce_inter_nranks
 
@@ -926,17 +1060,18 @@ def hierarchical_allreduce_inter_nranks(self, value):
     @property
     def sync_batch_norm(self):
         """
+
         Indicating whether we are using sync_batch_norm to do synchronous batch normalization among all training nodes.
 
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sync_batch_norm = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sync_batch_norm = True
         """
 
         return self.strategy.sync_batch_norm
@@ -952,16 +1087,17 @@ def sync_batch_norm(self, flag):
     @property
     def fuse_all_reduce_ops(self):
         """
+
         Indicating whether we are using fuse_all_reduce_ops for gradient fusion during backward phase of training
         Default value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_all_reduce_ops = False
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_all_reduce_ops = False
         """
         return self.strategy.fuse_all_reduce_ops
 
@@ -976,17 +1112,18 @@ def fuse_all_reduce_ops(self, flag):
     @property
     def fuse_grad_size_in_MB(self):
         """
+
         Specifying the size of gradient to fuse in Mega-Bytes
 
         Default value: 32
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_grad_size_in_MB = 50
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_grad_size_in_MB = 50
         """
         return self.strategy.fuse_grad_size_in_MB
 
@@ -1001,18 +1138,20 @@ def fuse_grad_size_in_MB(self, value):
     @property
     def last_comm_group_size_MB(self):
         """
-        Specifying the size of gradient to fuse in Mega-Bytes when 
-        the last group of each batch communicates. Making the last group 
-        small is useful to improve performance. 
+
+        Specifying the size of gradient to fuse in Mega-Bytes when
+        the last group of each batch communicates. Making the last group
+        small is useful to improve performance.
 
         Default value: 1
 
         Examples:
-          .. code-block:: python
-        
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.last_comm_group_size_MB = 2
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.last_comm_group_size_MB = 2
+
         """
         return self.strategy.last_comm_group_size_MB
 
@@ -1027,18 +1166,19 @@ def last_comm_group_size_MB(self, value):
     @property
     def find_unused_parameters(self):
         """
-        Indicating whether we are using find_unused_parameters to 
+
+        Indicating whether we are using find_unused_parameters to
         find unused parameters in DataParallel.
 
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.find_unused_parameters = True
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.find_unused_parameters = True
         """
 
         return self.strategy.find_unused_parameters
@@ -1070,17 +1210,18 @@ def _fuse_grad_size_in_TFLOPS(self, value):
     @property
     def nccl_comm_num(self):
         """
+
         Specifying the number of NCCL communicator
 
         Default value: 1
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.nccl_comm_num = 2
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.nccl_comm_num = 2
         """
 
         return self.strategy.nccl_comm_num
@@ -1104,32 +1245,32 @@ def recompute(self, flag):
     @property
     def recompute_configs(self):
         """
-        Set recompute configurations. 
-        
+
+        Set recompute configurations.
+
         **Note**:
         checkpoints(list): list of string name of checkpoints. In general, the recompute
         strategy of current implementation should have some manually assign checkpoints.
 
-        enable_offload(bool): enable recompute checkpoints offload feature. this feature 
+        enable_offload(bool): enable recompute checkpoints offload feature. this feature
         will offload the checkpoint to host memory to allow even larger batch size. since
         the memcpy from host to device takes time, it is a trade off between larger batch
         size and training speed.
 
         checkpoint_shape(list): list of int that specific the shape of checkpoint. so far
         recompute-offload requires that all checkpoint to be same shape, and every dimension
-        specific here should be determined ("-1" is not allowed). 
+        specific here should be determined ("-1" is not allowed).
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.recompute = True
-            strategy.recompute_configs = {
-                "checkpoints": ["x", "y"],
-                "enable_offload": True,
-                "checkpoint_shape": [100, 512, 1024] }
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.recompute = True
+                strategy.recompute_configs = {
+                    "checkpoints": ["x", "y"],
+                    "enable_offload": True,
+                    "checkpoint_shape": [100, 512, 1024] }
 
         """
         return get_msg_dict(self.strategy.recompute_configs)
@@ -1137,15 +1278,17 @@ def recompute_configs(self):
     @recompute_configs.setter
     @is_strict_auto
     def recompute_configs(self, configs):
-        check_configs_key(self.strategy.recompute_configs, configs,
-                          "checkpoint_configs")
+        check_configs_key(
+            self.strategy.recompute_configs, configs, "checkpoint_configs"
+        )
         assign_configs_value(self.strategy.recompute_configs, configs)
 
     @property
     def sharding(self):
         """
+
         Indicating whether we are using sharding Optimizer for memory
-        optimization. We implement the sharding optimizer following the ZeRO-DP 
+        optimization. We implement the sharding optimizer following the ZeRO-DP
         idea from [ZeRO: Memory Optimizations Toward Training Trillion Parameter Models](https://arxiv.org/abs/1910.02054).
         Model parameters and Optimizer State are sharded into different ranks allowing to fit larger model.
 
@@ -1154,12 +1297,12 @@ def sharding(self):
         Default value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sharding = True
 
-            import paddle.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sharding = True
         """
         return self.strategy.sharding
 
@@ -1174,26 +1317,27 @@ def sharding(self, flag):
     @property
     def sharding_configs(self):
         """
-        Set sharding configurations. 
+
+        Set sharding configurations.
 
         **Note**:
-            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are 
-            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and 
+            sharding_segment_strategy(string, optional): strategy used to segment the program(forward & backward operations). two strategise are
+            available: "segment_broadcast_MB" and "segment_anchors". segment is a concept used in sharding to overlap computation and
             communication. Default is segment_broadcast_MB.
 
-            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and 
+            segment_broadcast_MB(float, optional): segment by the parameters broadcast volume. sharding will introduce parameter broadcast operations into program, and
             after every segment_broadcast_MB size parameter being broadcasted, the program will be cutted into one segment.
             This configuration will affect the communication speed in sharding training, and should be an empirical value decided by your model size and network topology.
             Only enable when sharding_segment_strategy = segment_broadcast_MB. Default is 32.0 .
 
-            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation. 
+            segment_anchors(list): list of anchors used to segment the program, which allows a finner control of program segmentation.
             this strategy is experimental by now. Only enable when sharding_segment_strategy = segment_anchors.
 
             sharding_degree(int, optional): specific the number of gpus within each sharding parallelism group; and sharding will be turn off if sharding_degree=1.  Default is 8.
 
             gradient_merge_acc_step(int, optional): specific the accumulation steps in gradient merge; and gradient merge will be turn off if gradient_merge_acc_step=1.  Default is 1.
 
-            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model. 
+            optimize_offload(bool, optional): enable the optimizer offload which will offload the moment vars to Host memory in order to saving GPU memory for fitting larger model.
             the moment var will be prefetch from and offloaded to Host memory during update stage. it is a stragtegy that trades off between training speed and GPU memory, and is recommened to be turn on only when gradient_merge_acc_step large, where
             the number of time of update stage will be relatively small compared with forward&backward's.  Default is False.
 
@@ -1203,7 +1347,7 @@ def sharding_configs(self):
 
             pp_degree(int, optional): [Hybrid parallelism ONLY] specific the number of gpus within each pipeline parallelism group; and pipeline parallelism will turn be off if pp_degree=1.  Default is 1.
 
-            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on. 
+            pp_allreduce_in_optimize(bool, optional): [Hybrid parallelism ONLY] move the allreduce operations from backward stage to update(optimize) stage when pipeline parallelsim is on.
             This configuration will affect the communication speed of Hybrid parallelism training depeneded on network topology. this strategy is experimental by now..  Default is False.
 
             optimize_cast(bool, optional): [Hybrid parallelism ONLY] Move the cast op of AMP which cast fp32 param to fp16 param to optimizer. optimize_cast will persist fp16 param, it
@@ -1211,42 +1355,43 @@ def sharding_configs(self):
 
 
         Examples:
+            .. code-block:: python
+
+                # sharding-DP, 2 nodes with 8 gpus per node
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.sharding = True
+                strategy.sharding_configs = {
+                    "sharding_segment_strategy": "segment_broadcast_MB",
+                    "segment_broadcast_MB": 32,
+                    "sharding_degree": 8,
+                    "dp_degree": 2,
+                    "gradient_merge_acc_step": 4,
+                    }
 
-          .. code-block:: python
-
-            # sharding-DP, 2 nodes with 8 gpus per node
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.sharding = True
-            strategy.sharding_configs = {
-                "sharding_segment_strategy": "segment_broadcast_MB",
-                "segment_broadcast_MB": 32,
-                "sharding_degree": 8,
-                "dp_degree": 2,
-                "gradient_merge_acc_step": 4,
-                }
         """
         return get_msg_dict(self.strategy.sharding_configs)
 
     @sharding_configs.setter
     @is_strict_auto
     def sharding_configs(self, configs):
-        check_configs_key(self.strategy.sharding_configs, configs,
-                          "sharding_configs")
+        check_configs_key(
+            self.strategy.sharding_configs, configs, "sharding_configs"
+        )
         assign_configs_value(self.strategy.sharding_configs, configs)
 
     @property
     def without_graph_optimization(self):
         """
+
         Run program using Executor other than ParallelExecutor.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.without_graph_optimization = True
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.without_graph_optimization = True
 
         """
         return self.strategy.without_graph_optimization
@@ -1264,14 +1409,18 @@ def without_graph_optimization(self, flag):
     @property
     def _calc_comm_same_stream(self):
         """
+
         This based on raw_program_optimizer program
         Set whether use same stream for calc and comm when fuse allreduce
         The default value for the calc_comm_same_stream is False
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.calc_comm_same_stream = True
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.calc_comm_same_stream = True
+
         """
         return self.strategy.calc_comm_same_stream
 
@@ -1288,14 +1437,18 @@ def _calc_comm_same_stream(self, same):
     @property
     def fuse_grad_merge(self):
         """
+
         Set whether fuse the grad for gradient merge.
         Note: this flag will only effect the gradient merge under pipeline mode
         The default value for the fuse_grad_merge is False
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_param_grad = True
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_param_grad = True
+
         """
         return self.strategy.fuse_grad_merge
 
@@ -1310,12 +1463,17 @@ def fuse_grad_merge(self, fuse_grad_merge):
     @property
     def fuse_grad_size_in_num(self):
         """
+
         This based on raw_program_optimizer program and allreduce the num of the fused op
+
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fuse_grad_size_in_num = 2
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+
+                strategy = fleet.DistributedStrategy()
+                strategy.fuse_grad_size_in_num = 2
+
         """
         return self.strategy.fuse_grad_size_in_num
 
@@ -1332,18 +1490,18 @@ def fuse_grad_size_in_num(self, num):
     @property
     def pipeline(self):
         """
+
         Indicating whether we are using pipeline parallelism for distributed training.
         Current implementation mainly focus on single GPU machine pipeline parallelism and
         data parallelism across GPU machine. The pipeline information is indicated through
         device_guard information in user-defined program.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.pipeline = True
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.pipeline = True
 
         """
         return self.strategy.pipeline
@@ -1383,13 +1541,14 @@ def pipeline(self, flag):
     @property
     def pipeline_configs(self):
         """
+
         Set pipeline parallelism configurations. In pipeline parallelism,
         different parts of neural networks are running on different GPUS.
-        There are Tensor queue buffer between each pair of neighborhood GPUS 
+        There are Tensor queue buffer between each pair of neighborhood GPUS
         that are responsible for synchronizing hidden Tensor results between
         GPUs. Pipeline parallelism consists of serveral producer-consumer style
         hardware pairs, such as GPU-GPU, CPU-GPU, GPU-XPU. The best way to speedup
-        pipeline parallelism is to make the size of Tensor in Tensor queue smaller, 
+        pipeline parallelism is to make the size of Tensor in Tensor queue smaller,
         so that we will have a faster producer for downstream consumers.
 
         **Notes**:
@@ -1398,13 +1557,12 @@ def pipeline_configs(self):
             **micro_batch_size**: the number of small batches in each user defined batch
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.pipeline = True
-            strategy.pipeline_configs = {"micro_batch_size": 12}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.pipeline = True
+                strategy.pipeline_configs = {"micro_batch_size": 12}
 
         """
 
@@ -1413,22 +1571,23 @@ def pipeline_configs(self):
     @pipeline_configs.setter
     @is_strict_auto
     def pipeline_configs(self, configs):
-        check_configs_key(self.strategy.pipeline_configs, configs,
-                          "pipeline_configs")
+        check_configs_key(
+            self.strategy.pipeline_configs, configs, "pipeline_configs"
+        )
         assign_configs_value(self.strategy.pipeline_configs, configs)
 
     @property
     def tensor_parallel(self):
         """
+
         Indicating whether we are using tensor parallel for distributed training.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.tensor_parallel = True
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.tensor_parallel = True
 
         """
         return self.strategy.tensor_parallel
@@ -1444,23 +1603,25 @@ def tensor_parallel(self, flag):
     @property
     def tensor_parallel_configs(self):
         """
+
         Set tensor_parallel configurations.
 
         **Notes**:
             **Detailed arguments for tensor_parallel_configs**
+
             **tensor_parallel_degree**: degree of tensor parallel
+
             **tensor_init_seed**: parameter initialization random seed
 
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.tensor_parallel = True
-            strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
-                                                "tensor_init_seed": 123}
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.tensor_parallel = True
+                strategy.tensor_parallel_configs = {"tensor_parallel_degree": 4,
+                                                    "tensor_init_seed": 123}
 
         """
         return get_msg_dict(self.strategy.tensor_parallel_configs)
@@ -1468,59 +1629,67 @@ def tensor_parallel_configs(self):
     @tensor_parallel_configs.setter
     @is_strict_auto
     def tensor_parallel_configs(self, configs):
-        check_configs_key(self.strategy.tensor_parallel_configs, configs,
-                          "tensor_parallel_configs")
+        check_configs_key(
+            self.strategy.tensor_parallel_configs,
+            configs,
+            "tensor_parallel_configs",
+        )
         assign_configs_value(self.strategy.tensor_parallel_configs, configs)
 
     @property
     def hybrid_configs(self):
         """
-        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism 
+
+        Dynamic graph hybrid parallel strategy configuration. Three-way hybrid parallelism
         needs to meet the following relationships
 
         total_number_GPUs = dp_degree * mp_degree * pp_degree
 
         **Note**:
-            dp_degree(int): set number of GPUs in a data parallel group. Default -1.
+            **dp_degree(int)**: set number of GPUs in a data parallel group. Default -1.
                                     This value should be an integer greater than 0.
-                                    If it is not set, or set to -1, its value will be inferred 
+                                    If it is not set, or set to -1, its value will be inferred
                                     based on the total number of cards.
-            mp_degree(int): set number of GPUs in a model parallel group. Default 1
-            pp_degree(int): set number of GPUs in a pipeline parallel group. Default 1
 
+            **mp_degree(int)**: set number of GPUs in a model parallel group. Default 1
+
+            **pp_degree(int)**: set number of GPUs in a pipeline parallel group. Default 1
 
         Examples:
-          .. code-block:: python
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.hybrid_configs = {
-                "dp_degree": 1,
-                "mp_degree": 2,
-                "pp_degree": 1}
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.hybrid_configs = {
+                    "dp_degree": 1,
+                    "mp_degree": 2,
+                    "pp_degree": 1}
+
         """
         return get_msg_dict(self.strategy.hybrid_configs)
 
     @hybrid_configs.setter
     def hybrid_configs(self, configs):
-        check_configs_key(self.strategy.hybrid_configs, configs,
-                          "hybrid_configs")
+        check_configs_key(
+            self.strategy.hybrid_configs, configs, "hybrid_configs"
+        )
         assign_configs_value(self.strategy.hybrid_configs, configs)
 
     @property
     def localsgd(self):
         """
+
         Indicating whether we are using Local SGD training. Default Value: False
         For more details, please refer to
         `Don't Use Large Mini-Batches, Use Local SGD <https://arxiv.org/pdf/1808.07217.pdf>`_.
 
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.localsgd = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.localsgd = True # by default this is false
 
         """
         return self.strategy.localsgd
@@ -1536,6 +1705,7 @@ def localsgd(self, flag):
     @property
     def localsgd_configs(self):
         """
+
         Set LocalSGD training configurations. LocalSGD has a configurable
         setting that can be configured through a dict.
 
@@ -1544,14 +1714,14 @@ def localsgd_configs(self):
             begin_step(int) The step of beginning training by localsgd. Default 1.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.localsgd = True
+                strategy.localsgd_configs = {"k_steps": 4,
+                                            "begin_step": 30}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.localsgd = True
-            strategy.localsgd_configs = {"k_steps": 4,
-                                         "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.localsgd_configs)
@@ -1559,25 +1729,25 @@ def localsgd_configs(self):
     @localsgd_configs.setter
     @is_strict_auto
     def localsgd_configs(self, configs):
-        check_configs_key(self.strategy.localsgd_configs, configs,
-                          "localsgd_configs")
+        check_configs_key(
+            self.strategy.localsgd_configs, configs, "localsgd_configs"
+        )
         assign_configs_value(self.strategy.localsgd_configs, configs)
 
     @property
     def adaptive_localsgd(self):
         """
+
         Indicating whether we are using Adaptive Local SGD training. Default Value: False
-        For more details, please refer to `Adaptive Communication Strategies to Achieve 
+        For more details, please refer to `Adaptive Communication Strategies to Achieve
         the Best Error-Runtime Trade-off in Local-Update SGD <https://arxiv.org/pdf/1810.08313.pdf>`_.
 
-
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.adaptive_localsgd = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.adaptive_localsgd = True # by default this is false
 
         """
         return self.strategy.adaptive_localsgd
@@ -1593,6 +1763,7 @@ def adaptive_localsgd(self, flag):
     @property
     def adaptive_localsgd_configs(self):
         """
+
         Set AdaptiveLocalSGD training configurations. AdaptiveLocalSGD has a configurable
         setting that can be configured through a dict.
 
@@ -1600,17 +1771,18 @@ def adaptive_localsgd_configs(self):
             init_k_steps(int) The initial steps for training before adaptive localsgd.
                               Then, the adaptive localsgd method will modify init_k_steps automatically.
                               Default 1.
+
             begin_step(int) The step of beginning training by adaptive localsgd. Default 1.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.adaptive_localsgd = True
+                strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
+                                                    "begin_step": 30}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.adaptive_localsgd = True
-            strategy.adaptive_localsgd_configs = {"init_k_steps": 1,
-                                                  "begin_step": 30}
         """
 
         return get_msg_dict(self.strategy.adaptive_localsgd_configs)
@@ -1618,25 +1790,28 @@ def adaptive_localsgd_configs(self):
     @adaptive_localsgd_configs.setter
     @is_strict_auto
     def adaptive_localsgd_configs(self, configs):
-        check_configs_key(self.strategy.adaptive_localsgd_configs, configs,
-                          "adaptive_localsgd_configs")
+        check_configs_key(
+            self.strategy.adaptive_localsgd_configs,
+            configs,
+            "adaptive_localsgd_configs",
+        )
         assign_configs_value(self.strategy.adaptive_localsgd_configs, configs)
 
     @property
     def dgc(self):
         """
+
         Indicating whether we are using Deep Gradient Compression training. For more details, please refer to
         [Deep Gradient Compression](https://arxiv.org/abs/1712.01887).
 
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.dgc = True # by default this is false
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.dgc = True # by default this is false
 
         """
         return self.strategy.dgc
@@ -1652,6 +1827,7 @@ def dgc(self, flag):
     @property
     def dgc_configs(self):
         r"""
+
         Set Deep Gradient Compression training configurations. In general, dgc has serveral configurable
         settings that can be configured through a dict.
 
@@ -1668,13 +1844,13 @@ def dgc_configs(self):
                     element will be transmitted.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.dgc = True
+                strategy.dgc_configs = {"rampup_begin_step": 1252}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.dgc = True
-            strategy.dgc_configs = {"rampup_begin_step": 1252}
         """
         return get_msg_dict(self.strategy.dgc_configs)
 
@@ -1687,16 +1863,17 @@ def dgc_configs(self, configs):
     @property
     def fp16_allreduce(self):
         """
+
         Indicating whether we are using fp16 gradient allreduce training
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.fp16_allreduce = True # by default this is false
+                strategy = fleet.DistributedStrategy()
+                strategy.fp16_allreduce = True # by default this is false
 
         """
         return self.strategy.fp16_allreduce
@@ -1711,6 +1888,7 @@ def fp16_allreduce(self, flag):
     @property
     def gradient_merge(self):
         """
+
         Gradient Merge, also called as Gradient Accumulation,
         is a strategy for large batch training. With this strategy,
         model parameter will not be updated until user-defined steps.
@@ -1721,13 +1899,13 @@ def gradient_merge(self):
         to model parameters.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.gradient_merge = True
+                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.gradient_merge = True
-            strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
         """
         return self.strategy.gradient_merge
 
@@ -1742,6 +1920,7 @@ def gradient_merge(self, flag):
     @property
     def gradient_merge_configs(self):
         """
+
         the key-value configs of distribute_strategy
 
         **Note**:
@@ -1750,39 +1929,41 @@ def gradient_merge_configs(self):
             avg(bool): whether to average the gradients of each mini-batch, the default value is `True`
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.gradient_merge = True
+                strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.gradient_merge = True
-            strategy.gradient_merge_configs = {"k_steps": 4, "avg": True}
         """
         return get_msg_dict(self.strategy.gradient_merge_configs)
 
     @gradient_merge_configs.setter
     @is_strict_auto
     def gradient_merge_configs(self, configs):
-        check_configs_key(self.strategy.gradient_merge_configs, configs,
-                          "gradient_configs")
+        check_configs_key(
+            self.strategy.gradient_merge_configs, configs, "gradient_configs"
+        )
         assign_configs_value(self.strategy.gradient_merge_configs, configs)
 
     @property
     def lars(self):
         """
-        Set lars configurations. lars is used to deal with the convergence problems when the global 
-        batch size is larger than 8k.  For more details, please refer to 
+
+        Set lars configurations. lars is used to deal with the convergence problems when the global
+        batch size is larger than 8k.  For more details, please refer to
         [Large Batch Training of Convolutional Networks](https://arxiv.org/abs/1708.03888).
 
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lars = True # by default this is false
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lars = True # by default this is false
         """
         return self.strategy.lars
 
@@ -1797,29 +1978,30 @@ def lars(self, flag):
     @property
     def lars_configs(self):
         """
+
         Set Lars training configurations.
 
         **Notes**:
         **lars_coeff (float)**: trust ratio in lars formula.
         **lars_weight_decay** (float): weight decay coefficient in lars formula.
-        **epsilon (float)**: argument is used to avoid potential devision-by-zero 
-        when compute the local lr; 
+        **epsilon (float)**: argument is used to avoid potential devision-by-zero
+        when compute the local lr;
         **exclude_from_weight_decay ([string])**: is a list of name strings of layers which
         will be exclude from weight decay in lars formula.
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lars = True
+                strategy.lars_configs = {
+                            "lars_coeff": 0.01,
+                            "lars_weight_decay": 0.0005,
+                            "epsilon": 0,
+                            "exclude_from_weight_decay": ['batch_norm', '.b_0']
+                        }
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lars = True
-            strategy.lars_configs = {
-                        "lars_coeff": 0.01,
-                        "lars_weight_decay": 0.0005,
-                        "epsilon": 0,
-                        "exclude_from_weight_decay": ['batch_norm', '.b_0']
-                    }
         """
         return get_msg_dict(self.strategy.lars_configs)
 
@@ -1832,20 +2014,21 @@ def lars_configs(self, configs):
     @property
     def lamb(self):
         """
-        Set lamb configurations. lamb is used to deal with the convergence problems for large 
-        batch size training, specially for attention-related model like BERT. For more details, 
-        please refer to 
+
+        Set lamb configurations. lamb is used to deal with the convergence problems for large
+        batch size training, specially for attention-related model like BERT. For more details,
+        please refer to
         [Large Batch Optimization for Deep Learning: Training BERT in 76 minutes](https://arxiv.org/abs/1904.00962).
 
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lamb = True # by default this is false
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lamb = True # by default this is false
         """
 
         return self.strategy.lamb
@@ -1861,6 +2044,7 @@ def lamb(self, flag):
     @property
     def lamb_configs(self):
         """
+
         Set Lars training configurations.
 
         **Notes**:
@@ -1869,16 +2053,16 @@ def lamb_configs(self):
         will be exclude from weight decay in lamb formula.
 
         Examples:
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.lamb = True
+                strategy.lamb_configs = {
+                        'lamb_weight_decay': 0.01,
+                        'exclude_from_weight_decay': [],
+                    }
 
-          .. code-block:: python
-
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.lamb = True
-            strategy.lamb_configs = {
-                    'lamb_weight_decay': 0.01,
-                    'exclude_from_weight_decay': [],
-                }
         """
         return get_msg_dict(self.strategy.lamb_configs)
 
@@ -1891,8 +2075,10 @@ def lamb_configs(self, configs):
     @property
     def elastic(self):
         """
+
         Indicating whether we want to do current distributed training on clusters with elastic resources.
         Currently, this is configuration is not valid.
+
         """
         return self.strategy.elastic
 
@@ -1907,28 +2093,29 @@ def elastic(self, flag):
     @property
     def auto(self):
         """
+
         Indicating whether we are using auto-parallel configuration
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
         auto-parallelism can be used only when a user does not set any other
         strategy configs except auto. For details, please reference the following
         code example
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.auto = True
+                # if set other strategy at the same time, auto will not apply
+                # strategy.amp = True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.auto = True
-            # if set other strategy at the same time, auto will not apply
-            # strategy.amp = True
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.auto
 
@@ -1942,28 +2129,29 @@ def auto(self, flag):
     @property
     def semi_auto(self):
         """
+
         Indicating whether we are using semi-auto parallel function
-        This feature is currently an experimental feature. Currently, 
+        This feature is currently an experimental feature. Currently,
         auto-parallelism can be used only when a user does not set any other
         strategy configs except semi-auto. For details, please reference the following
         code example
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.semi_auto = True
+                # if set other strategy at the same time, auto will not apply
+                # strategy.amp = True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.semi_auto = True
-            # if set other strategy at the same time, auto will not apply
-            # strategy.amp = True
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.semi_auto
 
@@ -1977,16 +2165,21 @@ def semi_auto(self, flag):
     @property
     def auto_search(self):
         """
+
         Indicating whether we are using auto-search parallel function
         For details, please reference the following code example
         Default Value: False
+
         Examples:
-          .. code-block:: python
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.auto_search = True
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.auto_search = True
+
         """
         return self.strategy.auto_search
 
@@ -2000,15 +2193,20 @@ def auto_search(self, flag):
     @property
     def split_data(self):
         """
+
         Indicating whether we split the data. If True, we split the data.
         Default Value: True
+
         Examples:
-          .. code-block:: python
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.split_data = True
+            .. code-block:: python
+
+                import paddle
+
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.split_data = True
+
         """
         return self.strategy.split_data
 
@@ -2022,8 +2220,10 @@ def split_data(self, flag):
     @property
     def qat(self):
         """
+
         Indicating whether we are using quantization training
         Default Value: False
+
         """
         return self.strategy.qat
 
@@ -2037,6 +2237,7 @@ def qat(self, flag):
     @property
     def qat_configs(self):
         """
+
         Set quantization training configurations. In general, qat has serveral configurable
         settings that can be configured through a dict.
 
@@ -2047,23 +2248,23 @@ def qat_configs(self):
 
             activation_bits(int): quantization bit number for activation. Default is 8.
 
-            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope, 
+            not_quant_pattern(list[str]): When the skip pattern is detected in an op's name scope,
                 the corresponding op will not be quantized.
 
             algo(str): Other quantization training algorithm.
 
         Exampless:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle.distributed.fleet as fleet
 
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.qat = True
-            strategy.qat_configs = {
-                "channel_wise_abs_max": True,
-                "weight_bits": 8,
-                "activation_bits: 8,
-                "not_quant_pattern": ['skip_quant']}
+                strategy = fleet.DistributedStrategy()
+                strategy.qat = True
+                strategy.qat_configs = {
+                    "channel_wise_abs_max": True,
+                    "weight_bits": 8,
+                    "activation_bits: 8,
+                    "not_quant_pattern": ['skip_quant']}
 
         """
         return get_msg_dict(self.strategy.qat_configs)
@@ -2076,24 +2277,25 @@ def qat_configs(self, configs):
     @property
     def heter_ccl_mode(self):
         """
+
         Indicating whether we are using heter_ccl_mode for model training.
         This feature is currently an experimental feature. Currently,
         heter_ccl_mode can be used only for dataparallel with dygraph mode.
         Default Value: False
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                strategy.heter_ccl_mode = True
 
-            strategy = fleet.DistributedStrategy()
-            strategy.heter_ccl_mode = True
+                # for initialize parallel env, only need to call
+                paddle.distributed.init_parallel_env()
+                # then the heterogenous context will be created.
 
-            # for initialize parallel env, only need to call
-            paddle.distributed.init_parallel_env()
-            # then the heterogenous context will be created.
         """
         return self.strategy.heter_ccl_mode
 
@@ -2107,6 +2309,7 @@ def heter_ccl_mode(self, flag):
     @property
     def cudnn_exhaustive_search(self):
         """
+
         Indicating whether to use exhaustive search method to choose convolution algorithms.
         Exhaustive search attempts all cuDNN algorithms to choose the fastest algorithm.
         This method is time-consuming, the choosed algorithm will be cached for the given layer specifications.
@@ -2114,17 +2317,18 @@ def cudnn_exhaustive_search(self):
         Default Value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.cudnn_exhaustive_search = False
+                strategy = fleet.DistributedStrategy()
+                strategy.cudnn_exhaustive_search = False
+
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
         """
         return self.strategy.cudnn_exhaustive_search
 
@@ -2141,6 +2345,7 @@ def cudnn_exhaustive_search(self, flag):
     @property
     def conv_workspace_size_limit(self):
         """
+
         The workspace limit size in MB unit for choosing cuDNN convolution algorithms.
         The inner funciton of cuDNN obtain the fastest suited algorithm that fits within this memory limit.
         Usually, large workspace size may lead to choose faster algorithms,
@@ -2148,17 +2353,17 @@ def conv_workspace_size_limit(self):
         Default Value: 4000
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.conv_workspace_size_limit = 1024
+                strategy = fleet.DistributedStrategy()
+                strategy.conv_workspace_size_limit = 1024
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.conv_workspace_size_limit
@@ -2176,22 +2381,23 @@ def conv_workspace_size_limit(self, value):
     @property
     def cudnn_batchnorm_spatial_persistent(self):
         """
+
         Indicates whether to use the mode CUDNN_BATCHNORM_SPATIAL_PERSISTENT function in batchnorm.
         This is only useful in cudnn.
         Default Value: True
 
         Examples:
+            .. code-block:: python
 
-          .. code-block:: python
+                import paddle
+                paddle.enable_static()
+                import paddle.distributed.fleet as fleet
 
-            import paddle
-            paddle.enable_static()
-            import paddle.distributed.fleet as fleet
-            strategy = fleet.DistributedStrategy()
-            strategy.cudnn_batchnorm_spatial_persistent = True
+                strategy = fleet.DistributedStrategy()
+                strategy.cudnn_batchnorm_spatial_persistent = True
 
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01)
-            optimizer = fleet.distributed_optimizer(optimizer, strategy)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01)
+                optimizer = fleet.distributed_optimizer(optimizer, strategy)
 
         """
         return self.strategy.cudnn_batchnorm_spatial_persistent
@@ -2244,7 +2450,8 @@ def __repr__(self):
 
         h1_format = "    " + "|{{:^{}s}}|\n".format(length)
         h2_format = "    " + "|{{:>{}s}}{}{{:^{}s}}|\n".format(
-            max_k, " " * spacing, max_v)
+            max_k, " " * spacing, max_v
+        )
 
         border = "    +" + "".join(["="] * length) + "+"
         line = "    +" + "".join(["-"] * length) + "+"
@@ -2269,37 +2476,48 @@ def __repr__(self):
                         if getattr(self.strategy, f.name):
                             draws += border + "\n"
                             draws += h1_format.format(
-                                "{}=True <-> {}_configs".format(f.name, f.name))
+                                "{}=True <-> {}_configs".format(f.name, f.name)
+                            )
                             draws += line + "\n"
-                            my_configs = getattr(self.strategy,
-                                                 f.name + "_configs")
+                            my_configs = getattr(
+                                self.strategy, f.name + "_configs"
+                            )
                             config_fields = my_configs.DESCRIPTOR.fields
                             for ff in config_fields:
                                 if isinstance(
-                                        getattr(my_configs,
-                                                ff.name), google.protobuf.pyext.
-                                        _message.RepeatedScalarContainer):
+                                    getattr(my_configs, ff.name),
+                                    google.protobuf.pyext._message.RepeatedScalarContainer,
+                                ):
                                     values = getattr(my_configs, ff.name)
                                     for i, v in enumerate(values):
                                         if i == 0:
                                             draws += h2_format.format(
-                                                ff.name, str(v))
+                                                ff.name, str(v)
+                                            )
                                         else:
                                             draws += h2_format.format(
-                                                "", str(v))
+                                                "", str(v)
+                                            )
                                 else:
                                     draws += h2_format.format(
                                         ff.name,
-                                        str(getattr(my_configs, ff.name)))
+                                        str(getattr(my_configs, ff.name)),
+                                    )
                     else:
                         env_draws += h2_format.format(
-                            f.name, str(getattr(self.strategy, f.name)))
+                            f.name, str(getattr(self.strategy, f.name))
+                        )
                 else:
                     env_draws += h2_format.format(
-                        f.name, str(getattr(self.strategy, f.name)))
-
-        result_res = draws + border + "\n" + h1_format.format(
-            "Environment Flags, Communication Flags")
+                        f.name, str(getattr(self.strategy, f.name))
+                    )
+
+        result_res = (
+            draws
+            + border
+            + "\n"
+            + h1_format.format("Environment Flags, Communication Flags")
+        )
         result_res += env_draws
 
         build_strategy_str = border + "\n"
@@ -2309,7 +2527,8 @@ def __repr__(self):
         fields = self.strategy.build_strategy.DESCRIPTOR.fields
         for f in fields:
             build_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.build_strategy, f.name)))
+                f.name, str(getattr(self.strategy.build_strategy, f.name))
+            )
         build_strategy_str += border + "\n"
 
         execution_strategy_str = h1_format.format("Execution Strategy")
@@ -2318,7 +2537,8 @@ def __repr__(self):
         fields = self.strategy.execution_strategy.DESCRIPTOR.fields
         for f in fields:
             execution_strategy_str += h2_format.format(
-                f.name, str(getattr(self.strategy.execution_strategy, f.name)))
+                f.name, str(getattr(self.strategy.execution_strategy, f.name))
+            )
         execution_strategy_str += border + "\n"
 
         result_res += build_strategy_str + execution_strategy_str
diff --git a/python/paddle/distributed/fleet/base/private_helper_function.py b/python/paddle/distributed/fleet/base/private_helper_function.py
index 8e2871272a9911..ff18a44ac0c42d 100644
--- a/python/paddle/distributed/fleet/base/private_helper_function.py
+++ b/python/paddle/distributed/fleet/base/private_helper_function.py
@@ -24,15 +24,15 @@ def wait_server_ready(endpoints):
     """
     Wait until parameter servers are ready, use connext_ex to detect
     port readiness.
-    
+
     Args:
     endpoints (list|tuple): endpoints string list, like:
     ["127.0.0.1:8080", "127.0.0.1:8081"]
-    
+
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-         wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
+             wait_server_ready(["127.0.0.1:8080", "127.0.0.1:8081"])
     """
     assert not isinstance(endpoints, str)
     while True:
@@ -40,8 +40,9 @@ def wait_server_ready(endpoints):
         not_ready_endpoints = []
         for ep in endpoints:
             ip_port = ep.split(":")
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as sock:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                 sock.settimeout(2)
                 sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                 if hasattr(socket, 'SO_REUSEPORT'):
@@ -53,8 +54,9 @@ def wait_server_ready(endpoints):
                     not_ready_endpoints.append(ep)
         if not all_ok:
             sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-            sys.stderr.write("not ready endpoints:" + str(not_ready_endpoints) +
-                             "\n")
+            sys.stderr.write(
+                "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+            )
             sys.stderr.flush()
             time.sleep(3)
         else:
diff --git a/python/paddle/distributed/fleet/base/topology.py b/python/paddle/distributed/fleet/base/topology.py
index bbaca8951205b9..7ad6ce3bd0033b 100644
--- a/python/paddle/distributed/fleet/base/topology.py
+++ b/python/paddle/distributed/fleet/base/topology.py
@@ -28,12 +28,13 @@
 
 class ParallelMode(object):
     """
+
     There are all the parallel modes currently supported:
-    - DATA_PARALLEL: Distribute input data to different devices.
-    - TENSOR_PARALLEL: Shards tensors in the network to different devices.
-    - PIPELINE_PARALLEL: Place different layers of the network on different devices.
-    - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states 
-                         corresponding to the parameters to each device.
+
+        - DATA_PARALLEL: Distribute input data to different devices.
+        - TENSOR_PARALLEL: Shards tensors in the network to different devices.
+        - PIPELINE_PARALLEL: Place different layers of the network on different devices.
+        - SHARDING_PARALLEL: Segment the model parameters, parameter gradients and optimizer states corresponding to the parameters to each device.
 
     Examples:
         .. code-block:: python
@@ -43,6 +44,7 @@ class ParallelMode(object):
             print(parallel_mode.DATA_PARALLEL)  # 0
 
     """
+
     DATA_PARALLEL = 0
     TENSOR_PARALLEL = 1
     PIPELINE_PARALLEL = 2
@@ -50,14 +52,16 @@ class ParallelMode(object):
 
 
 class CommunicateTopology(object):
-
-    def __init__(self,
-                 hybrid_group_names=["data", "pipe", "sharding", "model"],
-                 dims=[1, 1, 1, 1]):
+    def __init__(
+        self,
+        hybrid_group_names=["data", "pipe", "sharding", "model"],
+        dims=[1, 1, 1, 1],
+    ):
         self._parallel_names = hybrid_group_names
         self._dims = dims
-        self.coordinate = collections.namedtuple('Coordinate',
-                                                 self._parallel_names)
+        self.coordinate = collections.namedtuple(
+            'Coordinate', self._parallel_names
+        )
         self._world_size = reduce(lambda x, y: x * y, self._dims)
 
         ranges = [range(d) for d in self._dims]
@@ -65,7 +69,8 @@ def __init__(self,
 
         self._coord2rank = dict(zip(all_coordinate, range(len(all_coordinate))))
         self._rank2coord = dict(
-            zip(self._coord2rank.values(), self._coord2rank.keys()))
+            zip(self._coord2rank.values(), self._coord2rank.keys())
+        )
 
     def get_hybrid_group_names(self):
         return self._parallel_names
@@ -90,7 +95,8 @@ def get_coord(self, rank):
     def get_axis_list(self, axis_name, index):
         axis = self._parallel_names.index(axis_name)
         ranks = [
-            self._coord2rank[coord] for coord in self._coord2rank.keys()
+            self._coord2rank[coord]
+            for coord in self._coord2rank.keys()
             if coord[axis] == index
         ]
         ranks.sort()
@@ -132,7 +138,6 @@ def get_rank_from_stage(self, global_rank, **kwargs):
 
 
 class HybridCommunicateGroup(object):
-
     def __init__(self, topology):
         self.nranks = paddle.distributed.get_world_size()
         self.global_rank = paddle.distributed.get_rank()
@@ -148,10 +153,16 @@ def __init__(self, topology):
         self._sharding_parallel_id = self._get_sharding_parallel_id()
         self.stage_id = self._get_pipe_parallel_id()
 
-        assert self._check_vaild_topo(
-        ), "Here is an unreasonable topogy setting. world_size: {}, but" \
-            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(self.nranks,
-            self._mp_degree, self._sharding_degree, self._pp_degree, self._dp_degree)
+        assert self._check_vaild_topo(), (
+            "Here is an unreasonable topogy setting. world_size: {}, but"
+            "mp_num: {}, sharding_num: {}, pp_num: {}, dp_num: {}".format(
+                self.nranks,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
 
         # create comm group for data parallel
         self._dp_group, self._dp_comm_group = self._set_comm_group("data")
@@ -164,26 +175,43 @@ def __init__(self, topology):
 
         # create comm group for sharding parallel
         self._sharding_group, self._sharding_comm_group = self._set_comm_group(
-            "sharding")
+            "sharding"
+        )
 
         # create global group for check inf_nan / clip global norm
         self._check_group, self._check_comm_group = self._set_check_group(
-            "data")
+            "data"
+        )
 
         # create p2p group
-        self.is_first_stage = (self.stage_id == 0)
-        self.is_last_stage = (self.stage_id == (self._pp_degree - 1))
+        self.is_first_stage = self.stage_id == 0
+        self.is_last_stage = self.stage_id == (self._pp_degree - 1)
 
         # create p2p_groups
         if self._pp_degree > 1:
             self._set_p2p_group()
 
-        debug_str = "HybridParallelInfo: rank_id: %d, mp_degree: %d, " \
-                    "sharding_degree: %d, pp_degree: %d, dp_degree: %d" % (self.global_rank, self._mp_degree,
-                    self._sharding_degree, self._pp_degree, self._dp_degree)
-        debug_str += ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s" % (
-            self._mp_group, self._sharding_group, self._pp_group,
-            self._dp_group, self._check_group)
+        debug_str = (
+            "HybridParallelInfo: rank_id: %d, mp_degree: %d, "
+            "sharding_degree: %d, pp_degree: %d, dp_degree: %d"
+            % (
+                self.global_rank,
+                self._mp_degree,
+                self._sharding_degree,
+                self._pp_degree,
+                self._dp_degree,
+            )
+        )
+        debug_str += (
+            ", mp_group: %s,  sharding_group: %s, pp_group: %s, dp_group: %s, check/clip group: %s"
+            % (
+                self._mp_group,
+                self._sharding_group,
+                self._pp_group,
+                self._dp_group,
+                self._check_group,
+            )
+        )
         logger.info(debug_str)
 
         global _HYBRID_PARALLEL_GROUP
@@ -195,7 +223,12 @@ def get_parallel_mode(self):
         # adding its parallel logic within that parallelism
         # when use sharding alone, it should have its own parallelism for its parallel logic
         # TODO modify 3 others parallel to support sharding
-        if self._mp_degree == 1 and self._pp_degree == 1 and self._dp_degree == 1 and self._sharding_degree > 1:
+        if (
+            self._mp_degree == 1
+            and self._pp_degree == 1
+            and self._dp_degree == 1
+            and self._sharding_degree > 1
+        ):
             return ParallelMode.SHARDING_PARALLEL
         elif self._mp_degree == 1 and self._pp_degree == 1:
             return ParallelMode.DATA_PARALLEL
@@ -206,7 +239,13 @@ def get_parallel_mode(self):
             return ParallelMode.PIPELINE_PARALLEL
 
     def _check_vaild_topo(self):
-        return self._dp_degree * self._mp_degree * self._pp_degree * self._sharding_degree == self.nranks
+        return (
+            self._dp_degree
+            * self._mp_degree
+            * self._pp_degree
+            * self._sharding_degree
+            == self.nranks
+        )
 
     def _set_comm_group(self, parallel_method="data"):
         parallel_group = []
@@ -268,14 +307,16 @@ def _set_p2p_group(self):
                     self.prev_rank = prev_rank
 
                 next_group = paddle.distributed.new_group(
-                    ranks=[curr_rank, next_rank])
+                    ranks=[curr_rank, next_rank]
+                )
                 if self.global_rank == curr_rank:
                     self.send_next_group = next_group
                 elif self.global_rank == next_rank:
                     self.recv_prev_group = next_group
 
                 prev_group = paddle.distributed.new_group(
-                    ranks=[prev_rank, curr_rank])
+                    ranks=[prev_rank, curr_rank]
+                )
 
                 if self.global_rank == curr_rank:
                     self.send_prev_group = prev_group
@@ -339,7 +380,12 @@ def get_pipe_parallel_group(self):
         return self._pp_comm_group
 
     def get_p2p_groups(self):
-        return self.send_next_group, self.send_prev_group, self.recv_next_group, self.recv_prev_group
+        return (
+            self.send_next_group,
+            self.send_prev_group,
+            self.recv_next_group,
+            self.recv_prev_group,
+        )
 
     # sharding parallel message:
     def _get_sharding_parallel_id(self):
@@ -363,23 +409,25 @@ def get_check_parallel_group(self):
         return self._check_comm_group
 
     def get_rank_from_stage(self, stage_id, **kwargs):
-        return self._topo.get_rank_from_stage(self.global_rank,
-                                              pipe=stage_id,
-                                              **kwargs)
+        return self._topo.get_rank_from_stage(
+            self.global_rank, pipe=stage_id, **kwargs
+        )
 
 
 class _CommunicateGroup(object):
-    """ tmp for static """
+    """tmp for static"""
 
     def __init__(self):
         global _HYBRID_PARALLEL_GROUP
         _HYBRID_PARALLEL_GROUP = self
         self.groups = dict()
 
-    def set_comm_group(self, group_name, group_rank, group_size, ring_id,
-                       group_ranks):
-        group = paddle.distributed.collective.Group(group_rank, group_size,
-                                                    ring_id, group_ranks)
+    def set_comm_group(
+        self, group_name, group_rank, group_size, ring_id, group_ranks
+    ):
+        group = paddle.distributed.collective.Group(
+            group_rank, ring_id, group_ranks
+        )
         self.groups[group_name] = group
 
     def get_group(self, group_name):
diff --git a/python/paddle/distributed/fleet/dataset/dataset.py b/python/paddle/distributed/fleet/dataset/dataset.py
index 3c6da4bd957cf9..56bc6eb268a779 100755
--- a/python/paddle/distributed/fleet/dataset/dataset.py
+++ b/python/paddle/distributed/fleet/dataset/dataset.py
@@ -54,7 +54,7 @@ def init(self,
             thread_num(int): thread num, it is the num of readers. default is 1.
             use_var(list): list of variables. Variables which you will use. default is [].
             pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
-            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. default is 0.
             fs_name(str): fs name. default is "".
             fs_ugi(str): fs ugi. default is "".
             download_cmd(str): customized download command. default is "cat"
@@ -441,7 +441,7 @@ def update_settings(self, **kwargs):
             batch_size(int): batch size. It will be effective during training. default is 1.
             thread_num(int): thread num, it is the num of readers. default is 1.
             use_var(list): list of variables. Variables which you will use. default is [].
-            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. default is 0.
             fs_name(str): fs name. default is "".
             fs_ugi(str): fs ugi. default is "".
             pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
@@ -522,7 +522,7 @@ def init(self, **kwargs):
             batch_size(int): batch size. It will be effective during training. default is 1.
             thread_num(int): thread num, it is the num of readers. default is 1.
             use_var(list): list of variables. Variables which you will use. default is [].
-            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. defalut is 0.
+            input_type(int): the input type of generated input. 0 is for one sample, 1 is for one batch. default is 0.
             fs_name(str): fs name. default is "".
             fs_ugi(str): fs ugi. default is "".
             pipe_command(str): pipe command of current dataset. A pipe command is a UNIX pipeline command that can be used only. default is "cat"
diff --git a/python/paddle/distributed/fleet/elastic/manager.py b/python/paddle/distributed/fleet/elastic/manager.py
index e0a6bd81c8ee88..451ed76741cb93 100644
--- a/python/paddle/distributed/fleet/elastic/manager.py
+++ b/python/paddle/distributed/fleet/elastic/manager.py
@@ -26,13 +26,9 @@
 from paddle.distributed.fleet import cloud_utils
 from paddle.distributed.fleet import launch_utils
 
-logger = logging.getLogger("ELASTIC")
-logger.setLevel(logging.INFO)
-formatter = logging.Formatter(
-    fmt='%(name)s %(levelname)s %(asctime)s %(message)s')
-ch = logging.StreamHandler()
-ch.setFormatter(formatter)
-logger.addHandler(ch)
+from paddle.distributed.utils.log_utils import get_logger
+
+logger = get_logger("INFO", "ELASTIC")
 
 ELASTIC_EXIT_CODE = 101
 ELASTIC_AUTO_PARALLEL_EXIT_CODE = 102
@@ -354,7 +350,7 @@ def pre_hook(self):
                                     stderr=subprocess.PIPE,
                                     shell=True).communicate()
         if err:
-            logger.warn("pre_hook exec failed")
+            logger.warning("pre_hook exec failed")
         else:
             logger.info(f"pre_hook exec result: {out.decode('utf-8').strip()}")
 
diff --git a/python/paddle/distributed/fleet/fleet.py b/python/paddle/distributed/fleet/fleet.py
index 63e4c5ec1822b5..f624b7bdb075b7 100644
--- a/python/paddle/distributed/fleet/fleet.py
+++ b/python/paddle/distributed/fleet/fleet.py
@@ -13,14 +13,17 @@
 # limitations under the License.
 
 import copy
-import warnings
 import paddle
 import os
 from types import MethodType
 import numpy as np
 from paddle.fluid.framework import _global_flags
 from paddle.fluid import compiler
-from .base.role_maker import UserDefinedRoleMaker, PaddleCloudRoleMaker, RoleMakerBase
+from .base.role_maker import (
+    UserDefinedRoleMaker,
+    PaddleCloudRoleMaker,
+    RoleMakerBase,
+)
 from .base.strategy_compiler import StrategyCompiler
 from .base.distributed_strategy import DistributedStrategy
 from .base.meta_optimizer_factory import MetaOptimizerFactory
@@ -32,6 +35,8 @@
 from .meta_parallel import model_parallel_random_seed
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
+from .utils.log_util import logger, set_log_level
+import logging
 
 __all__ = []
 
@@ -54,17 +59,17 @@ def apply_ir_passes(main_program, startup_program, config):
         # RawProgramOptimizer also inserts coalesce_tensor
         # into program. These two procedures may conflict
         # in which vars are to be fused.
-        warnings.warn(
+        logger.warning(
             'Currently, the fuse_all_optimizer_ops pass has conflict with fuse_all_reduce_ops pass. Disable the fuse_all_optimizer_ops pass temporarily.'
         )
         build_strategy.fuse_all_optimizer_ops = False
 
-    return apply_build_strategy(main_program, startup_program, build_strategy,
-                                pass_attrs)
+    return apply_build_strategy(
+        main_program, startup_program, build_strategy, pass_attrs
+    )
 
 
 def _inited_runtime_handler_(func):
-
     def __impl__(*args, **kwargs):
         cls = args[0]
 
@@ -77,15 +82,17 @@ def __impl__(*args, **kwargs):
 
 
 def _is_non_distributed_check_(func):
-
     def __impl__(*args, **kwargs):
         cls = args[0]
 
-        if cls._role_maker is not None and cls._role_maker._is_non_distributed(
-        ) is True:
-            warnings.warn(
-                "%s() function doesn't work when use non_distributed fleet." %
-                (func.__name__))
+        if (
+            cls._role_maker is not None
+            and cls._role_maker._is_non_distributed() is True
+        ):
+            logger.warning(
+                "%s() function doesn't work when use non_distributed fleet."
+                % (func.__name__)
+            )
             return
 
         return func(*args, **kwargs)
@@ -100,7 +107,7 @@ def __impl__(*args, **kwargs):
 class Fleet(object):
     """
     Unified API for distributed training of PaddlePaddle
-    Please reference the https://github.com/PaddlePaddle/FleetX for details
+    Please reference the https://github.com/PaddlePaddle/PaddleFleetX for details
 
 
     Returns:
@@ -165,7 +172,13 @@ def __init__(self):
         self._context = {}
         self.user_defined_optimizer = paddle.optimizer.Optimizer(0.0)
 
-    def init(self, role_maker=None, is_collective=False, strategy=None):
+    def init(
+        self,
+        role_maker=None,
+        is_collective=False,
+        strategy=None,
+        log_level="INFO",
+    ):
         """
         Initialize role_maker in Fleet.
 
@@ -174,14 +187,16 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
 
         Args:
             role_maker (RoleMakerBase, optional): A ``RoleMakerBase`` containing the configuration
-                of environment variables related to distributed training.If you did not initialize 
+                of environment variables related to distributed training.If you did not initialize
                 the rolemaker by yourself, it will be automatically initialized to PaddleRoleMaker.
                 The default value is None.
-            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program 
+            is_collective (Boolean, optional): A ``Boolean`` variable determines whether the program
                 runs on the CPU or GPU. False means set distributed training using CPU, and True means
                 GPU.The default value is False.The default value is False.
-            strategy (DistributedStrategy): Extra properties for distributed training. 
+            strategy (DistributedStrategy): Extra properties for distributed training.
                 For details, please refer to paddle.distributed.fleet.DistributedStrategy. Default: None.
+            log_level (Integer, String, optional): A ``Integer`` or ``String`` Variable determining how hight
+                the logging level is. Default is "INFO".
 
 
         Returns:
@@ -217,7 +232,18 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                 strategy = fleet.DistributedStrategy()
                 fleet.init(strategy=strategy)
 
+        Examples5:
+
+            .. code-block:: python
+
+                import paddle.distributed.fleet as fleet
+                strategy = fleet.DistributedStrategy()
+                fleet.init(log_level = "DEBUG")
+
         """
+
+        set_log_level(log_level)
+
         if strategy is None:
             strategy = DistributedStrategy()
         self._user_defined_strategy = copy.deepcopy(strategy)
@@ -226,22 +252,28 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
             if isinstance(is_collective, bool):
                 self._is_collective = is_collective
                 self._role_maker = PaddleCloudRoleMaker(
-                    is_collective=self._is_collective)
+                    is_collective=self._is_collective
+                )
             else:
                 raise ValueError(
-                    "`is_collective` should be instance of `bool`, but got {}".
-                    format(type(is_collective)))
+                    "`is_collective` should be instance of `bool`, but got {}".format(
+                        type(is_collective)
+                    )
+                )
         else:
             if isinstance(role_maker, RoleMakerBase):
                 self._role_maker = role_maker
                 self._is_collective = role_maker._is_collective
             else:
                 raise ValueError(
-                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}"
-                    .format(type(role_maker)))
+                    "`role_maker` should be subclass of `RoleMakerBase`, but got {}".format(
+                        type(role_maker)
+                    )
+                )
         self._role_maker._generate_role()
 
         import paddle.distributed.fleet as fleet
+
         fleet.util._set_role_maker(self._role_maker)
 
         self.strategy_compiler = StrategyCompiler()
@@ -261,18 +293,21 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                 self._hcg = tp.HybridCommunicateGroup(self._topology)
                 return
             if parallel_helper._is_parallel_ctx_initialized():
-                warnings.warn(
-                    "The dygraph parallel environment has been initialized.")
+                logger.warning(
+                    "The dygraph parallel environment has been initialized."
+                )
             else:
                 # FLAGS_nccl_nrings is used for dynamic graph multi-stream communication
                 if "FLAGS_nccl_nrings" in os.environ:
-                    warnings.warn(
+                    logger.warning(
                         "You have set the environment variable FLAGS_nccl_nrings "
                         "outside the program, so the nccl_comm_num in "
-                        "DistributedStrategy will not take effect here.")
+                        "DistributedStrategy will not take effect here."
+                    )
                 else:
                     os.environ["FLAGS_nccl_nrings"] = str(
-                        self._user_defined_strategy.nccl_comm_num)
+                        self._user_defined_strategy.nccl_comm_num
+                    )
                 paddle.distributed.init_parallel_env()
 
             # hybrid parallel not support for npu/xpu
@@ -281,7 +316,7 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                 if tp._HYBRID_PARALLEL_GROUP is None:
                     self._init_hybrid_parallel_env()
                 else:
-                    warnings.warn(
+                    logger.warning(
                         "The dygraph hybrid parallel environment has been initialized."
                     )
         elif self._is_collective:
@@ -294,17 +329,24 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
             global_ring_id = 3 if use_sharding else 0
             global_ranks = list(range(global_world_size))
 
-            if tp._HYBRID_PARALLEL_GROUP is None: tp._CommunicateGroup()
+            if tp._HYBRID_PARALLEL_GROUP is None:
+                tp._CommunicateGroup()
             cg = tp._HYBRID_PARALLEL_GROUP
             self._hcg = cg
-            cg.set_comm_group('global', global_rank, global_world_size,
-                              global_ring_id, global_ranks)
+            cg.set_comm_group(
+                'global',
+                global_rank,
+                global_world_size,
+                global_ring_id,
+                global_ranks,
+            )
 
             use_tensor_parallel = self._user_defined_strategy.tensor_parallel
             use_mp = use_sharding or use_tensor_parallel
 
             # hybrid group
-            if use_mp is False: return
+            if use_mp is False:
+                return
 
             mp_degree_sharding = 1
             mp_degree_tensor_parallel = 1
@@ -313,14 +355,21 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                 mp_degree_sharding = int(sharding_configs['mp_degree'])
 
             if use_tensor_parallel:
-                tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
+                tensor_parallel_configs = (
+                    self._user_defined_strategy.tensor_parallel_configs
+                )
                 mp_degree_tensor_parallel = int(
-                    tensor_parallel_configs['tensor_parallel_degree'])
+                    tensor_parallel_configs['tensor_parallel_degree']
+                )
 
             if use_sharding and use_tensor_parallel:
                 assert mp_degree_sharding == mp_degree_tensor_parallel
 
-            mp_degree = mp_degree_sharding if use_sharding else mp_degree_tensor_parallel
+            mp_degree = (
+                mp_degree_sharding
+                if use_sharding
+                else mp_degree_tensor_parallel
+            )
 
             if mp_degree > 1:
                 assert global_world_size % mp_degree == 0
@@ -329,16 +378,17 @@ def init(self, role_maker=None, is_collective=False, strategy=None):
                 mp_rank = global_rank % mp_degree
                 mp_group_id = global_rank // mp_degree
                 mp_group_ranks = [
-                    idx for idx in global_ranks
+                    idx
+                    for idx in global_ranks
                     if idx // mp_degree == mp_group_id
                 ]
-                cg.set_comm_group('model', mp_rank, mp_degree, mp_ring_id,
-                                  mp_group_ranks)
+                cg.set_comm_group(
+                    'model', mp_rank, mp_degree, mp_ring_id, mp_group_ranks
+                )
         return self
 
     def _init_hybrid_parallel_env(self):
-        """initialize the hybrid environment
-        """
+        """initialize the hybrid environment"""
         self.hybrid_configs = self._user_defined_strategy.hybrid_configs
         self.dp_degree = self.hybrid_configs["dp_degree"]
         self.mp_degree = self.hybrid_configs["mp_degree"]
@@ -347,7 +397,9 @@ def _init_hybrid_parallel_env(self):
 
         assert self.mp_degree >= 0, "mp_degree should be greater or equal to 0"
         assert self.pp_degree >= 0, "pp_degree should be greater or equal to 0"
-        assert self.sharding_degree >= 0, "sharding_degree should be greater or equal to 0"
+        assert (
+            self.sharding_degree >= 0
+        ), "sharding_degree should be greater or equal to 0"
 
         self.mp_degree = max(self.mp_degree, 1)
         self.pp_degree = max(self.pp_degree, 1)
@@ -361,14 +413,19 @@ def _init_hybrid_parallel_env(self):
         self._topology = tp.CommunicateTopology(
             hybrid_group_names=["data", "pipe", "sharding", "model"],
             dims=[
-                self.dp_degree, self.pp_degree, self.sharding_degree,
-                self.mp_degree
-            ])
+                self.dp_degree,
+                self.pp_degree,
+                self.sharding_degree,
+                self.mp_degree,
+            ],
+        )
 
         self._hcg = tp.HybridCommunicateGroup(self._topology)
 
         if self.mp_degree > 1:
-            tensor_parallel_configs = self._user_defined_strategy.tensor_parallel_configs
+            tensor_parallel_configs = (
+                self._user_defined_strategy.tensor_parallel_configs
+            )
             tensor_init_seed = tensor_parallel_configs["tensor_init_seed"]
             if tensor_init_seed == -1:
                 model_parallel_random_seed()
@@ -808,29 +865,29 @@ def save(self, dirname, feed=[], fetch=[], **configs):
                 for name in fetch_var_names
             ]
 
-            self._runtime_handle._save_inference_model(executor, dirname,
-                                                       feeded_var_names,
-                                                       fetch_vars, None, True,
-                                                       0)
+            self._runtime_handle._save_inference_model(
+                executor, dirname, feeded_var_names, fetch_vars, None, True, 0
+            )
         else:
             increment_mode = 0
             if "mode" in configs:
                 increment_mode = int(configs["mode"])
-            self._runtime_handle._save_persistables(executor,
-                                                    dirname,
-                                                    main_program=None,
-                                                    mode=increment_mode)
+            self._runtime_handle._save_persistables(
+                executor, dirname, main_program=None, mode=increment_mode
+            )
 
     @is_non_distributed_check
     @inited_runtime_handler
-    def save_inference_model(self,
-                             executor,
-                             dirname,
-                             feeded_var_names,
-                             target_vars,
-                             main_program=None,
-                             export_for_deployment=True,
-                             mode=0):
+    def save_inference_model(
+        self,
+        executor,
+        dirname,
+        feeded_var_names,
+        target_vars,
+        main_program=None,
+        export_for_deployment=True,
+        mode=0,
+    ):
         """
         save inference model for inference.
 
@@ -850,14 +907,16 @@ def save_inference_model(self,
                 fleet.init_server()
 
         """
-        # warnings.warn(
-        #     "'save_inference_model' is a deprecated, will be deleted after v2.2.0, Please use fleet.save instead."
-        # )
 
-        self._runtime_handle._save_inference_model(executor, dirname,
-                                                   feeded_var_names,
-                                                   target_vars, main_program,
-                                                   export_for_deployment, mode)
+        self._runtime_handle._save_inference_model(
+            executor,
+            dirname,
+            feeded_var_names,
+            target_vars,
+            main_program,
+            export_for_deployment,
+            mode,
+        )
 
     @is_non_distributed_check
     @inited_runtime_handler
@@ -902,12 +961,9 @@ def save_persistables(self, executor, dirname, main_program=None, mode=0):
                 fleet.save_persistables(exe, "dirname", paddle.static.default_main_program())
 
         """
-        # warnings.warn(
-        #     "'save_persistables' is a deprecated, will be deleted after v2.2.0, Please use fleet.save instead."
-        # )
-
-        self._runtime_handle._save_persistables(executor, dirname, main_program,
-                                                mode)
+        self._runtime_handle._save_persistables(
+            executor, dirname, main_program, mode
+        )
 
     @is_non_distributed_check
     @inited_runtime_handler
@@ -946,12 +1002,9 @@ def save_one_table(self, table_id, path, mode):
 
     @is_non_distributed_check
     @inited_runtime_handler
-    def save_dense_params(self,
-                          executor,
-                          dirname,
-                          scope,
-                          program,
-                          var_names=None):
+    def save_dense_params(
+        self, executor, dirname, scope, program, var_names=None
+    ):
         """
         save fleet one table from path
 
@@ -975,8 +1028,9 @@ def save_dense_params(self,
                 fleet.save_dense_params(exe, "path", scope=paddle.static.global_scope(), program=paddle.static.default_main_program())
 
         """
-        self._runtime_handle._save_dense_params(executor, dirname, scope,
-                                                program, var_names)
+        self._runtime_handle._save_dense_params(
+            executor, dirname, scope, program, var_names
+        )
 
     def shrink(self, threshold=None):
         self._runtime_handle._shrink(threshold)
@@ -990,10 +1044,10 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         Args:
             optimizer(Optimizer): The executor to run for init server.
-            strategy(DistributedStrategy): Extra properties for distributed optimizer. 
+            strategy(DistributedStrategy): Extra properties for distributed optimizer.
                 It is recommended to use DistributedStrategy in fleet.init(). The strategy
-                here is for compatibility. If the strategy in fleet.distributed_optimizer() 
-                is not None, then it will overwrite the DistributedStrategy in fleet.init(), 
+                here is for compatibility. If the strategy in fleet.distributed_optimizer()
+                is not None, then it will overwrite the DistributedStrategy in fleet.init(),
                 which will take effect in distributed training.
 
         Returns:
@@ -1015,12 +1069,13 @@ def distributed_optimizer(self, optimizer, strategy=None):
 
         if strategy is not None:
             if self._is_collective:
-                warnings.warn(
+                logger.warning(
                     "It is recommended to use DistributedStrategy "
                     "in fleet.init(). The strategy here is only for compatibility. "
                     "If the strategy in fleet.distributed_optimizer() is "
                     "not None, then it will overwrite the DistributedStrategy in fleet.init(), "
-                    "which will take effect in distributed training.")
+                    "which will take effect in distributed training."
+                )
             self._user_defined_strategy = copy.deepcopy(strategy)
 
         self._context = {}
@@ -1039,35 +1094,32 @@ def _get_amp_optimizer(self):
             if hasattr(self.user_defined_optimizer, 'amp_init'):
                 amp_optimizer = self.user_defined_optimizer
 
-        assert amp_optimizer is not None, \
-            "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
+        assert (
+            amp_optimizer is not None
+        ), "amp_init can only be used when the amp(auto mixed precision) strategy is turned on."
         return amp_optimizer
 
     def get_loss_scaling(self):
-        """Return the real-time loss scaling factor.
-        """
+        """Return the real-time loss scaling factor."""
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.get_loss_scaling()
 
-    def amp_init(self,
-                 place,
-                 scope=None,
-                 test_program=None,
-                 use_fp16_test=False):
+    def amp_init(
+        self, place, scope=None, test_program=None, use_fp16_test=False
+    ):
         """
         Init the amp training, such as cast fp32 parameters to fp16 type.
-  
+
         Args:
-            place(CUDAPlace): place is used to initialize 
+            place(CUDAPlace): place is used to initialize
                 fp16 parameters with fp32 values.
             scope(Scope): The scope is used to find fp32 parameters.
             test_program(Program): The program is used for testing.
             use_fp16_test(bool): Whether to use fp16 testing.
-            
+
         Examples:
             .. code-block:: python
 
-                import numpy as np
                 import paddle
                 import paddle.nn.functional as F
                 paddle.enable_static()
@@ -1085,7 +1137,7 @@ def run_example_code():
                         loss = paddle.mean(hidden)
                     # 2) Create the optimizer and set `multi_precision` to True.
                     # Setting `multi_precision` to True can avoid the poor accuracy
-                    # or the slow convergence in a way. 
+                    # or the slow convergence in a way.
                     optimizer = paddle.optimizer.Momentum(learning_rate=0.01, multi_precision=True)
                     # 3) These ops in `custom_black_list` will keep in the float32 computation type.
                     amp_list = paddle.static.amp.CustomOpLists(
@@ -1105,9 +1157,9 @@ def run_example_code():
                     # 5) Use `amp_init` after FP32 parameters initialization(such as `exe.run(startup_program)`).
                     # If you want to perform the testing process, you should pass `test_program` into `amp_init`.
                     optimizer.amp_init(place, scope=paddle.static.global_scope())
-                    
+
                 if paddle.is_compiled_with_cuda() and len(paddle.static.cuda_places()) > 0:
-                    run_example_code()       
+                    run_example_code()
         """
         amp_optimizer = self._get_amp_optimizer()
         return amp_optimizer.amp_init(place, scope, test_program, use_fp16_test)
@@ -1139,11 +1191,9 @@ def _get_applied_graph_list(self):
         else:
             return self._context["applied_graph_list"]
 
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameter_list=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
+    ):
         """
         Add distributed operations to minimize ``loss`` by updating ``parameter_list``.
 
@@ -1191,27 +1241,31 @@ def minimize(self,
                 optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
                 optimizer.minimize(avg_cost)
 
-                # for more examples, please reference https://github.com/PaddlePaddle/FleetX
+                # for more examples, please reference https://github.com/PaddlePaddle/PaddleFleetX
 
         """
         if not isinstance(loss, list):
-            return self._minimize_impl(loss, startup_program, parameter_list,
-                                       no_grad_set)
+            return self._minimize_impl(
+                loss, startup_program, parameter_list, no_grad_set
+            )
         else:
-            if paddle.fluid.framework._non_static_mode(
-            ) or self._role_maker._is_non_distributed() or self._is_collective:
+            if (
+                paddle.fluid.framework._non_static_mode()
+                or self._role_maker._is_non_distributed()
+                or self._is_collective
+            ):
                 raise ValueError("loss can be list only in PS mode")
-            return self._minimize_losses_impl(loss, startup_program,
-                                              parameter_list, no_grad_set)
-
-    def _minimize_impl(self,
-                       loss,
-                       startup_program=None,
-                       parameter_list=None,
-                       no_grad_set=None):
+            return self._minimize_losses_impl(
+                loss, startup_program, parameter_list, no_grad_set
+            )
+
+    def _minimize_impl(
+        self, loss, startup_program=None, parameter_list=None, no_grad_set=None
+    ):
         context = {}
         context["user_defined_strategy"] = copy.deepcopy(
-            self._user_defined_strategy)
+            self._user_defined_strategy
+        )
         if paddle.fluid.framework._non_static_mode():
             # imitate target optimizer retrieval
             target_opt = self.user_defined_optimizer
@@ -1224,49 +1278,62 @@ def _minimize_impl(self,
         if not hasattr(self.origin_main_program, "distributed_info_"):
             setattr(self.origin_main_program, "distributed_info_", dict())
             self.origin_main_program.distributed_info_[
-                "dp_degree"] = self._user_defined_strategy.sharding_configs[
-                    "dp_degree"]
+                "dp_degree"
+            ] = self._user_defined_strategy.sharding_configs["dp_degree"]
             self.origin_main_program.distributed_info_[
-                "mp_degree"] = self._user_defined_strategy.sharding_configs[
-                    "mp_degree"]
+                "mp_degree"
+            ] = self._user_defined_strategy.sharding_configs["mp_degree"]
             self.origin_main_program.distributed_info_[
-                "pp_degree"] = self._user_defined_strategy.sharding_configs[
-                    "pp_degree"]
+                "pp_degree"
+            ] = self._user_defined_strategy.sharding_configs["pp_degree"]
             self.origin_main_program.distributed_info_[
-                "sharding_degree"] = self._user_defined_strategy.sharding_configs[
-                    "sharding_degree"]
+                "sharding_degree"
+            ] = self._user_defined_strategy.sharding_configs["sharding_degree"]
 
         context["origin_main_program"] = self.origin_main_program
         context["origin_main_programs"] = [self.origin_main_program]
         context["loss"] = loss
         if startup_program == None:
-            self.origin_startup_program = \
+            self.origin_startup_program = (
                 paddle.static.default_startup_program().clone(for_test=False)
+            )
             startup_program = paddle.static.default_startup_program()
         else:
-            self.origin_startup_program = \
-                startup_program.clone(for_test=False)
+            self.origin_startup_program = startup_program.clone(for_test=False)
 
         context["origin_startup_program"] = startup_program
         context["origin_startup_programs"] = [startup_program]
         context["role_maker"] = self._role_maker
 
         # Use the auto-parallel's routines instead
-        if self._user_defined_strategy.semi_auto or self._user_defined_strategy.auto_search:
+        if (
+            self._user_defined_strategy.semi_auto
+            or self._user_defined_strategy.auto_search
+        ):
             from ..auto_parallel.parallelizer import AutoParallelizer
+
             auto_parallelizer = AutoParallelizer(self)
-            optimize_ops, params_grads, dist_startup_prog, dist_main_prog = auto_parallelizer.parallelize(
-                loss, startup_program, parameter_list, no_grad_set)
+            (
+                optimize_ops,
+                params_grads,
+                dist_startup_prog,
+                dist_main_prog,
+            ) = auto_parallelizer.parallelize(
+                loss, startup_program, parameter_list, no_grad_set
+            )
 
             return optimize_ops, params_grads, dist_startup_prog, dist_main_prog
 
         # compile time
-        distributed_optimizer_list = \
+        distributed_optimizer_list = (
             MetaOptimizerFactory()._get_valid_meta_optimizers(
-                self.user_defined_optimizer)
+                self.user_defined_optimizer
+            )
+        )
 
         context["user_defined_strategy"] = copy.deepcopy(
-            self._user_defined_strategy)
+            self._user_defined_strategy
+        )
         copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy)
 
         # trigger the auto-parallel in very strict condition
@@ -1284,9 +1351,12 @@ def _minimize_impl(self,
         can_not_apply_optimizer_list = []
         # recall meta optimizers for ranking
         for opt in distributed_optimizer_list:
-            opt._set_basic_info(loss, self._role_maker,
-                                self.user_defined_optimizer,
-                                copy_user_defined_strategy)
+            opt._set_basic_info(
+                loss,
+                self._role_maker,
+                self.user_defined_optimizer,
+                copy_user_defined_strategy,
+            )
             if opt._can_apply() and not opt._is_graph_out():
                 valid_optimizer_list.append(opt)
             elif opt._can_apply() and opt._is_graph_out():
@@ -1294,18 +1364,27 @@ def _minimize_impl(self,
             else:
                 can_not_apply_optimizer_list.append(opt)
         # combine recalled meta optimizers to be a valid meta optimizer
-        meta_optimizer, graph_optimizer = \
-            self.strategy_compiler.generate_optimizer(
-                loss, self._role_maker, self.user_defined_optimizer,
-                copy_user_defined_strategy, valid_optimizer_list,
-                valid_graph_optimizer_list)
+        (
+            meta_optimizer,
+            graph_optimizer,
+        ) = self.strategy_compiler.generate_optimizer(
+            loss,
+            self._role_maker,
+            self.user_defined_optimizer,
+            copy_user_defined_strategy,
+            valid_optimizer_list,
+            valid_graph_optimizer_list,
+        )
 
         valid_strategy = self.strategy_compiler._get_valid_strategy(
-            copy_user_defined_strategy, can_not_apply_optimizer_list)
+            copy_user_defined_strategy, can_not_apply_optimizer_list
+        )
 
         context["valid_strategy"] = copy.deepcopy(valid_strategy)
-        # print("valid_strategy:", context["valid_strategy"])
-        # print("user_defined_strategy:", context["user_defined_strategy"])
+        logger.debug("valid_strategy: " + str(context["valid_strategy"]))
+        logger.debug(
+            "user_defined_strategy: " + str(context["user_defined_strategy"])
+        )
 
         applied_meta_list = self.strategy_compiler._get_applied_meta_list()
         applied_graph_list = self.strategy_compiler._get_applied_graph_list()
@@ -1326,38 +1405,48 @@ def _minimize_impl(self,
                 self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
             compiled_program = compiler.CompiledProgram(
-                self.origin_main_program).with_data_parallel(
-                    loss_name=loss.name, share_vars_from=None)
+                self.origin_main_program
+            ).with_data_parallel(loss_name=loss.name, share_vars_from=None)
             loss.block.program._graph = compiled_program
-            return self.user_defined_optimizer.minimize(loss,
-                                                        startup_program,
-                                                        parameter_list,
-                                                        no_grad_set=no_grad_set)
+            return self.user_defined_optimizer.minimize(
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
 
         if meta_optimizer:
-            # print("before minimize program id:", id(loss.block.program))
+            logger.debug(
+                "before minimize program id: " + str(id(loss.block.program))
+            )
             optimize_ops, params_grads = meta_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
-            # print("after minimize program id:", id(loss.block.program))
-
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
+            logger.debug(
+                "after minimize program id: " + str(id(loss.block.program))
+            )
             default_program = paddle.static.default_main_program()
-            # print("default program id:", id(default_program))
+            logger.debug("default program id: " + str(id(default_program)))
 
             if id(default_program) != id(loss.block.program):
                 paddle.fluid.framework.switch_main_program(loss.block.program)
-            # print("default program id after switch:", id(default_program))
+            logger.debug(
+                "default program id after switch: " + str(id(default_program))
+            )
 
         else:
             optimize_ops, params_grads = self.user_defined_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
 
         context["program_optimize_ops"] = optimize_ops
         context["program_params_grads"] = params_grads
 
         if graph_optimizer:
-            # print("before graph minimize program id:", id(loss.block.program))
+            logger.debug(
+                "before graph minimize program id: "
+                + str(id(loss.block.program))
+            )
             optimize_ops, params_grads = graph_optimizer.minimize(
-                loss, startup_program, parameter_list, no_grad_set=no_grad_set)
+                loss, startup_program, parameter_list, no_grad_set=no_grad_set
+            )
             # since we do not encourage users to use graph operations
             # if a graph optimizer takes effect, mostly
             # optimizers_ops and params_grads are None
@@ -1372,8 +1461,10 @@ def _minimize_impl(self,
             opt_info = {} if program._fleet_opt is None else program._fleet_opt
             opt_info["mpi_size"] = self.worker_num()
             opt_info["mpi_rank"] = self.worker_index()
-            for k, v in self._user_defined_strategy.trainer_desc_configs.items(
-            ):
+            for (
+                k,
+                v,
+            ) in self._user_defined_strategy.trainer_desc_configs.items():
                 if v or k not in opt_info:
                     opt_info[k] = v
             program._fleet_opt = opt_info
@@ -1382,15 +1473,18 @@ def _minimize_impl(self,
             self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
         import paddle.distributed.fleet as fleet
+
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
 
-    def _minimize_losses_impl(self,
-                              losses,
-                              startup_programs=None,
-                              parameter_list=None,
-                              no_grad_set=None):
+    def _minimize_losses_impl(
+        self,
+        losses,
+        startup_programs=None,
+        parameter_list=None,
+        no_grad_set=None,
+    ):
         context = {}
 
         # cache original feed forward program
@@ -1406,7 +1500,8 @@ def _minimize_losses_impl(self,
                 startup_programs = [paddle.static.default_startup_program()]
             else:
                 raise ValueError(
-                    "startup_program can't be None when loss is list.")
+                    "startup_program can't be None when loss is list."
+                )
         self.origin_startup_program = startup_programs[0].clone(for_test=False)
         context["origin_startup_program"] = startup_programs[0]
         context["origin_startup_programs"] = []
@@ -1416,7 +1511,8 @@ def _minimize_losses_impl(self,
         context["role_maker"] = self._role_maker
 
         context["user_defined_strategy"] = copy.deepcopy(
-            self._user_defined_strategy)
+            self._user_defined_strategy
+        )
 
         context["valid_strategy"] = copy.deepcopy(self._user_defined_strategy)
 
@@ -1429,12 +1525,17 @@ def _minimize_losses_impl(self,
         params_grads = []
 
         from .meta_optimizers import ParameterServerOptimizer
+
         ps_optimizer = ParameterServerOptimizer(self.user_defined_optimizer)
-        ps_optimizer._set_basic_info(losses, self._role_maker,
-                                     self.user_defined_optimizer,
-                                     self._user_defined_strategy)
+        ps_optimizer._set_basic_info(
+            losses,
+            self._role_maker,
+            self.user_defined_optimizer,
+            self._user_defined_strategy,
+        )
         optimize_ops, params_grads = ps_optimizer.minimize_losses_impl(
-            losses, startup_programs, parameter_list, no_grad_set=no_grad_set)
+            losses, startup_programs, parameter_list, no_grad_set=no_grad_set
+        )
 
         # default_program = paddle.static.default_main_program()
 
@@ -1449,17 +1550,24 @@ def _minimize_losses_impl(self,
             opt_info = {} if program._fleet_opt is None else program._fleet_opt
             opt_info["mpi_size"] = self.worker_num()
             opt_info["mpi_rank"] = self.worker_index()
-            for k, v in self._user_defined_strategy.trainer_desc_configs.items(
-            ):
+            for (
+                k,
+                v,
+            ) in self._user_defined_strategy.trainer_desc_configs.items():
                 if v or k not in opt_info:
                     opt_info[k] = v
             program._fleet_opt = opt_info
-            # print("fleet base opt info:", id(program), program._fleet_opt)
+            logger.debug(
+                "fleet base opt info: "
+                + str(id(program))
+                + str(program._fleet_opt)
+            )
 
         if self._runtime_handle is None:
             self._runtime_handle = RuntimeFactory()._create_runtime(context)
 
         import paddle.distributed.fleet as fleet
+
         fleet.util._set_strategy(context["valid_strategy"])
 
         return optimize_ops, params_grads
diff --git a/python/paddle/distributed/fleet/fleet_executor_utils.py b/python/paddle/distributed/fleet/fleet_executor_utils.py
index f5a1d8b18148da..a53f2e73511eaf 100644
--- a/python/paddle/distributed/fleet/fleet_executor_utils.py
+++ b/python/paddle/distributed/fleet/fleet_executor_utils.py
@@ -22,38 +22,47 @@ class TaskNode:
     Python side TaskNode, connection to the c++ side TaskNode
     """
 
-    def __init__(self,
-                 rank,
-                 max_run_times,
-                 max_slot_times,
-                 role=None,
-                 node_type=None,
-                 task_id=0,
-                 ops=None,
-                 program=None,
-                 lazy_initialize=False):
+    def __init__(
+        self,
+        rank,
+        max_run_times,
+        role=None,
+        node_type=None,
+        task_id=0,
+        ops=None,
+        program=None,
+        lazy_initialize=False,
+        cond_var_name=None,
+        vars_to_dtype=None,
+        vars_to_shape=None,
+    ):
         """
         :param rank (int): Current rank of the task node.
         :param max_run_times (int): The max run times of the task node.
-        :param max_slot_times (int): The mas slot times of the task node.
         :param role (int): The role of the task node. (Will be removed in the future)
         :param node_type (str): The type of the task node.
         :param task_id (int): The id of task node.
-        :param ops (list): A list of op.desc to init the task node. (Will be removed in the future) 
+        :param ops (list): A list of op.desc to init the task node. (Will be removed in the future)
         :param program (Program): An instance of Program to init the task node.
         :param lazy_initialize (bool): In user-defined task, the program may change adding feed/fetch op. As efficient consideration, the task node will have the C++ object later.
+        :param cond_var_name (string): Indicate the cond var name of while.
+        :param vars_list (list): A list of var name to send.
         """
-        assert ((ops is not None) ^ (program is not None)), \
-            "Should provide only one of ops or program to task node."
-        assert (not ((ops is not None) and lazy_initialize)), \
-                "Lazy initialization doesn't support with ops list"
+        assert (ops is not None) ^ (
+            program is not None
+        ), "Should provide only one of ops or program to task node."
+        assert not (
+            (ops is not None) and lazy_initialize
+        ), "Lazy initialization doesn't support with ops list"
         self.id = int(task_id)
         self.rank = rank
         self.max_run_times = max_run_times
-        self.max_slot_times = max_slot_times
         self.node_type = node_type
         self.program = program
         self.lazy_initialize = lazy_initialize
+        self.cond_var_name = cond_var_name
+        self.vars_to_dtype = vars_to_dtype
+        self.vars_to_shape = vars_to_shape
         self.run_pre_steps = None
         self.run_at_offset = None
         self.node = None
@@ -61,40 +70,63 @@ def __init__(self,
         self.downstreams = []
         if not lazy_initialize:
             if ops is not None:
-                assert role is not None and task_id is not None, \
-                    "If init task node with ops, should provide `role` and `task_id`."
-                self.node = core.TaskNode(role, ops, rank, task_id,
-                                          max_run_times, max_slot_times)
+                assert (
+                    role is not None and task_id is not None
+                ), "If init task node with ops, should provide `role` and `task_id`."
+                self.node = core.TaskNode(
+                    role,
+                    ops,
+                    rank,
+                    task_id,
+                    max_run_times,
+                )
             else:
-                self.node = core.TaskNode(program.desc, rank, self.id,
-                                          max_run_times, max_slot_times)
+                self.node = core.TaskNode(
+                    program.desc,
+                    rank,
+                    self.id,
+                    max_run_times,
+                )
             if self.node_type:
                 self.node.set_type(self.node_type)
 
     def task_node(self):
         if self.lazy_initialize:
-            self.node = core.TaskNode(self.program.desc, self.rank, self.id,
-                                      self.max_run_times, self.max_slot_times)
+            self.node = core.TaskNode(
+                self.program.desc,
+                self.rank,
+                self.id,
+                self.max_run_times,
+            )
             if self.node_type:
                 self.node.set_type(self.node_type)
             if self.run_pre_steps:
                 self.node.set_run_pre_steps(self.run_pre_steps)
             if self.run_at_offset:
                 self.node.set_run_at_offset(self.run_at_offset)
+            if self.cond_var_name:
+                self.node.set_cond_var_name(self.cond_var_name)
+            if self.vars_to_shape:
+                self.node.set_vars_to_shape(self.vars_to_shape)
+            if self.vars_to_dtype:
+                self.node.set_vars_to_dtype(self.vars_to_dtype)
             for up in self.upstreams:
-                self.node.add_upstream_task(up[0], up[1])
+                self.node.add_upstream_task(up[0], up[1], up[2])
             for down in self.downstreams:
-                self.node.add_downstream_task(down[0], down[1])
+                self.node.add_downstream_task(down[0], down[1], down[2])
             self.lazy_initialize = False
         return self.node
 
     def set_program(self, program):
-        assert self.lazy_initialize, \
-            "Inside program is unchangable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()."
+        assert (
+            self.lazy_initialize
+        ), "Inside program is unchangable for immediate initialized task node. Set the lazy_initialize to be true if the inside program need to be update. Remember to do all your change before eval node.task_node()."
         self.program = program
 
     def get_program(self):
-        assert self.program is not None, "The task node is not initialized using program"
+        assert (
+            self.program is not None
+        ), "The task node is not initialized using program"
         return self.program
 
     def set_run_pre_steps(self, steps):
@@ -109,17 +141,21 @@ def set_run_at_offset(self, offset):
         else:
             self.node.set_run_at_offset(offset)
 
-    def add_upstream_task(self, upstream, buffer_size=2):
+    def add_upstream_task(
+        self, upstream, buffer_size=2, depend_type=core.DependType.NORMAL
+    ):
         if self.lazy_initialize:
-            self.upstreams.append((upstream, buffer_size))
+            self.upstreams.append((upstream, buffer_size, depend_type))
         else:
-            self.node.add_upstream_task(upstream, buffer_size)
+            self.node.add_upstream_task(upstream, buffer_size, depend_type)
 
-    def add_downstream_task(self, downstream, buffer_size=2):
+    def add_downstream_task(
+        self, downstream, buffer_size=2, depend_type=core.DependType.NORMAL
+    ):
         if self.lazy_initialize:
-            self.downstreams.append((downstream, buffer_size))
+            self.downstreams.append((downstream, buffer_size, depend_type))
         else:
-            self.node.add_downstream_task(downstream, buffer_size)
+            self.node.add_downstream_task(downstream, buffer_size, depend_type)
 
     def task_id(self):
         return self.id
@@ -142,10 +178,16 @@ def _invalide_coord(self, coord):
         :param coord: The coord to be tested
         :return: False if valid, True if invalid.
         """
-        return coord['mp_idx'] < 0 or coord['mp_idx'] >= self.mp_degree or \
-               coord['sharding_idx'] < 0 or coord['sharding_idx'] >= self.sharding_degree or \
-               coord['pp_idx'] < 0 or coord['pp_idx'] >= self.pp_degree or \
-               coord['dp_idx'] < 0 or coord['dp_idx'] >= self.dp_degree
+        return (
+            coord['mp_idx'] < 0
+            or coord['mp_idx'] >= self.mp_degree
+            or coord['sharding_idx'] < 0
+            or coord['sharding_idx'] >= self.sharding_degree
+            or coord['pp_idx'] < 0
+            or coord['pp_idx'] >= self.pp_degree
+            or coord['dp_idx'] < 0
+            or coord['dp_idx'] >= self.dp_degree
+        )
 
     def coord_to_rank(self, coord):
         """
@@ -155,9 +197,15 @@ def coord_to_rank(self, coord):
         """
         if self._invalide_coord(coord):
             return -1
-        return int(coord['dp_idx'] * self.pp_degree * self.sharding_degree * self.mp_degree + \
-                   coord['pp_idx'] * self.sharding_degree * self.mp_degree + \
-                   coord['sharding_idx'] * self.mp_degree + coord['mp_idx'])
+        return int(
+            coord['dp_idx']
+            * self.pp_degree
+            * self.sharding_degree
+            * self.mp_degree
+            + coord['pp_idx'] * self.sharding_degree * self.mp_degree
+            + coord['sharding_idx'] * self.mp_degree
+            + coord['mp_idx']
+        )
 
     def rank_to_coord(self, rank):
         """
@@ -176,17 +224,14 @@ def rank_to_coord(self, rank):
             'mp_idx': int(mp_idx),
             'sharding_idx': int(sharding_idx),
             'pp_idx': int(pp_idx),
-            'dp_idx': int(dp_idx)
+            'dp_idx': int(dp_idx),
         }
 
 
 class FleetExecutorUtils:
-
-    def __init__(self,
-                 dist_strategy=None,
-                 rank=None,
-                 nrank=None,
-                 max_run_times=None):
+    def __init__(
+        self, dist_strategy=None, rank=None, nrank=None, max_run_times=None
+    ):
         self.dist_strategy = dist_strategy
         self.rank = rank
         self.nrank = nrank
@@ -206,12 +251,14 @@ def is_lr_sched_op(self, op_role):
         return op_role == int(OpRole.Optimize.LRSched)
 
     def is_forward_op(self, op_role):
-        return (op_role == int(OpRole.Forward)) or \
-               (op_role == (int(OpRole.Forward) | int(OpRole.Loss)))
+        return (op_role == int(OpRole.Forward)) or (
+            op_role == (int(OpRole.Forward) | int(OpRole.Loss))
+        )
 
     def is_backward_op(self, op_role):
-        return (op_role == int(OpRole.Backward)) or \
-               (op_role == (int(OpRole.Backward) | int(OpRole.Loss)))
+        return (op_role == int(OpRole.Backward)) or (
+            op_role == (int(OpRole.Backward) | int(OpRole.Loss))
+        )
 
     def split_program_to_op_list(self, program):
         op_list_map = {"lr": [], "fwd": [], "bwd": [], "opt": []}
@@ -233,17 +280,19 @@ def split_program_to_op_list(self, program):
         return op_list_map
 
     def convert_op_list_to_program(self, op_list, complete_program):
-        #TODO(liyurui): Complete this convert logic
+        # TODO(liyurui): Complete this convert logic
         program_map = {
             "lr": Program(),
             "fwd": Program(),
             "bwd": Program(),
-            "opt": Program()
+            "opt": Program(),
         }
         return program_map
 
     def build_1f1b_dependency(self, task_node_map):
-        assert not self.is_auto_parallel, "Handly add dependency should not be invoked in auto parallel mode"
+        assert (
+            not self.is_auto_parallel
+        ), "Handly add dependency should not be invoked in auto parallel mode"
         # Generated the dependency based on this graph:
         # lr(1:m) -> forward -> backward -> (m:1)optimize
         #               ↑          ↓
@@ -253,8 +302,9 @@ def build_1f1b_dependency(self, task_node_map):
 
         # add dependency intra stage
         cur_start_id = self.rank * self.num_of_functionality
-        pp_buff_size = int(self.dist_strategy['pp_degree'] -
-                           self.coord['pp_idx'])
+        pp_buff_size = int(
+            self.dist_strategy['pp_degree'] - self.coord['pp_idx']
+        )
         task_node_map["lr"].add_downstream_task(cur_start_id + 1)
         task_node_map["fwd"].add_upstream_task(cur_start_id)
         task_node_map["fwd"].add_downstream_task(cur_start_id + 2, pp_buff_size)
@@ -267,8 +317,8 @@ def build_1f1b_dependency(self, task_node_map):
         downstream_coord['pp_idx'] = downstream_coord['pp_idx'] + 1
         pp_upstream = self.coord_sys.coord_to_rank(upstream_coord)
         pp_downstream = self.coord_sys.coord_to_rank(downstream_coord)
-        first_stage = (pp_upstream == -1)
-        last_stage = (pp_downstream == -1)
+        first_stage = pp_upstream == -1
+        last_stage = pp_downstream == -1
         prev_pp_start_id = pp_upstream * self.num_of_functionality
         next_pp_start_id = pp_downstream * self.num_of_functionality
         if not first_stage:
@@ -280,33 +330,36 @@ def build_1f1b_dependency(self, task_node_map):
         return task_node_map
 
     def construct_task_nodes_1f1b(self, program_map):
-        max_slot_times = int(self.max_run_times - self.coord['pp_idx'])
         cur_start_id = int(self.rank * self.num_of_functionality)
-        lr_task_node = TaskNode(rank=self.rank,
-                                max_run_times=self.max_run_times,
-                                max_slot_times=max_slot_times,
-                                program=program_map["lr"],
-                                task_id=cur_start_id)
-        fwd_task_node = TaskNode(rank=self.rank,
-                                 max_run_times=self.max_run_times,
-                                 max_slot_times=max_slot_times,
-                                 program=program_map["fwd"],
-                                 task_id=cur_start_id + 1)
-        bwd_task_node = TaskNode(rank=self.rank,
-                                 max_run_times=self.max_run_times,
-                                 max_slot_times=max_slot_times,
-                                 program=program_map["bwd"],
-                                 task_id=cur_start_id + 2)
-        opt_task_node = TaskNode(rank=self.rank,
-                                 max_run_times=self.max_run_times,
-                                 max_slot_times=max_slot_times,
-                                 program=program_map["opt"],
-                                 task_id=cur_start_id + 3)
+        lr_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            program=program_map["lr"],
+            task_id=cur_start_id,
+        )
+        fwd_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            program=program_map["fwd"],
+            task_id=cur_start_id + 1,
+        )
+        bwd_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            program=program_map["bwd"],
+            task_id=cur_start_id + 2,
+        )
+        opt_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            program=program_map["opt"],
+            task_id=cur_start_id + 3,
+        )
         return {
             "lr": lr_task_node,
             "fwd": fwd_task_node,
             "bwd": bwd_task_node,
-            "opt": opt_task_node
+            "opt": opt_task_node,
         }
 
     def task_id_to_rank(self):
@@ -317,53 +370,58 @@ def task_id_to_rank(self):
         return task_id_to_rank
 
     def construct_task_nodes_1f1b_op_list(self, op_list_map):
-        max_slot_times = int(self.max_run_times - self.coord['pp_idx'])
         cur_start_id = int(self.rank * self.num_of_functionality)
-        lr_task_node = TaskNode(rank=self.rank,
-                                max_run_times=self.max_run_times,
-                                max_slot_times=max_slot_times,
-                                role=int(OpRole.Optimize.LRSched),
-                                ops=op_list_map["lr"],
-                                task_id=cur_start_id,
-                                node_type="Amplifier")
+        lr_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            role=int(OpRole.Optimize.LRSched),
+            ops=op_list_map["lr"],
+            task_id=cur_start_id,
+            node_type="Amplifier",
+        )
         lr_task_node.set_run_pre_steps(self.max_run_times)
-        fwd_task_node = TaskNode(rank=self.rank,
-                                 max_run_times=self.max_run_times,
-                                 max_slot_times=max_slot_times,
-                                 role=int(OpRole.Forward),
-                                 ops=op_list_map["fwd"],
-                                 task_id=cur_start_id + 1,
-                                 node_type="Compute")
-        bwd_task_node = TaskNode(rank=self.rank,
-                                 max_run_times=self.max_run_times,
-                                 max_slot_times=max_slot_times,
-                                 role=int(OpRole.Backward),
-                                 ops=op_list_map["bwd"],
-                                 task_id=cur_start_id + 2,
-                                 node_type="Compute")
-        opt_task_node = TaskNode(rank=self.rank,
-                                 max_run_times=self.max_run_times,
-                                 max_slot_times=max_slot_times,
-                                 role=int(OpRole.Optimize),
-                                 ops=op_list_map["opt"],
-                                 task_id=cur_start_id + 3,
-                                 node_type="Amplifier")
+        fwd_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            role=int(OpRole.Forward),
+            ops=op_list_map["fwd"],
+            task_id=cur_start_id + 1,
+            node_type="Compute",
+        )
+        bwd_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            role=int(OpRole.Backward),
+            ops=op_list_map["bwd"],
+            task_id=cur_start_id + 2,
+            node_type="Compute",
+        )
+        opt_task_node = TaskNode(
+            rank=self.rank,
+            max_run_times=self.max_run_times,
+            role=int(OpRole.Optimize),
+            ops=op_list_map["opt"],
+            task_id=cur_start_id + 3,
+            node_type="Amplifier",
+        )
         opt_task_node.set_run_pre_steps(self.max_run_times)
         opt_task_node.set_run_at_offset(self.max_run_times - 1)
         return {
             "lr": lr_task_node,
             "fwd": fwd_task_node,
             "bwd": bwd_task_node,
-            "opt": opt_task_node
+            "opt": opt_task_node,
         }
 
 
-def run1f1b(program,
-            rank,
-            max_run_times,
-            dist_opt,
-            nrank,
-            with_standalone_executor=False):
+def run1f1b(
+    program,
+    rank,
+    max_run_times,
+    dist_opt,
+    nrank,
+    with_standalone_executor=False,
+):
     """
     Split the program to support 1f1b pipeline scheduler.
     This funct will split the program based on the op_role.
@@ -380,24 +438,29 @@ def run1f1b(program,
         task_id_to_rank (dict): task nodes' ids to it's corresponding rank
     """
     print("fleet executor will use python side 1f1b scheduler.")
-    fleet_executor_utils = FleetExecutorUtils(dist_strategy=dist_opt,
-                                              rank=rank,
-                                              nrank=nrank,
-                                              max_run_times=max_run_times)
+    fleet_executor_utils = FleetExecutorUtils(
+        dist_strategy=dist_opt,
+        rank=rank,
+        nrank=nrank,
+        max_run_times=max_run_times,
+    )
     op_list_map = fleet_executor_utils.split_program_to_op_list(program)
     task_node_map = None
     if with_standalone_executor:
         program_map = fleet_executor_utils.convert_op_list_to_program(
-            op_list_map, program)
+            op_list_map, program
+        )
         task_node_map = fleet_executor_utils.construct_task_nodes_1f1b(
-            program_map)
+            program_map
+        )
     else:
         op_desc_list_map = {"lr": [], "fwd": [], "bwd": [], "opt": []}
         for key in op_list_map:
             for op in op_list_map[key]:
                 op_desc_list_map[key].append(op.desc)
         task_node_map = fleet_executor_utils.construct_task_nodes_1f1b_op_list(
-            op_desc_list_map)
+            op_desc_list_map
+        )
     task_node_map = fleet_executor_utils.build_1f1b_dependency(task_node_map)
     task_id_to_rank = fleet_executor_utils.task_id_to_rank()
     task_node_list = [task_node_map[key].task_node() for key in task_node_map]
@@ -414,10 +477,11 @@ def origin(program, rank):
         task_id_to_rank (dict): a fake dict, since there is no upstream or downstream, this dict won't be used
     """
     print("fleet executor will use python side origin scheduler.")
-    task_node = TaskNode(program=program,
-                         rank=rank,
-                         node_type="Compute",
-                         max_run_times=1,
-                         max_slot_times=1)
+    task_node = TaskNode(
+        program=program,
+        rank=rank,
+        node_type="Compute",
+        max_run_times=1,
+    )
     task_id_to_rank = {task_node.task_id(): rank}
     return [task_node.task_node()], task_id_to_rank
diff --git a/python/paddle/distributed/fleet/layers/mpu/__init__.py b/python/paddle/distributed/fleet/layers/mpu/__init__.py
new file mode 100644
index 00000000000000..11b6970265003f
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/__init__.py
@@ -0,0 +1,24 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .mp_layers import VocabParallelEmbedding
+from .mp_layers import ColumnParallelLinear
+from .mp_layers import RowParallelLinear
+from .mp_layers import ParallelCrossEntropy
+
+from .random import RNGStatesTracker
+from .random import get_rng_state_tracker
+from .random import model_parallel_random_seed
+from .random import determinate_seed
+from .random import dropout
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_layers.py b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
new file mode 100644
index 00000000000000..2ba9ce9ed76a9b
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_layers.py
@@ -0,0 +1,466 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from . import mp_ops
+from paddle.fluid import core
+from paddle.fluid.dygraph.layers import Layer
+from .random import get_rng_state_tracker
+from paddle.nn import functional as F
+from paddle import framework
+from paddle.autograd import PyLayer
+from ...base import topology as tp
+
+__all__ = []
+
+# Follow this paper to achieve the file:
+# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter
+# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
+
+
+def is_fused_matmul_bias_supported():
+    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
+        return hasattr(core.ops, 'fused_gemm_epilogue')
+    else:
+        return False
+
+
+class VocabParallelEmbedding(Layer):
+    """Embedding mp parallelized in the vocabulary dimension.
+    this class is used for splitting embedding in mp group.
+
+    Args:
+        num_embeddings(int): One element which indicate the size of the dictionary of embeddings.
+        embedding_dim(int): One element which indicate the size of each embedding vector respectively.
+        weight_attr(ParamAttr|None): To specify the weight parameter property. Default: None, which means the
+            default weight parameter property is used. See usage for details in :ref:`api_ParamAttr` . In addition,
+            user-defined or pre-trained word vectors can be loaded with the :attr:`param_attr` parameter.
+            The local word vector needs to be transformed into numpy format, and the shape of local word
+            vector should be consistent with :attr:`num_embeddings` . Then :ref:`api_initializer_NumpyArrayInitializer`
+            is used to load custom or pre-trained word vectors. See code example for details.
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): For detailed information, please refer
+               to :ref:`api_guide_Name`. Usually name is no need to set and
+               None by default.
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        from paddle.distributed import fleet
+
+        class SimpleMPNet(paddle.nn.Layer):
+           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+              super(SimpleMPNet, self).__init__()
+              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                    hidden_size,
+                    inner_size,
+                    gather_output=False,
+                    has_bias=True)
+
+              self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                    inner_size,
+                    hidden_size,
+                    input_is_parallel=True,
+                    has_bias=True)
+
+              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+
+              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                                vocab_size,
+                                hidden_size)
+
+           def forward(self, x):
+              x = self.embedding(x)
+              x = self.linear1(x)
+              x = self.linear2(x)
+              x = self.linear3(x)
+              return x
+    """
+
+    def __init__(self,
+                 num_embeddings,
+                 embedding_dim,
+                 weight_attr=None,
+                 mp_group=None,
+                 name=None):
+        super(VocabParallelEmbedding, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
+        ) if mp_group is None else mp_group.rank
+
+        self.origin_num_embeddings = num_embeddings
+        self.is_mp = (self.world_size > 1)
+
+        assert num_embeddings % self.world_size == 0, (
+            "The length of the vocabulary must be divisible by the parallelism degree of MP"
+        )
+
+        per_part_size = num_embeddings // self.world_size
+
+        self.vocab_start_index = self.rank * per_part_size
+        self._dtype = self._helper.get_default_dtype()
+        self._size = [per_part_size, embedding_dim]
+        self._weight_attr = weight_attr
+        self._name = name
+
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(attr=self._weight_attr,
+                                                    shape=self._size,
+                                                    dtype=self._dtype,
+                                                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(attr=self._weight_attr,
+                                                shape=self._size,
+                                                dtype=self._dtype,
+                                                is_bias=False)
+
+        self.weight.is_distributed = True if self.is_mp else False
+
+    def forward(self, x):
+        if self.is_mp:
+            output_parallel = mp_ops._c_lookup_table(
+                self.weight,
+                x,
+                start_index=self.vocab_start_index,
+                name=self._name)
+            output = mp_ops._mp_allreduce(output_parallel,
+                                          group=self.model_parallel_group,
+                                          use_calc_stream=True,
+                                          use_model_parallel=True)
+        else:
+            output = F.embedding(x,
+                                 weight=self.weight,
+                                 padding_idx=None,
+                                 sparse=False,
+                                 name=self._name)
+        return output
+
+
+class ColumnParallelLinear(Layer):
+    """Linear layer with mp parallelized(column).
+    this class is used for splitting Linear Layer in mp group, column split the weight of the Linear layer.
+
+    Args:
+        in_features(int): The number of input units.
+        out_features(int): The number of output units.
+        weight_attr(ParamAttr|None): The attribute for the learnable weight of this layer. The default value is None
+            and the weight will be initialized to zero. For detailed information, please refer to paddle.ParamAttr.
+        has_bias(bool): whether to add bias.
+        gather_output(bool): whether to do allgahter for the output of each rank.
+        fuse_matmul_bias(bool): whether to fuse matmul and bias.
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        from paddle.distributed import fleet
+
+        class SimpleMPNet(paddle.nn.Layer):
+           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+              super(SimpleMPNet, self).__init__()
+              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                    hidden_size,
+                    inner_size,
+                    gather_output=False,
+                    has_bias=True)
+
+              self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                    inner_size,
+                    hidden_size,
+                    input_is_parallel=True,
+                    has_bias=True)
+
+              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+
+              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                                vocab_size,
+                                hidden_size)
+
+           def forward(self, x):
+              x = self.embedding(x)
+              x = self.linear1(x)
+              x = self.linear2(x)
+              x = self.linear3(x)
+              return x
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=None,
+                 gather_output=True,
+                 fuse_matmul_bias=False,
+                 mp_group=None,
+                 name=None):
+        super(ColumnParallelLinear, self).__init__()
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self._name = name
+        self.is_mp = (self.world_size > 1)
+
+        self.gather_output = gather_output
+        assert out_features % self.world_size == 0, (
+            "Number of column of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(
+                out_features, self.world_size))
+        self.output_size_per_partition = out_features // self.world_size
+
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[in_features, self.output_size_per_partition],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[in_features, self.output_size_per_partition],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
+        self.weight.is_distributed = True if self.is_mp else False
+
+        if has_bias:
+            # initialize bias to zero like Megatron
+            self.bias = self.create_parameter(
+                shape=[self.output_size_per_partition],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype,
+                is_bias=True)
+            self.bias.is_distributed = True if self.is_mp else False
+        else:
+            self.bias = None
+
+        self.linear = F.linear
+
+        if fuse_matmul_bias:
+            if not is_fused_matmul_bias_supported():
+                raise NotImplementedError(
+                    "You set fuse_matmul_bias=True in ColumnParallelLinear, "
+                    "however, the paddle you are using not support this operation. "
+                    "Please set fuse_matmul_bias=False or use paddle compiled "
+                    "with cuda 11.6 or higher.")
+            from paddle.incubate.nn.functional import fused_linear
+            self.linear = fused_linear
+
+    def forward(self, x):
+        # use inner api to process identity
+        if self.is_mp:
+            input_parallel = mp_ops._c_identity(x,
+                                                group=self.model_parallel_group)
+        else:
+            input_parallel = x
+
+        output_parallel = self.linear(input_parallel,
+                                      self.weight,
+                                      self.bias,
+                                      name=self._name)
+
+        if self.gather_output and self.is_mp:
+            output = mp_ops._c_concat(output_parallel,
+                                      group=self.model_parallel_group)
+        else:
+            output = output_parallel
+        return output
+
+
+class RowParallelLinear(Layer):
+    """Linear layer with mp parallelized(row).
+    this class is used for splitting Linear Layer in mp group, row split the weight of the Linear layer.
+
+    Args:
+        in_features(int): The number of input units.
+        out_features(int): The number of output units.
+        weight_attr(ParamAttr|None): The attribute for the learnable weight of this layer. The default value is None
+            and the weight will be initialized to zero. For detailed information, please refer to paddle.ParamAttr.
+        has_bias(bool): whether to add bias.
+        input_is_parallel(bool): whether the input has alreadly been splitted across the mp group.
+        fuse_matmul_bias(bool): whether to fuse matmul and bias.
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+        import paddle
+        from paddle.distributed import fleet
+
+        class SimpleMPNet(paddle.nn.Layer):
+           def __init__(self, vocab_size, hidden_size, inner_size, output_size):
+              super(SimpleMPNet, self).__init__()
+              self.linear1 = fleet.meta_parallel.ColumnParallelLinear(
+                    hidden_size,
+                    inner_size,
+                    gather_output=False,
+                    has_bias=True)
+
+              self.linear2 = fleet.meta_parallel.RowParallelLinear(
+                    inner_size,
+                    hidden_size,
+                    input_is_parallel=True,
+                    has_bias=True)
+
+              self.linear3 = paddle.nn.Linear(hidden_size, output_size)
+
+              self.embedding = fleet.meta_parallel.VocabParallelEmbedding(
+                                vocab_size,
+                                hidden_size)
+
+           def forward(self, x):
+              x = self.embedding(x)
+              x = self.linear1(x)
+              x = self.linear2(x)
+              x = self.linear3(x)
+              return x
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 has_bias=True,
+                 input_is_parallel=False,
+                 fuse_matmul_bias=False,
+                 mp_group=None,
+                 name=None):
+        super(RowParallelLinear, self).__init__()
+
+        self.in_features = in_features
+        self.out_features = out_features
+        self.input_is_parallel = input_is_parallel
+        self._weight_attr = weight_attr
+        self._dtype = self._helper.get_default_dtype()
+        self._name = name
+
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
+        ) if mp_group is None else mp_group.rank
+
+        self.is_mp = (self.world_size > 1)
+        assert in_features % self.world_size == 0, (
+            "Number of row of the weight for linear ({}) must be"
+            " divisible by model parallel size ({})".format(
+                in_features, self.world_size))
+
+        self.input_size_per_partition = in_features // self.world_size
+
+        if self.is_mp and paddle.in_dynamic_mode():
+            with get_rng_state_tracker().rng_state():
+                self.weight = self.create_parameter(
+                    shape=[self.input_size_per_partition, self.out_features],
+                    attr=self._weight_attr,
+                    dtype=self._dtype,
+                    is_bias=False)
+        else:
+            self.weight = self.create_parameter(
+                shape=[self.input_size_per_partition, self.out_features],
+                attr=self._weight_attr,
+                dtype=self._dtype,
+                is_bias=False)
+
+        self.weight.is_distributed = True if self.is_mp else False
+
+        if has_bias:
+            self.bias = self.create_parameter(
+                shape=[self.out_features],
+                attr=paddle.nn.initializer.Constant(value=0.0),
+                dtype=self._dtype,
+                is_bias=True)
+        else:
+            self.bias = None
+
+        self.linear = F.linear
+
+        if fuse_matmul_bias:
+            if not is_fused_matmul_bias_supported():
+                raise NotImplementedError(
+                    "You set fuse_matmul_bias=True in RowParallelLinear, "
+                    "however, the paddle you are using not support this operation. "
+                    "Please set fuse_matmul_bias=False or use paddle compiled "
+                    "with cuda 11.6 or higher.")
+            from paddle.incubate.nn.functional import fused_linear
+            self.linear = fused_linear
+
+    def forward(self, x):
+        if self.input_is_parallel or (not self.is_mp):
+            input_parallel = x
+        else:
+            # split last dim
+            input_parallel = mp_ops._c_split(x, group=self.model_parallel_group)
+
+        if self.is_mp:
+            output_parallel = self.linear(input_parallel,
+                                          self.weight,
+                                          name=self._name)
+            output_ = mp_ops._mp_allreduce(output_parallel,
+                                           group=self.model_parallel_group,
+                                           use_calc_stream=True,
+                                           use_model_parallel=True)
+            output = output_ + self.bias if self.bias is not None else output_
+        else:
+            output = self.linear(input_parallel,
+                                 self.weight,
+                                 self.bias,
+                                 name=self._name)
+
+        return output
+
+
+class ParallelCrossEntropy(Layer):
+    """CrossEntropy with mp parallelized.
+    this class is used for splitting softmax cross entropy in mp group.
+
+    Args:
+        mp_group(Group): The tensor parallel group.
+        name(str, optional): Normally there is no need for user to set this parameter.
+            For detailed information, please refer to :ref:`api_guide_Name` .
+
+    Examples:
+        .. code-block:: python
+        loss_func = ParallelCrossEntropy()
+        loss = loss_func(img, lable)
+    """
+
+    def __init__(self, mp_group=None, name=None):
+        super(ParallelCrossEntropy, self).__init__()
+        self.name = name
+        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
+        ) if mp_group is None else mp_group
+        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
+        ) if mp_group is None else mp_group.nranks
+        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
+        ) if mp_group is None else mp_group.rank
+
+    def forward(self, input, label):
+        loss = mp_ops._c_softmax_with_cross_entropy(
+            input, label, group=self.model_parallel_group)
+        return loss
diff --git a/python/paddle/distributed/fleet/layers/mpu/mp_ops.py b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
new file mode 100644
index 00000000000000..18e7b6617783e2
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/mp_ops.py
@@ -0,0 +1,791 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid import core
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.framework import _in_legacy_dygraph
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.dygraph import layers
+from paddle.distributed import collective
+from ....communication.reduce import ReduceOp
+from paddle.fluid.data_feeder import check_dtype
+import paddle.fluid.dygraph_utils as dygraph_utils
+
+
+def _c_identity(tensor, group=None):
+    """
+    Return a copy of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if in_dygraph_mode():
+        from paddle.autograd import PyLayer
+
+        class c_identity_eager(PyLayer):
+
+            @staticmethod
+            def forward(ctx, tensor):
+                return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True,
+                                                'ring_id', group.id,
+                                                'use_model_parallel', True)
+
+            @staticmethod
+            def backward(ctx, dy):
+                op_type = collective._get_reduce_op(ReduceOp.SUM, "_c_identity")
+                group.process_group.allreduce_on_calc_stream(dy, op_type)
+                return dy
+
+        return c_identity_eager.apply(tensor)
+
+    elif _in_legacy_dygraph():
+        return _legacy_C_ops.c_identity(tensor, 'use_calc_stream', True,
+                                        'ring_id', ring_id,
+                                        'use_model_parallel', True)
+    op_type = 'c_identity'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_identity')
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'use_model_parallel': True,
+                     })
+    return out
+
+
+def _c_concat(tensor, group=None):
+    """
+    Return allgather of the tensor, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    group = collective._get_default_group() if group is None else group
+    ring_id = group.id
+
+    global_rank = collective._get_global_env().rank
+    rank = group.rank
+    nranks = group.nranks
+
+    if _non_static_mode():
+        return _legacy_C_ops.c_concat(tensor, 'ring_id', ring_id,
+                                      'use_calc_stream', True, 'rank', rank,
+                                      'nranks', nranks, 'use_model_parallel',
+                                      True)
+
+    op_type = 'c_concat'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_concat')
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'use_model_parallel': True,
+                         'nranks': nranks,
+                         'rank': rank
+                     })
+    return out
+
+
+def _c_split(tensor, group=None):
+    """
+    Split tensor evenly among all members, mainly used with model parallel.
+
+    Args:
+        tensor (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64, int32 or int64.
+        rank (int): The rank of the current process.
+        group (int): The id of the process group to work on.
+
+    Returns:
+        Tensor.
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    global_rank = collective._get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = collective._get_global_env(
+    ).world_size if group is None else group.nranks
+
+    if _non_static_mode():
+        return _legacy_C_ops.c_split(tensor, 'use_calc_stream', True, 'ring_id',
+                                     ring_id, 'rank', rank, 'nranks', nranks,
+                                     'use_model_parallel', True)
+
+    op_type = 'c_split'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        '_c_split')
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': True,
+                         'rank': rank,
+                         'nranks': nranks,
+                         'use_model_parallel': True,
+                     })
+    return out
+
+
+def _mp_allreduce(tensor,
+                  op=ReduceOp.SUM,
+                  group=None,
+                  use_calc_stream=True,
+                  use_model_parallel=True):
+    """[it is same as allreduce above, but it supports model parallel. And it support inplace startegy]
+    """
+    if group is not None and not group.is_member():
+        return
+
+    if in_dygraph_mode():
+        group = collective._get_default_group() if group is None else group
+        assert op == ReduceOp.SUM, "Unknown parameter: {}.".format(op)
+
+        from paddle.autograd import PyLayer
+
+        class mp_allreduce_eager(PyLayer):
+
+            @staticmethod
+            def forward(ctx, tensor, group, use_calc_stream,
+                        use_model_parallel):
+                ctx.ring_id = group.id
+
+                if use_calc_stream:
+                    op_type = collective._get_reduce_op(op, "_mp_allreduce")
+                    group.process_group.allreduce_on_calc_stream(
+                        tensor, op_type)
+                    return tensor
+                else:
+                    return _legacy_C_ops.c_allreduce_sum_(
+                        tensor, 'use_calc_stream', use_calc_stream, 'ring_id',
+                        ring_id, "use_model_parallel", use_model_parallel)
+
+            @staticmethod
+            def backward(ctx, dy):
+                return _legacy_C_ops.c_identity(dy, 'use_calc_stream', True,
+                                                'ring_id', ctx.ring_id,
+                                                'use_model_parallel', True)
+
+        return mp_allreduce_eager.apply(tensor, group, use_calc_stream,
+                                        use_model_parallel)
+
+    ring_id = 0 if group is None else group.id
+    if _in_legacy_dygraph():
+        if op == ReduceOp.SUM:
+            return _legacy_C_ops.c_allreduce_sum_(tensor, 'use_calc_stream',
+                                                  use_calc_stream, 'ring_id',
+                                                  ring_id, "use_model_parallel",
+                                                  use_model_parallel)
+        else:
+            raise ValueError("Unknown parameter: {}.".format(op))
+
+    op_type = 'c_allreduce_sum'
+    helper = LayerHelper(op_type, **locals())
+    out = helper.create_variable_for_type_inference(dtype=tensor.dtype)
+
+    check_variable_and_dtype(
+        tensor, 'tensor', ['float16', 'float32', 'float64', 'int32', 'int64'],
+        op_type)
+
+    helper.append_op(type=op_type,
+                     inputs={'X': tensor},
+                     outputs={'Out': out},
+                     attrs={
+                         'ring_id': ring_id,
+                         'use_calc_stream': use_calc_stream,
+                         'use_model_parallel': use_model_parallel,
+                     })
+    return out
+
+
+def _c_lookup_table(table, index, start_index=0, name=None):
+    """
+    Lookup table according to index.
+
+    Args:
+        table (Tensor): The input Tensor. Its data type
+            should be float16, float32, float64.
+        index (Tensor): The index to lookup table.
+        start_index (int): The initial index for table range.
+        name (string): The name of the api
+
+    Returns:
+        Tensor.
+    """
+    if _non_static_mode():
+        return _legacy_C_ops.c_embedding(table, index, "start_index",
+                                         start_index)
+
+    op_type = 'c_embedding'
+    helper = LayerHelper(op_type, **locals())
+    dtype = helper.input_dtype(input_param_name='table')
+    check_variable_and_dtype(index, 'input', ['int32', 'int64'], op_type)
+    tmp = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type='c_embedding',
+                     inputs={
+                         'Ids': index,
+                         'W': table
+                     },
+                     outputs={'Out': tmp},
+                     attrs={"start_index": start_index})
+    return tmp
+
+
+class _Linear(layers.Layer):
+    """
+    Linear
+    """
+
+    def __init__(self,
+                 in_features,
+                 out_features,
+                 weight_attr=None,
+                 bias_attr=None,
+                 name=None):
+        super(_Linear, self).__init__()
+        self._dtype = self._helper.get_default_dtype()
+        self._weight_attr = weight_attr
+        self._bias_attr = bias_attr
+        self.weight = self.create_parameter(shape=[in_features, out_features],
+                                            attr=self._weight_attr,
+                                            dtype=self._dtype,
+                                            is_bias=False)
+        self.bias = self.create_parameter(shape=[out_features],
+                                          attr=self._bias_attr,
+                                          dtype=self._dtype,
+                                          is_bias=True)
+        self.name = name
+
+    def forward(self, input):
+        out = _linear(x=input,
+                      weight=self.weight,
+                      bias=self.bias,
+                      name=self.name)
+        return out
+
+    def extra_repr(self):
+        name_str = ', name={}'.format(self.name) if self.name else ''
+        return 'in_features={}, out_features={}, dtype={}{}'.format(
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
+
+
+def _c_softmax_with_cross_entropy(logits,
+                                  label,
+                                  group=None,
+                                  return_softmax=False):
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+    global_rank = collective._get_global_env().rank
+    rank = global_rank if group is None else group.get_group_rank(global_rank)
+    nranks = collective._get_global_env(
+    ).world_size if group is None else group.nranks
+
+    input_dims = len(list(logits.shape))
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=-1)
+
+    if _non_static_mode():
+        softmax, loss = _legacy_C_ops.c_softmax_with_cross_entropy(
+            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    attrs = {
+        'ring_id': ring_id,
+        'rank': rank,
+        'nranks': nranks,
+    }
+    helper = LayerHelper('c_softmax_with_cross_entropy', **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    helper.append_op(type='c_softmax_with_cross_entropy',
+                     inputs={
+                         'Logits': logits,
+                         'Label': label
+                     },
+                     outputs={
+                         'Softmax': softmax,
+                         'Loss': loss
+                     },
+                     attrs=attrs)
+
+    if return_softmax:
+        return loss, softmax
+
+    return loss
+
+
+def _linear(x, weight, bias=None, name=None):
+    """
+    Fuction Linear
+    """
+    if _non_static_mode():
+        pre_bias = _varbase_creator(dtype=x.dtype)
+        _legacy_C_ops.matmul(x, weight, pre_bias, 'transpose_X', False,
+                             'transpose_Y', False, "alpha", 1)
+        return dygraph_utils._append_bias_in_dygraph(pre_bias,
+                                                     bias,
+                                                     axis=len(x.shape) - 1)
+    else:
+        helper = LayerHelper('linear', **locals())
+        dtype = x.dtype
+        assert len(
+            x.shape) < 4, "X latitude is not supported greater than 3 now."
+
+        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                                 'linear')
+        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear')
+
+        inputs = {'X': [x], 'Y': [weight]}
+        attrs = {
+            'transpose_X': False,
+            'transpose_Y': False,
+            'alpha': 1,
+        }
+        tmp = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(type='matmul_v2',
+                         inputs=inputs,
+                         outputs={'Out': tmp},
+                         attrs=attrs)
+        if bias is not None:
+            res = helper.create_variable_for_type_inference(dtype)
+            helper.append_op(type='elementwise_add',
+                             inputs={
+                                 'X': [tmp],
+                                 'Y': [bias]
+                             },
+                             outputs={'Out': [res]},
+                             attrs={'axis': len(x.shape) - 1})
+        else:
+            res = tmp
+        return res
+
+
+def _set_var_distributed(var):
+    if var is None:
+        return
+
+    var.is_distributed = True
+
+    # NOTE: use current_block and find_var_recursive to support while_loop
+    startup_block = paddle.static.default_startup_program().current_block()
+    main_block = paddle.static.default_main_program().current_block()
+    startup_block._find_var_recursive(var.name).is_distributed = True
+    main_block._find_var_recursive(var.name).is_distributed = True
+
+
+def _parallel_linear(x,
+                     num_rows,
+                     num_cols,
+                     axis,
+                     param_attr,
+                     bias_attr,
+                     gather_out,
+                     inner_rank,
+                     nranks,
+                     split_tensor,
+                     name,
+                     group=None):
+    """
+    Parallel Linear
+
+    axis the dimension of the parameter of linear layer.
+    axis = 0: the row dimension
+    axis = 1: the col dimension
+
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    if axis == 0:
+        if split_tensor:
+            x = _c_split(x, group=group)
+    else:
+        x = _c_identity(x, group=group)
+
+    linear = paddle.nn.Linear(num_rows,
+                              num_cols,
+                              weight_attr=param_attr,
+                              bias_attr=bias_attr,
+                              name=name)
+
+    # NOTE: npu linear function use matmul_v2 but linear use matmul
+    linear_function = _linear if core.is_compiled_with_npu()\
+        else paddle.nn.functional.linear
+    linear_out = linear_function(
+        x,
+        linear.weight,
+        # NOTE(wangxi): row split, bias need add after allreduce
+        None if axis == 0 else linear.bias,
+        linear.name)
+
+    _set_var_distributed(linear.weight)
+    # set is_distributed for splited bias
+    # if a linear layer is splited by row, each rank would hold a complete bias and they should be the same in each rank.
+    # if a linear layer is splited by col, the bias would also be split into each rank as its weight
+    if axis == 1 and linear._bias_attr != False:
+        _set_var_distributed(linear.bias)
+
+    if not gather_out: return linear_out
+
+    out_shape = list(linear_out.shape)
+    out_shape[0] *= 1 if axis == 0 else nranks
+    main_block = paddle.static.default_main_program().current_block()
+    out = main_block.create_var(
+        shape=out_shape,
+        dtype=linear_out.dtype,
+        type=linear_out.type,
+        lod_level=linear_out.lod_level,
+        persistable=False,
+        is_data=False,
+        need_check_feed=linear_out.desc.need_check_feed())
+    if axis == 0:
+        main_block.append_op(type='c_allreduce_sum',
+                             inputs={'X': linear_out},
+                             outputs={'Out': out},
+                             attrs={
+                                 'ring_id': ring_id,
+                                 'use_calc_stream': True,
+                                 'use_model_parallel': True
+                             })
+        if linear.bias is not None:
+            out = out + linear.bias
+    else:
+        main_block.append_op(type='c_concat',
+                             inputs={'X': linear_out},
+                             outputs={'Out': out},
+                             attrs={
+                                 'rank': inner_rank,
+                                 'ring_id': ring_id,
+                                 'nranks': nranks,
+                                 'use_calc_stream': True,
+                                 'use_model_parallel': True
+                             })
+    return out
+
+
+def _parallel_embedding(x,
+                        per_part_embeddings,
+                        origin_size,
+                        param_attr,
+                        inner_rank,
+                        num_partitions,
+                        name,
+                        group=None):
+    """
+    Parallel Embedding
+    """
+    if group is not None and not group.is_member():
+        return
+    ring_id = 0 if group is None else group.id
+
+    helper = LayerHelper("_parallel_embedding", **locals())
+
+    per_part_size = per_part_embeddings
+    rank = inner_rank
+
+    vocab_start_index = rank * per_part_size
+    dtype = helper.get_default_dtype()
+    size = [per_part_size, origin_size[1]]
+
+    weight = helper.create_parameter(attr=param_attr,
+                                     shape=size,
+                                     dtype=dtype,
+                                     is_bias=False)
+
+    if num_partitions == 1:
+        return paddle.nn.functional.embedding(x,
+                                              weight=weight,
+                                              padding_idx=None,
+                                              sparse=False,
+                                              name=name)
+
+    startup_block = paddle.static.default_startup_program().global_block()
+    main_block = paddle.static.default_main_program().global_block()
+    startup_block.vars[weight.name].is_distributed = True
+    main_block.vars[weight.name].is_distributed = True
+
+    output_parallel = _c_lookup_table(weight,
+                                      x,
+                                      start_index=vocab_start_index,
+                                      name=name)
+    out = _mp_allreduce(output_parallel,
+                        group=group,
+                        use_calc_stream=True,
+                        use_model_parallel=True)
+    return out
+
+
+def split(x,
+          size,
+          operation,
+          axis=0,
+          num_partitions=1,
+          gather_out=True,
+          weight_attr=None,
+          bias_attr=None,
+          name=None):
+    """
+
+    Split the weight of the specified operation into multiple devices
+    and do the computation in parallel.
+
+    Now the following three cases are supported.
+
+    Case 1: Parallel Embedding
+        The weight of the embedding operation is a NxM matrix with N rows and M columns.
+        With parallel embedding, the weight is split into num_partitions partitions, each
+        of which is a matrix with (N/num_partitions + 1) rows and M column where the last
+        row as the padding idx.
+
+        Suppose we split the NxM weight into two partitons on device_0 and device_1
+        respectively. Then, one each device, the final weight has (N/2 + 1) rows with the
+        index range from 0 to N/2. On device_0, all values in the input within [0, N/2 -1]
+        keep unchanged and all other values are changed to N/2 which is the padding index and
+        are mapped to all zeros after embedding. In the same way, on device_1, the value V in the
+        input within [N/2, N-1] will be changed to (V - N/2), and all other values are changed
+        to N/2 and are mapped to all zeros after embedding. Finally, the results on the two
+        devices are sum-reduced.
+
+        The Embedding put on single card is as shown below:
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_single.png
+            :width: 800
+            :height: 350
+            :alt: single_embedding
+            :align: center
+
+        Parallel Embedding is shown as below:
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_embedding_split.png
+            :width: 800
+            :alt: split_embedding
+            :align: center
+
+    Case 2: Row Parallel Linear
+        The weight of the linear operation is a NxM matrix with N rows and M columns.
+        With row parallel linear, the weight is split into num_partitions partitions, each
+        of which is a matrix with N/num_partitions rows and M column.
+
+        The linear layer put on single card is shown as below, the input variable is represented by X,
+        the weight matrix is represented by W and the output vaiable is O. The linear layer on single card is
+        simple matrix multiplication operation, O = X * W.
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_single.png
+            :width: 800
+            :alt: single_linear
+            :align: center
+
+        Row Parallel Linear is shown as below. As the name suggests, Row Parallel Linear splits the weight matrix W into
+        [[W_row1], [W_row2]] along the row. And accordingly the input is splitted along the column into [X_col1, X_col2] and multiply their
+        respective weight matrices. Finally apply AllReduce on the output from each card to get the final output.
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_row.png
+            :width: 800
+            :alt: split_row
+            :align: center
+
+    Case 3: Column Parallel Linear
+        The weight of the linear operation is a NxM matrix with N rows and M columns.
+        With column parallel linear, the weight is split into num_paratitions partitions, each
+        of which is a matrix with N rows and M/num_partitions column.
+
+        The linear layer put on single card has been illustrated on case 2 and Column Parallel Linear
+        is shown as below. The Column Parallel Linear splits the weight matrix W into [W_col1, W_col2] along the column and
+        these splitted matrices respectively multiply the input. Finally apply AllGather on the output from each card to get the final output.
+
+        .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col.png
+            :width: 800
+            :alt: split_col
+            :align: center
+
+    As observed, the column parallel linear and row parallel linear can be combined to skip one ALLGATHER communication
+    operator. Furthermore the Attention and MLP can be combined to imporve the performance as shown below.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/split_col_row.png
+            :width: 800
+            :alt: split_col_row
+            :align: center
+
+    Args:
+        x (Tensor): Input tensor. It's data type should be float16, float32, float64, int32 or int64.
+        size (list|tuple): A list or tuple with two elements indicating the shape of the weight.
+        operation (str): The name of the operation. The supported operations are 'linear' and 'embedding'.
+        axis (int, Optional): Indicate along which axis to split the weight. Default: 0.
+        num_partitions (int, Optional): How many parts the weight is partitioned. Default: 1.
+        gather_out (bool, Optional): Whether to gather the output after computation. By default, the output
+            on each partitions will be gathered after computation. Default: True.
+        weight_attr (ParamAttr, Optional): The parameter attribute for the learnable
+            weights(Parameter) of the specified operation. Default: None.
+        bias_attr (ParamAttr, Optional): The parameter attribute for the bias
+            of the specified operation. Default: None.
+        name (str, Optional): The default value is None. Normally there is no need for user to set this
+            property. Default: None. For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import paddle
+            import paddle.distributed.fleet as fleet
+
+            paddle.enable_static()
+            paddle.set_device('gpu:%d'%paddle.distributed.ParallelEnv().dev_id)
+            fleet.init(is_collective=True)
+            data = paddle.randint(0, 8, shape=[10,4])
+            emb_out = paddle.distributed.split(
+                data,
+                (8, 8),
+                operation="embedding",
+                num_partitions=2)
+
+    """
+    assert isinstance(
+        size,
+        (list, tuple)), ("The type of size for "
+                         "paddle.distributed.split must be list or tuple.")
+    assert len(size) == 2, ("Number of elements in size of "
+                            "paddle.distributed.split must be two.")
+    assert isinstance(operation, str), ("The type of operation for "
+                                        "paddle.distributed.split must be str.")
+    supported_operations = [
+        'linear',
+        'embedding',
+    ]
+    assert operation in supported_operations, (
+        "The operation for "
+        "paddle.distributed.split must be one of {}.".format(
+            supported_operations))
+    if _non_static_mode():
+        raise ValueError(
+            "paddle.distributed.split cannot be used in dynamic "
+            "graph mode, plese use ParallelEmbedding, ParallelRowLinear, "
+            "ParallelColumnLinear instead.")
+    else:
+        from paddle.distributed.fleet import fleet
+        assert fleet._role_maker, ("To use paddle.distributed.split, "
+                                   "you must call fleet.init() firstly.")
+        rank = fleet.worker_index()
+        nranks = fleet.worker_num()
+
+    # rank within a model parallel group
+    inner_rank = rank % num_partitions
+
+    if operation == "embedding":
+        assert axis == 0, ("We only support to split the weight of embedding "
+                           "along the first axis now.")
+        assert size[0] % num_partitions == 0, \
+            "The length of the vocabulary must be divisible by num_partitions " \
+            "but received vocabulary={} num_partitions={}".format(size[0], num_partitions)
+
+        per_part_size = size[0] // num_partitions
+        emb_out = _parallel_embedding(x,
+                                      per_part_size,
+                                      size,
+                                      weight_attr,
+                                      inner_rank,
+                                      num_partitions,
+                                      name,
+                                      group=None)
+        return emb_out
+    else:
+        should_split = False
+        if axis == 0:
+            assert size[0] % num_partitions == 0, (
+                "Number of rows of the weight for linear ({}) must be"
+                " divisible by num_partitions ({})".format(
+                    size[0], num_partitions))
+            per_part_size = size[0] // num_partitions
+            linear_size = (per_part_size, size[1])
+            if x.shape[-1] == size[0]: should_split = True
+
+        elif axis == 1:
+            assert size[1] % num_partitions == 0, (
+                "Number of column of the weight for linear ({}) must be"
+                " divisible by num_partitions ({})".format(
+                    size[1], num_partitions))
+            per_part_size = size[1] // num_partitions
+            linear_size = (size[0], per_part_size)
+        else:
+            raise ValueError("The value of axis must be 0 or 1, but the value "
+                             "given is {}.".format(axis))
+
+        linear_out = _parallel_linear(x,
+                                      linear_size[0],
+                                      linear_size[1],
+                                      axis,
+                                      weight_attr,
+                                      bias_attr,
+                                      gather_out,
+                                      inner_rank,
+                                      num_partitions,
+                                      should_split,
+                                      name=name,
+                                      group=None)
+        return linear_out
diff --git a/python/paddle/distributed/fleet/layers/mpu/random.py b/python/paddle/distributed/fleet/layers/mpu/random.py
new file mode 100644
index 00000000000000..7577be6253cbfa
--- /dev/null
+++ b/python/paddle/distributed/fleet/layers/mpu/random.py
@@ -0,0 +1,243 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import contextlib
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid import core
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle.fluid.framework import _non_static_mode, default_main_program, Variable
+from paddle.fluid.layer_helper import LayerHelper
+
+__all__ = []
+
+MODEL_PARALLEL_RNG = 'model_parallel_rng'
+
+# This file is inspired by Megatron to control random states for MP:
+# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/random.py
+
+
+class RNGStatesTracker:
+    """
+    Tracker the RNG states.
+    """
+
+    def __init__(self):
+        # Map from name to the rng state.
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def reset(self):
+        self.states_ = {}
+        self.seeds_ = set()
+
+    def add(self, name, seed):
+        if seed in self.seeds_:
+            raise ValueError('seed {} already exists'.format(seed))
+        self.seeds_.add(seed)
+        if name in self.states_:
+            raise ValueError('state {} already exists'.format(name))
+        orig_rng_state = paddle.get_cuda_rng_state()
+        paddle.seed(seed)
+        self.states_[name] = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(orig_rng_state)
+
+    def get_states_tracker(self):
+        states = {}
+        for name in self.states_:
+            states[name] = self.states_[name]
+        return states
+
+    def set_states_tracker(self, states):
+        self.states_ = states
+
+    @contextlib.contextmanager
+    def rng_state(self, name=MODEL_PARALLEL_RNG):
+        if name not in self.states_:
+            raise ValueError('state {} does not exist'.format(name))
+        orig_cuda_rng_state = paddle.get_cuda_rng_state()
+        paddle.set_cuda_rng_state(self.states_[name])
+        try:
+            yield
+        finally:
+            self.states_[name] = paddle.get_cuda_rng_state()
+            paddle.set_cuda_rng_state(orig_cuda_rng_state)
+
+
+RNG_STATE_TRACKER = RNGStatesTracker()
+
+
+def get_rng_state_tracker():
+    return RNG_STATE_TRACKER
+
+
+def model_parallel_random_seed(seed=None):
+    import paddle.distributed.fleet as fleet
+    hcg = fleet.get_hybrid_communicate_group()
+    rank = hcg.get_model_parallel_rank()
+
+    if seed:
+        global_seed = seed
+        local_seed = seed * 1024 + rank * 100
+    else:
+        global_seed = np.random.randint(0, 655350)
+        local_seed = np.random.randint(rank * 10000, (rank + 1) * 10000 - 1)
+
+    RNG_STATE_TRACKER.reset()
+    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
+    paddle.seed(global_seed)
+
+
+def determinate_seed(rng_name):
+    assert rng_name is not None and rng_name != ""
+    helper = LayerHelper('seed', **locals())
+    out = helper.create_variable_for_type_inference(dtype=paddle.int32)
+    # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
+    helper.append_op(type='seed',
+                     outputs={'Out': out},
+                     attrs={
+                         'deterministic': True,
+                         'rng_name': rng_name,
+                         'force_cpu': True
+                     })
+    return out
+
+
+def dropout(x,
+            p=0.5,
+            axis=None,
+            rng_name=None,
+            training=True,
+            mode="upscale_in_train",
+            name=None):
+    """
+    Dropout is a regularization technique for reducing overfitting by preventing
+    neuron co-adaption during training. The dropout operator randomly sets the
+    outputs of some units to zero, while upscale others according to the given
+    dropout probability.
+
+    Args:
+        x (Tensor): The input tensor. The data type is float32 or float64.
+        p (float|int): Probability of setting units to zero. Default 0.5.
+        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
+        rng_name (str): The random seed generator name, which used to obtain deterministic results.
+        training (bool): A flag indicating whether it is in train phrase or not. Default True.
+        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
+
+                           1. upscale_in_train(default), upscale the output at training time
+
+                              - train: out = input * mask / ( 1.0 - dropout_prob )
+                              - inference: out = input
+
+                           2. downscale_in_infer, downscale the output at inference
+
+                              - train: out = input * mask
+                              - inference: out = input * (1.0 - dropout_prob)
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A Tensor representing the dropout, has same shape and data type as `x` .
+
+
+    Examples:
+        We use ``p=0.5`` in the following description for simplicity.
+
+        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
+
+        ..  code-block:: text
+
+            Let's see a simple case when x is a 2d tensor with shape 2*3:
+            [[1 2 3]
+             [4 5 6]]
+            we generate mask with the same shape as x, which is 2*3. The value of mask is
+            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
+            [[0 1 0]
+             [1 0 1]]
+            So the output is obtained from elementwise multiply of x and mask:
+            [[0 2 0]
+             [4 0 6]]
+            Using default setting, i.e. ``mode='upscale_in_train'`` ,
+            if in training phase, the final upscale output is:
+            [[0 4 0 ]
+             [8 0 12]]
+            if in test phase, the output is the same as input:
+            [[1 2 3]
+             [4 5 6]]
+            we can also set ``mode='downscale_in_infer'`` , then
+            if in training phase, the final output is:
+            [[0 2 0]
+             [4 0 6]]
+            if in test phase, the scale output is:
+            [[0.5 1.  1.5]
+             [2.  2.5 3. ]]
+
+    """
+    if rng_name is None:
+        return paddle.nn.functional.dropout(x, p, axis, training, mode, name)
+
+    if not isinstance(p, (float, int, Variable)):
+        raise TypeError("p argument should be a number(int|float) or Variable")
+
+    # fast return for p == 0
+    if isinstance(p, (int, float)) and p == 0: return x
+
+    assert 0 <= p <= 1, ValueError("p argument should between 0 and 1")
+    assert mode in ('downscale_in_infer', 'upscale_in_train'), \
+        ValueError(
+            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
+
+    assert axis is None, \
+        TypeError("unsupport axis when using random seed generator")
+
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    # dygraph using tracker, doesn't need determinate seed
+    if _non_static_mode():
+        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
+                                          not training, 'fix_seed', False,
+                                          'seed', 0, 'dropout_implementation',
+                                          mode)
+        return out
+
+    seed = determinate_seed(rng_name)
+
+    if isinstance(p, Variable) and not p.shape != [1]:
+        raise TypeError(
+            "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}"
+            .format(p.shape))
+
+    helper = LayerHelper('dropout', **locals())
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
+                             'dropout')
+
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    mask = helper.create_variable_for_type_inference(
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+
+    helper.append_op(type='dropout',
+                     inputs={
+                         'X': [x],
+                         'Seed': seed
+                     },
+                     outputs={
+                         'Out': [out],
+                         'Mask': [mask]
+                     },
+                     attrs={
+                         'dropout_prob': p,
+                         'is_test': not training,
+                         'dropout_implementation': mode,
+                     })
+    return out
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
index 8a6ec33b39b739..d34f1cad270332 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/dygraph_sharding_optimizer.py
@@ -146,7 +146,7 @@ def _sharding_sync_parameters(self):
                         # instead of the relative logic rank id within group
                         src=self._hcg.get_sharding_parallel_group().ranks[rank],
                         group=self._hcg.get_sharding_parallel_group(),
-                        use_calc_stream=True)
+                        sync_op=True)
 
     def _update_trainable(self):
         """
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
index 3359e63b1deff8..ad5cbf83ecb862 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/sharding_optimizer_stage2.py
@@ -150,7 +150,7 @@ def _sync_params_and_buffers(self):
             broadcast(p,
                       src=self._global_root_rank,
                       group=self.group,
-                      use_calc_stream=True)
+                      sync_op=True)
 
         # Multi stream operation will be supported later
         wait(tensor=p, group=self.group, use_calc_stream=True)
@@ -415,7 +415,7 @@ def _broadcast_params(self):
                 broadcast(tensor=internal_storage.buffer,
                           src=self.group.ranks[dst_rank],
                           group=self.group,
-                          use_calc_stream=True)
+                          sync_op=True)
 
             # Multi stream operation will be supported later
             wait(tensor=internal_storage.buffer,
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
index fcecc3a9a671ec..ccac803e72130e 100755
--- a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -30,15 +30,8 @@
 from .sharding import utils
 # FIXME: import *
 from .sharding.utils import *
-
 import logging
-
-logger = logging.getLogger(__name__)
-formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
-                              datefmt='%Y-%m-%d %H:%M:%S')
-ch = logging.StreamHandler()
-ch.setFormatter(formatter)
-logger.addHandler(ch)
+from ..utils.log_util import logger
 
 __all__ = []
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
index 6cb69bc73ce617..66a1c87756220e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/mp_layers.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,298 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-from paddle.fluid import core
-from paddle.fluid.dygraph.layers import Layer
-from .random import get_rng_state_tracker
-from paddle.nn import functional as F
-from paddle import framework
-from ...base import topology as tp
-from paddle.autograd import PyLayer
+from ...layers.mpu.mp_layers import VocabParallelEmbedding  # noqa: F401
+from ...layers.mpu.mp_layers import ColumnParallelLinear  # noqa: F401
+from ...layers.mpu.mp_layers import RowParallelLinear  # noqa: F401
+from ...layers.mpu.mp_layers import ParallelCrossEntropy  # noqa: F401
 
 __all__ = []
-
-# Follow this paper to achieve the file:
-# Shoeybi M, Patwary M, Puri R, et al. Megatron-lm: Training multi-billion parameter
-# language models using model parallelism[J]. arXiv preprint arXiv:1909.08053, 2019. (https://arxiv.org/abs/1909.08053)
-
-
-def is_fused_matmul_bias_supported():
-    if paddle.is_compiled_with_cuda() and not paddle.is_compiled_with_rocm():
-        return hasattr(core.ops, 'fused_gemm_epilogue')
-    else:
-        return False
-
-
-class VocabParallelEmbedding(Layer):
-
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 weight_attr=None,
-                 mp_group=None,
-                 name=None):
-        super(VocabParallelEmbedding, self).__init__()
-
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
-        ) if mp_group is None else mp_group.rank
-
-        self.origin_num_embeddings = num_embeddings
-        self.is_mp = (self.world_size > 1)
-
-        assert num_embeddings % self.world_size == 0, (
-            "The length of the vocabulary must be divisible by the parallelism degree of MP"
-        )
-
-        per_part_size = num_embeddings // self.world_size
-
-        self.vocab_start_index = self.rank * per_part_size
-        self._dtype = self._helper.get_default_dtype()
-        self._size = [per_part_size, embedding_dim]
-        self._weight_attr = weight_attr
-        self._name = name
-
-        if self.is_mp and paddle.in_dynamic_mode():
-            with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(attr=self._weight_attr,
-                                                    shape=self._size,
-                                                    dtype=self._dtype,
-                                                    is_bias=False)
-        else:
-            self.weight = self.create_parameter(attr=self._weight_attr,
-                                                shape=self._size,
-                                                dtype=self._dtype,
-                                                is_bias=False)
-
-        self.weight.is_distributed = True if self.is_mp else False
-
-    def forward(self, x):
-        if self.is_mp:
-            output_parallel = paddle.distributed.collective._c_lookup_table(
-                self.weight,
-                x,
-                start_index=self.vocab_start_index,
-                name=self._name)
-            output = paddle.distributed.collective._mp_allreduce(
-                output_parallel,
-                group=self.model_parallel_group,
-                use_calc_stream=True,
-                use_model_parallel=True)
-        else:
-            output = F.embedding(x,
-                                 weight=self.weight,
-                                 padding_idx=None,
-                                 sparse=False,
-                                 name=self._name)
-        return output
-
-
-class ColumnParallelLinear(Layer):
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 has_bias=None,
-                 gather_output=True,
-                 fuse_matmul_bias=False,
-                 mp_group=None,
-                 name=None):
-        super(ColumnParallelLinear, self).__init__()
-
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self._name = name
-        self.is_mp = (self.world_size > 1)
-
-        self.gather_output = gather_output
-        assert out_features % self.world_size == 0, (
-            "Number of column of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                out_features, self.world_size))
-        self.output_size_per_partition = out_features // self.world_size
-
-        self._weight_attr = weight_attr
-        self._dtype = self._helper.get_default_dtype()
-
-        if self.is_mp and paddle.in_dynamic_mode():
-            with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(
-                    shape=[in_features, self.output_size_per_partition],
-                    attr=self._weight_attr,
-                    dtype=self._dtype,
-                    is_bias=False)
-        else:
-            self.weight = self.create_parameter(
-                shape=[in_features, self.output_size_per_partition],
-                attr=self._weight_attr,
-                dtype=self._dtype,
-                is_bias=False)
-
-        self.weight.is_distributed = True if self.is_mp else False
-
-        if has_bias:
-            # initialize bias to zero like Megatron
-            self.bias = self.create_parameter(
-                shape=[self.output_size_per_partition],
-                attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype,
-                is_bias=True)
-            self.bias.is_distributed = True if self.is_mp else False
-        else:
-            self.bias = None
-
-        self.linear = F.linear
-
-        if fuse_matmul_bias:
-            if not is_fused_matmul_bias_supported():
-                raise NotImplementedError(
-                    "You set fuse_matmul_bias=True in ColumnParallelLinear, "
-                    "however, the paddle you are using not support this operation. "
-                    "Please set fuse_matmul_bias=False or use paddle compiled "
-                    "with cuda 11.6 or higher.")
-            from paddle.incubate.nn.functional import fused_linear
-            self.linear = fused_linear
-
-    def forward(self, x):
-        # use inner api to process identity
-        if self.is_mp:
-            input_parallel = paddle.distributed.collective._c_identity(
-                x, group=self.model_parallel_group)
-        else:
-            input_parallel = x
-
-        output_parallel = self.linear(input_parallel,
-                                      self.weight,
-                                      self.bias,
-                                      name=self._name)
-
-        if self.gather_output and self.is_mp:
-            output = paddle.distributed.collective._c_concat(
-                output_parallel, group=self.model_parallel_group)
-        else:
-            output = output_parallel
-        return output
-
-
-class RowParallelLinear(Layer):
-
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 has_bias=True,
-                 input_is_parallel=False,
-                 fuse_matmul_bias=False,
-                 mp_group=None,
-                 name=None):
-        super(RowParallelLinear, self).__init__()
-
-        self.in_features = in_features
-        self.out_features = out_features
-        self.input_is_parallel = input_is_parallel
-        self._weight_attr = weight_attr
-        self._dtype = self._helper.get_default_dtype()
-        self._name = name
-
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
-        ) if mp_group is None else mp_group.rank
-
-        self.is_mp = (self.world_size > 1)
-        assert in_features % self.world_size == 0, (
-            "Number of row of the weight for linear ({}) must be"
-            " divisible by model parallel size ({})".format(
-                in_features, self.world_size))
-
-        self.input_size_per_partition = in_features // self.world_size
-
-        if self.is_mp and paddle.in_dynamic_mode():
-            with get_rng_state_tracker().rng_state():
-                self.weight = self.create_parameter(
-                    shape=[self.input_size_per_partition, self.out_features],
-                    attr=self._weight_attr,
-                    dtype=self._dtype,
-                    is_bias=False)
-        else:
-            self.weight = self.create_parameter(
-                shape=[self.input_size_per_partition, self.out_features],
-                attr=self._weight_attr,
-                dtype=self._dtype,
-                is_bias=False)
-
-        self.weight.is_distributed = True if self.is_mp else False
-
-        if has_bias:
-            self.bias = self.create_parameter(
-                shape=[self.out_features],
-                attr=paddle.nn.initializer.Constant(value=0.0),
-                dtype=self._dtype,
-                is_bias=True)
-        else:
-            self.bias = None
-
-        self.linear = F.linear
-
-        if fuse_matmul_bias:
-            if not is_fused_matmul_bias_supported():
-                raise NotImplementedError(
-                    "You set fuse_matmul_bias=True in RowParallelLinear, "
-                    "however, the paddle you are using not support this operation. "
-                    "Please set fuse_matmul_bias=False or use paddle compiled "
-                    "with cuda 11.6 or higher.")
-            from paddle.incubate.nn.functional import fused_linear
-            self.linear = fused_linear
-
-    def forward(self, x):
-        if self.input_is_parallel or (not self.is_mp):
-            input_parallel = x
-        else:
-            # split last dim
-            input_parallel = paddle.distributed.collective._c_split(
-                x, group=self.model_parallel_group)
-
-        if self.is_mp:
-            output_parallel = self.linear(input_parallel,
-                                          self.weight,
-                                          name=self._name)
-            output_ = paddle.distributed.collective._mp_allreduce(
-                output_parallel,
-                group=self.model_parallel_group,
-                use_calc_stream=True,
-                use_model_parallel=True)
-            output = output_ + self.bias if self.bias is not None else output_
-        else:
-            output = self.linear(input_parallel,
-                                 self.weight,
-                                 self.bias,
-                                 name=self._name)
-
-        return output
-
-
-class ParallelCrossEntropy(Layer):
-
-    def __init__(self, mp_group=None, name=None):
-        super(ParallelCrossEntropy, self).__init__()
-        self.name = name
-        self.model_parallel_group = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_group(
-        ) if mp_group is None else mp_group
-        self.world_size = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_world_size(
-        ) if mp_group is None else mp_group.nranks
-        self.rank = tp._HYBRID_PARALLEL_GROUP.get_model_parallel_rank(
-        ) if mp_group is None else mp_group.rank
-
-    def forward(self, input, label):
-        loss = paddle.distributed.collective._c_softmax_with_cross_entropy(
-            input, label, group=self.model_parallel_group)
-        return loss
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
index a6e8661f7a6eae..0e14d141238caf 100755
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/pp_layers.py
@@ -49,14 +49,14 @@
 import paddle
 from paddle.fluid.dygraph.layers import Layer
 from ...utils.log_util import logger, layer_to_str
-from ..pp_utils.utils import _hp_recompute, _initialize_recompute_setting
+from paddle.distributed import fleet
 from paddle.fluid.framework import in_dygraph_mode
+from paddle.incubate.distributed.fleet import recompute_hybrid
 
 __all__ = []
 
 
 class LayerDesc(object):
-
     def __init__(self, layer_func, *inputs, **kwargs):
         self.layer_func = layer_func
         self.inputs = inputs
@@ -64,25 +64,28 @@ def __init__(self, layer_func, *inputs, **kwargs):
 
         if not issubclass(layer_func, Layer):
             raise TypeError(
-                "The input(layer_func) should be a derived class of Layer.")
+                "The input(layer_func) should be a derived class of Layer."
+            )
 
     def build_layer(self):
         return self.layer_func(*self.inputs, **self.kwargs)
 
     def __repr__(self):
-        return layer_to_str(self.layer_func.__name__, *self.inputs,
-                            **self.kwargs)
+        return layer_to_str(
+            self.layer_func.__name__, *self.inputs, **self.kwargs
+        )
 
 
 class SharedLayerDesc(LayerDesc):
-
-    def __init__(self,
-                 key,
-                 layer_func,
-                 forward_func=None,
-                 shared_weight_attr='weight',
-                 *inputs,
-                 **kwargs):
+    def __init__(
+        self,
+        key,
+        layer_func,
+        forward_func=None,
+        shared_weight_attr='weight',
+        *inputs,
+        **kwargs
+    ):
         super(SharedLayerDesc, self).__init__(layer_func, *inputs, **kwargs)
         self.layer_name = key
         self.forward_func = forward_func
@@ -90,12 +93,13 @@ def __init__(self,
 
 
 class SegmentLayers(object):
-
-    def __init__(self,
-                 layers_desc,
-                 num_parts,
-                 method="uniform",
-                 num_virtual_pipeline_stage=None):
+    def __init__(
+        self,
+        layers_desc,
+        num_parts,
+        method="uniform",
+        num_virtual_pipeline_stage=None,
+    ):
         self._layers_desc = layers_desc
         self.method = method
         self.num_parts = num_parts
@@ -103,7 +107,9 @@ def __init__(self,
         self.num_virtual_pipeline_stage = num_virtual_pipeline_stage
         if self.num_virtual_pipeline_stage is not None:
             self.total_parts = num_parts * self.num_virtual_pipeline_stage
-        assert self.num_items >= self.num_parts, "layer number should be greater than number of segments"
+        assert (
+            self.num_items >= self.num_parts
+        ), "layer number should be greater than number of segments"
 
     def do_segment(self):
         if self.method == "uniform":
@@ -117,12 +123,17 @@ def do_segment(self):
             for idx in weight_idxs:
                 weights[idx] = 1
 
-            actual_num_parts = self.num_parts if self.num_virtual_pipeline_stage is None else self.total_parts
-
-            assert sum(
-                weights
-            ) % actual_num_parts == 0, "number of layers ({}) should be divided by part number({})".format(
-                sum(weights), actual_num_parts)
+            actual_num_parts = (
+                self.num_parts
+                if self.num_virtual_pipeline_stage is None
+                else self.total_parts
+            )
+
+            assert (
+                sum(weights) % actual_num_parts == 0
+            ), "number of layers ({}) should be divided by part number({})".format(
+                sum(weights), actual_num_parts
+            )
             part_size = sum(weights) // actual_num_parts
             result = [0 for _ in range(actual_num_parts + 1)]
 
@@ -155,21 +166,23 @@ def _gen_layer_weight(self, layername):
             if regex.search(name):
                 weight_idxs.append(idx)
 
-        assert len(
-            weight_idxs) > 0, "weight_idxs' length should be greater than 0"
+        assert (
+            len(weight_idxs) > 0
+        ), "weight_idxs' length should be greater than 0"
         return weight_idxs
 
     def uniform(self, num_items, num_parts):
         result = [0 for _ in range(num_parts + 1)]
         part_size = math.floor(num_items / num_parts)
-        for i in range(num_parts):
-            result[i] = int(min(part_size * i, num_items))
+        extra_layers = num_items % num_parts
+        for i in range(1, num_parts):
+            offset = 1 if i > (num_parts - extra_layers) else 0
+            result[i] = int(min(result[i - 1] + part_size + offset, num_items))
         result[num_parts] = num_items
         return result
 
 
 class PipelineLayerChunk(Layer):
-
     def __init__(self):
         super(PipelineLayerChunk, self).__init__()
         self.run_function = []
@@ -191,18 +204,19 @@ def forward(self, *args, **kwargs):
         # behavior under recompute circumstance.
         raise PermissionError(
             "The forward function of PipelineLayerChunk cannot be called directly. "
-            "Please call forward function of PipelineLayer.")
+            "Please call forward function of PipelineLayer."
+        )
 
 
 class PipelineLayer(Layer):
     """PipelineLayer
     Args:
         layers(Iterable): A sequence of layers description to define the structure for pipeline.
-        num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given. 
+        num_stages(int, optional): pp degree, if not specified, 'topology' parameter must be given.
         topology(CommunicateTopology, optional): topo of hybrid parallel, if it is None, 'num_stages' parameters must be given.
         loss_fn(callable, optional): Loss function.
         seg_method(str, optional): the method of splitting pp layer, default 'uniform', or use specific layer to split, method's name must be start with 'layer:'.
-        recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0. 
+        recompute_interval(int, optional): the number of layers to be used recompute, the value of 0 represents no recompute. default 0.
         recompute_ctx(dict,optional): the context of recompute, when 'recompute_interval' > 0, the context must be given.
         num_virtual_pipeline_stages(int, optional): the num of virtual pipeline stages for interleave pp.
     Examples:
@@ -212,7 +226,7 @@ class PipelineLayer(Layer):
         from paddle.fluid.dygraph.layers import Layer
         import paddle.nn.functional as F
         from paddle.distributed.fleet.meta_parallel import LayerDesc, PipelineLayer
-        
+
         pipeline_parallel_size = 2
         strategy = fleet.DistributedStrategy()
         strategy.hybrid_configs = {
@@ -224,19 +238,19 @@ class PipelineLayer(Layer):
             "accumulate_steps": 4,
             "micro_batch_size": 2
         }
-        
+
         fleet.init(is_collective=True, strategy=strategy)
-        
+
         hcg = fleet.get_hybrid_communicate_group()
-        
+
         class ReshapeHelp(Layer):
             def __init__(self, shape):
                 super(ReshapeHelp, self).__init__()
                 self.shape = shape
-        
+
             def forward(self, x):
                 return x.reshape(shape=self.shape)
-        
+
         class AlexNetPipeDesc(PipelineLayer):
             def __init__(self, num_classes=10, **kwargs):
                 self.num_classes = num_classes
@@ -268,37 +282,46 @@ def __init__(self, num_classes=10, **kwargs):
                 ]
                 super(AlexNetPipeDesc, self).__init__(
                     layers=decs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
-        
+
         model = AlexNetPipeDesc(num_stages=pipeline_parallel_size, topology=hcg._topo)
 
     """
 
-    def __init__(self,
-                 layers,
-                 num_stages=None,
-                 topology=None,
-                 loss_fn=None,
-                 seg_method="uniform",
-                 recompute_interval=0,
-                 recompute_ctx=None,
-                 num_virtual_pipeline_stages=None):
+    def __init__(
+        self,
+        layers,
+        num_stages=None,
+        topology=None,
+        loss_fn=None,
+        seg_method="uniform",
+        recompute_interval=0,
+        recompute_ctx=None,
+        num_virtual_pipeline_stages=None,
+    ):
         super(PipelineLayer, self).__init__()
         if num_stages is None and topology is None:
             raise ValueError("should provide num_stages or topology")
 
         if num_virtual_pipeline_stages:
-            assert isinstance(num_virtual_pipeline_stages, int), \
-                "virtual_pipeline_stage should be None or an int"
+            assert isinstance(
+                num_virtual_pipeline_stages, int
+            ), "virtual_pipeline_stage should be None or an int"
             if num_virtual_pipeline_stages > 1:
                 logger.info(
                     "set num_virtual_pipeline_stages > 1 means using interleave scheduler instead of 1f1b scheduler"
                 )
-                assert isinstance(seg_method, str), \
-                    "seg_method should be a str for interleave scheduler"
-                assert seg_method.startswith('layer:'), \
-                    "seg_method shoud be start with layer: for interleave scheduler"
-
-        self._num_virtual_pipeline_stages = 1 if num_virtual_pipeline_stages is None else num_virtual_pipeline_stages
+                assert isinstance(
+                    seg_method, str
+                ), "seg_method should be a str for interleave scheduler"
+                assert seg_method.startswith(
+                    'layer:'
+                ), "seg_method shoud be start with layer: for interleave scheduler"
+
+        self._num_virtual_pipeline_stages = (
+            1
+            if num_virtual_pipeline_stages is None
+            else num_virtual_pipeline_stages
+        )
 
         # lazy import
         import paddle.distributed as dist
@@ -309,16 +332,20 @@ def __init__(self,
         self._loss_fn = loss_fn
         self._topo = topology
         self._recompute_interval = recompute_interval
+        self.recompute_ctx = recompute_ctx
 
         if recompute_interval > 0:
-            assert recompute_ctx is not None, "recompute_ctx must be not None for recompute."
+            assert (
+                recompute_ctx is not None
+            ), "recompute_ctx must be not None for recompute."
 
             offload = recompute_ctx.get('offload', False)
             partition = recompute_ctx.get('partition', False)
-            _initialize_recompute_setting(offload, partition)
             logger.info(
-                "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}"
-                .format(offload, partition))
+                "Start Recompute for PipeLineParallel. recompute_offload: {}, recompute_partition: {}".format(
+                    offload, partition
+                )
+            )
 
         world_size = dist.get_world_size()
         self.global_rank = dist.get_rank()
@@ -327,22 +354,28 @@ def __init__(self,
             self._stage_id = self._topo.get_coord(self.global_rank).pipe
             self._num_stages = self._topo.get_dim_size("pipe")
             if num_stages:
-                assert self._num_stages == num_stages, "num_stages should be equal to be %d" % (
-                    self._num_stages)
+                assert (
+                    self._num_stages == num_stages
+                ), "num_stages should be equal to be %d" % (self._num_stages)
         else:
             # construct default topology
             if world_size % num_stages != 0:
                 raise ValueError(
                     "should provide correct num_stages({}) "
                     "which can be divided by world_size({})".format(
-                        num_stages, world_size))
+                        num_stages, world_size
+                    )
+                )
             dp_num = world_size // num_stages
-            self._topo = fleet.CommunicateTopology(["data", "pipe", "model"],
-                                                   [dp_num, num_stages, 1])
+            self._topo = fleet.CommunicateTopology(
+                ["data", "pipe", "model"], [dp_num, num_stages, 1]
+            )
             self._stage_id = self._topo.get_coord(self.global_rank).pipe
             self._num_stages = self._topo.get_dim_size("pipe")
 
-        self._total_stages_with_virtual_stages = self._num_stages * self._num_virtual_pipeline_stages
+        self._total_stages_with_virtual_stages = (
+            self._num_stages * self._num_virtual_pipeline_stages
+        )
 
         # initialize segment
         self._layers_desc = list(self.layers)
@@ -377,19 +410,25 @@ def get_stage_from_index(self, layer_idx):
         for virtual_pp_rank in range(self._num_virtual_pipeline_stages):
             # Mapping the virtual pipeline stage to the real pipeline stage.
             # start_idx marks the start of a new virtual pp stage.
-            start_idx = virtual_pp_rank * self._num_virtual_pipeline_stages
+            start_idx = virtual_pp_rank * self._num_stages
             for stage in range(self._num_stages):
                 # stage mark the real pp stage
-                if self.segment_parts[start_idx +
-                                      stage] <= layer_idx < self.segment_parts[
-                                          start_idx + stage + 1]:
+                if (
+                    self.segment_parts[start_idx + stage]
+                    <= layer_idx
+                    < self.segment_parts[start_idx + stage + 1]
+                ):
                     return stage
 
     def get_num_virtual_stages(self):
         return self._num_virtual_pipeline_stages
 
     def get_model_chunks(self):
-        return None if self._num_virtual_pipeline_stages == 1 else self._model_chunks
+        return (
+            None
+            if self._num_virtual_pipeline_stages == 1
+            else self._model_chunks
+        )
 
     def _construct_shared_comm(self):
         shared_comm = {}
@@ -397,17 +436,21 @@ def _construct_shared_comm(self):
             return
 
         layers_desc = self._layers_desc
-        shared_layer_names = set(s.layer_name for s in layers_desc
-                                 if isinstance(s, SharedLayerDesc))
+        shared_layer_names = set(
+            s.layer_name for s in layers_desc if isinstance(s, SharedLayerDesc)
+        )
         for key in shared_layer_names:
             shared_layers = []
             for idx, layer in enumerate(layers_desc):
-                if isinstance(layer,
-                              SharedLayerDesc) and layer.layer_name == key:
+                if (
+                    isinstance(layer, SharedLayerDesc)
+                    and layer.layer_name == key
+                ):
                     shared_layers.append(idx)
 
             shared_stages = set(
-                self.get_stage_from_index(idx) for idx in shared_layers)
+                self.get_stage_from_index(idx) for idx in shared_layers
+            )
             self._dp_degree = self._topo.get_dim('data')
             self._mp_degree = self._topo.get_dim('model')
             self._sharding_degree = self._topo.get_dim('sharding')
@@ -424,7 +467,9 @@ def _construct_shared_comm(self):
                                     pipe=s,
                                     data=dp,
                                     sharding=sharding,
-                                    model=mp))
+                                    model=mp,
+                                )
+                            )
 
                         group = paddle.distributed.new_group(ranks=shared_ranks)
                         if self.global_rank in shared_ranks:
@@ -433,8 +478,9 @@ def _construct_shared_comm(self):
                                 shared_comm[key] = {
                                     'ranks': shared_ranks,
                                     'group': group,
-                                    'weight_attr':
-                                    self.shared_weight_attrs[key],
+                                    'weight_attr': self.shared_weight_attrs[
+                                        key
+                                    ],
                                     'layer': self.shared_layers[key],
                                 }
         return shared_comm
@@ -442,10 +488,11 @@ def _construct_shared_comm(self):
     def _synchronize_shared_weights(self):
         for key, comm in self.shared_comm.items():
             with paddle.framework.no_grad():
-                paddle.distributed.broadcast(getattr(comm['layer'],
-                                                     comm['weight_attr']),
-                                             src=min(comm['ranks']),
-                                             group=comm['group'])
+                paddle.distributed.broadcast(
+                    getattr(comm['layer'], comm['weight_attr']),
+                    src=min(comm['ranks']),
+                    group=comm['group'],
+                )
 
             for param in comm['layer'].parameters():
                 if self.global_rank != min(comm['ranks']):
@@ -457,8 +504,9 @@ def allreduce_shared_weight_gradients(self):
             # need use trace_op to allreduce weight
             if in_dygraph_mode():
                 with paddle.framework.no_grad():
-                    paddle.distributed.all_reduce(param.grad,
-                                                  group=comm['group'])
+                    paddle.distributed.all_reduce(
+                        param.grad, group=comm['group']
+                    )
             else:
                 with paddle.framework.no_grad():
                     paddle.fluid.framework._dygraph_tracer().trace_op(
@@ -467,8 +515,9 @@ def allreduce_shared_weight_gradients(self):
                         outputs={'Out': param._grad_ivar()},
                         attrs={
                             'ring_id': comm['group'].id,
-                            'use_calc_stream': True
-                        })
+                            'use_calc_stream': True,
+                        },
+                    )
 
     def _segment_network_for_interleave(self, seg_method):
         logger.info("start segment network for interleave scheduler")
@@ -476,14 +525,20 @@ def _segment_network_for_interleave(self, seg_method):
             self._layers_desc,
             num_parts=self._num_stages,
             method=seg_method,
-            num_virtual_pipeline_stage=self._num_virtual_pipeline_stages)
+            num_virtual_pipeline_stage=self._num_virtual_pipeline_stages,
+        )
         self.segment_parts = seg.do_segment()
 
-        logger.info("segment result:" +
-                    ", ".join(str(arg) for arg in self.segment_parts))
+        logger.info(
+            "segment result:"
+            + ", ".join(str(arg) for arg in self.segment_parts)
+        )
 
-        for i in range(self._stage_id, self._total_stages_with_virtual_stages,
-                       self._num_virtual_pipeline_stages):
+        for i in range(
+            self._stage_id,
+            self._total_stages_with_virtual_stages,
+            self._num_stages,
+        ):
             # If there are 2 real pp stages and 2 virtual pp stages, and the model has 8 layers.
             # Layers [0, 1], [4, 5] will be assigned to the first real pp stage.
             # Layers [2, 3], [6, 7] will be assigned to the second real pp stage.
@@ -499,13 +554,15 @@ def _segment_network_for_interleave(self, seg_method):
 
     def _segment_network(self, seg_method):
         logger.info("start segment network..")
-        seg = SegmentLayers(self._layers_desc,
-                            num_parts=self._num_stages,
-                            method=seg_method)
+        seg = SegmentLayers(
+            self._layers_desc, num_parts=self._num_stages, method=seg_method
+        )
         self.segment_parts = seg.do_segment()
 
-        logger.info("segment result:" +
-                    ", ".join(str(arg) for arg in self.segment_parts))
+        logger.info(
+            "segment result:"
+            + ", ".join(str(arg) for arg in self.segment_parts)
+        )
 
         self._start_pos = self.segment_parts[self._stage_id]
         self._end_pos = self.segment_parts[self._stage_id + 1]
@@ -513,22 +570,30 @@ def _segment_network(self, seg_method):
 
     def _print_segmentation_for_debug(self):
         # print information for debug
-        for stage in range(self._num_stages *
-                           self._num_virtual_pipeline_stages):
+        for stage in range(
+            self._num_stages * self._num_virtual_pipeline_stages
+        ):
             start = self.segment_parts[stage]
             end = self.segment_parts[stage + 1]
-            logger.info("stage={}, global_rank={} ,layer_number={}".format(
-                stage, self.global_rank, end - start))
+            logger.info(
+                "stage={}, global_rank={} ,layer_number={}".format(
+                    stage, self.global_rank, end - start
+                )
+            )
 
             for index, layer in enumerate(self._layers_desc[start:end]):
                 logger.info("{}: {}".format(index + start, str(layer)))
 
         if self._num_virtual_pipeline_stages > 1:
             for stage in range(self._num_stages):
-                stage_to_virtual_stage_info = "stage {} contains virtual stages: ".format(
-                    stage)
-                for i in range(stage, self._total_stages_with_virtual_stages,
-                               self._num_virtual_pipeline_stages):
+                stage_to_virtual_stage_info = (
+                    "stage {} contains virtual stages: ".format(stage)
+                )
+                for i in range(
+                    stage,
+                    self._total_stages_with_virtual_stages,
+                    self._num_stages,
+                ):
                     stage_to_virtual_stage_info += " {},".format(i)
                 logger.info(stage_to_virtual_stage_info)
 
@@ -574,9 +639,11 @@ def _build_layer_impl(self, start, end):
                 if layer.layer_name not in self.shared_layers:
                     self.shared_layers[layer.layer_name] = layer.build_layer()
                     self.shared_weight_attrs[
-                        layer.layer_name] = layer.shared_weight_attr
+                        layer.layer_name
+                    ] = layer.shared_weight_attr
                     for param in self.shared_layers[
-                            layer.layer_name].parameters():
+                        layer.layer_name
+                    ].parameters():
                         setattr(param, "is_firstly_shared", True)
 
                 if layer.forward_func is None:
@@ -584,8 +651,11 @@ def _build_layer_impl(self, start, end):
 
                 else:
                     run_function.append(
-                        partial(layer.forward_func,
-                                self.shared_layers[layer.layer_name]))
+                        partial(
+                            layer.forward_func,
+                            self.shared_layers[layer.layer_name],
+                        )
+                    )
 
             elif isinstance(layer, LayerDesc):
                 model = layer.build_layer()
@@ -600,11 +670,12 @@ def _build_layer_impl(self, start, end):
         return run_function
 
     def forward_function(self, start, end):
+        run_function = self.run_function
 
         def execute_func(*x):
             if len(x) == 1:
                 x = x[0]
-            for idx, layer in enumerate(self.run_function[start:end]):
+            for idx, layer in enumerate(run_function[start:end]):
                 x = layer(x)
             return x
 
@@ -613,11 +684,15 @@ def execute_func(*x):
     def forward(self, input, chunk_id=None):
         if chunk_id is not None:
             assert isinstance(chunk_id, int), "chunk_id should be an int"
-            assert self._num_virtual_pipeline_stages > 1, \
-                "chunk_id is only valid when using virtual pipeline stage"
-            assert chunk_id < len(self._model_chunks), \
-                "The virtual pipeline only has {} chunks, " \
-                "but received chunk_id {}.".format(len(self._model_chunks), chunk_id)
+            assert (
+                self._num_virtual_pipeline_stages > 1
+            ), "chunk_id is only valid when using virtual pipeline stage"
+            assert chunk_id < len(self._model_chunks), (
+                "The virtual pipeline only has {} chunks, "
+                "but received chunk_id {}.".format(
+                    len(self._model_chunks), chunk_id
+                )
+            )
             # Get the target model chunk.
             model_chunk = self._model_chunks[chunk_id]
             # Update the self.run_function to the target run functions.
@@ -635,19 +710,25 @@ def forward(self, input, chunk_id=None):
                 funcs = self.run_function[start_idx:end_idx]
 
                 if not isinstance(input, tuple):
-                    input = (input, )
+                    input = (input,)
 
                 if self._need_recompute(funcs, input):
-                    input = _hp_recompute(
-                        self.forward_function(start_idx, end_idx), *input)
+                    input = recompute_hybrid(
+                        self.recompute_ctx,
+                        self.forward_function(start_idx, end_idx),
+                        *input
+                    )
                 else:
                     input = self.forward_function(start_idx, end_idx)(*input)
 
         return input
 
     def _need_recompute(self, funcs, inputs):
-        if not any(input_.stop_gradient == False
-                   for input_ in inputs if isinstance(input_, paddle.Tensor)):
+        if not any(
+            input_.stop_gradient == False
+            for input_ in inputs
+            if isinstance(input_, paddle.Tensor)
+        ):
             return False
 
         params = [f.parameters() for f in funcs if isinstance(f, Layer)]
@@ -671,11 +752,18 @@ def _offset_dirname(ckpt_dir, local_layer_idx, local_chunk_id=None):
             if self._num_virtual_pipeline_stages > 1:
                 # add virtual pipeline info to the save path
                 assert local_chunk_id is not None
-                virtual_pipeline_stage_message = "-virtual_pp_stage_{:0>2d}".format(
-                    local_chunk_id)
-            layer_save_path = os.path.join(ckpt_dir,
-                                           'layer_{:0>2d}'.format(idx))
-            layer_save_path = layer_save_path + virtual_pipeline_stage_message + rank_message + '-model_states.pdparams'
+                virtual_pipeline_stage_message = (
+                    "-virtual_pp_stage_{:0>2d}".format(local_chunk_id)
+                )
+            layer_save_path = os.path.join(
+                ckpt_dir, 'layer_{:0>2d}'.format(idx)
+            )
+            layer_save_path = (
+                layer_save_path
+                + virtual_pipeline_stage_message
+                + rank_message
+                + '-model_states.pdparams'
+            )
             return layer_save_path
 
         def _save_model(run_functions, local_chunk_id=None):
@@ -698,7 +786,8 @@ def _save_model(run_functions, local_chunk_id=None):
 
     def set_state_dir(self, path):
         assert os.path.exists(
-            path), "{} not found, please check the path".format(path)
+            path
+        ), "{} not found, please check the path".format(path)
 
         def _load_model(run_functions, local_chunk_id=None):
             for idx, layer in enumerate(run_functions):
@@ -712,21 +801,26 @@ def _load_model(run_functions, local_chunk_id=None):
                     pos_offset = self._start_poss[local_chunk_id]
                 layer_idx = idx + pos_offset
                 layer_save_path = os.path.join(
-                    path, 'layer_{0:0>2d}'.format(layer_idx))
+                    path, 'layer_{0:0>2d}'.format(layer_idx)
+                )
                 if self._num_virtual_pipeline_stages > 1:
                     # add virtual pipeline info to the path
                     assert local_chunk_id is not None
-                    layer_save_path = layer_save_path + "-virtual_pp_stage_{:0>2d}".format(
-                        local_chunk_id)
-                model_files = glob.glob(layer_save_path +
-                                        "*model_states.pdparams")
+                    layer_save_path = (
+                        layer_save_path
+                        + "-virtual_pp_stage_{:0>2d}".format(local_chunk_id)
+                    )
+                model_files = glob.glob(
+                    layer_save_path + "*model_states.pdparams"
+                )
                 model_files.sort()
                 mp_rank = self._topo.get_coord(self.global_rank).model
                 mp_world_size = self._topo.get_dim('model')
                 num_files = len(model_files)
 
-                load_param_path = model_files[mp_rank * num_files //
-                                              mp_world_size]
+                load_param_path = model_files[
+                    mp_rank * num_files // mp_world_size
+                ]
                 model_state_dict = paddle.load(load_param_path)
                 layer.set_state_dict(model_state_dict)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
index 900c0f79798fcd..9deed30db66f5c 100644
--- a/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
+++ b/python/paddle/distributed/fleet/meta_parallel/parallel_layers/random.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,232 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import paddle
-import contextlib
-import numpy as np
-from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid import core
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle.fluid.framework import _non_static_mode, default_main_program, Variable
-from paddle.fluid.layer_helper import LayerHelper
+from ...layers.mpu.random import RNGStatesTracker  # noqa: F401
+from ...layers.mpu.random import get_rng_state_tracker  # noqa: F401
+from ...layers.mpu.random import model_parallel_random_seed  # noqa: F401
+from ...layers.mpu.random import determinate_seed  # noqa: F401
+from ...layers.mpu.random import dropout  # noqa: F401
 
 __all__ = []
-
-MODEL_PARALLEL_RNG = 'model_parallel_rng'
-
-# This file is inspired by Megatron to control random states for MP:
-# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/mpu/random.py
-
-
-class RNGStatesTracker:
-    """
-    Tracker the RNG states.
-    """
-
-    def __init__(self):
-        # Map from name to the rng state.
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def reset(self):
-        self.states_ = {}
-        self.seeds_ = set()
-
-    def add(self, name, seed):
-        if seed in self.seeds_:
-            raise ValueError('seed {} already exists'.format(seed))
-        self.seeds_.add(seed)
-        if name in self.states_:
-            raise ValueError('state {} already exists'.format(name))
-        orig_rng_state = paddle.get_cuda_rng_state()
-        paddle.seed(seed)
-        self.states_[name] = paddle.get_cuda_rng_state()
-        paddle.set_cuda_rng_state(orig_rng_state)
-
-    def get_states_tracker(self):
-        states = {}
-        for name in self.states_:
-            states[name] = self.states_[name]
-        return states
-
-    def set_states_tracker(self, states):
-        self.states_ = states
-
-    @contextlib.contextmanager
-    def rng_state(self, name=MODEL_PARALLEL_RNG):
-        if name not in self.states_:
-            raise ValueError('state {} does not exist'.format(name))
-        orig_cuda_rng_state = paddle.get_cuda_rng_state()
-        paddle.set_cuda_rng_state(self.states_[name])
-        try:
-            yield
-        finally:
-            self.states_[name] = paddle.get_cuda_rng_state()
-            paddle.set_cuda_rng_state(orig_cuda_rng_state)
-
-
-RNG_STATE_TRACKER = RNGStatesTracker()
-
-
-def get_rng_state_tracker():
-    return RNG_STATE_TRACKER
-
-
-def model_parallel_random_seed(seed=None):
-    import paddle.distributed.fleet as fleet
-    hcg = fleet.get_hybrid_communicate_group()
-    rank = hcg.get_model_parallel_rank()
-
-    if seed:
-        global_seed = seed
-        local_seed = seed * 1024 + rank * 100
-    else:
-        global_seed = np.random.randint(0, 655350)
-        local_seed = np.random.randint(rank * 10000, (rank + 1) * 10000 - 1)
-
-    RNG_STATE_TRACKER.reset()
-    RNG_STATE_TRACKER.add(MODEL_PARALLEL_RNG, local_seed)
-    paddle.seed(global_seed)
-
-
-def determinate_seed(rng_name):
-    assert rng_name is not None and rng_name != ""
-    helper = LayerHelper('seed', **locals())
-    out = helper.create_variable_for_type_inference(dtype=paddle.int32)
-    # set force_cpu to reduce sync copy from CPU->GPU->CPU, and reduce pipeline hang
-    helper.append_op(type='seed',
-                     outputs={'Out': out},
-                     attrs={
-                         'deterministic': True,
-                         'rng_name': rng_name,
-                         'force_cpu': True
-                     })
-    return out
-
-
-def dropout(x,
-            p=0.5,
-            axis=None,
-            rng_name=None,
-            training=True,
-            mode="upscale_in_train",
-            name=None):
-    """
-    Dropout is a regularization technique for reducing overfitting by preventing
-    neuron co-adaption during training. The dropout operator randomly sets the
-    outputs of some units to zero, while upscale others according to the given
-    dropout probability.
-
-    Args:
-        x (Tensor): The input tensor. The data type is float32 or float64.
-        p (float|int): Probability of setting units to zero. Default 0.5.
-        axis (int|list|tuple): The axis along which the dropout is performed. Default None.
-        rng_name (str): The random seed generator name, which used to obtain deterministic results.
-        training (bool): A flag indicating whether it is in train phrase or not. Default True.
-        mode(str): ['upscale_in_train'(default) | 'downscale_in_infer'].
-
-                           1. upscale_in_train(default), upscale the output at training time
-
-                              - train: out = input * mask / ( 1.0 - dropout_prob )
-                              - inference: out = input
-
-                           2. downscale_in_infer, downscale the output at inference
-
-                              - train: out = input * mask
-                              - inference: out = input * (1.0 - dropout_prob)
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-
-    Returns:
-        A Tensor representing the dropout, has same shape and data type as `x` .
-
-
-    Examples:
-        We use ``p=0.5`` in the following description for simplicity.
-
-        1. When ``axis=None`` , this is commonly used dropout, which dropout each element of x randomly.
-
-        ..  code-block:: text
-
-            Let's see a simple case when x is a 2d tensor with shape 2*3:
-            [[1 2 3]
-             [4 5 6]]
-            we generate mask with the same shape as x, which is 2*3. The value of mask is
-            sampled from a Bernoulli distribution randomly. For example, we may get such mask:
-            [[0 1 0]
-             [1 0 1]]
-            So the output is obtained from elementwise multiply of x and mask:
-            [[0 2 0]
-             [4 0 6]]
-            Using default setting, i.e. ``mode='upscale_in_train'`` ,
-            if in training phase, the final upscale output is:
-            [[0 4 0 ]
-             [8 0 12]]
-            if in test phase, the output is the same as input:
-            [[1 2 3]
-             [4 5 6]]
-            we can also set ``mode='downscale_in_infer'`` , then
-            if in training phase, the final output is:
-            [[0 2 0]
-             [4 0 6]]
-            if in test phase, the scale output is:
-            [[0.5 1.  1.5]
-             [2.  2.5 3. ]]
-
-    """
-    if rng_name is None:
-        return paddle.nn.functional.dropout(x, p, axis, training, mode, name)
-
-    if not isinstance(p, (float, int, Variable)):
-        raise TypeError("p argument should be a number(int|float) or Variable")
-
-    # fast return for p == 0
-    if isinstance(p, (int, float)) and p == 0: return x
-
-    assert 0 <= p <= 1, ValueError("p argument should between 0 and 1")
-    assert mode in ('downscale_in_infer', 'upscale_in_train'), \
-        ValueError(
-            "mode argument should be 'downscale_in_infer' or 'upscale_in_train'")
-
-    assert axis is None, \
-        TypeError("unsupport axis when using random seed generator")
-
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
-
-    # dygraph using tracker, doesn't need determinate seed
-    if _non_static_mode():
-        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
-                                          not training, 'fix_seed', False,
-                                          'seed', 0, 'dropout_implementation',
-                                          mode)
-        return out
-
-    seed = determinate_seed(rng_name)
-
-    if isinstance(p, Variable) and not p.shape != [1]:
-        raise TypeError(
-            "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}"
-            .format(p.shape))
-
-    helper = LayerHelper('dropout', **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'dropout')
-
-    out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    mask = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
-
-    helper.append_op(type='dropout',
-                     inputs={
-                         'X': [x],
-                         'Seed': seed
-                     },
-                     outputs={
-                         'Out': [out],
-                         'Mask': [mask]
-                     },
-                     attrs={
-                         'dropout_prob': p,
-                         'is_test': not training,
-                         'dropout_implementation': mode,
-                     })
-    return out
diff --git a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
index 876f9ffaed32bb..77570407335167 100755
--- a/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pipeline_parallel.py
@@ -14,14 +14,16 @@
 import paddle
 import paddle.fluid as fluid
 from .meta_parallel_base import MetaParallelBase
-from .pp_utils.utils import is_float_tensor, _initialize_recompute_hcg
 from .parallel_layers.pp_layers import PipelineLayer
 
 from ..utils.hybrid_parallel_util import broadcast_mp_parameters
 from ..utils.hybrid_parallel_util import broadcast_dp_parameters
 from ..utils.hybrid_parallel_util import broadcast_sharding_parameters
 from ..utils.log_util import logger
-from ..meta_optimizers.dygraph_optimizer import HybridParallelOptimizer, HybridParallelGradScaler
+from ..meta_optimizers.dygraph_optimizer import (
+    HybridParallelOptimizer,
+    HybridParallelGradScaler,
+)
 import paddle.fluid.framework as framework
 from .pp_utils import p2p_communication as p2p
 import paddle.fluid.core as core
@@ -30,24 +32,31 @@
 
 
 class PipelineParallel(MetaParallelBase):
-
     def __init__(self, layers, hcg, strategy):
         if not isinstance(layers, PipelineLayer):
             raise TypeError(
-                "The Layer should be a derived class of PipelineLayer.")
+                "The Layer should be a derived class of PipelineLayer."
+            )
         super(PipelineParallel, self).__init__(layers, hcg, strategy)
         self.use_data_parallel = self._hcg.get_data_parallel_world_size() > 1
         self.use_model_parallel = self._hcg.get_model_parallel_world_size() > 1
-        self.use_sharding_parallel = self._hcg.get_sharding_parallel_world_size(
-        ) > 1
+        self.use_sharding_parallel = (
+            self._hcg.get_sharding_parallel_world_size() > 1
+        )
 
         self.total_loss = None
 
         self.micro_batch_size = self._strategy.pipeline_configs[
-            'micro_batch_size']
+            'micro_batch_size'
+        ]
         self.accumulate_steps = self._strategy.pipeline_configs[
-            'accumulate_steps']
-
+            'accumulate_steps'
+        ]
+        # If sent tensor are not the same from different hosts,
+        # they shouldn't been sent partially and then concated as a whole tensor.
+        self._enable_partial_send_recv = self._strategy.pipeline_configs[
+            'enable_partial_send_recv'
+        ]
         self._using_cache = self._strategy.pipeline_configs['p2p_cache_shape']
 
         self.num_stages = self._hcg.get_pipe_parallel_world_size()
@@ -59,17 +68,20 @@ def __init__(self, layers, hcg, strategy):
         self._real_pp_world_size = self.num_stages
         self._real_pp_rank = self.stage_id
 
-        p2p.initialize_p2p_groups(hcg, self._using_cache)
-
-        _initialize_recompute_hcg(hcg)
+        p2p.initialize_p2p_groups(
+            hcg, self._using_cache, self._enable_partial_send_recv
+        )
 
         self.global_rank = self._hcg.get_global_rank()
         self.micro_batch_id = 0
 
         self._compute_loss = True
 
-        logger.info("Pipeline Info -- num_stages: {}, stage_id: {}".format(
-            self.num_stages, self.stage_id))
+        logger.info(
+            "Pipeline Info -- num_stages: {}, stage_id: {}".format(
+                self.num_stages, self.stage_id
+            )
+        )
 
         if self.use_model_parallel:
             logger.info("start broadcast mp parameters")
@@ -121,7 +133,7 @@ def forward_backward_pipeline(self, data, scaler=None):
         # store data id for micro_batch
         self.micro_batch_id = 0
 
-        startup_steps = (self.num_stages - self.stage_id - 1)
+        startup_steps = self.num_stages - self.stage_id - 1
         startup_steps = min(startup_steps, self.accumulate_steps)
         steady_steps = self.accumulate_steps - startup_steps
 
@@ -141,39 +153,46 @@ def forward_backward_pipeline(self, data, scaler=None):
             input_tensor = p2p.recv_forward(self.is_pipeline_first_stage())
 
         for i in range(steady_steps):
-            last_iter = (i == (steady_steps - 1))
+            last_iter = i == (steady_steps - 1)
 
             output_tensor = self._forward_step(input_tensor)
 
             output_tensor_grad = p2p.send_forward_recv_backward(
-                output_tensor, self.is_pipeline_last_stage())
+                output_tensor, self.is_pipeline_last_stage()
+            )
 
             input_buffers.append(input_tensor)
             output_buffers.append(output_tensor)
 
             input_tensor, output_tensor = input_buffers.pop(
-                0), output_buffers.pop(0)
+                0
+            ), output_buffers.pop(0)
 
-            input_tensor_grad = self._backward_step(input_tensor, output_tensor,
-                                                    output_tensor_grad)
+            input_tensor_grad = self._backward_step(
+                input_tensor, output_tensor, output_tensor_grad
+            )
 
             if last_iter:
                 input_tensor = None
-                p2p.send_backward(input_tensor_grad,
-                                  self.is_pipeline_first_stage())
+                p2p.send_backward(
+                    input_tensor_grad, self.is_pipeline_first_stage()
+                )
             else:
                 input_tensor = p2p.send_backward_recv_forward(
-                    input_tensor_grad, self.is_pipeline_first_stage())
+                    input_tensor_grad, self.is_pipeline_first_stage()
+                )
 
         for i in range(startup_steps):
             input_tensor = input_buffers.pop(0)
             output_tensor = output_buffers.pop(0)
 
             output_tensor_grad = p2p.recv_backward(
-                self.is_pipeline_last_stage())
+                self.is_pipeline_last_stage()
+            )
 
-            input_tensor_grad = self._backward_step(input_tensor, output_tensor,
-                                                    output_tensor_grad)
+            input_tensor_grad = self._backward_step(
+                input_tensor, output_tensor, output_tensor_grad
+            )
             p2p.send_backward(input_tensor_grad, self.is_pipeline_first_stage())
 
         self._layers.allreduce_shared_weight_gradients()
@@ -185,17 +204,20 @@ def _prepare_training(self, data, optimizer, lr_scheduler):
         # reset the virtual pp rank for each run
         self.set_virtual_pipeline_rank(0)
 
-        assert isinstance(optimizer, HybridParallelOptimizer), (
-            'optimizer should be HybridParallelOptimizer subclass.')
+        assert isinstance(
+            optimizer, HybridParallelOptimizer
+        ), 'optimizer should be HybridParallelOptimizer subclass.'
 
-        assert fluid.framework._dygraph_tracer()._has_grad, (
-            'Please enable the generation of gradients.')
+        assert (
+            fluid.framework._dygraph_tracer()._has_grad
+        ), 'Please enable the generation of gradients.'
 
         if self.is_pipeline_first_stage(
-                ignore_virtual=True) or self.is_pipeline_last_stage(
-                    ignore_virtual=True):
-            assert data is not None, (
-                "For the first and the last stage, the data must be set.")
+            ignore_virtual=True
+        ) or self.is_pipeline_last_stage(ignore_virtual=True):
+            assert (
+                data is not None
+            ), "For the first and the last stage, the data must be set."
         else:
             data = None
 
@@ -232,7 +254,7 @@ def eval_batch(self, data, compute_loss=False):
         # store total loss of entire batch
         self.total_loss = None
 
-        startup_steps = (self.num_stages - self.stage_id - 1)
+        startup_steps = self.num_stages - self.stage_id - 1
         startup_steps = min(startup_steps, self.accumulate_steps)
         steady_steps = self.accumulate_steps - startup_steps
 
@@ -252,7 +274,7 @@ def eval_batch(self, data, compute_loss=False):
             input_tensor = p2p.recv_forward(self.is_pipeline_first_stage())
 
         for i in range(steady_steps):
-            last_iter = (i == (steady_steps - 1))
+            last_iter = i == (steady_steps - 1)
 
             output_tensor = self._forward_step(input_tensor)
             p2p.send_forward(output_tensor, self.is_pipeline_last_stage())
@@ -281,13 +303,14 @@ def _forward_step(self, input_tensor, chunk_id=None):
         if self.is_pipeline_last_stage():
             # train calculate loss for train
             if self._compute_loss:
-                assert self._layers._loss_fn is not None, "loss function should exist to compute loss"
+                assert (
+                    self._layers._loss_fn is not None
+                ), "loss function should exist to compute loss"
                 labels = self._load_micro_batch(self.micro_batch_id)
                 output_tensor = self._layers._loss_fn(output_tensor, labels)
                 assert isinstance(
-                    output_tensor,
-                    (paddle.Tensor, core.eager.Tensor
-                     )), "Currently, loss_fn should obtain Paddle.Tensor dtype"
+                    output_tensor, (paddle.Tensor, core.eager.Tensor)
+                ), "Currently, loss_fn should obtain Paddle.Tensor dtype"
 
                 with paddle.amp.auto_cast(enable=False):
                     if self.accumulate_steps > 1:
@@ -317,91 +340,113 @@ def _backward_step(self, input_tensor, output_tensor, output_tensor_grad):
                     assert len(outputs) == len(output_tensor_grad)
                     paddle.autograd.backward(
                         tensors=outputs,
-                        grad_tensors=[t for t in output_tensor_grad])
+                        grad_tensors=[t for t in output_tensor_grad],
+                    )
                 else:
-                    paddle.autograd.backward(tensors=[output_tensor],
-                                             grad_tensors=[output_tensor_grad])
+                    paddle.autograd.backward(
+                        tensors=[output_tensor],
+                        grad_tensors=[output_tensor_grad],
+                    )
 
             input_tensor_grad = None
             if input_tensor is not None:
                 if isinstance(input_tensor, tuple):
                     input_tensor_grad = tuple(
-                        [t.grad for t in input_tensor if not t.stop_gradient])
+                        [t.grad for t in input_tensor if not t.stop_gradient]
+                    )
                 else:
                     input_tensor_grad = input_tensor.grad
             return input_tensor_grad
 
-    def _load_micro_batch(self, cache_id):
-        inputs = self.data
+    def _check_data_vaild(self, data):
+        batch_size = data.shape[0]
+        assert self.micro_batch_size * self.accumulate_steps == batch_size, (
+            "batch_size needs to be divisible by micro_batch_size. Currently, "
+            "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d."
+            % (batch_size, self.micro_batch_size, self.accumulate_steps)
+        )
+
+    def _load_micro_batch_impl(self, inputs, cache_id):
         begin = cache_id * self.micro_batch_size
         end = begin + self.micro_batch_size
 
-        # The virtual first and last pipeline stage need data, all others don't need.
+        if isinstance(inputs, tuple):
+            output = []
+            for data in inputs:
+                if isinstance(data, list):
+                    assert (
+                        len(data) == self.accumulate_steps
+                    ), "length of data should be %d, but it is %d" % (
+                        self.accumulate_steps,
+                        len(data),
+                    )
+                    output.append(data[cache_id].detach())
+                else:
+                    self._check_data_vaild(data)
+                    output.append(data[begin:end, :].detach())
+            return tuple(output)
+
+        elif isinstance(inputs, list):
+            assert (
+                len(inputs) == self.accumulate_steps
+            ), "length of data should be %d, but it is %d" % (
+                self.accumulate_steps,
+                len(inputs),
+            )
+            return inputs[cache_id].detach()
+        else:
+            self._check_data_vaild(inputs)
+            return inputs[begin:end, :].detach()
+
+    def _load_micro_batch(self, cache_id):
+        inputs = self.data
         if self.is_pipeline_first_stage():
             assert len(inputs) == 2, "length of input should be 2"
-            if isinstance(inputs[0], tuple):
-                assert len(
-                    inputs[0]
-                ) > 1, "If you use tuple for input data, it should have at least two inputs."
-                batch_size = inputs[0][0].shape[0]
-                assert self.micro_batch_size * self.accumulate_steps == batch_size, (
-                    "batch_size needs to be divisible by micro_batch_size. Currently, "
-                    "batch_size = %d, micro_batch_size = %d, accumulate_steps = %d."
-                    %
-                    (batch_size, self.micro_batch_size, self.accumulate_steps))
-                data = [input[begin:end, :].detach() for input in inputs[0]]
-                return tuple(data)
-            else:
-                batch_size = inputs[0].shape[0]
-                assert self.micro_batch_size * self.accumulate_steps == batch_size
-                return inputs[0][begin:end, :].detach()
+            return self._load_micro_batch_impl(inputs[0], cache_id)
         elif self.is_pipeline_last_stage():
             assert len(inputs) == 2, "length of input should be 2"
-            if isinstance(inputs[1], tuple):
-                batch_size = inputs[1][0].shape[0]
-                assert self.micro_batch_size * self.accumulate_steps == batch_size
-                data = [input[begin:end, :].detach() for input in inputs[1]]
-                return tuple(data)
-            else:
-                batch_size = inputs[1].shape[0]
-                assert self.micro_batch_size * self.accumulate_steps == batch_size
-                return inputs[1][begin:end, :].detach()
+            return self._load_micro_batch_impl(inputs[1], cache_id)
         else:
-            # No data input is required for other stages
             inputs = None
 
     def _broadcast_final_loss(self):
         # Since the last backward run in interleave will set the virtual rank to 0,
         # here we need to check last stage ignoring virtual stage.
         if self.is_pipeline_last_stage(ignore_virtual=True):
-            assert self.total_loss is not None, "train_batch() in last stage should obtain vaild loss"
+            assert (
+                self.total_loss is not None
+            ), "train_batch() in last stage should obtain vaild loss"
             loss = self.total_loss.detach()
-            is_fp32 = paddle.to_tensor(
-                1) if loss.dtype == paddle.float32 else paddle.to_tensor(0)
-            paddle.distributed.broadcast(is_fp32,
-                                         src=self.global_rank,
-                                         use_calc_stream=True,
-                                         group=self.pp_group)
-            paddle.distributed.broadcast(loss,
-                                         src=self.global_rank,
-                                         use_calc_stream=True,
-                                         group=self.pp_group)
+            is_fp32 = (
+                paddle.to_tensor(1)
+                if loss.dtype == paddle.float32
+                else paddle.to_tensor(0)
+            )
+            paddle.distributed.broadcast(
+                is_fp32, src=self.global_rank, sync_op=True, group=self.pp_group
+            )
+            paddle.distributed.broadcast(
+                loss, src=self.global_rank, sync_op=True, group=self.pp_group
+            )
         else:
             is_fp32 = paddle.to_tensor(1)
             paddle.distributed.broadcast(
                 is_fp32,
                 src=self._hcg.get_rank_from_stage(self.num_stages - 1),
-                use_calc_stream=True,
-                group=self.pp_group)
-            loss = paddle.zeros(shape=[
-                1
-            ], dtype="float32") if is_fp32.numpy()[0] else paddle.zeros(
-                shape=[1], dtype="float16")
+                sync_op=True,
+                group=self.pp_group,
+            )
+            loss = (
+                paddle.zeros(shape=[1], dtype="float32")
+                if is_fp32.numpy()[0]
+                else paddle.zeros(shape=[1], dtype="float16")
+            )
             paddle.distributed.broadcast(
                 loss,
                 src=self._hcg.get_rank_from_stage(self.num_stages - 1),
-                use_calc_stream=True,
-                group=self.pp_group)
+                sync_op=True,
+                group=self.pp_group,
+            )
         return loss
 
     def _optimizer_step(self):
@@ -420,11 +465,12 @@ class PipelineParallelWithInterleave(PipelineParallel):
     # pipeline parallel with interleave scheduler
 
     def __init__(self, layers, hcg, strategy):
-        super(PipelineParallelWithInterleave, self).__init__(layers=layers,
-                                                             hcg=hcg,
-                                                             strategy=strategy)
+        super(PipelineParallelWithInterleave, self).__init__(
+            layers=layers, hcg=hcg, strategy=strategy
+        )
         assert layers.get_num_virtual_stages() > 1
-        assert framework.in_dygraph_mode(
+        assert (
+            framework.in_dygraph_mode()
         ), "virtual pipeline stage with interleave only support eager dygraph mode"
         # setup for interleave scheduler
         self.num_model_chunks = layers.get_num_virtual_stages()
@@ -435,11 +481,12 @@ def __init__(self, layers, hcg, strategy):
         self._virtual_pp_rank = 0
 
     def _get_virtual_pp_rank(self, micro_step, forward):
-        virtual_pp_stage = micro_step % (self.num_stages *
-                                         self.num_model_chunks)
+        virtual_pp_stage = micro_step % (
+            self.num_stages * self.num_model_chunks
+        )
         virtual_pp_stage = virtual_pp_stage // self.num_stages
         if not forward:
-            virtual_pp_stage = (self.num_model_chunks - virtual_pp_stage - 1)
+            virtual_pp_stage = self.num_model_chunks - virtual_pp_stage - 1
         return virtual_pp_stage
 
     def _forward_step_helper(self, micro_step):
@@ -454,7 +501,8 @@ def _forward_step_helper(self, micro_step):
 
         if self.is_pipeline_first_stage():
             if len(self.input_tensors[virtual_pp_rank]) == len(
-                    self.output_tensors[virtual_pp_rank]):
+                self.output_tensors[virtual_pp_rank]
+            ):
                 self.input_tensors[virtual_pp_rank].append(None)
         input_tensor = self.input_tensors[virtual_pp_rank][-1]
         output_tensor = self._forward_step(input_tensor, virtual_pp_rank)
@@ -483,21 +531,22 @@ def _backward_step_helper(self, micro_step):
         input_tensor = self.input_tensors[virtual_pp_rank].pop(0)
         output_tensor = self.output_tensors[virtual_pp_rank].pop(0)
         output_tensor_grad = self.output_tensor_grads[virtual_pp_rank].pop(0)
-        input_tensor_grad = self._backward_step(input_tensor, output_tensor,
-                                                output_tensor_grad)
+        input_tensor_grad = self._backward_step(
+            input_tensor, output_tensor, output_tensor_grad
+        )
 
         return input_tensor_grad
 
-    def interleave_pipeline(self,
-                            data,
-                            scaler,
-                            forward_only=False,
-                            compute_loss=True):
+    def forward_backward_pipeline(
+        self, data, scaler, forward_only=False, compute_loss=True
+    ):
         # use interleave scheduling strategy.
         # this strategy is inspired by:
         # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/schedules.py
         if not compute_loss:
-            assert not forward_only, "compute_loss can only be set to False when forward_only is set to True"
+            assert (
+                not forward_only
+            ), "compute_loss can only be set to False when forward_only is set to True"
 
         # init some attributes for this batch run
         self.scaler = scaler
@@ -529,15 +578,17 @@ def interleave_pipeline(self,
 
         self.set_virtual_pipeline_rank(0)
         self.input_tensors[0].append(
-            p2p.recv_forward(self.is_pipeline_first_stage()))
+            p2p.recv_forward(self.is_pipeline_first_stage(), sync_recv=False)
+        )
 
         # run startup steps
         for micro_step in range(startup_steps):
             output_tensor = self._forward_step_helper(micro_step)
 
             # determine whether recv forward tensor or not
-            next_virtual_pp_rank = self._get_virtual_pp_rank(micro_step + 1,
-                                                             forward=True)
+            next_virtual_pp_rank = self._get_virtual_pp_rank(
+                micro_step + 1, forward=True
+            )
             recv_prev = True
             if self.is_pipeline_first_stage(ignore_virtual=True):
                 if next_virtual_pp_rank == 0:
@@ -551,24 +602,33 @@ def interleave_pipeline(self,
             if self.is_pipeline_last_stage():
                 output_tensor = None
 
-            if micro_step == (startup_steps -
-                              1) and not forward_only and not all_startup_steps:
+            if (
+                micro_step == (startup_steps - 1)
+                and not forward_only
+                and not all_startup_steps
+            ):
                 input_tensor_grad = None
                 recv_next = True
                 if self.is_pipeline_last_stage(ignore_virtual=True):
                     recv_next = False
 
                 # the last startup step needs on four direction comm to set up for steady 1f1b
-                input_tensor, output_tensor_grad = p2p.send_forward_backward_recv_forward_backward(
+                (
+                    input_tensor,
+                    output_tensor_grad,
+                ) = p2p.send_forward_backward_recv_forward_backward(
                     output_tensor,
                     input_tensor_grad,
                     recv_prev=recv_prev,
-                    recv_next=recv_next)
-                self.output_tensor_grads[self.num_model_chunks -
-                                         1].append(output_tensor_grad)
+                    recv_next=recv_next,
+                )
+                self.output_tensor_grads[self.num_model_chunks - 1].append(
+                    output_tensor_grad
+                )
             else:
                 input_tensor = p2p.send_forward_recv_forward(
-                    output_tensor, recv_prev=recv_prev)
+                    output_tensor, recv_prev=recv_prev
+                )
             self.input_tensors[next_virtual_pp_rank].append(input_tensor)
 
         # run 1f1b steady steps
@@ -580,7 +640,8 @@ def interleave_pipeline(self,
             # backward
             backward_micro_step_id = micro_step
             input_tensor_grad = self._backward_step_helper(
-                backward_micro_step_id)
+                backward_micro_step_id
+            )
 
             # four directions comm
             # send output tensor to downstream
@@ -590,14 +651,16 @@ def interleave_pipeline(self,
 
             # last stage doesn't send rst to downstream
             forward_virtual_pp_rank = self._get_virtual_pp_rank(
-                forward_micro_step_id, forward=True)
+                forward_micro_step_id, forward=True
+            )
             self.set_virtual_pipeline_rank(forward_virtual_pp_rank)
             if self.is_pipeline_last_stage():
                 output_tensor = None
 
             # first stage doesn't send grad to upstream
             backward_virtual_pp_rank = self._get_virtual_pp_rank(
-                backward_micro_step_id, forward=False)
+                backward_micro_step_id, forward=False
+            )
             self.set_virtual_pipeline_rank(backward_virtual_pp_rank)
             if self.is_pipeline_first_stage():
                 input_tensor_grad = None
@@ -606,14 +669,16 @@ def interleave_pipeline(self,
             recv_prev = True
             if self.is_pipeline_first_stage(ignore_virtual=True):
                 next_forward_virtual_pp_rank = self._get_virtual_pp_rank(
-                    forward_micro_step_id - (self.num_stages - 1), forward=True)
+                    forward_micro_step_id - (self.num_stages - 1), forward=True
+                )
                 if next_forward_virtual_pp_rank == (self.num_model_chunks - 1):
                     # first pp stage and first virtual stage
                     recv_prev = False
                 next_forward_virtual_pp_rank += 1
             else:
                 next_forward_virtual_pp_rank = self._get_virtual_pp_rank(
-                    forward_micro_step_id + 1, forward=True)
+                    forward_micro_step_id + 1, forward=True
+                )
 
             # last iteration doesn't need recv from upstream
             if micro_step == (steady_steps - 1):
@@ -624,52 +689,67 @@ def interleave_pipeline(self,
             if self.is_pipeline_last_stage(ignore_virtual=True):
                 next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
                     backward_micro_step_id - (self.num_stages - 1),
-                    forward=False)
+                    forward=False,
+                )
                 if next_backward_virtual_pp_rank == 0:
                     # last pp stage and last virtual stage
                     recv_next = False
                 next_backward_virtual_pp_rank -= 1
             else:
                 next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
-                    backward_micro_step_id + 1, forward=False)
+                    backward_micro_step_id + 1, forward=False
+                )
 
-            input_tensor, output_tensor_grad = p2p.send_forward_backward_recv_forward_backward(
+            (
+                input_tensor,
+                output_tensor_grad,
+            ) = p2p.send_forward_backward_recv_forward_backward(
                 output_tensor,
                 input_tensor_grad,
                 recv_prev=recv_prev,
-                recv_next=recv_next)
+                recv_next=recv_next,
+            )
 
             if recv_prev:
                 self.input_tensors[next_forward_virtual_pp_rank].append(
-                    input_tensor)
+                    input_tensor
+                )
             if recv_next:
                 self.output_tensor_grads[next_backward_virtual_pp_rank].append(
-                    output_tensor_grad)
+                    output_tensor_grad
+                )
 
         # remaining backward steps
         if not forward_only:
             if all_startup_steps:
                 self.output_tensor_grads[self.num_model_chunks - 1].append(
-                    p2p.recv_backward(self.is_pipeline_last_stage()))
+                    p2p.recv_backward(
+                        self.is_pipeline_last_stage(), sync_recv=False
+                    )
+                )
 
             for micro_step in range(steady_steps, num_steps):
                 # cooldown loop
                 input_tensor_grad = self._backward_step_helper(micro_step)
                 next_backward_virtual_pp_rank = self._get_virtual_pp_rank(
-                    micro_step + 1, forward=False)
+                    micro_step + 1, forward=False
+                )
 
                 recv_next = True
                 if self.is_pipeline_last_stage(ignore_virtual=True):
-                    if next_backward_virtual_pp_rank == (self.num_model_chunks -
-                                                         1):
+                    if next_backward_virtual_pp_rank == (
+                        self.num_model_chunks - 1
+                    ):
                         recv_next = False
 
                 if micro_step == (num_steps - 1):
                     recv_next = False
 
                 self.output_tensor_grads[next_backward_virtual_pp_rank].append(
-                    p2p.send_backward_recv_backward(input_tensor_grad,
-                                                    recv_next=recv_next))
+                    p2p.send_backward_recv_backward(
+                        input_tensor_grad, recv_next=recv_next
+                    )
+                )
 
             self._layers.allreduce_shared_weight_gradients()
 
@@ -686,7 +766,7 @@ def interleave_pipeline(self,
     def train_batch(self, data, optimizer, lr_scheduler=None, scaler=None):
         data = self._prepare_training(data, optimizer, lr_scheduler)
         # interleave scheduler for pipeline parallel
-        train_loss = self.interleave_pipeline(data, scaler)
+        train_loss = self.forward_backward_pipeline(data, scaler)
 
         # optimizer
         with paddle.amp.auto_cast(enable=False):
@@ -701,4 +781,4 @@ def eval_batch(self, data, compute_loss=False):
         self._layers.eval()
         self._compute_loss = compute_loss
 
-        return self.interleave_pipeline(data, None, forward_only=True)
+        return self.forward_backward_pipeline(data, None, forward_only=True)
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
index 786eb20487a52e..04575bfb231946 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/__init__.py
@@ -12,6 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .utils import get_tensor_bytes
-
 __all__ = []
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
index 91136033761913..b41fef05660b2f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/p2p_communication.py
@@ -13,21 +13,23 @@
 # limitations under the License.
 
 import paddle
-from .utils import paddle_2_number, number_2_dtype
 from ...utils.log_util import logger
 import numpy as np
 from paddle import _C_ops, _legacy_C_ops
 import paddle.fluid.core as core
 from paddle.fluid.framework import _in_legacy_dygraph, _non_static_mode, in_dygraph_mode
+from .utils import paddle_2_number, paddle_2_number, number_2_dtype
 
 _hcg = None
 _use_cache = False
+_enable_partial_send_recv = True
 
 
-def initialize_p2p_groups(hcg, use_cache=True):
-    global _hcg, _use_cache
+def initialize_p2p_groups(hcg, use_cache=True, enable_partial_send_recv=True):
+    global _hcg, _use_cache, _enable_partial_send_recv
     _hcg = hcg
     _use_cache = use_cache
+    _enable_partial_send_recv = enable_partial_send_recv
     send_next_group, send_prev_group, recv_next_group, recv_prev_group = _hcg.get_p2p_groups(
     )
 
@@ -157,7 +159,8 @@ def set_send_message(self, tensor):
 
 
 def _is_valid_send_recv_partial(tensor, mp_degree):
-
+    if not _enable_partial_send_recv:
+        return False
     tensor_numel = np.prod(tensor.shape)
     assert tensor_numel != 0, "can't send/recv zero element"
     return mp_degree > 1 and tensor_numel % mp_degree == 0
@@ -174,8 +177,9 @@ def _partial_send_op(tensor, group, use_calc_stream, ring_id, dst, nranks,
     elif in_dygraph_mode():
         group = paddle.distributed.collective._get_default_group(
         ) if group is None else group
-        return group.process_group.send_partial(tensor, dst_rank_in_group,
-                                                nranks, rank_id)
+        comm_op = group.process_group.send_partial_on_calc_stream \
+            if use_calc_stream else group.process_group.send_partial
+        return comm_op(tensor, dst_rank_in_group, nranks, rank_id)
 
 
 def send_partial(tensor,
@@ -197,7 +201,8 @@ def send_partial(tensor,
                                 dst_rank, nranks, rank_id)
     else:
         if _in_legacy_dygraph():
-            send_op = paddle.distributed.send
+            send_op = lambda x, dst, group: \
+                    paddle.distributed.send(x, dst, group, use_calc_stream)
         elif in_dygraph_mode():
             send_op = paddle.distributed.isend
         return send_op(tensor.detach(), dst=dst_rank, group=group)
@@ -207,6 +212,7 @@ def _partial_recv_op(tensor, group, use_calc_stream, ring_id, src, nranks,
                      rank_id):
     src_rank_in_group = src if group is None else group.get_group_rank(src)
     if _in_legacy_dygraph():
+        assert use_calc_stream
         return _legacy_C_ops.partial_recv(tensor.detach(), 'use_calc_stream',
                                           use_calc_stream, 'ring_id', ring_id,
                                           'peer', src_rank_in_group, 'num',
@@ -216,8 +222,9 @@ def _partial_recv_op(tensor, group, use_calc_stream, ring_id, src, nranks,
     elif in_dygraph_mode():
         group = paddle.distributed.collective._get_default_group(
         ) if group is None else group
-        return group.process_group.recv_partial(tensor, src_rank_in_group,
-                                                nranks, rank_id)
+        comm_op = group.process_group.recv_partial_on_calc_stream \
+            if use_calc_stream else group.process_group.recv_partial
+        return comm_op(tensor, src_rank_in_group, nranks, rank_id)
 
 
 def recv_partial(tensor,
@@ -238,7 +245,7 @@ def recv_partial(tensor,
         return _partial_recv_op(tensor, group, use_calc_stream, ring_id,
                                 src_rank, nranks, rank_id)
     else:
-        if _in_legacy_dygraph():
+        if _in_legacy_dygraph() or use_calc_stream:
             recv_op = paddle.distributed.recv
         elif in_dygraph_mode():
             recv_op = paddle.distributed.irecv
@@ -256,8 +263,9 @@ def _partial_allgather_op(tensor, group, use_calc_stream, ring_id, nranks,
     elif in_dygraph_mode():
         group = paddle.distributed.collective._get_default_group(
         ) if group is None else group
-        return group.process_group.all_gather_partial(tensor, tensor, nranks,
-                                                      rank_id)
+        comm_op = group.process_group.all_gather_partial_on_calc_stream \
+            if use_calc_stream else group.process_group.all_gather_partial
+        return comm_op(tensor, tensor, nranks, rank_id)
 
 
 def allgather_partial(tensor,
@@ -266,16 +274,20 @@ def allgather_partial(tensor,
                       group=None,
                       use_calc_stream=True):
     if not _is_valid_send_recv_partial(tensor, nranks):
-        return None
+        return tensor
     if group is not None and not group.is_member():
-        return None
+        return
     ring_id = 0 if group is None else group.id
 
     return _partial_allgather_op(tensor, group, use_calc_stream, ring_id,
                                  nranks, rank_id)
 
 
-def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
+def _p2p_helper(tensor_send_next,
+                tensor_send_prev,
+                recv_prev,
+                recv_next,
+                sync_recv=True):
     global _hcg
 
     tensor_recv_prev = None
@@ -328,116 +340,139 @@ def _p2p_helper(tensor_send_next, tensor_send_prev, recv_prev, recv_next):
         if isinstance(tensor_send_prev, tuple):
             for d in tensor_send_prev:
                 paddle.distributed.wait(d, use_calc_stream=True)
-                tasks.append(
-                    send_partial(d,
-                                 dst=0,
-                                 nranks=mp_degree,
-                                 rank_id=mp_rank,
-                                 group=_hcg.send_prev_group,
-                                 use_calc_stream=False))
-        else:
-            paddle.distributed.wait(tensor_send_prev, use_calc_stream=True)
-            tasks.append(
-                send_partial(tensor_send_prev,
+                send_partial(d,
                              dst=0,
                              nranks=mp_degree,
                              rank_id=mp_rank,
                              group=_hcg.send_prev_group,
-                             use_calc_stream=False))
+                             use_calc_stream=False)
+        else:
+            paddle.distributed.wait(tensor_send_prev, use_calc_stream=True)
+            send_partial(tensor_send_prev,
+                         dst=0,
+                         nranks=mp_degree,
+                         rank_id=mp_rank,
+                         group=_hcg.send_prev_group,
+                         use_calc_stream=False)
 
     if tensor_recv_prev is not None:
         if isinstance(tensor_recv_prev, tuple):
             for d in tensor_recv_prev:
-                tasks.append(
-                    recv_partial(d,
-                                 src=0,
-                                 nranks=mp_degree,
-                                 rank_id=mp_rank,
-                                 group=_hcg.recv_prev_group,
-                                 use_calc_stream=True))
-                tasks.append(
+                task = recv_partial(d,
+                                    src=0,
+                                    nranks=mp_degree,
+                                    rank_id=mp_rank,
+                                    group=_hcg.recv_prev_group,
+                                    use_calc_stream=sync_recv)
+                if sync_recv:
                     allgather_partial(d,
                                       nranks=mp_degree,
                                       rank_id=mp_rank,
                                       group=mp_group,
-                                      use_calc_stream=True))
+                                      use_calc_stream=True)
+                else:
+                    tasks.append(task)
         else:
-            tasks.append(
-                recv_partial(tensor_recv_prev,
-                             src=0,
-                             nranks=mp_degree,
-                             rank_id=mp_rank,
-                             group=_hcg.recv_prev_group,
-                             use_calc_stream=True))
-            tasks.append(
+            task = recv_partial(tensor_recv_prev,
+                                src=0,
+                                nranks=mp_degree,
+                                rank_id=mp_rank,
+                                group=_hcg.recv_prev_group,
+                                use_calc_stream=sync_recv)
+            if sync_recv:
                 allgather_partial(tensor_recv_prev,
                                   nranks=mp_degree,
                                   rank_id=mp_rank,
                                   group=mp_group,
-                                  use_calc_stream=True))
+                                  use_calc_stream=True)
+            else:
+                tasks.append(task)
 
     if tensor_send_next is not None:
         if isinstance(tensor_send_next, tuple):
             for d in tensor_send_next:
                 paddle.distributed.wait(d, use_calc_stream=True)
-                tasks.append(
-                    send_partial(d,
-                                 dst=1,
-                                 nranks=mp_degree,
-                                 rank_id=mp_rank,
-                                 group=_hcg.send_next_group,
-                                 use_calc_stream=False))
-        else:
-            paddle.distributed.wait(tensor_send_next, use_calc_stream=True)
-            tasks.append(
-                send_partial(tensor_send_next,
+                send_partial(d,
                              dst=1,
                              nranks=mp_degree,
                              rank_id=mp_rank,
                              group=_hcg.send_next_group,
-                             use_calc_stream=False))
+                             use_calc_stream=False)
+        else:
+            paddle.distributed.wait(tensor_send_next, use_calc_stream=True)
+            send_partial(tensor_send_next,
+                         dst=1,
+                         nranks=mp_degree,
+                         rank_id=mp_rank,
+                         group=_hcg.send_next_group,
+                         use_calc_stream=False)
 
     if tensor_recv_next is not None:
         if isinstance(tensor_recv_next, tuple):
             for d in tensor_recv_next:
-                tasks.append(
-                    recv_partial(d,
-                                 src=1,
-                                 nranks=mp_degree,
-                                 rank_id=mp_rank,
-                                 group=_hcg.recv_next_group,
-                                 use_calc_stream=True))
-                tasks.append(
+                task = recv_partial(d,
+                                    src=1,
+                                    nranks=mp_degree,
+                                    rank_id=mp_rank,
+                                    group=_hcg.recv_next_group,
+                                    use_calc_stream=sync_recv)
+                if sync_recv:
                     allgather_partial(d,
                                       nranks=mp_degree,
                                       rank_id=mp_rank,
                                       group=mp_group,
-                                      use_calc_stream=True))
+                                      use_calc_stream=True)
+                else:
+                    tasks.append(task)
 
         else:
-            tasks.append(
-                recv_partial(tensor_recv_next,
-                             src=1,
-                             nranks=mp_degree,
-                             rank_id=mp_rank,
-                             group=_hcg.recv_next_group,
-                             use_calc_stream=True))
-
-            tasks.append(
+            task = recv_partial(tensor_recv_next,
+                                src=1,
+                                nranks=mp_degree,
+                                rank_id=mp_rank,
+                                group=_hcg.recv_next_group,
+                                use_calc_stream=sync_recv)
+            if sync_recv:
                 allgather_partial(tensor_recv_next,
                                   nranks=mp_degree,
                                   rank_id=mp_rank,
                                   group=mp_group,
-                                  use_calc_stream=True))
-    if in_dygraph_mode():
-        # wait tasks in new dygraph mode with new comm library
-        for task in tasks:
-            if task is not None:
+                                  use_calc_stream=True)
+            else:
+                tasks.append(task)
+
+    if not sync_recv:
+        if in_dygraph_mode():
+            # wait irecv tasks in eager dygraph mode with new comm library
+            for task in tasks:
+                assert task is not None
                 task.wait()
+
+        tensors_for_all_gather = []
+        if tensor_recv_prev is not None:
+            if isinstance(tensor_recv_prev, tuple):
+                for d in tensor_recv_prev:
+                    tensors_for_all_gather.append(d)
+            else:
+                tensors_for_all_gather.append(tensor_recv_prev)
+        if tensor_recv_next is not None:
+            if isinstance(tensor_recv_next, tuple):
+                for d in tensor_recv_next:
+                    tensors_for_all_gather.append(d)
+            else:
+                tensors_for_all_gather.append(tensor_recv_next)
+
+        for tensor in tensors_for_all_gather:
+            allgather_partial(tensor,
+                              nranks=mp_degree,
+                              rank_id=mp_rank,
+                              group=mp_group,
+                              use_calc_stream=True)
+
     return tensor_recv_prev, tensor_recv_next
 
 
-def recv_forward(pp_first_stage):
+def recv_forward(pp_first_stage, sync_recv=True):
     if pp_first_stage:
         input_tensor = None
     else:
@@ -448,18 +483,20 @@ def recv_forward(pp_first_stage):
         input_tensor, _ = _p2p_helper(tensor_send_next=None,
                                       tensor_send_prev=None,
                                       recv_prev=True,
-                                      recv_next=False)
+                                      recv_next=False,
+                                      sync_recv=sync_recv)
     return input_tensor
 
 
-def recv_backward(pp_last_stage):
+def recv_backward(pp_last_stage, sync_recv=True):
     if pp_last_stage:
         output_tensor_grad = None
     else:
         _, output_tensor_grad = _p2p_helper(tensor_send_next=None,
                                             tensor_send_prev=None,
                                             recv_prev=False,
-                                            recv_next=True)
+                                            recv_next=True,
+                                            sync_recv=sync_recv)
     return output_tensor_grad
 
 
@@ -521,7 +558,8 @@ def send_forward_backward_recv_forward_backward(output_tensor,
         tensor_send_next=output_tensor,
         tensor_send_prev=input_tensor_grad,
         recv_prev=recv_prev,
-        recv_next=recv_next)
+        recv_next=recv_next,
+        sync_recv=False)
     return input_tensor, output_tensor_grad
 
 
@@ -538,7 +576,8 @@ def send_forward_recv_forward(output_tensor, recv_prev):
     input_tensor, _ = _p2p_helper(tensor_send_next=output_tensor,
                                   tensor_send_prev=None,
                                   recv_prev=recv_prev,
-                                  recv_next=False)
+                                  recv_next=False,
+                                  sync_recv=False)
 
     return input_tensor
 
@@ -547,5 +586,6 @@ def send_backward_recv_backward(input_tensor_grad, recv_next):
     _, output_tensor_grad = _p2p_helper(tensor_send_next=None,
                                         tensor_send_prev=input_tensor_grad,
                                         recv_prev=False,
-                                        recv_next=recv_next)
+                                        recv_next=recv_next,
+                                        sync_recv=False)
     return output_tensor_grad
diff --git a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
index bb774b8a0e5f89..c2008abb71c537 100644
--- a/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/pp_utils/utils.py
@@ -12,16 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import contextlib
-
 import paddle
 from paddle.fluid import core
 from paddle import _C_ops, _legacy_C_ops
-from paddle.autograd import PyLayer
-from paddle.fluid import framework
-from ...utils.recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
-from ..parallel_layers.random import get_rng_state_tracker
-from paddle.fluid.framework import in_dygraph_mode
 
 __all__ = []
 
@@ -88,23 +81,6 @@ def get_tensor_bytes(tensor):
     return tensor.numel() * elem_size
 
 
-_hcg = None
-_recompute_offload = False
-_recompute_partition = False
-
-
-def _initialize_recompute_setting(is_offload, is_partition):
-    global _recompute_offload, _recompute_partition
-
-    _recompute_offload = is_offload
-    _recompute_partition = is_partition
-
-
-def _initialize_recompute_hcg(hcg):
-    global _hcg
-    _hcg = hcg
-
-
 def _all_gather(tensor, group=None, use_calc_stream=True):
     """
     The main difference with paddle.distributed.all_gather: 
@@ -117,187 +93,3 @@ def _all_gather(tensor, group=None, use_calc_stream=True):
     ).nranks if group is None else group.nranks
     return _legacy_C_ops.c_allgather(tensor, 'use_calc_stream', use_calc_stream,
                                      'ring_id', ring_id, 'nranks', nranks)
-
-
-def _split_activation(tensor):
-    global _hcg
-
-    mp_degree = _hcg.get_model_parallel_world_size()
-    mp_rank = _hcg.get_model_parallel_rank()
-    if mp_degree < 2:
-        return tensor
-
-    tensor_numel = paddle.numel(tensor)
-    assert tensor_numel != 0, "can't recompute zero element"
-    assert tensor_numel % mp_degree == 0, "The capacity of the activation () cannot be divisible by mp_degree()".format(
-        tensor_numel, mp_degree)
-
-    # use inplace operation to save memory
-    data = tensor.flatten_()
-
-    part_size = tensor_numel // mp_degree
-    start = part_size * mp_rank
-    end = start + part_size
-    return data[start:end]
-
-
-def _merge_activation(tensor):
-    global _hcg
-    mp_degree = _hcg.get_model_parallel_world_size()
-    mp_rank = _hcg.get_model_parallel_rank()
-    mp_group = _hcg.get_model_parallel_group()
-    if mp_degree < 2:
-        return tensor
-    return _all_gather(tensor, group=mp_group)
-
-
-class _HPRecomputeFunction(PyLayer):
-    """
-    Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
-    1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
-    2. Offload support for activation
-    3. Support MP segmentation of activation to further reduce cuda memory
-    4. Adapt to the random state of MP
-    """
-
-    @staticmethod
-    def forward(ctx, run_function, all_outputs, *args):
-        check_recompute_necessary(args)
-
-        # store for recomputing
-        ctx.run_function = run_function
-
-        # store the rng states
-        ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state()
-        ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
-        ).get_states_tracker()
-
-        # save input for backward
-        ctx.inputs = []
-        ctx.tensor_indices = []
-        ctx.tensor_shapes = []
-        tensor_inputs = []
-
-        cur_device = paddle.get_device()
-        assert 'gpu:' in paddle.get_device(
-        ), "Recompute with RNG is not support current device: {}.".format(
-            cur_device)
-
-        # TODO support AMP
-        tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
-        if tracer._amp_level == core.AmpLevel.O2:
-            ctx.amp_level = 'O2'
-        elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
-            ctx.amp_level = 'O1'
-        else:
-            raise ValueError("unsupported amp level: {}".format(
-                tracer._amp_level))
-        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
-
-        with paddle.no_grad():
-            outputs = run_function(*args)
-
-        for i, arg in enumerate(args):
-            if paddle.is_tensor(arg):
-                state = arg.stop_gradient
-                if _recompute_partition:
-                    ctx.tensor_shapes.append(arg.shape)
-                    partition = _split_activation(arg.detach()).clone()
-                    # TODO(shenliang03) not use calculate stream to D2H to speed
-                    arg = partition.cpu() if _recompute_offload else partition
-                else:
-                    arg = arg.cpu() if _recompute_offload else arg
-                arg.stop_gradient = state
-                tensor_inputs.append(arg)
-                ctx.tensor_indices.append(i)
-                ctx.inputs.append(None)
-            else:
-                ctx.inputs.append(arg)
-
-        ctx.save_for_backward(*tensor_inputs)
-
-        if paddle.is_tensor(outputs):
-            all_outputs += [outputs]
-            return outputs
-        else:
-            all_outputs += outputs
-            return tuple(outputs)
-
-    @staticmethod
-    def backward(ctx, *args):
-        with paddle.fluid.dygraph.guard():
-            # Restore inputs
-            inputs = list(ctx.inputs)
-            tensor_indices = ctx.tensor_indices
-            tensor_shapes = ctx.tensor_shapes
-            tensors = list(ctx.saved_tensor())
-
-            device_id = paddle.distributed.ParallelEnv().device_id
-            for i, idx in enumerate(tensor_indices):
-                if _recompute_partition:
-                    state = tensors[i].stop_gradient
-                    tensors[i] = _merge_activation(
-                        tensors[i]).detach().reshape_(tensor_shapes[i])
-                    tensors[i].stop_gradient = state
-                inputs[idx] = tensors[i].cuda(
-                    device_id) if _recompute_offload else tensors[i]
-
-            tracer = framework._dygraph_tracer()
-            tracer._has_grad = True
-
-            # need restore auto_cast state as well as w/b list
-            with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
-                                         ctx.fwd_cuda_rng_state_tracker):
-                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
-                                          custom_white_list=ctx.amp_white_list,
-                                          custom_black_list=ctx.amp_black_list,
-                                          level=ctx.amp_level):
-                    detached_inputs = detach_variable(tuple(inputs))
-                    outputs = ctx.run_function(*detached_inputs)
-
-            if isinstance(outputs, (core.VarBase, core.eager.Tensor)):
-                outputs = (outputs, )
-            assert len(outputs) == len(args)
-
-            forward_outputs_with_grad = []
-            backward_inputs = []
-
-            for i in range(len(outputs)):
-                if isinstance(
-                        outputs[i],
-                    (core.VarBase,
-                     core.eager.Tensor)) and not outputs[i].stop_gradient:
-                    forward_outputs_with_grad.append(outputs[i])
-                    backward_inputs.append(args[i])
-
-            if len(forward_outputs_with_grad) == 0:
-                raise RuntimeError(
-                    "none of output has stop_gradient=False, this recompute() is not necessary"
-                )
-
-            # actually backward
-            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
-            grads = tuple(inp._grad_ivar() for inp in detached_inputs
-                          if isinstance(inp, (core.VarBase, core.eager.Tensor)))
-            return grads
-
-
-def _hp_recompute(function, *args):
-    # NODTE(shenliang03)The current hybrid parallel recompute has limitations.
-    # It cannot handle the following situations:
-    # 1. The calculation output of recompute, there are tensors that do not require gradients.
-    # 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach().
-    # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
-
-    all_outputs = []
-    _HPRecomputeFunction.apply(function, all_outputs, *args)
-
-    if len(all_outputs) == 1:
-        return all_outputs[0]
-    else:
-        for output in all_outputs:
-            if paddle.is_tensor(output) and not is_float_tensor(output):
-                output.stop_gradient = True
-
-        return tuple(all_outputs)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
index 7bdbe2ce32e47b..beda2401b7573e 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py
@@ -24,6 +24,8 @@
 
 import copy
 import logging
+import warnings
+
 import numpy as np
 from collections import OrderedDict
 
@@ -41,6 +43,7 @@
 alignment = {"gpu": 256, "cpu": 4096}
 align = {
     Type.fp16.value: 2,
+    Type.bf16.value: 2,
     Type.fp32.value: 4,
 }
 
@@ -86,6 +89,11 @@ def __init__(self,
         # Default information
         self._optim = optim
 
+        # sharing stage 2 comm overlap flag
+        self._reduce_overlap = False
+        # record the last task used for comm overlap for sharding stage 2
+        self._comm_task = None
+
         assert hasattr(self._optim, "_master_weights"
                        ), "Must use optimizer with _master_weights attribute"
 
@@ -103,6 +111,17 @@ def __init__(self,
                 filter(lambda x: x.trainable and x.dtype == Type.fp16.value,
                        self._local_params))) > 0
 
+        self._broadcast_overlap = False
+        self._forward_pre_hook_remove_helper = []
+        try:
+            # The fp32 params such as layer_norm_0.w_0 will be at the end of param_list.
+            # Have to sort the params to make sure all params are in the forward using order.
+            self._broadcast_order_params = sorted(
+                self.local_params,
+                key=lambda x: int(x.name.split('.')[0].split('_')[-1]))
+        except ValueError:
+            self._broadcast_order_params = None
+
         self._group = new_group(
             _get_global_group().ranks) if group is None else group
 
@@ -155,7 +174,61 @@ def _sync_params_and_buffers(self):
             broadcast(p,
                       src=self._global_root_rank,
                       group=self._group,
-                      use_calc_stream=True)
+                      sync_op=True)
+
+    def _update_task(self, task):
+        if self._reduce_overlap:
+            assert task is not None
+        # Only track of the last reduce task.
+        # Since all tasks are on the same stream, only need to wait the last one.
+        # After waiting for the last reduce task, all reduce tasks before have already finished.
+        self._comm_task = task
+
+    def _set_reduce_overlap(self, reduce_overlap):
+        # Enable gradients' reduces overlap with backward calculation.
+        self._reduce_overlap = reduce_overlap
+
+    def _set_broadcast_overlap(self,
+                               broadcast_overlap,
+                               layers=None,
+                               num_groups=None):
+        # Enable post optimizer broadcasts overlap with the forward calculation of next batch.
+        self._broadcast_overlap = broadcast_overlap
+        if self._broadcast_overlap:
+            assert layers is not None, \
+                "To enable broadcast overlap forward, please pass the module to the function."
+            self._layers = layers
+            warnings.warn(
+                "Setting overlap broadcast means the `paddle.device.cuda.synchronize()` "
+                "must be called manually before calling `paddle.save()` and before and inference."
+            )
+            if self._broadcast_order_params is None:
+                # Params' names should be like column_linear_32.w_0 patter to get the best performance.
+                warnings.warn(
+                    "The param name passed to the optimizer doesn't follow .+_[0-9]+\..+ patter, "
+                    "overlap broadcast may harm the performance.")
+                self._broadcast_order_params = self._local_params
+
+        if num_groups is None or num_groups > len(self._broadcast_order_params):
+            warnings.warn(
+                "The num_groups for broadcast is larger than the number of params to be broadcast. "
+                "It will set to default value: 1 (use the default sharding group)."
+            )
+            num_groups = 1
+
+        assert isinstance(
+            num_groups,
+            int) and num_groups > 0, "num_groups should be a positive integer"
+
+        self._number_of_broadcast_groups = num_groups
+        self._broadcast_groups = [
+            None for _ in range(self._number_of_broadcast_groups)
+        ]
+        self._broadcast_groups[0] = self._group
+
+        ranks = self._group.ranks
+        for i in range(1, self._number_of_broadcast_groups):
+            self._broadcast_groups[i] = new_group(ranks)
 
     def _generate_master_params(self, trainable_params):
         if self.offload:
@@ -222,7 +295,9 @@ def dtype_rank_params(self):
         """
         if len(self._dtype_rank_params) == 0:
             # Assign the parameters of each rank according to the type
-            for param in self._local_params:
+            trainable_params = list(
+                filter(lambda x: x.trainable, self._local_params))
+            for param in trainable_params:
                 if param.dtype not in self._dtype_rank_params.keys():
                     self._dtype_rank_params[param.dtype] = [
                         [] for _ in range(self.world_size)
@@ -364,6 +439,13 @@ def step(self):
         """
         A wrapper for Optimizer's step function to finish the update operation of the optimizer.
         """
+        # This method won't be called directly by opt.step()!
+        # The _redefine_opt_step() in class GroupShardedStage2 will wrap this function.
+        if self._broadcast_overlap:
+            # Clear the pre forward hook in the optimizer step.
+            for hook_remove in self._forward_pre_hook_remove_helper:
+                hook_remove.remove()
+            self._forward_pre_hook_remove_helper = []
 
         if self.offload:
             params_list = [self.offload_params.buffer]
@@ -408,9 +490,52 @@ def _broadcast_params(self):
         """Broadcast the parameters of the current rank to each rank"""
 
         # Exchange all the shards with the other ranks
-        for dtype_per_rank in self.param_storages.values():
-            for dst_rank, internal_storage in dtype_per_rank.items():
-                broadcast(tensor=internal_storage.buffer,
-                          src=self._group.ranks[dst_rank],
-                          group=self._group,
-                          use_calc_stream=True)
+        if self._broadcast_overlap:
+            self._broadcast_params_overlap_forward()
+        else:
+            for dtype_per_rank in self.param_storages.values():
+                for dst_rank, internal_storage in dtype_per_rank.items():
+                    broadcast(tensor=internal_storage.buffer,
+                              src=self._group.ranks[dst_rank],
+                              group=self._group,
+                              sync_op=True)
+
+    def _forward_pre_hook_function(self, tasks):
+        # Since the layers will call pre hook by `forward_pre_hook(self, inputs)`,
+        # the helper functions needs the x and y to take those params.
+        def __impl__(x, y):
+            for task in tasks:
+                # Wait for broadcast task before using the result of the broadcast.
+                task.wait()
+
+        return __impl__
+
+    @paddle.autograd.no_grad()
+    def _broadcast_params_overlap_forward(self):
+        # Exchange all the shards with the other ranks,
+        # but overlap the broadcast with next batch's calculation.
+        group_idx = 0
+
+        param2task = {}
+        for x in self._broadcast_order_params:
+            if x.trainable:
+                group = self._broadcast_groups[group_idx]
+                group_idx = (group_idx + 1) % self._number_of_broadcast_groups
+                task = broadcast(tensor=x,
+                                 src=group.ranks[self._param2rank[x.name]],
+                                 group=group,
+                                 sync_op=False)
+                assert x.name not in param2task
+                param2task[x.name] = task
+
+        for layer in self._layers.sublayers():
+            if len(layer.sublayers()) == 0:
+                # Register forward pre hood for leaf layers. This will get the best performance.
+                tasks = []
+                for param in layer.parameters():
+                    if param.trainable:
+                        if param.name in param2task:
+                            tasks.append(param2task[param.name])
+                self._forward_pre_hook_remove_helper.append(
+                    layer.register_forward_pre_hook(
+                        self._forward_pre_hook_function(tasks)))
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
index 905af0487ba67f..3f3ab817e91461 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage2.py
@@ -33,7 +33,7 @@
 import paddle
 from paddle import nn
 from paddle.distributed import collective
-from paddle.distributed.utils import get_logger
+from paddle.distributed.utils.log_utils import get_logger
 
 from .group_sharded_storage import GradStorage
 from .group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
@@ -100,11 +100,15 @@ def __init__(
         for optim in self._sharding_optimizers:
             self._all_params.extend(list(optim.local_params))
 
-        self._trainable_params = []
+        # sharing stage 2 comm overlap flag
+        self._reduce_overlap = False
+
         self._grad_reduced = []
         self._trainable_param2rank = {}
         self._trainable_param2align = {}
-        self._trainable_mask = list(map(_trainable, self._all_params))
+        self._trainable_params = list(
+            filter(lambda x: x.trainable, self._all_params))
+        self._trainable_mask = list(map(_trainable, self._trainable_params))
         self._param_grads = []
 
         # Set grad storage size & Display param sizes and model sizes
@@ -287,7 +291,7 @@ def __sync_buffers(self):
             collective.broadcast(buffer,
                                  self._global_root_rank,
                                  self._group,
-                                 use_calc_stream=True)
+                                 sync_op=True)
 
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
@@ -306,6 +310,18 @@ def _clear_counters(self):
             for grad_storage in self._grad_storage_list:
                 grad_storage.reset_checked_in()
 
+    def _set_reduce_overlap(self, reduce_overlap):
+        # Hacky way to not add an extra parameter to the `group_sharded_parallel` funct.
+        # User should use this like:
+        # model, optimizer, scaler = group_sharded_parallel(...)
+        # model._set_reduce_overlap(True)
+        self._reduce_overlap = reduce_overlap
+        if self._reduce_overlap:
+            assert len(
+                self._sharding_optimizers
+            ) == 1, "Only support comm overlap strategy for single optimizer"
+        self._sharding_optimizers[0]._set_reduce_overlap(reduce_overlap)
+
     def _get_reduce_fn(self, index, param, dst_rank):
         """
         There are two ways to reduce gradient.
@@ -337,11 +353,12 @@ def cleanup():
                             del tmp_grad
                             param.clear_gradient(False)
 
-                    # Synchronize the reduce parameter gradient
-                    collective.reduce(tensor=param.grad,
-                                      dst=self._group.ranks[dst_rank],
-                                      group=self._group)
-                    #  TODO (Baibaifan) Asynchronous the reduce parameter gradient
+                    # Synchronize the reduce parameter gradient asynchronize
+                    self._sharding_optimizers[0]._update_task(
+                        collective.reduce(tensor=param.grad,
+                                          dst=self._group.ranks[dst_rank],
+                                          group=self._group,
+                                          sync_op=not self._reduce_overlap))
 
                     # Clear the task flow and trigger callback to clear the redundant gradient
                     # self._clear_task_flow()
@@ -385,12 +402,13 @@ def cleanup():
 
                         # Reduce the bucket
                         grad_storage.sent = True
-                        # Synchronize the reduce parameter gradient
-                        collective.reduce(
-                            tensor=grad_storage.buffer,
-                            dst=self._group.ranks[grad_storage.destination],
-                            group=self._group)
-                        #  TODO (Baibaifan) Asynchronous the reduce parameter gradient
+                        # Synchronize the reduce parameter gradient asynchronize
+                        self._sharding_optimizers[0]._update_task(
+                            collective.reduce(
+                                tensor=grad_storage.buffer,
+                                dst=self._group.ranks[grad_storage.destination],
+                                group=self._group,
+                                sync_op=not self._reduce_overlap))
 
                         cleanup()
 
@@ -471,7 +489,7 @@ def _setup_use_grad_storage(self):
 
     def _detect_train_change(self):
         # Current trainable parameters
-        trainable_mask = list(map(_trainable, self._all_params))
+        trainable_mask = list(map(_trainable, self._trainable_params))
 
         # Whether parameters trainability changed
         trainability_changed = trainable_mask != self._trainable_mask
@@ -514,6 +532,12 @@ def _rank_buffer_size(self, buffer_max_size, model_size):
                 "====== FP16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======"
                 .format(rank_buffer_size[Type.fp16.value] / 2**19,
                         model_size / 2**19))
+        if Type.bf16.value in rank_buffer_size.keys():
+            # FP16 GradStorage and model size
+            logger_.info(
+                "====== BF16 GradStorage size: {:.2f}M parameters, Model size {:.2f}M parameters ======"
+                .format(rank_buffer_size[Type.bf16.value] / 2**19,
+                        model_size / 2**19))
         if Type.fp32.value in rank_buffer_size.keys():
             # FP32 GradStorage and model size
             logger_.info(
@@ -528,6 +552,10 @@ def _redefine_opt_step(self):
             opt_step = opt.step
 
             def _opt_step(self):
+                if self._reduce_overlap:
+                    # Wait for the last reduce task. This wait must before grad scale function.
+                    assert self._comm_task is not None
+                    self._comm_task.wait()
                 grad_func()
                 opt_step()
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
index abc5e0549ae668..b628378140f785 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_stage3.py
@@ -181,7 +181,7 @@ def _sync_params_and_buffers(self):
             collective.broadcast(p,
                                  src=self._global_root_rank,
                                  group=self._group,
-                                 use_calc_stream=True)
+                                 sync_op=True)
 
     def _clear_gradients(self):
         assert len(self._trainable_params.keys()) > 0
@@ -446,7 +446,7 @@ def _sync_buffers(self):
             collective.broadcast(buffer,
                                  self._global_root_rank,
                                  self._group,
-                                 use_calc_stream=True)
+                                 sync_op=True)
 
     def __getattr__(self, name):
         """Forward missing attributes to wrapped layer."""
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
index c44872491093ec..5b9ab7343f08ca 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_storage.py
@@ -53,6 +53,8 @@ def __init__(self, size, dtype, device, convert_cpu=False):
                 dtype=np.float16) if Type.fp16.value == dtype else np.zeros(
                     size, dtype=np.float32)
             self.buffer = core.eager.Tensor(value=value, place=core.CPUPlace())
+            if dtype == Type.bf16.value:
+                self.buffer = paddle.cast(self.buffer, dtype=paddle.bfloat16)
         else:
             self.buffer = paddle.zeros(size, dtype=dtype)
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
index 8cff407363a3b7..7eb7b1e8784aa9 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py
@@ -41,6 +41,7 @@ class Type(Enum):
     Type of trainable parameters
     """
     fp16 = paddle.float16
+    bf16 = paddle.bfloat16
     fp32 = paddle.float32
 
 
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
index 7834e6d93984e3..a08e67456e5e6f 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage2.py
@@ -285,7 +285,7 @@ def __sync_buffers(self):
             dist.broadcast(buffer,
                            self._global_root_rank,
                            self._group,
-                           use_calc_stream=True)
+                           sync_op=True)
         # Multi stream operation will be supported later
         dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
 
@@ -340,7 +340,7 @@ def cleanup():
                             tensor=param.grad,
                             dst=self._group.ranks[dst_rank],
                             group=self._group,
-                            use_calc_stream=True),
+                            sync_op=True),
                                  callback=cleanup))
 
                     # Multi stream operation will be supported later
@@ -396,7 +396,7 @@ def cleanup():
                                 tensor=grad_storage.buffer,
                                 dst=self._group.ranks[grad_storage.destination],
                                 group=self._group,
-                                use_calc_stream=True),
+                                sync_op=True),
                                      callback=cleanup))
 
                         # Multi stream operation will be supported later
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
index 67d48c8abba1b4..5e0c3743dd3f88 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_stage3.py
@@ -170,7 +170,7 @@ def _sync_params_and_buffers(self):
             dist.broadcast(p,
                            src=self._global_root_rank,
                            group=self._group,
-                           use_calc_stream=True)
+                           sync_op=True)
 
         # Multi stream operation will be supported later
         dist.wait(tensor=p, group=self._group, use_calc_stream=True)
@@ -435,7 +435,7 @@ def _sync_buffers(self):
             dist.broadcast(buffer,
                            self._global_root_rank,
                            self._group,
-                           use_calc_stream=True)
+                           sync_op=True)
         # Multi stream operation will be supported later
         dist.wait(tensor=buffer, group=self._group, use_calc_stream=True)
 
@@ -478,7 +478,7 @@ def _update_params(self):
             grad_storage.buffer.scale_(scale=self._world_size_scaling)
             dist.all_reduce(tensor=grad_storage.buffer,
                             group=self._group,
-                            use_calc_stream=True)
+                            sync_op=True)
             dist.wait(tensor=grad_storage.buffer,
                       group=self._group,
                       use_calc_stream=True)
@@ -541,7 +541,7 @@ def allreduce_(*_):
                 # Only support sync allreduce current rank's layer now
                 dist.all_reduce(tensor=full_grad,
                                 group=self._group,
-                                use_calc_stream=True)
+                                sync_op=True)
                 dist.wait(tensor=full_grad,
                           group=self._group,
                           use_calc_stream=True)
diff --git a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
index d21502bcc16b88..42f43ce5377484 100644
--- a/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
+++ b/python/paddle/distributed/fleet/meta_parallel/sharding/sharding_utils.py
@@ -45,6 +45,7 @@ class Type(Enum):
     Type of trainable parameters
     """
     fp16 = paddle.float16
+    bf16 = paddle.bfloat16
     fp32 = paddle.float32
 
 
diff --git a/python/paddle/distributed/fleet/model.py b/python/paddle/distributed/fleet/model.py
index fea2614fe84c3f..40633788f12d45 100644
--- a/python/paddle/distributed/fleet/model.py
+++ b/python/paddle/distributed/fleet/model.py
@@ -20,46 +20,9 @@
 from .meta_parallel import TensorParallel, model_parallel_random_seed
 from .meta_parallel import PipelineParallel, ShardingParallel, PipelineParallelWithInterleave, PipelineLayer
 from paddle.fluid import core
-from paddle.distributed.fleet.utils.recompute import LegacyRecomputeFunction
 from paddle.fluid.dygraph.varbase_patch_methods import _grad_scalar
 from paddle.distributed import fleet
 
-
-class _RecomputeModelWrapper(paddle.nn.Layer):
-
-    def __init__(self, model, segments=2, preserve_rng_state=True):
-        super(_RecomputeModelWrapper, self).__init__()
-        assert isinstance(model, paddle.nn.Sequential), (
-            "The model passed to RecomputeModelWrapper must be of type "
-            "paddle.nn.Sequential.")
-        self._model = model
-        self._segments = segments
-        self._preserve_rng_state = preserve_rng_state
-        self._layers = list(model.children())
-        self._segment_size = len(self._layers) // segments
-
-    def _run_func(self, begin, end):
-
-        def do_run(input):
-            for i in range(begin, end):
-                input = self._layers[i](input)
-            return input
-
-        return do_run
-
-    def _checkpoint(self, func, *args, **kwargs):
-        return LegacyRecomputeFunction.apply(func, self._preserve_rng_state,
-                                             *args)
-
-    def forward(self, input):
-        end = 0
-        for begin in range(0, self._segment_size * (self._segments - 1),
-                           self._segment_size):
-            end = begin + self._segment_size
-            input = self._checkpoint(self._run_func(begin, end), input)
-        return self._run_func(end, len(self._layers))(input)
-
-
 _grad_scalar = None
 
 
@@ -125,7 +88,6 @@ def forward(self, x):
         return model
 
     amp_enable = False
-    recompute_enable = False
     strategy = fleet_env._user_defined_strategy
     if strategy.amp == True:
         amp_enable = True
@@ -154,10 +116,6 @@ def forward(self, x):
             decr_every_n_nan_or_inf=decr_every_n_nan_or_inf,
             use_dynamic_loss_scaling=use_dynamic_loss_scaling)
 
-    if strategy.recompute == True:
-        recompute_enable = True
-        model = _RecomputeModelWrapper(model)
-
     if strategy.heter_ccl_mode == True:
         distributed_model = paddle.DataParallel(
             model,
diff --git a/python/paddle/distributed/fleet/optimizer.py b/python/paddle/distributed/fleet/optimizer.py
index bfc3d737f99341..ddad6511a0a645 100644
--- a/python/paddle/distributed/fleet/optimizer.py
+++ b/python/paddle/distributed/fleet/optimizer.py
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import copy
-import warnings
 import paddle
 import os
 import numpy as np
@@ -22,6 +21,7 @@
 from .meta_optimizers import HybridParallelOptimizer, HeterParallelOptimizer
 from paddle.fluid import core
 from paddle.distributed import fleet
+from .utils.log_util import logger
 
 
 def _dygraph_distributed_optimizer(optimizer, strategy=None):
@@ -52,7 +52,7 @@ def _dygraph_distributed_optimizer(optimizer, strategy=None):
 
     if strategy is not None:
         if fleet_env._is_collective:
-            warnings.warn(
+            logger.warning(
                 "It is recommended to use DistributedStrategy "
                 "in fleet_env.init(). The strategy here is only for compatibility. "
                 "If the strategy in fleet_env.distributed_optimizer() is "
diff --git a/python/paddle/distributed/fleet/recompute/__init__.py b/python/paddle/distributed/fleet/recompute/__init__.py
new file mode 100644
index 00000000000000..7e5bcdb1db2776
--- /dev/null
+++ b/python/paddle/distributed/fleet/recompute/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .recompute import recompute, recompute_sequential
+from .recompute_hybrid import recompute_hybrid
+
+__all__ = []
diff --git a/python/paddle/distributed/fleet/utils/recompute.py b/python/paddle/distributed/fleet/recompute/recompute.py
similarity index 67%
rename from python/paddle/distributed/fleet/utils/recompute.py
rename to python/paddle/distributed/fleet/recompute/recompute.py
index f0c74159488a78..e1d2db328d1796 100755
--- a/python/paddle/distributed/fleet/utils/recompute.py
+++ b/python/paddle/distributed/fleet/recompute/recompute.py
@@ -22,13 +22,7 @@
 from paddle.fluid.framework import in_dygraph_mode
 
 import logging
-
-logger = logging.getLogger(__name__)
-formatter = logging.Formatter(fmt='%(asctime)s %(levelname)-8s %(message)s',
-                              datefmt='%Y-%m-%d %H:%M:%S')
-ch = logging.StreamHandler()
-ch.setFormatter(formatter)
-logger.addHandler(ch)
+from ..utils.log_util import logger
 
 __all__ = []
 
@@ -47,16 +41,23 @@ def detach_variable(inputs):
 
 
 def check_recompute_necessary(inputs):
-    if not any(input_.stop_gradient == False for input_ in inputs
-               if isinstance(input_, (core.eager.Tensor, paddle.Tensor))):
-        logger.warn(
+    if not any(
+        input_.stop_gradient == False
+        for input_ in inputs
+        if isinstance(input_, (core.eager.Tensor, paddle.Tensor))
+    ):
+        logger.warning(
             "[Recompute]: None of the inputs to current recompute block need grad, "
-            "therefore there is NO need to recompute this block in backward !")
+            "therefore there is NO need to recompute this block in backward !"
+        )
 
 
 @contextlib.contextmanager
 def swith_rng_state_tracker(rng_state, tracker):
-    from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
+    from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
+        get_rng_state_tracker,
+    )
+
     orig_cuda_rng_state = paddle.get_cuda_rng_state()
     orig_cuda_rng_tracker = get_rng_state_tracker().get_states_tracker()
 
@@ -70,10 +71,11 @@ def swith_rng_state_tracker(rng_state, tracker):
 
 
 class LegacyRecomputeFunction(LegacyPyLayer):
-
     @staticmethod
     def forward(ctx, run_function, preserve_rng_state, *args):
-        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
+            get_rng_state_tracker,
+        )
 
         # store for recomputing
         ctx.run_function = run_function
@@ -102,30 +104,37 @@ def forward(ctx, run_function, preserve_rng_state, *args):
             cur_device = paddle.get_device()
             if 'gpu:' not in cur_device:
                 raise RuntimeError(
-                    "Recompute with RNG perserve is not support current device: {}."
-                    .format(cur_device))
+                    "Recompute with RNG perserve is not support current device: {}.".format(
+                        cur_device
+                    )
+                )
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
-            ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
-            ).get_states_tracker()
+            ctx.fwd_cuda_rng_state_tracker = (
+                get_rng_state_tracker().get_states_tracker()
+            )
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
+        ctx.is_fw_autocast = (
+            False if tracer._amp_level == core.AmpLevel.O0 else True
+        )
         if tracer._amp_level == core.AmpLevel.O2:
             ctx.amp_level = 'O2'
         elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
             ctx.amp_level = 'O1'
         else:
-            raise ValueError("unsupported amp level: {}".format(
-                tracer._amp_level))
+            raise ValueError(
+                "unsupported amp level: {}".format(tracer._amp_level)
+            )
 
         if tracer._amp_dtype == 'float16':
             ctx.amp_dtype = 'float16'
         elif tracer._amp_dtype in ('bfloat16', 'float32'):
             ctx.amp_dtype = 'bfloat16'
         else:
-            raise ValueError("unsupported amp dtype: {}".format(
-                tracer._amp_dtype))
+            raise ValueError(
+                "unsupported amp dtype: {}".format(tracer._amp_dtype)
+            )
 
         ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
 
@@ -135,7 +144,10 @@ def forward(ctx, run_function, preserve_rng_state, *args):
 
     @staticmethod
     def backward(ctx, *args):
-        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
+            get_rng_state_tracker,
+        )
+
         with paddle.fluid.dygraph.guard():
             # TODO need to check the recompute calling is vaild or not
 
@@ -153,27 +165,31 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state_tracker(ctx.fw_cuda_rng_state,
-                                             ctx.fwd_cuda_rng_state_tracker):
+                with swith_rng_state_tracker(
+                    ctx.fw_cuda_rng_state, ctx.fwd_cuda_rng_state_tracker
+                ):
                     with paddle.amp.auto_cast(
-                            enable=ctx.is_fw_autocast,
-                            custom_white_list=ctx.amp_white_list,
-                            custom_black_list=ctx.amp_black_list,
-                            level=ctx.amp_level,
-                            dtype=ctx.amp_dtype):
+                        enable=ctx.is_fw_autocast,
+                        custom_white_list=ctx.amp_white_list,
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_level,
+                        dtype=ctx.amp_dtype,
+                    ):
                         detached_inputs = detach_variable(tuple(inputs))
                         outputs = ctx.run_function(*detached_inputs)
             else:
-                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
-                                          custom_white_list=ctx.amp_white_list,
-                                          custom_black_list=ctx.amp_black_list,
-                                          level=ctx.amp_level,
-                                          dtype=ctx.amp_dtype):
+                with paddle.amp.auto_cast(
+                    enable=ctx.is_fw_autocast,
+                    custom_white_list=ctx.amp_white_list,
+                    custom_black_list=ctx.amp_black_list,
+                    level=ctx.amp_level,
+                    dtype=ctx.amp_dtype,
+                ):
                     detached_inputs = detach_variable(tuple(inputs))
                     outputs = ctx.run_function(*detached_inputs)
 
             if isinstance(outputs, core.VarBase):
-                outputs = (outputs, )
+                outputs = (outputs,)
             assert len(outputs) == len(args)
 
             # run backward() with only tensor that requires grad
@@ -184,8 +200,10 @@ def backward(ctx, *args):
             # the following backward_inputs_with_grad is used to avoid this case.
             backward_inputs_with_grad = []
             for i in range(len(outputs)):
-                if isinstance(outputs[i],
-                              core.VarBase) and not outputs[i].stop_gradient:
+                if (
+                    isinstance(outputs[i], core.VarBase)
+                    and not outputs[i].stop_gradient
+                ):
                     forward_outputs_with_grad.append(outputs[i])
                     backward_inputs_with_grad.append(args[i])
 
@@ -196,23 +214,29 @@ def backward(ctx, *args):
 
             # actually backward
             with paddle.amp.auto_cast(enable=False):
-                paddle.autograd.backward(forward_outputs_with_grad,
-                                         backward_inputs_with_grad)
+                paddle.autograd.backward(
+                    forward_outputs_with_grad, backward_inputs_with_grad
+                )
 
-            grads = list(inp._grad_ivar() for inp in detached_inputs
-                         if isinstance(inp, core.VarBase))
+            grads = list(
+                inp._grad_ivar()
+                for inp in detached_inputs
+                if isinstance(inp, core.VarBase)
+            )
             return grads
 
 
 class RecomputeFunction(PyLayer):
-
     @staticmethod
-    def forward(ctx, run_function, preserve_rng_state, *args):
-        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
+    def forward(ctx, run_function, preserve_rng_state, *args, **kwargs):
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
+            get_rng_state_tracker,
+        )
 
         # store for recomputing
         ctx.run_function = run_function
         ctx.preserve_rng_state = preserve_rng_state
+        ctx.kwargs = kwargs
 
         # NOTE the number of outputs of backward() should be equal to the number of tensors in forward()'s input
         # the order of tensors in backward()'s output should be the same as tensors in forward()'s input
@@ -237,40 +261,50 @@ def forward(ctx, run_function, preserve_rng_state, *args):
             cur_device = paddle.get_device()
             if 'gpu:' not in cur_device:
                 raise RuntimeError(
-                    "Recompute with RNG perserve is not support current device: {}."
-                    .format(cur_device))
+                    "Recompute with RNG perserve is not support current device: {}.".format(
+                        cur_device
+                    )
+                )
             ctx.fw_cuda_rng_state = paddle.get_cuda_rng_state()
-            ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
-            ).get_states_tracker()
+            ctx.fwd_cuda_rng_state_tracker = (
+                get_rng_state_tracker().get_states_tracker()
+            )
 
         # TODO support AMP
         tracer = framework._dygraph_tracer()
-        ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
+        ctx.is_fw_autocast = (
+            False if tracer._amp_level == core.AmpLevel.O0 else True
+        )
         if tracer._amp_level == core.AmpLevel.O2:
             ctx.amp_level = 'O2'
         elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
             ctx.amp_level = 'O1'
         else:
-            raise ValueError("unsupported amp level: {}".format(
-                tracer._amp_level))
+            raise ValueError(
+                "unsupported amp level: {}".format(tracer._amp_level)
+            )
 
         if tracer._amp_dtype == 'float16':
             ctx.amp_dtype = 'float16'
         elif tracer._amp_dtype in ('bfloat16', 'float32'):
             ctx.amp_dtype = 'bfloat16'
         else:
-            raise ValueError("unsupported amp dtype: {}".format(
-                tracer._amp_dtype))
+            raise ValueError(
+                "unsupported amp dtype: {}".format(tracer._amp_dtype)
+            )
 
         ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
 
         with paddle.no_grad():
-            outputs = run_function(*args)
+            outputs = run_function(*args, **kwargs)
         return outputs
 
     @staticmethod
     def backward(ctx, *args):
-        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import get_rng_state_tracker
+        from paddle.distributed.fleet.meta_parallel.parallel_layers.random import (
+            get_rng_state_tracker,
+        )
+
         with paddle.fluid.dygraph.guard():
             # TODO need to check the recompute calling is vaild or not
 
@@ -288,27 +322,33 @@ def backward(ctx, *args):
             # NOTE support AMP
             # need restore auto_cast state as well as w/b list
             if ctx.preserve_rng_state:
-                with swith_rng_state_tracker(ctx.fw_cuda_rng_state,
-                                             ctx.fwd_cuda_rng_state_tracker):
+                with swith_rng_state_tracker(
+                    ctx.fw_cuda_rng_state, ctx.fwd_cuda_rng_state_tracker
+                ):
                     with paddle.amp.auto_cast(
-                            enable=ctx.is_fw_autocast,
-                            custom_white_list=ctx.amp_white_list,
-                            custom_black_list=ctx.amp_black_list,
-                            level=ctx.amp_level,
-                            dtype=ctx.amp_dtype):
+                        enable=ctx.is_fw_autocast,
+                        custom_white_list=ctx.amp_white_list,
+                        custom_black_list=ctx.amp_black_list,
+                        level=ctx.amp_level,
+                        dtype=ctx.amp_dtype,
+                    ):
                         detached_inputs = detach_variable(tuple(inputs))
-                        outputs = ctx.run_function(*detached_inputs)
+                        outputs = ctx.run_function(
+                            *detached_inputs, **ctx.kwargs
+                        )
             else:
-                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
-                                          custom_white_list=ctx.amp_white_list,
-                                          custom_black_list=ctx.amp_black_list,
-                                          level=ctx.amp_level,
-                                          dtype=ctx.amp_dtype):
+                with paddle.amp.auto_cast(
+                    enable=ctx.is_fw_autocast,
+                    custom_white_list=ctx.amp_white_list,
+                    custom_black_list=ctx.amp_black_list,
+                    level=ctx.amp_level,
+                    dtype=ctx.amp_dtype,
+                ):
                     detached_inputs = detach_variable(tuple(inputs))
-                    outputs = ctx.run_function(*detached_inputs)
+                    outputs = ctx.run_function(*detached_inputs, **ctx.kwargs)
 
             if isinstance(outputs, (core.VarBase, core.eager.Tensor)):
-                outputs = (outputs, )
+                outputs = (outputs,)
             assert len(outputs) == len(args)
 
             # run backward() with only tensor that requires grad
@@ -319,10 +359,10 @@ def backward(ctx, *args):
             # the following backward_inputs_with_grad is used to avoid this case.
             backward_inputs_with_grad = []
             for i in range(len(outputs)):
-                if isinstance(
-                        outputs[i],
-                    (core.VarBase,
-                     core.eager.Tensor)) and not outputs[i].stop_gradient:
+                if (
+                    isinstance(outputs[i], (core.VarBase, core.eager.Tensor))
+                    and not outputs[i].stop_gradient
+                ):
                     forward_outputs_with_grad.append(outputs[i])
                     backward_inputs_with_grad.append(args[i])
 
@@ -333,17 +373,22 @@ def backward(ctx, *args):
 
             # actually backward
             with paddle.amp.auto_cast(enable=False):
-                paddle.autograd.backward(forward_outputs_with_grad,
-                                         backward_inputs_with_grad)
+                paddle.autograd.backward(
+                    forward_outputs_with_grad, backward_inputs_with_grad
+                )
 
             if in_dygraph_mode():
                 grads = tuple(
-                    inp._grad_ivar() for inp in detached_inputs
-                    if isinstance(inp, (core.VarBase, core.eager.Tensor)))
+                    inp._grad_ivar()
+                    for inp in detached_inputs
+                    if isinstance(inp, (core.VarBase, core.eager.Tensor))
+                )
             else:
                 grads = list(
-                    inp._grad_ivar() for inp in detached_inputs
-                    if isinstance(inp, (core.VarBase, core.eager.Tensor)))
+                    inp._grad_ivar()
+                    for inp in detached_inputs
+                    if isinstance(inp, (core.VarBase, core.eager.Tensor))
+                )
             return grads
 
 
@@ -352,13 +397,13 @@ def recompute(function, *args, **kwargs):
     recompute intermediate activations to save then memory.
 
     Parameters:
-        function(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model  
-              whose intermediate activations will be released to save memory in forward stage and will be recomputed 
-              in backward stage for gradient calculation. 
-        *args(Tensor): inputs to the function.    
-        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to 
-              indicate whether to save the forward rng. If it is True, then the last forward rng value will be 
-              restored when the forward recalculation of backpropagation is performed. The default 
+        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
+              whose intermediate activations will be released to save memory in forward stage and will be recomputed
+              in backward stage for gradient calculation.
+        *args(Tensor): inputs to the function.
+        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
+              indicate whether to save the forward rng. If it is True, then the last forward rng value will be
+              restored when the forward recalculation of backpropagation is performed. The default
               preserve_rng_state is True.
 
     Returns:
@@ -367,13 +412,10 @@ def recompute(function, *args, **kwargs):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
             from paddle.distributed.fleet.utils import recompute
             import random
-
             # required: gpu
-
             def get_fc_block(block_idx, input_size, is_last=False):
                 block_name = "block_" + str(block_idx)
                 block = paddle.nn.Sequential(
@@ -395,10 +437,7 @@ def get_fc_block(block_idx, input_size, is_last=False):
                         block_name + "_fc_2",
                         paddle.nn.Linear(input_size, input_size, bias_attr=False)
                     )
-
                 return block
-
-
             class Naive_fc_net(paddle.nn.Layer):
                 def __init__(self, input_size=10,
                             recompute_blocks=[1, 3],
@@ -412,7 +451,6 @@ def __init__(self, input_size=10,
                     self.runfunc3 = get_fc_block(3, input_size, is_last=False)
                     self.runfunc4 = get_fc_block(4, input_size, is_last=True)
                     self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
-
                 def forward(self, inputs):
                     nums = len(self.total_func)
                     for i in range(nums):
@@ -421,15 +459,12 @@ def forward(self, inputs):
                         else:
                             inputs = self.total_func[i](inputs)
                     return inputs
-
             def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
                 gen = paddle.seed(10)
                 gen.manual_seed(10)
-                np.random.seed(10)
                 random.seed(10)
                 if cuda_state:
                     paddle.set_cuda_rng_state(cuda_state)
-
                 batch_size, input_size = 1, 10
                 model = Naive_fc_net(
                     input_size,
@@ -440,37 +475,81 @@ def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
                 param_ = []
                 grad_ = []
                 for _ in range(5):
-                    x_data = np.random.randn(batch_size, input_size).astype(np.float32)
-                    x = paddle.to_tensor(x_data)
+                    x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
                     y_pred = model(x)
                     loss = y_pred.mean()
-                    loss_.append(np.asarray(loss).tolist())
+                    loss_.append(loss.item())
                     loss.backward()
                     optimizer.step()
-                    param_.append(np.asarray(model.parameters()[9]).tolist())
-                    grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
+                    param_.append(model.parameters()[9])
+                    grad_.append(model.parameters()[3]._grad_ivar())
                     optimizer.clear_grad()
-
                 return loss_, param_, grad_
-
             cuda_state = paddle.get_cuda_rng_state()
             # without recompute
             loss_ref, param_ref, grad_ref = run_model(
                 cuda_state, recompute_block=[]
             )
-
             loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
             print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
             # The result of the recompute_loss should be the same as the normal_loss.
-
     """
     # Hack to mix *args with **kwargs in a python 2.7-compliant way
     preserve = kwargs.pop('preserve_rng_state', True)
-    if kwargs:
-        raise ValueError("Unexpected keyword arguments: " +
-                         ",".join(arg for arg in kwargs))
 
     if framework._dygraph_tracer()._has_grad:
         check_recompute_necessary(args)
 
-    return RecomputeFunction.apply(function, preserve, *args)
+    return RecomputeFunction.apply(function, preserve, *args, **kwargs)
+
+
+def recompute_sequential(ctx, functions, *args, **kwargs):
+    """
+    recompute intermediate activations to save then memory for 'Sequential' models.
+
+    Parameters:
+        ctx(dict): include 'segments' and  'preserve_rng_state' keys, the key 'segments' (int, default 1), represents the number of chunks to create in the model,
+                   the key 'preserve_rng_state' (bool, optional, default=True) indicate whether to save the forward rng. If it is True, then the last forward rng value will be
+                   restored when the forward recalculation of backpropagation is performed. and some keys such as 'mp_group', 'offload' and 'partition' are invalid here,
+                   they are useful in 'recompute_hybrid' API.
+        functions(paddle.nn.Sequential): layer of sequence of layers that describes part of forward pass of the model
+              whose intermediate activations will be released to save memory in forward stage and will be recomputed
+              in backward stage for gradient calculation.
+        *args(Tensor): inputs(tuple) to the function.
+        **kwargs(Dict): inputs(dict) to the function.
+
+    Returns:
+        Output of function on args and kwargs.
+
+    Examples:
+        .. code-block:: python
+
+            model = paddle.nn.Sequential(...)
+            input = recompute_sequential({'segments' : 1}, model, input)
+    """
+    segments = ctx.get('segments', 1)
+    preserve_rng_state = ctx.get('preserve_rng_state', True)
+
+    def _run_func(begin, end, funcs):
+        def do_run(input):
+            for i in range(begin, end + 1):
+                input = funcs[i](input)
+            return input
+
+        return do_run
+
+    if isinstance(functions, paddle.nn.Sequential):
+        functions = list(functions.children())
+
+    segment_size = len(functions) // segments
+
+    end = -1
+    for begin in range(0, segment_size * (segments - 1), segment_size):
+        end = begin + segment_size - 1
+        args = recompute(
+            _run_func(begin, end, functions),
+            *args,
+            preserve_rng_state=preserve_rng_state,
+            **kwargs
+        )
+    return _run_func(end + 1, len(functions) - 1, functions)(args)
diff --git a/python/paddle/distributed/fleet/recompute/recompute_hybrid.py b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
new file mode 100644
index 00000000000000..4883cad2511bb8
--- /dev/null
+++ b/python/paddle/distributed/fleet/recompute/recompute_hybrid.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import contextlib
+
+import paddle
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid import core
+from paddle.autograd import PyLayer
+from paddle.fluid import framework
+from ..meta_parallel.parallel_layers.random import get_rng_state_tracker
+from paddle.fluid.framework import in_dygraph_mode
+from paddle.distributed import fleet
+from .recompute import check_recompute_necessary, detach_variable, swith_rng_state_tracker
+from ..meta_parallel.pp_utils import utils
+
+__all__ = []
+
+
+def _split_activation(tensor, mp_group):
+
+    mp_degree = mp_group.nranks
+    mp_rank = mp_group.rank
+    if mp_degree < 2:
+        return tensor
+
+    tensor_numel = paddle.numel(tensor)
+    assert tensor_numel != 0, "can't recompute zero element"
+    assert tensor_numel % mp_degree == 0, "The capacity of the activation ({}) cannot be divisible by mp_degree({})".format(
+        tensor_numel, mp_degree)
+
+    # use inplace operation to save memory
+    data = tensor.flatten_()
+
+    part_size = tensor_numel // mp_degree
+    start = part_size * mp_rank
+    end = start + part_size
+    return data[start:end]
+
+
+def _merge_activation(tensor, mp_group):
+    mp_degree = mp_group.nranks
+    mp_rank = mp_group.rank
+    if mp_degree < 2:
+        return tensor
+
+    # adapt to new dygraph
+    tensor_shape = list(tensor.shape)
+    tensor_shape[0] *= mp_group.nranks
+    out = paddle.empty(tensor_shape, tensor.dtype)
+    task = mp_group.process_group.all_gather(tensor.cuda(), out)
+    task.wait()
+    return out
+
+
+class _HPRecomputeFunction(PyLayer):
+    """
+    Compared with paddle.distributed.fleet.utils.recompute, there are the following differences:
+    1. In order to support PipeLineParallel, the input of recompute is modified to ensure that the input can be tuple type.
+    2. Offload support for activation
+    3. Support MP segmentation of activation to further reduce cuda memory
+    4. Adapt to the random state of MP
+    """
+
+    @staticmethod
+    def forward(ctx, run_function, all_outputs, mp_group, offload, partition,
+                *args, **kwargs):
+        check_recompute_necessary(args)
+
+        # store for recomputing
+        ctx.run_function = run_function
+
+        ctx.kwargs = kwargs
+
+        # store the rng states
+        ctx.fwd_cuda_rng_state = paddle.get_cuda_rng_state()
+        ctx.fwd_cuda_rng_state_tracker = get_rng_state_tracker(
+        ).get_states_tracker()
+
+        # save config info
+        ctx.mp_group = mp_group
+        ctx.offload = offload
+        ctx.partition = partition
+
+        # save input for backward
+        ctx.inputs = []
+        ctx.tensor_indices = []
+        ctx.tensor_shapes = []
+        tensor_inputs = []
+
+        cur_device = paddle.get_device()
+        assert 'gpu:' in paddle.get_device(
+        ), "Recompute with RNG is not support current device: {}.".format(
+            cur_device)
+
+        # TODO support AMP
+        tracer = framework._dygraph_tracer()
+        ctx.is_fw_autocast = False if tracer._amp_level == core.AmpLevel.O0 else True
+        if tracer._amp_level == core.AmpLevel.O2:
+            ctx.amp_level = 'O2'
+        elif tracer._amp_level in (core.AmpLevel.O1, core.AmpLevel.O0):
+            ctx.amp_level = 'O1'
+        else:
+            raise ValueError("unsupported amp level: {}".format(
+                tracer._amp_level))
+        ctx.amp_white_list, ctx.amp_black_list = tracer._get_amp_op_list()
+
+        with paddle.no_grad():
+            outputs = run_function(*args, **kwargs)
+
+        for i, arg in enumerate(args):
+            if paddle.is_tensor(arg):
+                state = arg.stop_gradient
+                if partition:
+                    ctx.tensor_shapes.append(arg.shape)
+                    partition = _split_activation(arg.detach(),
+                                                  mp_group).clone()
+                    # TODO(shenliang03) not use calculate stream to D2H to speed
+                    arg = partition.cpu() if offload else partition
+                else:
+                    arg = arg.cpu() if offload else arg
+                arg.stop_gradient = state
+                tensor_inputs.append(arg)
+                ctx.tensor_indices.append(i)
+                ctx.inputs.append(None)
+            else:
+                ctx.inputs.append(arg)
+
+        ctx.save_for_backward(*tensor_inputs)
+
+        if paddle.is_tensor(outputs):
+            all_outputs += [outputs]
+            return outputs
+        else:
+            all_outputs += outputs
+            return tuple(outputs)
+
+    @staticmethod
+    def backward(ctx, *args):
+        with paddle.fluid.dygraph.guard():
+            # Restore inputs
+            inputs = list(ctx.inputs)
+            tensor_indices = ctx.tensor_indices
+            tensor_shapes = ctx.tensor_shapes
+            tensors = list(ctx.saved_tensor())
+
+            device_id = paddle.distributed.ParallelEnv().device_id
+            for i, idx in enumerate(tensor_indices):
+                if ctx.partition:
+                    state = tensors[i].stop_gradient
+                    tensors[i] = _merge_activation(
+                        tensors[i],
+                        ctx.mp_group).detach().reshape_(tensor_shapes[i])
+                    tensors[i].stop_gradient = state
+                inputs[idx] = tensors[i].cuda(
+                    device_id) if ctx.offload else tensors[i]
+
+            tracer = framework._dygraph_tracer()
+            tracer._has_grad = True
+
+            # need restore auto_cast state as well as w/b list
+            with swith_rng_state_tracker(ctx.fwd_cuda_rng_state,
+                                         ctx.fwd_cuda_rng_state_tracker):
+                with paddle.amp.auto_cast(enable=ctx.is_fw_autocast,
+                                          custom_white_list=ctx.amp_white_list,
+                                          custom_black_list=ctx.amp_black_list,
+                                          level=ctx.amp_level):
+                    detached_inputs = detach_variable(tuple(inputs))
+                    outputs = ctx.run_function(*detached_inputs, **ctx.kwargs)
+
+            if isinstance(outputs, (core.VarBase, core.eager.Tensor)):
+                outputs = (outputs, )
+            assert len(outputs) == len(args)
+
+            forward_outputs_with_grad = []
+            backward_inputs = []
+
+            for i in range(len(outputs)):
+                if isinstance(
+                        outputs[i],
+                    (core.VarBase,
+                     core.eager.Tensor)) and not outputs[i].stop_gradient:
+                    forward_outputs_with_grad.append(outputs[i])
+                    backward_inputs.append(args[i])
+
+            if len(forward_outputs_with_grad) == 0:
+                raise RuntimeError(
+                    "none of output has stop_gradient=False, this recompute() is not necessary"
+                )
+
+            # actually backward
+            paddle.autograd.backward(forward_outputs_with_grad, backward_inputs)
+            grads = tuple(inp._grad_ivar() for inp in detached_inputs
+                          if isinstance(inp, (core.VarBase, core.eager.Tensor)))
+            return grads
+
+
+def recompute_hybrid(ctx, function, *args, **kwargs):
+    """
+    # NODTE(shenliang03)The current hybrid parallel recompute has limitations.
+    # It cannot handle the following situations:
+    # 1. The calculation output of recompute, there are tensors that do not require gradients.
+    # 2. The forward output tensor has no gradient. This problem can be solved temporarily by detach().
+    # 3. Here, we only use float dtype to distinguish whether a gradient is needed in output tensor
+
+    Parameters:
+        ctx(dict): include 'mp_group', 'offload', and 'partition' keys. the key 'mp_group' (Group), represents the avtivations are splitted
+                   in which group. the key 'offload' (bool, optional, default=False), represents whether to offload to cpu. the key 'partition' (bool, optional, default=False),
+                   represents whether to split activations in the mp_group. and some keys such as 'segments' and 'preserve_rng_state' are invalid here, they are useful in
+                   'recompute_sequential' API.
+        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
+              whose intermediate activations will be released to save memory in forward stage and will be recomputed
+              in backward stage for gradient calculation.
+        *args(Tensor): inputs(tuple) to the function.
+
+        **kwargs(Dict): inputs(dict) to the function.
+
+    Returns:
+        Output of function on args and kwargs.
+
+    """
+    mp_group = ctx.get('mp_group', None)
+    assert mp_group is not None, "ctx must contains mp_group and mp_group can not be None."
+
+    offload = ctx.get('offload', False)
+    partition = ctx.get('partition', False)
+
+    all_outputs = []
+    _HPRecomputeFunction.apply(function, all_outputs, mp_group, offload,
+                               partition, *args, **kwargs)
+
+    if len(all_outputs) == 1:
+        return all_outputs[0]
+    else:
+        for output in all_outputs:
+            if paddle.is_tensor(output) and not utils.is_float_tensor(output):
+                output.stop_gradient = True
+
+        return tuple(all_outputs)
diff --git a/python/paddle/distributed/fleet/utils/__init__.py b/python/paddle/distributed/fleet/utils/__init__.py
index 1bf90a22e375c7..30afae2b432e56 100644
--- a/python/paddle/distributed/fleet/utils/__init__.py
+++ b/python/paddle/distributed/fleet/utils/__init__.py
@@ -15,11 +15,115 @@
 from .fs import LocalFS  # noqa: F401
 from .fs import HDFSClient  # noqa: F401
 from .ps_util import DistributedInfer  # noqa: F401
-from .recompute import recompute  # noqa: F401
+import paddle.utils.deprecated as deprecated
+from paddle.distributed import fleet
 
+import paddle
 from . import log_util  # noqa: F401
 from . import hybrid_parallel_util  # noqa: F401
 
-__all__ = [  #noqa
-    "LocalFS", "recompute", "DistributedInfer", "HDFSClient"
-]
+__all__ = ["LocalFS", "recompute", "DistributedInfer", "HDFSClient"]  # noqa
+
+
+def recompute(function, *args, **kwargs):
+    """
+    recompute intermediate activations to save then memory.
+    Parameters:
+        function(paddle.nn.Layer): layer of sequence of layers that describes part of forward pass of the model
+              whose intermediate activations will be released to save memory in forward stage and will be recomputed
+              in backward stage for gradient calculation.
+        *args(Tensor): inputs to the function.
+        **kwargs(Dict): Kwargs should only contain the key-value pair of preserve_rng_state, which is used to
+              indicate whether to save the forward rng. If it is True, then the last forward rng value will be
+              restored when the forward recalculation of backpropagation is performed. The default
+              preserve_rng_state is True.
+    Returns:
+        Output of function on args.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            from paddle.distributed.fleet.utils import recompute
+            import random
+            # required: gpu
+            def get_fc_block(block_idx, input_size, is_last=False):
+                block_name = "block_" + str(block_idx)
+                block = paddle.nn.Sequential(
+                    (block_name + "_fc_0", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+                    (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+                    (block_name + "_relu_1", paddle.nn.ReLU()),
+                    (block_name + "_fc_1", paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+                    (block_name + "_relu_2", paddle.nn.ReLU()),
+                )
+                if is_last:
+                    block.add_sublayer(
+                        block_name + "_fc_2",
+                        paddle.nn.Linear(
+                            input_size, 1, bias_attr=False
+                        )
+                    )
+                else:
+                    block.add_sublayer(
+                        block_name + "_fc_2",
+                        paddle.nn.Linear(input_size, input_size, bias_attr=False)
+                    )
+                return block
+            class Naive_fc_net(paddle.nn.Layer):
+                def __init__(self, input_size=10,
+                            recompute_blocks=[1, 3],
+                            recompute_kwargs={}):
+                    super(Naive_fc_net, self).__init__()
+                    self.recompute_blocks = recompute_blocks
+                    self.recompute_kwargs = recompute_kwargs
+                    self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+                    self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+                    self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+                    self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+                    self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+                    self.total_func = [self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3, self.runfunc4]
+                def forward(self, inputs):
+                    nums = len(self.total_func)
+                    for i in range(nums):
+                        if i in self.recompute_blocks:
+                            inputs = recompute(self.total_func[i], inputs, **{"preserve_rng_state": True})
+                        else:
+                            inputs = self.total_func[i](inputs)
+                    return inputs
+            def run_model(cuda_state, recompute_block=[], recompute_kwargs={}):
+                gen = paddle.seed(10)
+                gen.manual_seed(10)
+                random.seed(10)
+                if cuda_state:
+                    paddle.set_cuda_rng_state(cuda_state)
+                batch_size, input_size = 1, 10
+                model = Naive_fc_net(
+                    input_size,
+                    recompute_blocks=recompute_block,
+                    recompute_kwargs=recompute_kwargs)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+                loss_ = []
+                param_ = []
+                grad_ = []
+                for _ in range(5):
+                    x = paddle.rand(shape=[batch_size, input_size], dtype="float32")
+                    y_pred = model(x)
+                    loss = y_pred.mean()
+                    loss_.append(loss.item())
+                    loss.backward()
+                    optimizer.step()
+                    param_.append(model.parameters()[9])
+                    grad_.append(model.parameters()[3]._grad_ivar())
+                    optimizer.clear_grad()
+                return loss_, param_, grad_
+            cuda_state = paddle.get_cuda_rng_state()
+            # without recompute
+            loss_ref, param_ref, grad_ref = run_model(
+                cuda_state, recompute_block=[]
+            )
+            loss, param, grad = run_model(cuda_state, recompute_block=[1, 2])
+            print("normal_loss: {}, recompute_loss: {}".format(loss_ref, loss))
+            # The result of the recompute_loss should be the same as the normal_loss.
+    """
+
+    return fleet.recompute.recompute(function, *args, **kwargs)
diff --git a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
index c3b0693d7ebd0b..7e527eced3f041 100644
--- a/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
+++ b/python/paddle/distributed/fleet/utils/hybrid_parallel_util.py
@@ -14,12 +14,15 @@
 import os
 import six
 import numpy as np
-import warnings
 
 from paddle import framework
 import paddle
 from paddle.fluid import core
-from paddle.fluid.dygraph.parallel import _split_tensors, sync_params_buffers, build_groups
+from paddle.fluid.dygraph.parallel import (
+    _split_tensors,
+    sync_params_buffers,
+    build_groups,
+)
 from paddle.fluid.framework import in_dygraph_mode, _in_legacy_dygraph
 from collections import OrderedDict
 from .log_util import logger
@@ -27,7 +30,7 @@
 __all__ = []
 
 
-def _apply_collective_grads(parameters, comm_group):
+def _apply_collective_grads(parameters, comm_group, bucket_size, scale=None):
     grad_var_set = set()
     grad_vars = []
     sparse_grad_vars = []
@@ -35,52 +38,70 @@ def _apply_collective_grads(parameters, comm_group):
     for param in parameters:
         if param.trainable and (param._grad_ivar() is not None):
             g_var = param._grad_ivar()
-            assert not g_var._is_sparse(
+            assert (
+                not g_var._is_sparse()
             ), "Now, it doesn't support sparse parameters"
             grad_vars.append(g_var)
             assert g_var not in grad_var_set
             grad_var_set.add(g_var)
 
-    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
+    coalesced_grads_and_vars = build_groups(grad_vars, bucket_size)
+
+    nranks = (
+        paddle.distributed.get_world_size()
+        if comm_group is None
+        else comm_group.nranks
+    )
+
+    scale = nranks if scale is None else 1.0 / scale
+    scale = None if scale == 1.0 else scale
 
-    nranks = paddle.distributed.get_world_size(
-    ) if comm_group is None else comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        div_factor = paddle.to_tensor(nranks, dtype=coalesced_grad.dtype)
-        paddle.fluid.framework._dygraph_tracer().trace_op(
-            type="elementwise_div",
-            inputs={
-                'X': coalesced_grad,
-                'Y': div_factor
-            },
-            outputs={'Out': coalesced_grad},
-            attrs={'axis': -1})
+        if scale is not None:
+            div_factor = paddle.to_tensor(scale, dtype=coalesced_grad.dtype)
+            paddle.fluid.framework._dygraph_tracer().trace_op(
+                type="elementwise_div",
+                inputs={'X': coalesced_grad, 'Y': div_factor},
+                outputs={'Out': coalesced_grad},
+                attrs={'axis': -1},
+            )
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
 
 
-def _apply_collective_grads_eager(parameters, comm_group):
+def _apply_collective_grads_eager(
+    parameters, comm_group, bucket_size, scale=None
+):
     grad_var_set = set()
     grad_vars = []
 
     for param in parameters:
         if param.trainable and (param._grad_ivar() is not None):
             g_var = param._grad_ivar()
-            assert not g_var.is_sparse(
+            assert (
+                not g_var.is_sparse()
             ), "Now, it doesn't support sparse parameters"
             grad_vars.append(g_var)
             assert g_var not in grad_var_set
             grad_var_set.add(g_var)
 
-    coalesced_grads_and_vars = build_groups(grad_vars, 128 * 1024 * 1024)
+    coalesced_grads_and_vars = build_groups(grad_vars, bucket_size)
+
+    nranks = (
+        paddle.distributed.get_world_size()
+        if comm_group is None
+        else comm_group.nranks
+    )
+
+    scale = 1.0 / nranks if scale is None else scale
+    scale = None if scale == 1.0 else scale
 
-    nranks = paddle.distributed.get_world_size(
-    ) if comm_group is None else comm_group.nranks
     for coalesced_grad, _, _ in coalesced_grads_and_vars:
         # need to div nranks
-        coalesced_grad.scale_(1.0 / nranks)
+        if scale is not None:
+            coalesced_grad.scale_(scale)
         paddle.distributed.all_reduce(coalesced_grad, group=comm_group)
 
     _split_tensors(coalesced_grads_and_vars)
@@ -92,20 +113,28 @@ def _broadcast_data_help(data, shape, dtype, hcg):
     mp_rank = hcg.get_model_parallel_rank()
 
     shape_gpu = paddle.to_tensor(shape, dtype="int32")
-    paddle.distributed.broadcast(shape_gpu,
-                                 src=src_rank,
-                                 group=model_parallel_group,
-                                 use_calc_stream=True)
+    paddle.distributed.broadcast(
+        shape_gpu, src=src_rank, group=model_parallel_group, sync_op=True
+    )
 
     if mp_rank != 0:
         input_data = paddle.zeros(shape_gpu, dtype=dtype)
     else:
         input_data = data
 
-    paddle.distributed.broadcast(input_data,
-                                 src=src_rank,
-                                 group=model_parallel_group,
-                                 use_calc_stream=True)
+    paddle.distributed.broadcast(
+        input_data, src=src_rank, group=model_parallel_group, sync_op=True
+    )
+
+    if mp_rank != 0:
+        if in_dygraph_mode():
+            data._clear_data()
+            input_data._share_buffer_to(data)
+        else:
+            data.value().get_tensor()._clear()
+            data.value().get_tensor()._share_data_with(
+                input_data.value().get_tensor()
+            )
 
 
 def broadcast_input_data(hcg, *inputs, **kwargs):
@@ -113,7 +142,14 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
     for v in inputs:
         if isinstance(v, (core.VarBase, core.eager.Tensor)):
             with framework.no_grad():
-                v = v.cuda() if "gpu" in cur_device else v
+                if (
+                    "gpu" in cur_device
+                    and in_dygraph_mode()
+                    and not v.place.is_gpu_place()
+                ):
+                    v_gpu = v.cuda(int(cur_device.split(":")[1]))
+                    v._clear_data()
+                    v_gpu._share_buffer_to(v)
                 _broadcast_data_help(v, v.shape, v.dtype, hcg)
         else:
             logger.error("it doesn't support data type {}".format(type(v)))
@@ -121,7 +157,14 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
     for k, v in kwargs.items():
         if isinstance(v, (core.VarBase, core.eager.Tensor)):
             with framework.no_grad():
-                v = v.cuda() if "gpu" in cur_device else v
+                if (
+                    "gpu" in cur_device
+                    and in_dygraph_mode()
+                    and not v.place.is_gpu_place()
+                ):
+                    v_gpu = v.cuda(int(cur_device.split(":")[1]))
+                    v._clear_data()
+                    v_gpu._share_buffer_to(v)
                 _broadcast_data_help(v, v.shape, v.dtype, hcg)
             kwargs[k] = v
         else:
@@ -132,28 +175,35 @@ def broadcast_input_data(hcg, *inputs, **kwargs):
 def broadcast_mp_parameters(model, hcg):
     model_parallel_group = hcg.get_model_parallel_group()
     src_rank = hcg.get_model_parallel_group_src_rank()
-    sync_params_buffers(model,
-                        model_parallel_group,
-                        src_rank,
-                        is_model_parallel=True)
+    sync_params_buffers(
+        model, model_parallel_group, src_rank, is_model_parallel=True
+    )
 
 
 def broadcast_dp_parameters(model, hcg):
     data_parallel_group = hcg.get_data_parallel_group()
     src_rank = hcg.get_data_parallel_group_src_rank()
-    sync_params_buffers(model,
-                        data_parallel_group,
-                        src_rank,
-                        is_model_parallel=False)
+    sync_params_buffers(
+        model, data_parallel_group, src_rank, is_model_parallel=False
+    )
+
+
+def fused_allreduce_gradients_with_group(
+    parameter_list, group, bucket_size=128 * 1024 * 1024, scale=None
+):
+    apply_func = (
+        _apply_collective_grads_eager
+        if in_dygraph_mode()
+        else _apply_collective_grads
+    )
+    with framework.no_grad():
+        apply_func(parameter_list, group, bucket_size, scale)
 
 
 def fused_allreduce_gradients(parameter_list, hcg):
     data_parallel_group = None if hcg is None else hcg.get_data_parallel_group()
     logger.debug("dp start fuse allreduce gradients")
-    apply_func = _apply_collective_grads_eager if in_dygraph_mode(
-    ) else _apply_collective_grads
-    with framework.no_grad():
-        apply_func(parameter_list, data_parallel_group)
+    fused_allreduce_gradients_with_group(parameter_list, data_parallel_group)
 
 
 def sharding_reduce_gradients(parameter_list, hcg):
@@ -170,7 +220,8 @@ def sharding_reduce_gradients(parameter_list, hcg):
                     paddle.distributed.all_reduce(
                         param.grad,
                         group=hcg.get_sharding_parallel_group(),
-                        use_calc_stream=True)
+                        sync_op=True,
+                    )
 
                 elif _in_legacy_dygraph():
                     g_var = param._grad_ivar()
@@ -183,20 +234,20 @@ def sharding_reduce_gradients(parameter_list, hcg):
                         outputs={'Out': g_var},
                         attrs={
                             'ring_id': hcg.get_sharding_parallel_group().id,
-                            'use_calc_stream': True
-                        })
+                            'use_calc_stream': True,
+                        },
+                    )
 
                     # grad / sharding_rank
-                    div_factor = paddle.to_tensor(sharding_nrank,
-                                                  dtype=g_var.dtype)
+                    div_factor = paddle.to_tensor(
+                        sharding_nrank, dtype=g_var.dtype
+                    )
                     paddle.fluid.framework._dygraph_tracer().trace_op(
                         type="elementwise_div",
-                        inputs={
-                            'X': g_var,
-                            'Y': div_factor
-                        },
+                        inputs={'X': g_var, 'Y': div_factor},
                         outputs={'Out': g_var},
-                        attrs={'axis': -1})
+                        attrs={'axis': -1},
+                    )
 
 
 def broadcast_sharding_parameters(model, hcg):
@@ -204,7 +255,6 @@ def broadcast_sharding_parameters(model, hcg):
     logger.debug("sharding start init parameters sync")
     sharding_parallel_group = hcg.get_sharding_parallel_group()
     src_rank = hcg.get_sharding_parallel_group_src_rank()
-    sync_params_buffers(model,
-                        sharding_parallel_group,
-                        src_rank,
-                        is_model_parallel=False)
+    sync_params_buffers(
+        model, sharding_parallel_group, src_rank, is_model_parallel=False
+    )
diff --git a/python/paddle/distributed/fleet/utils/log_util.py b/python/paddle/distributed/fleet/utils/log_util.py
index cf90527c07fe49..6118d0264478b1 100644
--- a/python/paddle/distributed/fleet/utils/log_util.py
+++ b/python/paddle/distributed/fleet/utils/log_util.py
@@ -15,30 +15,50 @@
 import logging
 import sys
 
-__all__ = []
+from paddle.distributed.utils.log_utils import get_logger
 
+logger = get_logger("INFO", __name__)
 
-class LoggerFactory:
 
-    @staticmethod
-    def build_logger(name=None, level=logging.INFO):
-        assert name is not None, "name for logger should not be None"
+def set_log_level(level):
+    """
+    Set log level
 
-        formatter = logging.Formatter(
-            "%(asctime)s-%(levelname)s: "
-            "[%(filename)s:%(lineno)d:%(funcName)s] %(message)s")
+    Args:
+        level (str|int): a specified level
 
-        _logger = logging.getLogger(name)
-        _logger.setLevel(level)
-        _logger.propagate = False
-        handler = logging.StreamHandler(stream=sys.stderr)
-        handler.setFormatter(formatter)
-        handler.setLevel(level)
-        _logger.addHandler(handler)
-        return _logger
+    Example 1:
+        import paddle
+        import paddle.distributed.fleet as fleet
+        fleet.init()
+        fleet.setLogLevel("DEBUG")
 
+    Example 2:
+        import paddle
+        import paddle.distributed.fleet as fleet
+        fleet.init()
+        fleet.setLogLevel(1)
 
-logger = LoggerFactory.build_logger(name="HybridParallel", level=logging.INFO)
+    """
+    assert isinstance(level, (str, int)), "level's type must be str or int"
+    if isinstance(level, int):
+        logger.setLevel(level)
+    else:
+        logger.setLevel(level.upper())
+
+
+def get_log_level_code():
+    """
+    Return current log level code
+    """
+    return logger.getEffectiveLevel()
+
+
+def get_log_level_name():
+    """
+    Return current log level name
+    """
+    return logging.getLevelName(get_log_level_code())
 
 
 def layer_to_str(base, *args, **kwargs):
diff --git a/python/paddle/distributed/metric/metrics.py b/python/paddle/distributed/metric/metrics.py
index 08d185efd971aa..4029734545f9d4 100644
--- a/python/paddle/distributed/metric/metrics.py
+++ b/python/paddle/distributed/metric/metrics.py
@@ -16,7 +16,7 @@
 import yaml
 import paddle.fluid as fluid
 import logging
-from paddle.distributed.utils import get_logger
+from paddle.distributed.utils.log_utils import get_logger
 
 __all__ = []
 logger = get_logger(logging.INFO, name="metrics")
diff --git a/python/paddle/distributed/parallel.py b/python/paddle/distributed/parallel.py
index cb8baa220c7650..8c7187236d47c3 100644
--- a/python/paddle/distributed/parallel.py
+++ b/python/paddle/distributed/parallel.py
@@ -30,7 +30,9 @@
 from paddle.fluid.dygraph import parallel_helper
 from paddle.distributed.fleet.launch_utils import check_backend
 from paddle.fluid.dygraph.parallel import ParallelEnv
-from paddle.distributed.fleet.base.private_helper_function import wait_server_ready  # noqa: F401
+from paddle.distributed.fleet.base.private_helper_function import (
+    wait_server_ready,
+)  # noqa: F401
 from paddle.distributed import collective
 from paddle.distributed.collective import _set_group_map
 from paddle.distributed.collective import _set_group_map_by_name
@@ -43,6 +45,7 @@
 from paddle.distributed.collective import _new_process_group_impl
 from paddle.distributed.collective import Group
 from paddle.distributed.collective import _set_group_map_backend
+from paddle.distributed.communication.group import _add_new_group
 
 __all__ = []
 
@@ -62,6 +65,7 @@ def _get_global_parallel_env():
 
 def _start_kv_server(port, http_server_d, size):
     from paddle.distributed.fleet.utils.http_server import KVServer
+
     http_server = KVServer(int(port), size=size)
     http_server.start()
     wait_seconds = 3
@@ -72,10 +76,15 @@ def _start_kv_server(port, http_server_d, size):
 
 def _is_cpuonly(backend):
     check_backend(backend)
-    if (backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl'] and
-        (core.is_compiled_with_cuda() or core.is_compiled_with_xpu()
-         or core.is_compiled_with_npu()
-         or core.is_compiled_with_mlu())) or backend is 'xccl':
+    if (
+        backend in ['auto', 'nccl', 'bkcl', 'hccl', 'heter', 'cncl']
+        and (
+            core.is_compiled_with_cuda()
+            or core.is_compiled_with_xpu()
+            or core.is_compiled_with_npu()
+            or core.is_compiled_with_mlu()
+        )
+    ) or backend is 'xccl':
 
         # passes 'auto' and can use cuda or xpu, use the default logics. so return False
         return False
@@ -86,16 +95,18 @@ def _is_cpuonly(backend):
 def _check_var_exists(var_name):
     var = os.environ.get(var_name, None)
     if var is None:
-        raise ValueError("paddle.distributed initialize error, "
-                         "environment variable %s is needed, but not set." %
-                         var_name)
+        raise ValueError(
+            "paddle.distributed initialize error, "
+            "environment variable %s is needed, but not set." % var_name
+        )
 
 
 def init_parallel_env():
     """
+
     Initialize parallel training environment in dynamic graph mode.
 
-    .. note::
+    Note:
         Now initialize both `NCCL` and `GLOO` contexts for communication.
 
     Args:
@@ -105,9 +116,10 @@ def init_parallel_env():
 
     Returns:
         None
-        
+
     Examples:
         .. code-block:: python
+
             # required: gpu
             import paddle
             import paddle.nn as nn
@@ -119,7 +131,7 @@ def __init__(self):
                     super(LinearNet, self).__init__()
                     self._linear1 = nn.Linear(10, 10)
                     self._linear2 = nn.Linear(10, 1)
-                    
+
                 def forward(self, x):
                     return self._linear2(self._linear1(x))
 
@@ -140,7 +152,7 @@ def train():
                 outputs = dp_layer(inputs)
                 labels = paddle.randn([10, 1], 'float32')
                 loss = loss_fn(outputs, labels)
-                
+
                 loss.backward()
 
                 adam.step()
@@ -148,6 +160,7 @@ def train():
 
             if __name__ == '__main__':
                 dist.spawn(train)
+
     """
 
     # 0. get env & check world size
@@ -166,15 +179,21 @@ def train():
     backend = os.environ.get('PADDLE_DISTRI_BACKEND', 'auto')
     is_cpu_only = _is_cpuonly(backend)
     # 1. gpu xpu check, must be gpu or xpu,
-    if not (is_cpu_only or core.is_compiled_with_cuda()
-            or core.is_compiled_with_xpu() or core.is_compiled_with_npu()
-            or core.is_compiled_with_mlu()):
+    if not (
+        is_cpu_only
+        or core.is_compiled_with_cuda()
+        or core.is_compiled_with_xpu()
+        or core.is_compiled_with_npu()
+        or core.is_compiled_with_mlu()
+    ):
         raise NotImplementedError(
-            "If you want to use CPU-only version, please use 'gloo' as backend")
+            "If you want to use CPU-only version, please use 'gloo' as backend"
+        )
 
     if backend == "xccl":
         FLAGS_selected_custom_devices = 'FLAGS_selected_{}s'.format(
-            parallel_env.device_type)
+            parallel_env.device_type
+        )
         _check_var_exists(FLAGS_selected_custom_devices)
     else:
         if not is_cpu_only and core.is_compiled_with_cuda():
@@ -202,8 +221,9 @@ def train():
     # they need to call a function to change default place,
     # here just set correctly place to users
     if backend == "xccl":
-        place = core.CustomPlace(parallel_env.device_type,
-                                 parallel_env.device_id)
+        place = core.CustomPlace(
+            parallel_env.device_type, parallel_env.device_id
+        )
     elif is_cpu_only:
         place = core.CPUPlace()
     elif core.is_compiled_with_cuda():
@@ -227,11 +247,15 @@ def train():
         assert rank >= 0 and world_size > rank and world_size > 1, (
             "rank must be non-negative and world_size must be the "
             "maximum rank plus one. Moreover, at least two processes are "
-            "required to create a process group.")
+            "required to create a process group."
+        )
         master_addr = os.getenv("MASTER_ADDR", None)
         master_port = os.getenv("MASTER_PORT", None)
-        endpoints = ":".join([master_addr, master_port
-                              ]) if master_addr and master_port else None
+        endpoints = (
+            ":".join([master_addr, master_port])
+            if master_addr and master_port
+            else None
+        )
         if endpoints is None:
             endpoints = os.getenv("PADDLE_MASTER", None)
         if endpoints is None:
@@ -240,33 +264,34 @@ def train():
             "The environment variable 'MASTER_ADDR' and 'MASTER_PORT' "
             "must be specified, for example 'export MASTER_ADDR=127.0.0.1' "
             "and 'export MASTER_ADDR=54612'. Or you can start your training"
-            "with paddle.distributed.run module.")
+            "with paddle.distributed.run module."
+        )
         master_addr, master_port = endpoints.split(":")
         master_port = int(master_port)
         is_master = rank == 0
         stop_check_timeout = int(os.getenv("FLAGS_stop_check_timeout", "900"))
-        default_store = core.TCPStore(master_addr,
-                                      master_port,
-                                      is_master,
-                                      world_size,
-                                      timeout=stop_check_timeout)
+        default_store = core.TCPStore(
+            master_addr,
+            master_port,
+            is_master,
+            world_size,
+            timeout=stop_check_timeout,
+        )
         _set_default_store(default_store)
-        pg = _new_process_group_impl(backend,
-                                     default_store,
-                                     rank,
-                                     world_size,
-                                     _default_group_name,
-                                     pg_options=None)
+        pg = _new_process_group_impl(
+            backend,
+            default_store,
+            rank,
+            world_size,
+            _default_group_name,
+            pg_options=None,
+        )
         ranks = list(range(world_size))
-        group = Group(rank,
-                      world_size,
-                      id=0,
-                      ranks=ranks,
-                      pg=pg,
-                      name=_default_group_name)
+        group = Group(rank, 0, ranks, pg=pg, name=_default_group_name)
         _set_group_map_by_name(_default_group_name, group)
         _set_group_map(0, group)
         _set_group_map_backend(group, backend)
+        _add_new_group(group)
         parallel_helper._set_parallel_ctx(True)
 
         paddle.distributed.barrier(group=group)
@@ -286,8 +311,10 @@ def train():
             size = {'_worker': parallel_env.world_size}
             if backend == "heter":
                 size = {'_worker': len(node_num)}
-            http_server = Process(target=_start_kv_server,
-                                  args=(int(ep_rank_0[1]), http_server_d, size))
+            http_server = Process(
+                target=_start_kv_server,
+                args=(int(ep_rank_0[1]), http_server_d, size),
+            )
             http_server.daemon = True
             http_server_d["running"] = True
             http_server.start()
@@ -305,22 +332,28 @@ def train():
     # init nccl or hccl or bkcl or heter context
     if is_cpu_only:
         parallel_helper._set_parallel_ctx(
-            core.GLOOParallelContext(strategy, place))
-    elif (backend == "heter"):
+            core.GLOOParallelContext(strategy, place)
+        )
+    elif backend == "heter":
         parallel_helper._set_parallel_ctx(
-            core.HeterParallelContext(strategy, parallel_env.device_id))
+            core.HeterParallelContext(strategy, parallel_env.device_id)
+        )
     elif core.is_compiled_with_cuda():
         parallel_helper._set_parallel_ctx(
-            core.NCCLParallelContext(strategy, place))
+            core.NCCLParallelContext(strategy, place)
+        )
     elif core.is_compiled_with_xpu():
         parallel_helper._set_parallel_ctx(
-            core.BKCLParallelContext(strategy, place))
+            core.BKCLParallelContext(strategy, place)
+        )
     elif core.is_compiled_with_npu():
         parallel_helper._set_parallel_ctx(
-            core.HCCLParallelContext(strategy, place))
+            core.HCCLParallelContext(strategy, place)
+        )
     elif core.is_compiled_with_mlu():
         parallel_helper._set_parallel_ctx(
-            core.CNCLParallelContext(strategy, place))
+            core.CNCLParallelContext(strategy, place)
+        )
 
     if backend != "heter":
         other_endpoints = strategy.trainer_endpoints[:]
@@ -359,47 +392,65 @@ def train():
     return group
 
 
-def get_rank():
+def get_rank(group=None):
     """
-    Returns the rank of current trainer.
+    Returns the rank of current trainer in the given group, ranks are consecutive integers in [0, ``world_size``).
+    If none of the group is given, the global group will be used as default.
 
-    Its value is equal to the value of the environment variable ``PADDLE_TRAINER_ID`` . 
-    The default value is 0.
+    Args:
+        group (Group, optional): The communication group you want to get rank of current trainer, use global group as default if group is None.
 
     Returns:
-        (int) The rank of current trainer.
+        (int) The rank of current trainer in the given group. Return -1 if the process is not part of the given group.
+
+    Warning:
+        Argument ``group`` only supports in dygraph mode.
 
     Examples:
         .. code-block:: python
 
+            # Execute this script using distributed launch with one card configs.
             import paddle
             import paddle.distributed as dist
 
-            # execute this command in terminal: export PADDLE_TRAINER_ID=0
+            dist.init_parallel_env()
             print("The rank is %d" % dist.get_rank())
             # The rank is 0
     """
+    if in_dygraph_mode() and group:
+        return group.rank
+
+    assert group is None, "Only support group argument in eager mode."
     return _get_global_parallel_env().rank
 
 
-def get_world_size():
+def get_world_size(group=None):
     """
-    Returns the number of trainers (number of processes participating in current job).
+    Returns the number of trainers (number of processes participating in current job) in the given group.
+    If none of the group is given, the global group will be used as default.
 
-    Its value is equal to the value of the environment variable ``PADDLE_TRAINERS_NUM`` . 
-    The default value is 1.
+    Args:
+        group (Group, optional): The communication group you want to check world size, use global group as default if group is None.
 
     Returns:
-        (int) The number of trainers.
+        (int) The number of trainers in the given group. Return -1 if the process if not part of the given group.
+
+    Warning:
+        Argument ``group`` only supports in dygraph mode.
 
     Examples:
         .. code-block:: python
 
+            # Execute this script using distributed launch with one card configs.
             import paddle
             import paddle.distributed as dist
 
-            # execute this command in terminal: export PADDLE_TRAINERS_NUM=4
+            dist.init_parallel_env()
             print("The world_size is %d" % dist.get_world_size())
-            # The world_size is 4
+            # The world_size is 1
     """
+    if in_dygraph_mode() and group:
+        return group.world_size
+
+    assert group is None, "Only support group argument in eager mode."
     return _get_global_parallel_env().world_size
diff --git a/python/paddle/distributed/passes/__init__.py b/python/paddle/distributed/passes/__init__.py
index 5f721a1df50df6..1d1dab90cf31ad 100644
--- a/python/paddle/distributed/passes/__init__.py
+++ b/python/paddle/distributed/passes/__init__.py
@@ -22,6 +22,7 @@
 from .auto_parallel_quantization import *
 from .auto_parallel_data_parallel_optimization import *
 from .auto_parallel_grad_clip import *
+from .auto_parallel_pipeline import *
 from .cpp_pass import *
 import os
 from .ps_trainer_pass import *
diff --git a/python/paddle/distributed/passes/auto_parallel_amp.py b/python/paddle/distributed/passes/auto_parallel_amp.py
index d97209f7fe5c51..9e0aaa64485548 100644
--- a/python/paddle/distributed/passes/auto_parallel_amp.py
+++ b/python/paddle/distributed/passes/auto_parallel_amp.py
@@ -26,6 +26,7 @@
 from paddle.fluid.contrib.mixed_precision.fp16_utils import _valid_types, find_true_post_op, find_true_prev_op
 from paddle.fluid.contrib.mixed_precision.fp16_utils import _is_in_black_varnames, _dtype_to_str, _rename_arg
 from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
+from ..auto_parallel.utils import is_forward_op, is_backward_op, is_loss_op
 
 world_process_group = get_world_process_group()
 
@@ -37,14 +38,18 @@ def __init__(self, block):
         self._op_fp16_dict = {
         }  # op_id --> True/False. 'True' means that the current op is in fp16 mode.
         self._var_name_dict = {}  # fwd_op_id --> {old_name: cast_name}
+        self.is_train = False
 
     def _is_fp16_op(self, op_id):
         return self._op_fp16_dict.get(op_id, None)
 
-    def _build_stats(self, amp_lists, dist_context):
+    def _build_state(self, amp_lists, dist_context):
         ops = self._block.ops
         dist_op_context = dist_context.dist_op_context
         for op in ops:
+            if int(op.attr('op_role')) == 257:
+                self.is_train = True
+
             if int(op.attr('op_role')) == int(OpRole.Forward):
                 self._mark_black_white_ops(amp_lists)
             elif int(op.attr('op_role')) == int(OpRole.Backward):
@@ -58,6 +63,8 @@ def _build_stats(self, amp_lists, dist_context):
             elif int(op.attr('op_role')) == int(OpRole.Optimize):
                 break
 
+        return self.is_train
+
     def _mark_black_white_ops(self, amp_lists):
         """
         this function is modified from paddle.fluid.contrib.mixed_precision
@@ -222,21 +229,33 @@ def cast_backward_program(self, params_grads, dist_context):
         loss_op = get_loss_op(self._block)
         loss_op_index = find_op_index(self._block.desc, loss_op.desc)
 
+        appended_grad_times = 0
         idx = loss_op_index + 1
         while idx < len(ops):
             num_cast_ops = 0
             grad_op = ops[idx]
+
+            # NOTE: the map in `grad_var_to_var` may be changed when the var is casted,
+            # which will affect the dist_op to insert allreduce_sum op.
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(grad_op)
+            if is_backward_op(grad_op) and (is_forward_op(ops[idx - 1])
+                                            or is_loss_op(ops[idx - 1])):
+                if not op_dist_attr.is_recompute:
+                    appended_grad_times += 1
+
             grad_op_orig_id = grad_op.desc.original_id()
             dist_op_context = dist_context.dist_op_context
             if grad_op_orig_id in dist_op_context.grad_op_id_to_op_id:
                 if self._is_fp16_op(grad_op_orig_id) == False:  # fp32
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP16,
-                        core.VarDesc.VarType.FP32, dist_context)
+                        core.VarDesc.VarType.FP32, dist_context,
+                        appended_grad_times)
                 elif self._is_fp16_op(grad_op_orig_id) == True:  # fp16
                     num_cast_ops = self._insert_cast_op_backward(
                         grad_op, idx, core.VarDesc.VarType.FP32,
-                        core.VarDesc.VarType.FP16, dist_context)
+                        core.VarDesc.VarType.FP16, dist_context,
+                        appended_grad_times)
             elif grad_op.type == "sum":
                 in_var_name = grad_op.desc.input_arg_names()[0]
                 src_dtype = self._block.var(in_var_name).dtype
@@ -258,7 +277,7 @@ def cast_backward_program(self, params_grads, dist_context):
         _update_backward_cast_ops(params_grads, dist_context)
 
     def _insert_cast_op_backward(self, grad_op, idx, src_dtype, dst_dtype,
-                                 dist_context):
+                                 dist_context, appended_grad_times):
         """ only for backward cast """
 
         def _keep_fp32_input(op, in_name):
@@ -301,7 +320,9 @@ def _keep_fp32_output(op, out_name):
                         consume_op_attr.set_input_dist_attr(
                             cast_name, in_var_dist_attr)
                     else:
-                        assert in_var.dtype == dst_dtype
+                        assert in_var.dtype == dst_dtype, "op [{}] expect input [{}] to be dtype [{}] BUT got [{}]. {}".format(
+                            grad_op.type, in_name, dst_dtype, in_var.dtype,
+                            str(grad_op))
 
         for out_name in grad_op.output_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_output(
@@ -328,7 +349,10 @@ def _keep_fp32_output(op, out_name):
                             grad_op)
                         fwd_cast_name = self._var_name_dict[fwd_op_id][
                             out_var_name_prefix]
-                        cast_name = fwd_cast_name + "@GRAD"
+                        suffix = ""
+                        if "@RENAME" in out_var_name:
+                            suffix = out_var_name[out_var_name.find("@RENAME"):]
+                        cast_name = fwd_cast_name + "@GRAD" + suffix
                         cast_var = self._block.vars.get(cast_name)
                         if cast_var is None or cast_var.dtype != dst_dtype:
                             grad_op.desc._rename_output(out_var_name, cast_name)
@@ -347,6 +371,8 @@ def _keep_fp32_output(op, out_name):
                                 stop_gradient=out_var.stop_gradient)
                             set_var_dist_attr(dist_context, cast_var,
                                               ref_mapping, ref_mesh)
+                            dist_op_context.grad_var_to_var[
+                                appended_grad_times][cast_name] = fwd_cast_name
 
                             cast_op = self._block._insert_op(
                                 idx + 1,
@@ -491,9 +517,11 @@ def __init__(self):
         self.set_attr("use_dynamic_loss_scaling", False)
         self.set_attr("input_data", [])
         self.set_attr("params_grads", [])
+        self._loss = None
         self._loss_scaling = None
         self._num_good_steps = None
         self._num_bad_steps = None
+        self._loss = None
 
     def _check_self(self):
         if self.get_attr("init_loss_scaling") < 0:
@@ -526,23 +554,25 @@ def _apply_single_impl(self, main_program, startup_program, context):
             set(self.get_attr("custom_black_list")),
             set(self.get_attr("custom_black_varnames")))
 
-        amp_state = AMPState(main_program.global_block())
-        amp_state._build_stats(amp_lists, self.dist_context)
-
         with paddle.static.program_guard(main_program, startup_program):
+            amp_state = AMPState(main_program.global_block())
+            is_train = amp_state._build_state(amp_lists, self.dist_context)
+
             amp_state.cast_forward_program(self.dist_context)
-            amp_state.cast_backward_program(params_grads, self.dist_context)
-            # TODO (JZ-LIANG)support cast forward program only when inference
-            self._init_amp_var()
-            self._scale_loss()
-
-            if self.get_attr("use_dynamic_loss_scaling"
-                             ) or self.get_attr("init_loss_scaling") != 1.0:
-                grads, found_inf = _check_and_update_gradient(
-                    params_grads, self._loss_scaling, self.dist_context)
-
-            if self.get_attr("use_dynamic_loss_scaling"):
-                self._update_loss_scaling(grads, found_inf)
+
+        if is_train:
+            with paddle.static.program_guard(main_program, startup_program):
+                amp_state.cast_backward_program(params_grads, self.dist_context)
+                self._init_amp_var()
+                self._scale_loss()
+
+                if self.get_attr("use_dynamic_loss_scaling"
+                                 ) or self.get_attr("init_loss_scaling") != 1.0:
+                    grads, found_inf = _check_and_update_gradient(
+                        params_grads, self._loss_scaling, self.dist_context)
+
+                if self.get_attr("use_dynamic_loss_scaling"):
+                    self._update_loss_scaling(grads, found_inf)
 
     def _init_amp_var(self):
         self._loss_scaling = paddle.static.create_global_var(
diff --git a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
index 3c5403c8254b9a..70592e8b38037d 100644
--- a/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
+++ b/python/paddle/distributed/passes/auto_parallel_data_parallel_optimization.py
@@ -13,12 +13,14 @@
 # limitations under the License.
 
 from collections import OrderedDict
+import numpy as np
 
 import paddle
+from paddle.fluid import core, unique_name
 from paddle.fluid.framework import default_main_program
-from paddle.distributed.fleet.meta_optimizers.common import OpRole
+from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
 from paddle.distributed.auto_parallel.operators.common import is_data_parallel_scale_op, is_data_parallel_reduce_op
-from paddle.distributed.auto_parallel.utils import is_loss_grad_op, is_optimize_op, ring_id_to_process_group
+from paddle.distributed.auto_parallel.utils import is_loss_grad_op, is_optimize_op, is_backward_op, ring_id_to_process_group, find_higher_order_backward_op
 from .pass_base import PassBase, PassType, register_pass
 
 # add new optimizers supporting rescale_grad here
@@ -31,6 +33,10 @@
 __max_stream_num_allow__ = 16
 
 
+def numel(var):
+    return np.prod(list(var.shape))
+
+
 @register_pass("auto_parallel_data_parallel_optimization")
 class DataParallelOptimizationPass(PassBase):
     """
@@ -45,6 +51,7 @@ def __init__(self):
         # NOTE not use depence on loss and param_grads
         self.set_attr("dist_context", None)
         self.set_attr("global_rank", -1)
+        self.set_attr("use_sharding", False)
         # {grad1: group1, grad2: group1, grad3: group2}
         # record the order for fuse grad data memory
         self._grad_name_to_group_map = OrderedDict()
@@ -71,12 +78,17 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
         self.dist_context = self.get_attr("dist_context")
         self.global_rank = int(self.get_attr("global_rank"))
+        self.use_sharding = self.get_attr("use_sharding")
 
         with paddle.static.program_guard(main_program, startup_program):
             self._analyze_program()
-            self._prune_grad_scaling()
-            self._calc_comm_overlap()
-            self._fuse_allreduce()
+
+            if self.is_data_parallel_applied():
+                self._prune_grad_scaling()
+                self._calc_comm_overlap()
+                grad_group = self._fuse_allreduce()
+
+        # self.summary(grad_group)
 
     def _prune_grad_scaling(self):
 
@@ -97,7 +109,14 @@ def _calc_comm_overlap(self):
         self._calc_wait_comms()
 
     def _fuse_allreduce(self):
-        pass
+
+        if not self._could_be_fuse():
+            return []
+
+        grad_group = self._group_grads()
+        self._update_program(grad_group)
+
+        return grad_group
 
     def _analyze_program(self):
         """
@@ -150,9 +169,12 @@ def _analyze_program(self):
         ) == 0, "Unexception: gradients [{}] is scaled BUT NOT synchronized.".format(
             not_synchronized_grads)
 
+    def is_data_parallel_applied(self):
+        return len(self._group_to_grad_name_map) > 0
+
     def _could_be_prune(self):
 
-        return self.dist_context._gradient_scale and (
+        return self.dist_context.gradient_scale and (
             self._support_rescale_grad or self._all_dp_groups_same_degree())
 
     def _all_dp_groups_same_degree(self):
@@ -224,7 +246,8 @@ def _could_be_overlap(self):
         num_dp_comm_stream = len(set(self._group_to_grad_name_map.keys()))
         if num_dp_comm_stream > __max_stream_num_allow__:
             return False
-
+        if self.use_sharding:
+            return False
         return True
 
     def _comms_overlap_calc(self):
@@ -313,3 +336,252 @@ def _calc_wait_comms(self):
                                                   'op_role': OpRole.Backward,
                                                   'ring_id': ring_id
                                               })
+
+    def _could_be_fuse(self):
+        # TODO  support gradient fuse higher order gradient.
+        # should analyse the dependencies of gradient in backward.
+        if find_higher_order_backward_op(default_main_program()):
+            return False
+        if self.use_sharding:
+            return False
+        return True
+
+    def _group_grads(self):
+        """
+        conditions for gradients to be grouped:
+        1. group size < max_fuse_numel
+        2. same dp group 
+        3. same dtype
+        4. dependency: grad would NOT be used by other ops within group segment 
+
+        gradients inside same group would be fuse into one coalesce tensor
+        """
+
+        block = default_main_program().global_block()
+        ops = block.ops
+
+        # group individual grad vars
+        # TODO consider fuse gradient for sharding reduce
+        # TODO let user to set fuse_grad_size
+        # emb = 50000 * h, ffn = 8 * h * h, mha = 4 * h * h
+        h = 2048
+        ffn_numel = 2 * (4 * h) * h
+        mha_numel = 3 * h * h + h * h
+        max_fuse_numel = ffn_numel + mha_numel
+        grad_groups = []
+        cur_group = GradientsGroup(ops, max_fuse_numel)
+        grouped_grad_names = set()
+
+        def collect_group(cur_group, grad_var, ring_id, i):
+            if len(cur_group.gradients) == 0:
+                cur_group = None
+            elif len(cur_group.gradients) == 1:
+                grouped_grad_names.remove(cur_group.gradients[0].name)
+            else:
+                cur_group.finalize()
+                grad_groups.append(cur_group)
+
+            new_group = GradientsGroup(ops, max_fuse_numel)
+            if grad_var:
+                new_group.add(grad_var, ring_id, i)
+                grouped_grad_names.add(grad_var.name)
+            return new_group
+
+        def op_depend_on_group(op, group):
+            vars_ = set(op.input_arg_names + op.output_arg_names)
+            grad_names = set([grad.name for grad in group.gradients])
+            return len(vars_.intersection(grad_names)) > 0
+
+        for i, op in enumerate(ops):
+            if is_data_parallel_reduce_op(op):
+                ring_id = op.attr("ring_id")
+                grad_name = op.output_arg_names[0]
+                grad_var = block.var(grad_name)
+                grad_numel = numel(grad_var)
+
+                if cur_group.acceptable(grad_var, ring_id):
+                    assert grad_name not in grouped_grad_names
+                    grouped_grad_names.add(grad_name)
+                    cur_group.add(grad_var, ring_id, i)
+                else:
+                    cur_group = collect_group(cur_group, grad_var, ring_id, i)
+            else:
+                if op_depend_on_group(op, cur_group):
+                    cur_group = collect_group(cur_group, None, None, None)
+
+        # collect last group
+        collect_group(cur_group, None, None, None)
+
+        return grad_groups
+
+    def _update_program(self, grad_groups):
+
+        block = default_main_program().global_block()
+
+        remove_op_types = ['scale', 'c_allreduce_sum', 'c_wait_compute']
+
+        for i, group in enumerate(grad_groups[::-1]):
+
+            # create coalecse tensor
+            group.coalesce_var = block.create_var(name=unique_name.generate(
+                'coalecse_grad_{}'.format(i)),
+                                                  dtype=group.dtype,
+                                                  persistable=False,
+                                                  stop_gradient=True)
+
+            # update allreduce & scale op
+            if group.scale_op_idx != -1:
+                scale_op = block.ops[group.scale_op_idx]
+                assert scale_op.type == 'scale', "should found scale op but found {}".format(
+                    str(scale_op))
+                scale_op._rename_input(scale_op.input_arg_names[0],
+                                       group.coalesce_var.name)
+                scale_op._rename_output(scale_op.output_arg_names[0],
+                                        group.coalesce_var.name)
+
+            allreduce_op = block.ops[group.allreduce_op_idx]
+            assert allreduce_op.type == 'c_allreduce_sum', "should found c_allreduce_sum op but found {}".format(
+                str(allreduce_op))
+            allreduce_op._rename_input(allreduce_op.input_arg_names[0],
+                                       group.coalesce_var.name)
+            allreduce_op._rename_output(allreduce_op.output_arg_names[0],
+                                        group.coalesce_var.name)
+
+            # remvoe un-used op
+            remove_op_indices = group.remove_wait_op_indices + group.remove_allreduce_op_indices + group.remove_scale_op_indices
+            for idx in sorted(remove_op_indices, reverse=True):
+                assert block.ops[
+                    idx].type in remove_op_types, "Unexception: try to remove op {}".format(
+                        str(op))
+                block._remove_op(idx)
+
+            # insert coalecse op
+            concated_shapes = []
+            concated_ranks = []
+            for grad_ in group.gradients:
+                shape = grad_.shape
+                concated_shapes.extend(shape)
+                concated_ranks.append(len(shape))
+
+            grad_names = [grad.name for grad in group.gradients]
+            block._insert_op_without_sync(group.coalesce_op_idx,
+                                          type="coalesce_tensor",
+                                          inputs={"Input": grad_names},
+                                          outputs={
+                                              "Output": grad_names,
+                                              "FusedOutput": group.coalesce_var
+                                          },
+                                          attrs={
+                                              "copy_data": False,
+                                              "use_align": True,
+                                              "dtype": group.dtype,
+                                              "concated_shapes":
+                                              concated_shapes,
+                                              "concated_ranks": concated_ranks,
+                                              OP_ROLE_KEY: OpRole.Backward
+                                          })
+
+        block._sync_with_cpp()
+        # TODO update dist attr
+
+    def summary(self, grad_groups=[]):
+        # TODO: add logger module
+        import logging
+        self._logger = logging.getLogger()
+        self._logger.propagate = False
+        if not self._logger.handlers:
+            self._logger.setLevel(logging.INFO)
+            log_handler = logging.StreamHandler()
+            log_format = logging.Formatter(
+                '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
+            )
+            log_handler.setFormatter(log_format)
+            self._logger.addHandler(log_handler)
+
+        if len(grad_groups) > 0:
+            self._logger.info(
+                "origin {} allreduce ops are fused into {} coalecse allreduce ops."
+                .format(len(self._grad_name_to_group_map.keys()),
+                        len(grad_groups)))
+            self._logger.info("gradient fusing group are following: ")
+            fused_grads = set()
+            for i, group in enumerate(grad_groups):
+                self._logger.info(
+                    "coalecse gradient [{}] is composed by: {}".format(
+                        i, [grad.name for grad in group.gradients]))
+                fused_grads.update([grad.name for grad in group.gradients])
+            individual_grads = set(
+                self._grad_name_to_group_map.keys()) - set(fused_grads)
+            self._logger.info(
+                "the following [{}] gradients are not fused: ".format(
+                    len(individual_grads)))
+            self._logger.info("individual gradient {}".format(individual_grads))
+
+
+class GradientsGroup(object):
+
+    def __init__(self, ops, max_group_size):
+        self.max_group_size = max_group_size
+        self.ops = ops
+
+        self.gradients = []
+        self.numel = 0
+        self.dtype = None
+        self.ring_id = None
+        self.coalesce_var = None
+        self.coalesce_op_idx = -1
+        self.allreduce_op_idx = -1
+        self.scale_op_idx = -1
+        self.remove_wait_op_indices = []
+        self.remove_allreduce_op_indices = []
+        self.remove_scale_op_indices = []
+
+    def acceptable(self, grad_var, ring_id):
+        if len(self.gradients) == 0:
+            return True
+        if ring_id != self.ring_id:
+            return False
+        if numel(grad_var) + self.numel > self.max_group_size:
+            return False
+        if grad_var.dtype != self.dtype:
+            return False
+
+        return True
+
+    def add(self, grad_var, ring_id, i):
+        self.gradients.append(grad_var)
+        self.ring_id = ring_id
+        self.dtype = grad_var.dtype
+        self.numel += numel(grad_var)
+
+        # remove auxiliary ops in non-fuse dp allreduce
+        self.remove_allreduce_op_indices.append(i)
+
+        # NOTE this pass rely on the original synchronization add in previous passes
+        # (same stream or calc_wait_comm & comm_wait_calc)
+        # to guarantee the correctness of comm_calc execution order.
+        # so the calc_wait_comm should be keep.
+        grad_op_idx = i - 1
+        if i > 0 and self.ops[i - 1].type == 'c_wait_compute':
+            self.remove_wait_op_indices.append(i - 1)
+            grad_op_idx -= 1
+        if i + 1 < len(self.ops) and is_data_parallel_scale_op(self.ops[i - 1]):
+            self.remove_scale_op_indices.append(i + 1)
+
+        if len(self.gradients) == 1:
+            # TODO Remove this is a temporary hack for Tensor Parallel. the logic
+            # for find grad_op should be more general.
+            if self.ops[grad_op_idx].type == "c_allreduce_sum":
+                grad_op_idx -= 1
+
+            grad_op = self.ops[grad_op_idx]
+            assert grad_var.name in grad_op.output_arg_names, "grad [{}] should be output of {}".format(
+                grad_var.name, str(grad_op))
+            self.coalesce_op_idx = grad_op_idx
+
+    def finalize(self):
+        self.allreduce_op_idx = self.remove_allreduce_op_indices.pop()
+        if len(self.remove_wait_op_indices) > 1:
+            self.remove_wait_op_indices.pop()
+        if len(self.remove_scale_op_indices) > 1:
+            self.scale_op_idx = self.remove_scale_op_indices.pop()
diff --git a/python/paddle/distributed/passes/auto_parallel_fp16.py b/python/paddle/distributed/passes/auto_parallel_fp16.py
index f65b7591e59727..34684c6ca41800 100644
--- a/python/paddle/distributed/passes/auto_parallel_fp16.py
+++ b/python/paddle/distributed/passes/auto_parallel_fp16.py
@@ -16,15 +16,35 @@
 
 import paddle
 from paddle.framework import core
+from paddle.fluid.framework import default_main_program, default_startup_program
 from paddle.fluid import unique_name
 from .pass_base import register_pass
 from paddle.fluid.data_feeder import check_variable_and_dtype, check_type
-from paddle.distributed.auto_parallel.utils import set_var_dist_attr, naive_set_dist_op_attr_for_program_by_mesh_and_mapping
-from paddle.distributed.auto_parallel.process_group import get_world_process_group
-from paddle.fluid.contrib.mixed_precision.fp16_utils import AutoMixedPrecisionLists
-from paddle.fluid.contrib.mixed_precision.fp16_utils import _keep_layer_norm_scale_bias_to_fp32, _need_keep_fp32, _valid_types, _dtype_to_str
-from paddle.distributed.auto_parallel.dist_attribute import OperatorDistributedAttribute
-from paddle.distributed.auto_parallel.utils import is_forward_op, is_backward_op, OP_ROLE_KEY, OpRole
+from paddle.distributed.auto_parallel.utils import (
+    set_var_dist_attr,
+    naive_set_dist_op_attr_for_program_by_mesh_and_mapping,
+)
+from paddle.distributed.auto_parallel.process_group import (
+    get_world_process_group,
+)
+from paddle.fluid.contrib.mixed_precision.fp16_utils import (
+    AutoMixedPrecisionLists,
+)
+from paddle.fluid.contrib.mixed_precision.fp16_utils import (
+    _keep_layer_norm_scale_bias_to_fp32,
+    _need_keep_fp32,
+    _valid_types,
+    _dtype_to_str,
+)
+from paddle.distributed.auto_parallel.dist_attribute import (
+    OperatorDistributedAttribute,
+)
+from paddle.distributed.auto_parallel.utils import (
+    is_forward_op,
+    is_backward_op,
+    OP_ROLE_KEY,
+    OpRole,
+)
 from .auto_parallel_amp import AMPPass
 
 world_process_group = get_world_process_group()
@@ -38,11 +58,15 @@
 
 
 def set_op_dtype_to_fp16(op):
-    if op.has_attr('in_dtype') and op.attr(
-            'in_dtype') == core.VarDesc.VarType.FP32:
+    if (
+        op.has_attr('in_dtype')
+        and op.attr('in_dtype') == core.VarDesc.VarType.FP32
+    ):
         op._set_attr('in_dtype', core.VarDesc.VarType.FP16)
-    if op.has_attr('out_dtype') and op.attr(
-            'out_dtype') == core.VarDesc.VarType.FP32:
+    if (
+        op.has_attr('out_dtype')
+        and op.attr('out_dtype') == core.VarDesc.VarType.FP32
+    ):
         op._set_attr('out_dtype', core.VarDesc.VarType.FP16)
     if op.has_attr('dtype') and op.attr('dtype') == core.VarDesc.VarType.FP32:
         op._set_attr('dtype', core.VarDesc.VarType.FP16)
@@ -62,7 +86,12 @@ def _keep_fp32_input(op, in_name):
         return in_name not in {'X', 'FilterX', 'Z', 'FilterZ'}
     if op_type in ['fused_attention', 'fused_feedforward']:
         return in_name in {
-            'LnScale', 'LnBias', 'Ln2Scale', 'Ln2Bias', "Ln1Scale", "Ln1Bias"
+            'LnScale',
+            'LnBias',
+            'Ln2Scale',
+            'Ln2Bias',
+            "Ln1Scale",
+            "Ln1Bias",
         }
     # backward
     if op_type in ['batch_norm_grad']:
@@ -82,8 +111,12 @@ def _keep_fp32_output(op, out_name):
         return out_name not in {'Y', 'ConvX', 'ConvZ'}
     if op_type in ['fused_attention', 'fused_feedforward']:
         return out_name in {
-            'LnMean', 'LnVariance', 'Ln2Mean', 'Ln2Variance', 'Ln1Mean',
-            'Ln1Variance'
+            'LnMean',
+            'LnVariance',
+            'Ln2Mean',
+            'Ln2Variance',
+            'Ln1Mean',
+            'Ln1Variance',
         }
     # backward
     if op_type in ['layer_norm_grad']:
@@ -94,24 +127,28 @@ def _keep_fp32_output(op, out_name):
 
 
 class FP16State(object):
-
-    def __init__(self,
-                 program,
-                 amp_list,
-                 dist_context,
-                 use_fp16_guard,
-                 input_data_var_names=None):
+    def __init__(
+        self,
+        program,
+        amp_list,
+        dist_context,
+        use_fp16_guard,
+        input_data_var_names=None,
+    ):
         self.program = program
         self.amp_list = amp_list
         self.use_fp16_guard = use_fp16_guard
         self.dist_context = dist_context
-        self.grad_op_to_op_map = self.dist_context.dist_op_context.grad_op_id_to_op_id
+        self.grad_op_to_op_map = (
+            self.dist_context.dist_op_context.grad_op_id_to_op_id
+        )
         if input_data_var_names:
             self.input_data_var_names = input_data_var_names
         else:
             self.input_data_var_names = []
-        self._op_fp16_dict = {
-        }  # op_id --> True/False. 'True' means that the op is should run in fp16 mode.
+        self._op_fp16_dict = (
+            {}
+        )  # op_id --> True/False. 'True' means that the op is should run in fp16 mode.
         # a trick to determine leaf tensor node in program {varname: generator_op_id}
         self.forward_non_leaf_tensors = {}
         # record the cast ops that are inserted for a forward
@@ -125,7 +162,7 @@ def _is_fp16_op(self, op_id):
 
     def _build_state(self):
         """
-        mark the execution mode (fp16 or fp32) for ops in all blocks 
+        mark the execution mode (fp16 or fp32) for ops in all blocks
         include forward ops & backward ops
         """
         # mark op dtype
@@ -155,8 +192,9 @@ def _mark_op(self, op):
             if op.type == "assign" and "array_" in op.input_arg_names[0]:
                 self._op_fp16_dict[op.desc.original_id()] = False
                 return
-            if _need_keep_fp32(op, self.amp_list.unsupported_list,
-                               self.use_fp16_guard):
+            if _need_keep_fp32(
+                op, self.amp_list.unsupported_list, self.use_fp16_guard
+            ):
                 self._op_fp16_dict[op.desc.original_id()] = False
             else:
                 self._op_fp16_dict[op.desc.original_id()] = True
@@ -169,8 +207,9 @@ def _mark_op(self, op):
             if op.desc.original_id() in self.grad_op_to_op_map:
                 fwd_op_id = self.grad_op_to_op_map[op.desc.original_id()]
                 assert fwd_op_id in self._op_fp16_dict, "{}".format(str(op))
-                self._op_fp16_dict[
-                    op.desc.original_id()] = self._op_fp16_dict[fwd_op_id]
+                self._op_fp16_dict[op.desc.original_id()] = self._op_fp16_dict[
+                    fwd_op_id
+                ]
 
         if int(op.attr('op_role')) == 257:
             self.is_train = True
@@ -180,7 +219,8 @@ def set_var_to_fp16(self, var_name, block):
         try:
             var = block.var(var_name)
         except ValueError as e:
-            var = self.program.global_block().var(var_name)
+            var = block._var_recursive(var_name)
+            # var = self.program.global_block().var(var_name)
 
         # NOTE(JZ-LIANG) "array_" is a hack to adopt for ernie3.0 inference, since there is
         # a trick which make the LOD_TENSOR_ARRAY to the float32 in while block to reset the LOD_TENSOR_ARRAY
@@ -195,13 +235,18 @@ def resolute_tensor_dtype(self, block):
         for op in block.ops:
             if is_forward_op(op):
                 # NOTE (JZ-LIANG) un-expected cast op when user call "+, -, *, /" in python
-                if self._is_fp16_op(op.desc.original_id()) == True \
-                    or op.type == "cast":
+                if (
+                    self._is_fp16_op(op.desc.original_id()) == True
+                    or op.type == "cast"
+                ):
                     for in_name in op.input_names:
                         if _keep_fp32_input(op, in_name):
                             continue
                         for in_var_name in op.input(in_name):
-                            if in_var_name not in self.forward_non_leaf_tensors and in_var_name not in self.input_data_var_names:
+                            if (
+                                in_var_name not in self.forward_non_leaf_tensors
+                                and in_var_name not in self.input_data_var_names
+                            ):
                                 self.set_var_to_fp16(in_var_name, block)
                     for out_name in op.output_names:
                         if _keep_fp32_output(op, out_name):
@@ -247,22 +292,42 @@ def cast_block(self, block):
             elif is_forward_op(op):
                 if self._is_fp16_op(op.desc.original_id()) == False:
                     num_cast_ops = self._insert_forward_cast_ops(
-                        op, idx, block, core.VarDesc.VarType.FP16,
-                        core.VarDesc.VarType.FP32, self.dist_context)
+                        op,
+                        idx,
+                        block,
+                        core.VarDesc.VarType.FP16,
+                        core.VarDesc.VarType.FP32,
+                        self.dist_context,
+                    )
                 elif self._is_fp16_op(op.desc.original_id()) == True:
                     num_cast_ops = self._insert_forward_cast_ops(
-                        op, idx, block, core.VarDesc.VarType.FP32,
-                        core.VarDesc.VarType.FP16, self.dist_context)
+                        op,
+                        idx,
+                        block,
+                        core.VarDesc.VarType.FP32,
+                        core.VarDesc.VarType.FP16,
+                        self.dist_context,
+                    )
             elif is_backward_op(op):
                 if op.desc.original_id() in dist_op_context.grad_op_id_to_op_id:
                     if self._is_fp16_op(op.desc.original_id()) == False:
                         num_cast_ops = self._insert_backward_cast_ops(
-                            op, idx, block, core.VarDesc.VarType.FP16,
-                            core.VarDesc.VarType.FP32, self.dist_context)
+                            op,
+                            idx,
+                            block,
+                            core.VarDesc.VarType.FP16,
+                            core.VarDesc.VarType.FP32,
+                            self.dist_context,
+                        )
                     elif self._is_fp16_op(op.desc.original_id()) == True:
                         num_cast_ops = self._insert_backward_cast_ops(
-                            op, idx, block, core.VarDesc.VarType.FP32,
-                            core.VarDesc.VarType.FP16, self.dist_context)
+                            op,
+                            idx,
+                            block,
+                            core.VarDesc.VarType.FP32,
+                            core.VarDesc.VarType.FP16,
+                            self.dist_context,
+                        )
                 elif op.type == "sum":
                     # all inputs dtype of sum should be equal and output dtype should follow input
                     out_var_name = op.output_arg_names[0]
@@ -270,41 +335,51 @@ def cast_block(self, block):
                     out_var = block.var(out_var_name)
                     in_var = block._find_var_recursive(in_var_name)
                     for in_var_name in op.input_arg_names:
-                        assert in_var.dtype == block.var(
-                            in_var_name).dtype, "{}, {}, {}".format(
-                                in_var, block.var(in_var_name), str(op))
+                        assert (
+                            in_var.dtype == block.var(in_var_name).dtype
+                        ), "{}, {}, {}".format(
+                            in_var, block.var(in_var_name), str(op)
+                        )
                     out_var.desc.set_dtype(in_var.dtype)
 
             idx += num_cast_ops + 1
         block._sync_with_cpp()
 
-    def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
-                                 dist_context):
+    def _insert_forward_cast_ops(
+        self, op, idx, block, src_dtype, dst_dtype, dist_context
+    ):
 
         num_cast_ops = 0
 
         for in_name in op.input_names:
             if src_dtype == core.VarDesc.VarType.FP32 and _keep_fp32_input(
-                    op, in_name):
+                op, in_name
+            ):
                 continue
 
             consume_op_attr = dist_context.get_op_dist_attr_for_program(op)
             assert consume_op_attr is not None
             for in_var_name in op.input(in_name):
                 in_var = block._find_var_recursive(in_var_name)
-                if in_var is None or in_var.type not in _valid_types or in_var.dtype == dst_dtype:
+                if (
+                    in_var is None
+                    or in_var.type not in _valid_types
+                    or in_var.dtype == dst_dtype
+                ):
                     continue
 
                 if in_var.dtype == src_dtype:
-                    cast_name = in_var.name + '.cast_' + _dtype_to_str(
-                        dst_dtype)
+                    cast_name = (
+                        in_var.name + '.cast_' + _dtype_to_str(dst_dtype)
+                    )
                     cast_var = block.vars.get(cast_name)
                     self.forward_input_cast_ops[op.desc.original_id()] += [
                         (cast_name, in_var.name, dst_dtype, src_dtype, in_name)
                     ]
 
                     in_var_dist_attr = consume_op_attr.get_input_dist_attr(
-                        in_var.name)
+                        in_var.name
+                    )
                     assert in_var_dist_attr is not None
                     # truly insert cast op
                     if cast_var is None or cast_var.dtype != dst_dtype:
@@ -318,9 +393,11 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                             name=cast_name,
                             dtype=dst_dtype,
                             persistable=False,
-                            stop_gradient=in_var.stop_gradient)
-                        set_var_dist_attr(dist_context, cast_var, ref_mapping,
-                                          ref_mesh)
+                            stop_gradient=in_var.stop_gradient,
+                        )
+                        set_var_dist_attr(
+                            dist_context, cast_var, ref_mapping, ref_mesh
+                        )
 
                         cast_op = block._insert_op_without_sync(
                             idx,
@@ -330,23 +407,27 @@ def _insert_forward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                             attrs={
                                 "in_dtype": in_var.dtype,
                                 "out_dtype": cast_var.dtype,
-                                OP_ROLE_KEY: OpRole.Forward
-                            })
+                                OP_ROLE_KEY: OpRole.Forward,
+                            },
+                        )
                         naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
-                            cast_op, ref_mesh, ref_mapping, dist_context)
+                            cast_op, ref_mesh, ref_mapping, dist_context
+                        )
                         num_cast_ops += 1
 
                     op._rename_input(in_var.name, cast_name)
-                    consume_op_attr.set_input_dist_attr(cast_name,
-                                                        in_var_dist_attr)
+                    consume_op_attr.set_input_dist_attr(
+                        cast_name, in_var_dist_attr
+                    )
 
         if op.has_attr('out_dtype') and op.attr('out_dtype') != -1:
             assert op.attr('out_dtype') == dst_dtype
 
         return num_cast_ops
 
-    def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
-                                  dist_context):
+    def _insert_backward_cast_ops(
+        self, op, idx, block, src_dtype, dst_dtype, dist_context
+    ):
 
         num_cast_ops = 0
         op_id = op.desc.id()
@@ -362,15 +443,21 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
             if _keep_fp32_output(op, out_var.name):
                 continue
             assert out_var.dtype == dst_dtype, "{}, {}".format(
-                str(out_var), dst_dtype)
+                str(out_var), dst_dtype
+            )
 
-        for cast_name, src_name, dst_dtype, src_dtype, slot_name in self.forward_input_cast_ops[
-                forward_op_id]:
+        for (
+            cast_name,
+            src_name,
+            dst_dtype,
+            src_dtype,
+            slot_name,
+        ) in self.forward_input_cast_ops[forward_op_id]:
 
             # rename input
             assert src_name in op.input(
-                slot_name), "var: {} not in op's {}. {}".format(
-                    src_name, slot_name, str(op))
+                slot_name
+            ), "var: {} not in op's {}. {}".format(src_name, slot_name, str(op))
             src_var_dist_attr = grad_op_attr.get_input_dist_attr(src_name)
             assert src_var_dist_attr is not None
             op._rename_input(src_name, cast_name)
@@ -379,6 +466,10 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
             # create cast grad
             grad_slot_name = slot_name + "@GRAD"
             assert grad_slot_name in op.output_names
+            if len(op.output(grad_slot_name)) == 0:
+                var = block.var(src_name)
+                assert var.stop_gradient is True
+                continue
             assert len(op.output(grad_slot_name)) == 1
             grad_name = op.output(grad_slot_name)[0]
             grad = block.var(grad_name)
@@ -388,15 +479,18 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
             ref_mapping = grad_dist_attr.dims_mapping
 
             cast_grad = block.create_var(
-                name=unique_name.generate_with_ignorable_key("".join(
-                    [cast_name, '@GRAD'])),
+                name=unique_name.generate_with_ignorable_key(
+                    "".join([cast_name, '@GRAD'])
+                ),
                 dtype=dst_dtype,
                 shape=grad.shape,
                 type=grad.type,
                 persistable=grad.persistable,
-                stop_gradient=grad.stop_gradient)
+                stop_gradient=grad.stop_gradient,
+            )
             dist_context.set_tensor_dist_attr_for_program(
-                cast_grad, grad_dist_attr)
+                cast_grad, grad_dist_attr
+            )
             op._rename_output(grad_name, cast_grad.name)
             grad_op_attr.set_output_dist_attr(cast_grad.name, grad_dist_attr)
 
@@ -409,12 +503,14 @@ def _insert_backward_cast_ops(self, op, idx, block, src_dtype, dst_dtype,
                 attrs={
                     "in_dtype": dst_dtype,
                     "out_dtype": src_dtype,
-                    OP_ROLE_KEY: OpRole.Backward
-                })
+                    OP_ROLE_KEY: OpRole.Backward,
+                },
+            )
             grad.desc.set_dtype(src_dtype)
 
             naive_set_dist_op_attr_for_program_by_mesh_and_mapping(
-                cast_op, ref_mesh, ref_mapping, dist_context)
+                cast_op, ref_mesh, ref_mapping, dist_context
+            )
             num_cast_ops += 1
 
         return num_cast_ops
@@ -427,26 +523,34 @@ def _check_and_update_gradient(grads, loss_scaling, name, dist_context):
 
     check_type(grads, 'x', (tuple, list), 'check_finite_and_unscale')
     for e in grads:
-        check_variable_and_dtype(e, "x", ['float16', 'float32', 'float64'],
-                                 'check_finite_and_unscale')
+        check_variable_and_dtype(
+            e,
+            "x",
+            ['float16', 'float32', 'float64'],
+            'check_finite_and_unscale',
+        )
 
     found_inf = main_block.create_var(
-        name=unique_name.generate_with_ignorable_key(".".join(
-            ['find_infinite_scale', name])),
+        name=unique_name.generate_with_ignorable_key(
+            ".".join(['find_infinite_scale', name])
+        ),
         shape=[1],
         dtype='bool',
         type=core.VarDesc.VarType.LOD_TENSOR,
         persistable=False,
-        stop_gradient=False)
+        stop_gradient=False,
+    )
     set_var_dist_attr(dist_context, found_inf, [-1], world_process_group.ranks)
 
     inputs = {'X': grads, 'Scale': loss_scaling}
     outputs = {'Out': grads, 'FoundInfinite': found_inf}
-    attrs = {'op_role': OpRole.Backward}
-    new_op = main_block.append_op(type='check_finite_and_unscale',
-                                  inputs=inputs,
-                                  outputs=outputs,
-                                  attrs=attrs)
+    attrs = {'op_role': OpRole.Optimize}
+    new_op = main_block.append_op(
+        type='check_finite_and_unscale',
+        inputs=inputs,
+        outputs=outputs,
+        attrs=attrs,
+    )
 
     new_op_dist_attr = OperatorDistributedAttribute()
     new_op_dist_attr.process_mesh = world_process_group.ranks
@@ -456,10 +560,12 @@ def _check_and_update_gradient(grads, loss_scaling, name, dist_context):
     for g in grads:
         g_dist_attr = dist_context.get_tensor_dist_attr_for_program(g)
         assert g_dist_attr is not None
-        new_op_dist_attr.set_input_dims_mapping(g.name,
-                                                g_dist_attr.dims_mapping)
-        new_op_dist_attr.set_output_dims_mapping(g.name,
-                                                 g_dist_attr.dims_mapping)
+        new_op_dist_attr.set_input_dims_mapping(
+            g.name, g_dist_attr.dims_mapping
+        )
+        new_op_dist_attr.set_output_dims_mapping(
+            g.name, g_dist_attr.dims_mapping
+        )
     dist_context.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
     return grads, found_inf
 
@@ -468,8 +574,9 @@ def _split_grads(params_grads):
     grads = [g for _, g in params_grads]
     fp32_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP32]
     fp16_grads = [g for g in grads if g.dtype == core.VarDesc.VarType.FP16]
-    assert len(fp32_grads) + len(fp16_grads) == len(grads), \
-        "Data types of all grads must be either fp16 or fp32."
+    assert len(fp32_grads) + len(fp16_grads) == len(
+        grads
+    ), "Data types of all grads must be either fp16 or fp32."
     return grads, fp32_grads, fp16_grads
 
 
@@ -481,37 +588,45 @@ def _set_op_dist_attr_with_ranks(new_op, ranks, block, dist_context):
         var = block.var(var_name)
         var_dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
         assert var_dist_attr is not None
-        new_op_dist_attr.set_input_dims_mapping(var_name,
-                                                var_dist_attr.dims_mapping)
+        new_op_dist_attr.set_input_dims_mapping(
+            var_name, var_dist_attr.dims_mapping
+        )
     for var_name in new_op.output_arg_names:
         var = block.var(var_name)
         var_dist_attr = dist_context.get_tensor_dist_attr_for_program(var)
         assert var_dist_attr is not None
-        new_op_dist_attr.set_output_dims_mapping(var_name,
-                                                 var_dist_attr.dims_mapping)
+        new_op_dist_attr.set_output_dims_mapping(
+            var_name, var_dist_attr.dims_mapping
+        )
     dist_context.set_op_dist_attr_for_program(new_op, new_op_dist_attr)
 
 
 def _get_memcopy_idx(block, found_inf_var):
     # use reduce_any op for check_nan_inf as the anchor for now
     for idx, op in enumerate(block.ops):
-        if op.type == 'reduce_any' and op.output_arg_names[
-                0] == found_inf_var.name:
+        if (
+            op.type == 'reduce_any'
+            and op.output_arg_names[0] == found_inf_var.name
+        ):
             return idx + 1
 
     raise RuntimeError(
-        "not found the correct location for memcopy for found_inf_var.")
+        "not found the correct location for memcopy for found_inf_var."
+    )
 
 
 def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"):
     src_name = src_var.name
-    output_var = block.create_var(name=unique_name.generate_with_ignorable_key(
-        src_name.join(['memcopy_'])),
-                                  dtype=src_var.dtype,
-                                  shape=src_var.shape,
-                                  type=core.VarDesc.VarType.LOD_TENSOR,
-                                  persistable=False,
-                                  stop_gradient=src_var.stop_gradient)
+    output_var = block.create_var(
+        name=unique_name.generate_with_ignorable_key(
+            src_name.join(['memcopy_'])
+        ),
+        dtype=src_var.dtype,
+        shape=src_var.shape,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        persistable=False,
+        stop_gradient=src_var.stop_gradient,
+    )
 
     set_var_dist_attr(dist_context, output_var, [-1], world_process_group.ranks)
 
@@ -522,23 +637,62 @@ def _insert_memcopy(block, idx, src_var, dist_context, direction="D2H"):
         dst_place_type = 1
     else:
         raise NotImplementedError(
-            "direction [{}] is not supported yet.".format(direction))
+            "direction [{}] is not supported yet.".format(direction)
+        )
 
     attrs = {'dst_place_type': dst_place_type}
-    new_op = block._insert_op_without_sync(index=idx,
-                                           type='memcpy',
-                                           inputs={'X': [src_var]},
-                                           outputs={'Out': [output_var]},
-                                           attrs=attrs)
-    _set_op_dist_attr_with_ranks(new_op, world_process_group.ranks, block,
-                                 dist_context)
+    new_op = block._insert_op_without_sync(
+        index=idx,
+        type='memcpy',
+        inputs={'X': [src_var]},
+        outputs={'Out': [output_var]},
+        attrs=attrs,
+    )
+    _set_op_dist_attr_with_ranks(
+        new_op, world_process_group.ranks, block, dist_context
+    )
     block._sync_with_cpp()
     return output_var
 
 
+def cast_startup_program():
+    main_program = default_main_program()
+    startup_program = default_startup_program()
+
+    param_to_dtype = {}
+    for block in main_program.blocks:
+        for p in block.all_parameters():
+            param_to_dtype[p.name] = p.dtype
+
+    def is_initialization_op(op):
+        comm_op_prefix = "c_"
+        op_type = op.type
+        if op_type.startswith(comm_op_prefix):
+            return False
+
+        if len(op.output_arg_names) != 1 and len(op.input_arg_names) != 0:
+            return False
+
+        return True
+
+    for op in startup_program.global_block().ops:
+        if is_initialization_op(op):
+            output_name = op.output_arg_names[0]
+            if (
+                param_to_dtype.get(output_name, None)
+                == core.VarDesc.VarType.FP16
+            ):
+                assert op.has_attr(
+                    'dtype'
+                ), "initialization op is supported to has dtype attribute but got {}.".format(
+                    str(op)
+                )
+                if op.attr('dtype') == core.VarDesc.VarType.FP32:
+                    op._set_attr('dtype', core.VarDesc.VarType.FP16)
+
+
 @register_pass("auto_parallel_fp16")
 class FP16Pass(AMPPass):
-
     def __init__(self):
         super(FP16Pass, self).__init__()
 
@@ -551,18 +705,26 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
         amp_list = AutoMixedPrecisionLists(
             set(self.get_attr("custom_white_list")),
-            set(self.get_attr("custom_black_list")), None)
+            set(self.get_attr("custom_black_list")),
+            None,
+        )
 
         # NOTE don't not change input data dtype, since it is controled by dataloader
         # and which is out of control of FP16 Pass
         input_data_var_names = [var.name for var in self.get_attr("input_data")]
 
         with paddle.static.program_guard(main_program, startup_program):
-            fp16_state = FP16State(main_program, amp_list, self.dist_context,
-                                   self.get_attr("use_fp16_guard"),
-                                   input_data_var_names)
+            fp16_state = FP16State(
+                main_program,
+                amp_list,
+                self.dist_context,
+                self.get_attr("use_fp16_guard"),
+                input_data_var_names,
+            )
             is_train = fp16_state._build_state()
 
+            cast_startup_program()
+
         if is_train:
             with paddle.static.program_guard(main_program, startup_program):
                 # TODO (JZ-LIANG)support cast forward program only when inference
@@ -571,44 +733,66 @@ def _apply_single_impl(self, main_program, startup_program, context):
 
                 grads, fp32_grads, fp16_grads = _split_grads(params_grads)
 
-                if self.get_attr("use_dynamic_loss_scaling"
-                                 ) or self.get_attr("init_loss_scaling") != 1.0:
+                if (
+                    self.get_attr("use_dynamic_loss_scaling")
+                    or self.get_attr("init_loss_scaling") != 1.0
+                ):
                     found_infs = []
                     if fp32_grads:
-                        with main_program._backward_role_guard():
+                        with main_program._optimized_guard([]):
                             _, found_inf_fp32 = _check_and_update_gradient(
-                                fp32_grads, self._loss_scaling, "@fp32",
-                                self.dist_context)
+                                fp32_grads,
+                                self._loss_scaling,
+                                "@fp32",
+                                self.dist_context,
+                            )
                         found_infs.append(found_inf_fp32)
                     if fp16_grads:
-                        with main_program._backward_role_guard():
+                        with main_program._optimized_guard([]):
                             _, found_inf_fp16 = _check_and_update_gradient(
-                                fp16_grads, self._loss_scaling, "@fp16",
-                                self.dist_context)
+                                fp16_grads,
+                                self._loss_scaling,
+                                "@fp16",
+                                self.dist_context,
+                            )
                         found_infs.append(found_inf_fp16)
-                    with main_program._backward_role_guard():
+                    with main_program._optimized_guard([]):
                         block = main_program.global_block()
 
                         all_infs = paddle.fluid.layers.concat(found_infs)
-                        set_var_dist_attr(self.dist_context, all_infs, [-1],
-                                          world_process_group.ranks)
+                        set_var_dist_attr(
+                            self.dist_context,
+                            all_infs,
+                            [-1],
+                            world_process_group.ranks,
+                        )
                         new_op = block.ops[-1]
                         assert new_op.type == "concat"
-                        _set_op_dist_attr_with_ranks(new_op,
-                                                     world_process_group.ranks,
-                                                     block, self.dist_context)
+                        _set_op_dist_attr_with_ranks(
+                            new_op,
+                            world_process_group.ranks,
+                            block,
+                            self.dist_context,
+                        )
 
                         found_inf = paddle.fluid.layers.reduce_any(all_infs)
-                        set_var_dist_attr(self.dist_context, found_inf, [-1],
-                                          world_process_group.ranks)
+                        set_var_dist_attr(
+                            self.dist_context,
+                            found_inf,
+                            [-1],
+                            world_process_group.ranks,
+                        )
                         new_op = block.ops[-1]
                         assert new_op.type == "reduce_any"
-                        _set_op_dist_attr_with_ranks(new_op,
-                                                     world_process_group.ranks,
-                                                     block, self.dist_context)
+                        _set_op_dist_attr_with_ranks(
+                            new_op,
+                            world_process_group.ranks,
+                            block,
+                            self.dist_context,
+                        )
 
                 if self.get_attr("use_dynamic_loss_scaling"):
-                    with main_program._backward_role_guard():
+                    with main_program._optimized_guard([]):
                         if fp32_grads:
                             self._update_loss_scaling(fp32_grads, found_inf)
                         if fp16_grads:
@@ -620,14 +804,15 @@ def _apply_single_impl(self, main_program, startup_program, context):
             if self.get_attr("use_optimizer_fp16"):
                 base_opt._multi_precision = False
             if isinstance(
-                    base_opt,
-                (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW)):
+                base_opt, (paddle.fluid.optimizer.Adam, paddle.optimizer.AdamW)
+            ):
                 with main_program._optimized_guard([]):
                     # found_inf = paddle.tensor.creation._memcpy(
                     #     found_inf, paddle.CPUPlace())
                     insert_idx = _get_memcopy_idx(block, found_inf)
-                    found_inf = _insert_memcopy(block, insert_idx, found_inf,
-                                                self.dist_context)
+                    found_inf = _insert_memcopy(
+                        block, insert_idx, found_inf, self.dist_context
+                    )
                 base_opt._set_auxiliary_var('found_inf', found_inf.name)
             elif hasattr(base_opt, "_set_auxiliary_var"):
                 base_opt._set_auxiliary_var('found_inf', found_inf.name)
diff --git a/python/paddle/distributed/passes/auto_parallel_grad_clip.py b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
index 6fba98ce752079..34c0b7d56a0381 100644
--- a/python/paddle/distributed/passes/auto_parallel_grad_clip.py
+++ b/python/paddle/distributed/passes/auto_parallel_grad_clip.py
@@ -207,12 +207,15 @@ def __init__(self):
         super(ClipGradByGloblNormPass, self).__init__()
         self.set_attr("rank_id", None)
         self.set_attr("dist_context", None)
+        self.set_attr("params_grads", None)
 
     def _check_self(self):
         if self.get_attr("dist_context") is None:
             return False
         dist_context = self.get_attr("dist_context")
-        if dist_context._lr_optimizer._grad_clip is None:
+        if dist_context._serial_optimizer._grad_clip is None:
+            return False
+        if self.get_attr("params_grads") is None:
             return False
         return True
 
@@ -223,7 +226,8 @@ def _apply_single_impl(self, main_program, startup_program, context):
         dist_context = self.get_attr("dist_context", None)
         rank_id = self.get_attr("rank_id", None)
         block = main_program.global_block()
-        dist_params_grads = _get_params_grads(block)
+        dist_params_grads = self.get_attr("params_grads", None)
+        # dist_params_grads = _get_params_grads(block)
 
         self.clip_helper = ClipHelper(dist_params_grads, rank_id, block,
                                       dist_context)
diff --git a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
index 717f8fa27f2df0..c61d944400d665 100644
--- a/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
+++ b/python/paddle/distributed/passes/auto_parallel_gradient_merge.py
@@ -55,13 +55,6 @@ def _remove_and_get_optimizer_op(main_program, dist_context):
     return optimize_ops_desc
 
 
-def _remove_op_role_var(param, grad):
-    op_maker = core.op_proto_and_checker_maker
-    op = grad.op
-    if op and op.has_attr(op_maker.kOpRoleVarAttrName()):
-        op._remove_attr(op_maker.kOpRoleVarAttrName())
-
-
 def _get_gm_cond_var(main_program, k_steps, dist_context):
     main_block = main_program.global_block()
     # Add const var
@@ -147,8 +140,6 @@ def _append_gradient_merge_backward_op(
             param.type != core.VarDesc.VarType.SELECTED_ROWS
         ), "SELECTED_ROWS is not supported in GradientMergeOptimizer for now"
 
-        _remove_op_role_var(param, grad)
-
     # {grad.name: gradient_merge_var.name} to rename opt inputs
     grad_to_gradient_merge = {}
     # {param: gradient_merge_var} to insert scale op and fill_constant op
diff --git a/python/paddle/distributed/passes/auto_parallel_pipeline.py b/python/paddle/distributed/passes/auto_parallel_pipeline.py
new file mode 100644
index 00000000000000..e5a97f75acc6c8
--- /dev/null
+++ b/python/paddle/distributed/passes/auto_parallel_pipeline.py
@@ -0,0 +1,672 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from logging import exception
+import os
+
+from paddle.fluid import core
+from .pass_base import PassBase, register_pass
+from paddle.fluid.framework import Program, Parameter
+from paddle.distributed.fleet.fleet_executor_utils import TaskNode
+from paddle.distributed.fleet.meta_optimizers.common import OpRole
+
+from paddle.distributed.auto_parallel.utils import (
+    is_forward_op,
+    is_backward_op,
+    is_optimize_op,
+    is_lr_sched_op,
+)
+
+
+__not_shape_var_type__ = [
+    core.VarDesc.VarType.READER,
+    core.VarDesc.VarType.STEP_SCOPES,
+    core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+    core.VarDesc.VarType.FEED_MINIBATCH,
+    core.VarDesc.VarType.FETCH_LIST,
+]
+
+
+@register_pass("auto_parallel_pipeline")
+class PipelinePass(PassBase):
+    def __init__(self):
+        super(PipelinePass, self).__init__()
+        self.set_attr("dist_context", None)
+
+    def _check_self(self):
+        if self.get_attr("dist_context") is None:
+            return False
+        return True
+
+    def _check_conflict(self, other_pass):
+        return True
+
+    def _apply_single_impl(self, main_program, startup_program, context):
+        self._dist_context = self.get_attr("dist_context")
+        self._acc_steps = self.get_attr("accumulate_steps")
+        self._mode = self.get_attr("schedule_mode")
+        self._gen_bsz = self.get_attr("generation_batch_size")
+        self._program = main_program
+
+        if self._mode == "1F1B":
+            raise NotImplementedError("1F1B has not been implemented")
+        elif self._mode == "F-Then-B":
+            raise NotImplementedError("F-Then-B has not been implemented")
+        elif self._mode == "stream":
+            self._insert_sync_ops_for_stream()
+            self._task_stream()
+        else:
+            raise ValueError(
+                "Now only 'F-then-B', '1F1B' and 'stream' are supported."
+                "The given value is {}.".format(self._mode)
+            )
+
+    def _insert_sync_ops_for_stream(self):
+
+        for block in self._program.blocks:
+            offset = 0
+            send_vars = []
+            # insert sync ops
+            for index, op in enumerate(list(block.ops)):
+                if op.type == 'send_v2':
+                    # step1: set 'use_calc_stream' False
+                    op._set_attr("use_calc_stream", False)
+                    op_role = op.attr('op_role')
+                    # step2: insert 'c_sync_calc_stream' op before 'send_v2' op
+                    var_name = op.input_arg_names[0]
+                    var = block.var(var_name)
+                    block._insert_op_without_sync(
+                        index=index + offset,
+                        type="c_sync_calc_stream",
+                        inputs={'X': [var]},
+                        outputs={'Out': [var]},
+                        attrs={'op_role': op_role},
+                    )
+                    offset += 1
+                    send_vars.append(var_name)
+
+            for var_name in send_vars:
+                nop_op = block.append_op(type='nop')
+                nop_op.desc.set_input('X', [var_name])
+                nop_op.desc.set_output('Out', [var_name])
+
+            block._sync_with_cpp()
+
+    def _create_param(self, dst_block, src_var):
+        copied_kwargs = {}
+        copied_kwargs['trainable'] = src_var.trainable
+        copied_kwargs['optimize_attr'] = src_var.optimize_attr
+        copied_kwargs['regularizer'] = src_var.regularizer
+        copied_kwargs['do_model_average'] = src_var.do_model_average
+        copied_kwargs['need_clip'] = src_var.need_clip
+
+        Parameter(
+            block=dst_block,
+            type=src_var.type,
+            name=src_var.name,
+            shape=src_var.shape,
+            dtype=src_var.dtype,
+            lod_level=src_var.lod_level,
+            error_clip=src_var.error_clip,
+            stop_gradient=src_var.stop_gradient,
+            is_data=src_var.is_data,
+            belong_to_optimizer=src_var.belong_to_optimizer,
+            **copied_kwargs
+        )
+
+    def _create_inter(self, dst_block, src_var):
+        dst_block.create_var(
+            type=src_var.type,
+            name=src_var.name,
+            shape=src_var.shape,
+            dtype=src_var.dtype,
+            lod_level=src_var.lod_level,
+            persistable=src_var.persistable,
+            error_clip=src_var.error_clip,
+            stop_gradient=src_var.stop_gradient,
+            is_data=src_var.is_data,
+            belong_to_optimizer=src_var.belong_to_optimizer,
+        )
+
+    def _create_var(
+        self, src_block, dst_block, src_varname, force_create=False
+    ):
+
+        if not force_create:
+            src_var = src_block.var(src_varname)
+        else:
+            src_var = src_block._var_recursive(src_varname)
+        if src_var.type in __not_shape_var_type__:
+            persist = getattr(src_var, 'persistable', False)
+            dst_block.create_var(
+                type=src_var.type,
+                name=src_var.name,
+                persistable=persist,
+                error_clip=src_var.error_clip,
+                stop_gradient=src_var.stop_gradient,
+                is_data=src_var.is_data,
+                belong_to_optimizer=src_var.belong_to_optimizer,
+            )
+        else:
+            if isinstance(src_var, Parameter):
+                self._create_param(dst_block, src_var)
+            else:
+                self._create_inter(dst_block, src_var)
+
+    def _create_program(self, src_block, dst_block, src_op, force_create=False):
+        dst_op_desc = dst_block.desc.append_op()
+        dst_op_desc.copy_from(src_op.desc)
+        for input_varname in src_op.input_arg_names:
+            if src_block.has_var(input_varname) or (
+                force_create and src_block._find_var_recursive(input_varname)
+            ):
+                self._create_var(
+                    src_block, dst_block, input_varname, force_create
+                )
+        for output_varname in src_op.output_arg_names:
+            if src_block.has_var(output_varname) or (
+                force_create and src_block._find_var_recursive(output_varname)
+            ):
+                self._create_var(
+                    src_block, dst_block, output_varname, force_create
+                )
+
+    def _get_pp_stage(self, rank):
+        pp_idx = None
+        for idx, process_mesh in enumerate(self._dist_context.process_meshes):
+            if rank in process_mesh.processes:
+                pp_idx = idx
+                break
+        return pp_idx
+
+    def _task_stream(self):
+        cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
+        trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(',')
+        nrank = len(trainer_endpoints)
+        num_of_functionality = 5
+
+        # compute current pp stage
+        pp_stages = len(self._dist_context.process_meshes)
+        cur_pp_stage = self._get_pp_stage(cur_rank)
+
+        print("pp_stages:", pp_stages)
+        print("cur_rank:", cur_rank)
+        print("cur_pp_stage:", cur_pp_stage)
+        print("process_meshes:", self._dist_context.process_meshes)
+        for process_mesh in self._dist_context.process_meshes:
+            print("--> processes:", process_mesh.processes)
+
+        start_prog = Program()
+        cond_prog = Program()
+        end_prog = Program()
+        send_prog = Program()
+        recv_prog = Program()
+
+        # print("=" * 20)
+        # print("src_prog:")
+        # print(self._program)
+
+        cond_var_name = None
+        send_vars_name = set()
+        recv_vars_name = dict()
+        for ib, src_block in enumerate(self._program.blocks):
+            if ib == 0:
+                strat_block = start_prog.block(0)
+                end_block = end_prog.block(0)
+
+                is_after_while_op = False
+                for op in src_block.ops:
+                    if op.type == "while":
+                        assert len(op.input('Condition')) == 1
+                        cond_var_name = op.input('Condition')[0]
+                        is_after_while_op = True
+                        continue
+
+                    if not is_after_while_op:
+                        self._create_program(
+                            src_block, strat_block, op, force_create=True
+                        )
+                    else:
+                        self._create_program(
+                            src_block, end_block, op, force_create=True
+                        )
+            elif ib == 1:
+                send_block = send_prog.block(0)
+                recv_block = recv_prog.block(0)
+
+                is_after_send_op = False
+                is_after_recv_op = False
+                for op in src_block.ops:
+                    if op.type == "send_v2" and not is_after_send_op:
+                        is_after_send_op = True
+                        if cur_pp_stage == pp_stages - 1:
+                            if op.type in ["c_sync_calc_stream", "nop"]:
+                                continue
+                            if (
+                                op.type not in ["recv_2", "assign"]
+                                and op.has_attr('op_namescope')
+                                and "/auto_parallel/reshard"
+                                in op.attr('op_namescope')
+                            ):
+                                if (
+                                    len(op.desc.input_arg_names()) > 0
+                                    and "@RESHARD"
+                                    not in op.desc.input_arg_names()[0]
+                                ):
+                                    send_vars_name.add(
+                                        op.desc.input_arg_names()[0]
+                                    )
+                                    continue
+                                if op.type == "send_v2":
+                                    continue
+                        self._create_program(
+                            src_block, send_block, op, force_create=True
+                        )
+                        continue
+
+                    if (
+                        is_after_send_op
+                        and not is_after_recv_op
+                        and op.type == "recv_v2"
+                    ):
+                        is_after_recv_op = True
+                        if op.has_attr(
+                            'op_namescope'
+                        ) and "/auto_parallel/reshard" in op.attr(
+                            'op_namescope'
+                        ):
+                            var_name = op.desc.output_arg_names()[0]
+                            index = var_name.find("@")
+                            if index > 0:
+                                old_var_name = var_name[:index]
+                            else:
+                                old_var_name = var_name
+                            recv_vars_name[var_name] = old_var_name
+                            if not src_block._find_var_recursive(old_var_name):
+                                src_var = src_block._var_recursive(var_name)
+                                recv_block.create_var(
+                                    type=src_var.type,
+                                    name=old_var_name,
+                                    shape=src_var.shape,
+                                    dtype=src_var.dtype,
+                                    lod_level=src_var.lod_level,
+                                    persistable=src_var.persistable,
+                                    error_clip=src_var.error_clip,
+                                    stop_gradient=src_var.stop_gradient,
+                                    is_data=src_var.is_data,
+                                    belong_to_optimizer=src_var.belong_to_optimizer,
+                                )
+                            continue
+
+                        self._create_program(
+                            src_block, recv_block, op, force_create=True
+                        )
+                        continue
+
+                    if not is_after_send_op or not is_after_recv_op:
+                        if cur_pp_stage == pp_stages - 1:
+                            if op.type in ["c_sync_calc_stream", "nop"]:
+                                continue
+                            if (
+                                op.type not in ["recv_2", "assign"]
+                                and op.has_attr('op_namescope')
+                                and "/auto_parallel/reshard"
+                                in op.attr('op_namescope')
+                            ):
+                                if (
+                                    len(op.desc.input_arg_names()) > 0
+                                    and "@RESHARD"
+                                    not in op.desc.input_arg_names()[0]
+                                ):
+                                    send_vars_name.add(
+                                        op.desc.input_arg_names()[0]
+                                    )
+                                    continue
+                                if op.type == "send_v2":
+                                    continue
+                        self._create_program(
+                            src_block, send_block, op, force_create=True
+                        )
+
+                    if is_after_send_op and is_after_recv_op:
+                        if op.has_attr(
+                            'op_namescope'
+                        ) and "/auto_parallel/reshard" in op.attr(
+                            'op_namescope'
+                        ):
+                            var_name = op.desc.output_arg_names()[0]
+                            index = var_name.find("@")
+                            if index > 0:
+                                old_var_name = var_name[:index]
+                            else:
+                                old_var_name = var_name
+                            recv_vars_name[var_name] = old_var_name
+                            if not src_block._find_var_recursive(old_var_name):
+                                src_var = src_block._var_recursive(var_name)
+                                recv_block.create_var(
+                                    type=src_var.type,
+                                    name=old_var_name,
+                                    shape=src_var.shape,
+                                    dtype=src_var.dtype,
+                                    lod_level=src_var.lod_level,
+                                    persistable=src_var.persistable,
+                                    error_clip=src_var.error_clip,
+                                    stop_gradient=src_var.stop_gradient,
+                                    is_data=src_var.is_data,
+                                    belong_to_optimizer=src_var.belong_to_optimizer,
+                                )
+                            continue
+
+                        for in_name in op.desc.input_arg_names():
+                            if in_name in recv_vars_name:
+                                op.desc._rename_input(
+                                    in_name, recv_vars_name[in_name]
+                                )
+                        self._create_program(
+                            src_block, recv_block, op, force_create=True
+                        )
+            else:
+                raise Exception("Only support generation condition.")
+
+        start_prog._sync_with_cpp()
+        end_prog._sync_with_cpp()
+        send_prog._sync_with_cpp()
+        recv_prog._sync_with_cpp()
+
+        print("send_vars_name:", list(send_vars_name))
+        print("recv_vars_name:", list(set(recv_vars_name.values())))
+
+        # print("=" * 20)
+        # print("start_prog:")
+        # print(start_prog)
+
+        # print("=" * 20)
+        # print("cond_prog:")
+        # print(cond_prog)
+
+        # print("=" * 20)
+        # print("send_prog:")
+        # print(send_prog)
+
+        # print("=" * 20)
+        # print("recv_prog:")
+        # print(recv_prog)
+
+        # print("=" * 20)
+        # print("end_prog:")
+        # print(end_prog)
+
+        assert cond_var_name is not None
+
+        send_task_node_var_dtype = dict()
+        send_task_node_var_shape = dict()
+        recv_task_node_var_dtype = dict()
+        recv_task_node_var_shape = dict()
+        for var_name in list(send_vars_name):
+            var = send_prog.global_block().vars[var_name]
+            dtype = str(var.dtype)
+            send_task_node_var_dtype[var_name] = dtype[
+                dtype.find("paddle.") + len("paddle.") :
+            ]
+            send_task_node_var_shape[var_name] = var.shape
+        for var_name in list(list(set(recv_vars_name.values()))):
+            var = recv_prog.global_block().vars[var_name]
+            dtype = str(var.dtype)
+            recv_task_node_var_dtype[var_name] = dtype[
+                dtype.find("paddle.") + len("paddle.") :
+            ]
+            recv_task_node_var_shape[var_name] = var.shape
+
+        vars_to_dtype = []
+        vars_to_shape = []
+        if len(send_task_node_var_dtype) > 0:
+            assert len(recv_task_node_var_dtype) == 0
+            vars_to_dtype = send_task_node_var_dtype
+            vars_to_shape = send_task_node_var_shape
+        if len(recv_task_node_var_dtype) > 0:
+            assert len(send_task_node_var_dtype) == 0
+            vars_to_dtype = recv_task_node_var_dtype
+            vars_to_shape = recv_task_node_var_shape
+
+        print("vars_to_dtype:", vars_to_dtype)
+        print("vars_to_shape:", vars_to_shape)
+
+        start_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Start",
+            task_id=int(cur_rank * num_of_functionality + 0),
+            program=start_prog,
+            lazy_initialize=True,
+        )
+        cond_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Cond",
+            task_id=int(cur_rank * num_of_functionality + 1),
+            program=cond_prog,
+            cond_var_name=cond_var_name,
+            lazy_initialize=True,
+        )
+        send_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 2),
+            program=send_prog,
+            lazy_initialize=True,
+        )
+        recv_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 3),
+            program=recv_prog,
+            lazy_initialize=True,
+            vars_to_dtype=vars_to_dtype,
+            vars_to_shape=vars_to_shape,
+        )
+        end_task_node = TaskNode(
+            rank=cur_rank,
+            max_run_times=self._acc_steps,
+            node_type="Compute",
+            task_id=int(cur_rank * num_of_functionality + 4),
+            program=end_prog,
+            lazy_initialize=True,
+        )
+
+        # add dependencies for task nodes intra stage
+        inf = -1
+        pp_buff_size = int(pp_stages - cur_pp_stage)
+        start_task_node.add_downstream_task(
+            cond_task_node.task_id(), self._gen_bsz
+        )
+        print(
+            "Task ",
+            start_task_node.task_id(),
+            "'s downstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            self._gen_bsz,
+        )
+        cond_task_node.add_upstream_task(
+            start_task_node.task_id(), self._gen_bsz
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s upstream is:",
+            start_task_node.task_id(),
+            ", buffer size is:",
+            self._gen_bsz,
+        )
+        cond_task_node.add_downstream_task(send_task_node.task_id(), inf)
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s downstream is:",
+            send_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        send_task_node.add_upstream_task(cond_task_node.task_id(), inf)
+        print(
+            "Task ",
+            send_task_node.task_id(),
+            "'s upstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        send_task_node.add_downstream_task(
+            recv_task_node.task_id(), pp_buff_size
+        )
+        print(
+            "Task ",
+            send_task_node.task_id(),
+            "'s downstream is:",
+            recv_task_node.task_id(),
+            ", buffer size is:",
+            pp_buff_size,
+        )
+        recv_task_node.add_upstream_task(send_task_node.task_id(), pp_buff_size)
+        print(
+            "Task ",
+            recv_task_node.task_id(),
+            "'s upstream is:",
+            send_task_node.task_id(),
+            ", buffer size is:",
+            pp_buff_size,
+        )
+        recv_task_node.add_downstream_task(
+            cond_task_node.task_id(), inf, core.DependType.LOOP
+        )
+        print(
+            "Task ",
+            recv_task_node.task_id(),
+            "'s downstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        cond_task_node.add_upstream_task(
+            recv_task_node.task_id(), inf, core.DependType.LOOP
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s upstream is:",
+            recv_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        cond_task_node.add_downstream_task(
+            end_task_node.task_id(), inf, core.DependType.STOP_LOOP
+        )
+        print(
+            "Task ",
+            cond_task_node.task_id(),
+            "'s downstream is:",
+            end_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+        end_task_node.add_upstream_task(
+            cond_task_node.task_id(), inf, core.DependType.STOP_LOOP
+        )
+        print(
+            "Task ",
+            end_task_node.task_id(),
+            "'s upstream is:",
+            cond_task_node.task_id(),
+            ", buffer size is:",
+            inf,
+        )
+
+        # add dependencies for task nodes inter stage
+        # get upstream ranks and downstream ranks of cur_rank
+        up_down_streams = self._dist_context.up_down_streams
+        pp_upstream_ranks = up_down_streams.ups(cur_rank)
+        pp_downstream_ranks = up_down_streams.downs(cur_rank)
+
+        for upstream_rank in pp_upstream_ranks:
+            upstream_pp_stage = self._get_pp_stage(upstream_rank)
+            if upstream_pp_stage < pp_stages - 1:
+                upstream_task_id = int(upstream_rank * num_of_functionality + 2)
+                send_task_node.add_upstream_task(upstream_task_id)
+                print(
+                    "Task ",
+                    send_task_node.task_id(),
+                    "'s upstream is:",
+                    upstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+            else:
+                upstream_task_id = int(upstream_rank * num_of_functionality + 3)
+                recv_task_node.add_upstream_task(upstream_task_id)
+                print(
+                    "Task ",
+                    recv_task_node.task_id(),
+                    "'s upstream is:",
+                    upstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+        for downstream_rank in pp_downstream_ranks:
+            if cur_pp_stage < pp_stages - 1:
+                downstream_task_id = int(
+                    downstream_rank * num_of_functionality + 2
+                )
+                send_task_node.add_downstream_task(downstream_task_id)
+                print(
+                    "Task ",
+                    send_task_node.task_id(),
+                    "'s downstream is:",
+                    downstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+            else:
+                downstream_task_id = int(
+                    downstream_rank * num_of_functionality + 3
+                )
+                recv_task_node.add_downstream_task(downstream_task_id)
+                print(
+                    "Task ",
+                    recv_task_node.task_id(),
+                    "'s downstream is:",
+                    downstream_task_id,
+                    ", buffer size is:",
+                    2,
+                )
+
+        task_id_to_rank = {}
+        for i in range(nrank):
+            for j in range(num_of_functionality):
+                task_id_to_rank[int(i * num_of_functionality + j)] = i
+        self._program._pipeline_opt = {
+            "fleet_opt": {
+                'tasks': [
+                    start_task_node,
+                    cond_task_node,
+                    send_task_node,
+                    recv_task_node,
+                    end_task_node,
+                ],
+                'task_id_to_rank': task_id_to_rank,
+                'num_micro_batches': self._acc_steps,
+                'inference_generation': True,
+            }
+        }
diff --git a/python/paddle/distributed/passes/auto_parallel_recompute.py b/python/paddle/distributed/passes/auto_parallel_recompute.py
index 80edec82fd7de2..a9c83a98c19fcb 100644
--- a/python/paddle/distributed/passes/auto_parallel_recompute.py
+++ b/python/paddle/distributed/passes/auto_parallel_recompute.py
@@ -151,13 +151,14 @@ def modify_forward_desc_for_recompute(self, dist_context):
             # modify dropout op's desc
             self._ops.insert(op_idx, seed_op)
             cur_op.desc.set_input("Seed", [var_unique_name])
-            cur_op.desc.remove_attr("fix_seed")
-            cur_op.desc.remove_attr("seed")
+            cur_op._remove_attr("fix_seed")
+            cur_op._remove_attr("seed")
             cur_op_dist_attr.set_input_dist_attr(seed_var.name,
                                                  seed_var_dist_attr)
-            self._block._sync_with_cpp()
             op_idx += 2
 
+        self._block._sync_with_cpp()
+
 
 def _find_op_index(block, cur_op):
     for idx in range(block.desc.op_size()):
@@ -339,12 +340,13 @@ def _apply_single_impl(self, main_program, startup_program, context):
             grad_op = ops[i]
             # remove some attrs of dropout_grad op's desc
             if grad_op.type == "dropout_grad":
-                grad_op.desc.remove_attr("fix_seed")
-                grad_op.desc.remove_attr("seed")
-                main_block._sync_with_cpp()
+                grad_op._remove_attr("fix_seed")
+                grad_op._remove_attr("seed")
 
             # rename grad op's var_name which is not in 'vars_in_memory'
             for key in var_name_dict:
+                if key not in grad_op.input_arg_names + grad_op.output_arg_names:
+                    continue
                 self.reset_op_dist_attr(grad_op, var_name_dict)
                 _rename_arg_([grad_op.desc], key, var_name_dict[key])
 
@@ -358,11 +360,11 @@ def _apply_single_impl(self, main_program, startup_program, context):
                         idx -= 1
                     segment_descs = ckpt_ops_dict[fwd_op_id][1]
                     for _, op_desc in reversed(list(enumerate(segment_descs))):
-                        rc_desc = main_block.desc._insert_op(idx)
+                        rc_op = main_block._insert_op_without_sync(idx,
+                                                                   type='nop')
+                        rc_desc = rc_op.desc
                         rc_desc.copy_from(op_desc)
                         rc_desc.set_original_id(rc_desc.id())
-                        rc_op = Operator(main_block, rc_desc)
-                        main_block.ops.insert(idx, rc_op)
                         # set recomputed ops' dist attr
                         fwd_op_dist_attr = self._dist_context.get_op_dist_attr_for_program_with_id(
                             op_desc.original_id())
@@ -371,7 +373,6 @@ def _apply_single_impl(self, main_program, startup_program, context):
                                               var_name_dict)
 
                     ckpt_ops_dict[fwd_op_id][0] = False
-                    main_block._sync_with_cpp()
 
         main_program._sync_with_cpp()
 
diff --git a/python/paddle/distributed/passes/auto_parallel_sharding.py b/python/paddle/distributed/passes/auto_parallel_sharding.py
index e414a235b59565..636b3218c8a0b5 100644
--- a/python/paddle/distributed/passes/auto_parallel_sharding.py
+++ b/python/paddle/distributed/passes/auto_parallel_sharding.py
@@ -22,7 +22,7 @@
 from .pass_base import PassBase, register_pass
 from paddle.distributed.fleet.meta_optimizers.common import is_backward_op, is_optimizer_op
 from paddle.distributed.auto_parallel.process_group import new_process_group
-from paddle.distributed.auto_parallel.operators.common import is_parameter_related
+from paddle.distributed.auto_parallel.operators.common import is_parameter_related, is_data_parallel_reduce_op
 from paddle.distributed.auto_parallel.utils import _get_comm_group, naive_set_dist_op_attr_for_program_by_mesh_and_mapping, set_var_dist_attr
 
 OpRole = core.op_proto_and_checker_maker.OpRole
@@ -38,6 +38,11 @@
 ]
 
 
+def _is_reshard_op(op):
+    return op.desc.has_attr("op_namescope") and \
+        "/auto_parallel/reshard" in op.desc.attr('op_namescope')
+
+
 # NOTE we add the "auto_parallel" prefix to the pass in order to
 # indicate that this pass should obey some constrains by auto_parallel
 # for example all ops and vars should has dist attr before and after pass
@@ -49,7 +54,8 @@ def __init__(self):
         super(ShardingPass, self).__init__()
         self.set_attr("dist_context", None)
         self.set_attr("stage", None)
-        self.set_attr("sharding_degree", None)
+        self.set_attr("sharding_degree", None)  # for parallelizer
+        self.set_attr("degree", None)  # for parallelizer_v2
         self.set_attr("params_grads", [])
         self.set_attr("global_rank", -1)
         self.dp_groups = set()
@@ -57,6 +63,7 @@ def __init__(self):
         self.varname_to_sharding_info = {}
         self.partial_sharding = False
         self.outer_dp_group = None
+        self.shared_params_grads = []
 
     def _check_self(self):
         if self.get_attr("dist_context") is None:
@@ -64,8 +71,15 @@ def _check_self(self):
 
         if self.get_attr("stage") not in [1, 2, 3]:
             return False
-        if (not isinstance(self.get_attr("sharding_degree"),
-                           int)) or self.get_attr("sharding_degree") <= 1:
+        if self.get_attr("sharding_degree") is not None:
+            if (not isinstance(self.get_attr("sharding_degree"), int)) \
+                or self.get_attr("sharding_degree") <= 1:
+                return False
+        elif self.get_attr("degree") is not None:
+            if (not isinstance(self.get_attr("degree"), int)) \
+                or self.get_attr("degree") <= 1:
+                return False
+        else:
             return False
         if len(self.get_attr("params_grads")) <= 0:
             return False
@@ -80,7 +94,8 @@ def _check_conflict(self, other_pass):
 
     def _apply_single_impl(self, main_program, startup_program, context):
         self._dist_context = self.get_attr("dist_context")
-        self.sharding_world_size = int(self.get_attr("sharding_degree"))
+        self.sharding_world_size = int(
+            self.get_attr("sharding_degree") or self.get_attr("degree"))
         self.stage = int(self.get_attr("stage"))
         self.global_rank = int(self.get_attr("global_rank"))
         params_grads = self.get_attr("params_grads")
@@ -92,6 +107,8 @@ def _apply_single_impl(self, main_program, startup_program, context):
         self._shard_gradient_synchronization(main_block)
         self._shard_parameter(main_block, startup_block)
 
+        context.set_attr("params_grads", self.shared_params_grads)
+
     def _build_sharding_groups(self, main_block, params_grads):
         self._collective_data_parallel_groups(main_block)
         self._build_sharding_infos(params_grads)
@@ -100,6 +117,10 @@ def _collective_data_parallel_groups(self, main_block):
         for op in main_block.ops:
             if not _is_forward_op(op) or op.type in _skip_ops:
                 continue
+            # NOTE: there aren't dist_attr in the ops which reshard insert,
+            # and should be skip in sharding.
+            if _is_reshard_op(op):
+                continue
             group = _inference_data_parallel_group_for_operator(
                 self.global_rank, op, self._dist_context)
             if group is not None:
@@ -142,13 +163,10 @@ def _build_sharding_infos(self, params_grads):
 
             self._dist_context._sharding_group = sharding_group
             # TODO(JZ-LIANG) when support multiple dp groups in future, should group param and bind them to corresponding dp group
-            params_in_group = [p for p, g in params_grads]
-            assert len(params_in_group) == len(
-                set(params_in_group)), "found duplicated param in params_grads"
             sharding_info = ShardingInfo(sharding_group, self.global_rank,
-                                         params_in_group)
+                                         params_grads)
             self.sharding_infos.append(sharding_info)
-            for param in params_in_group:
+            for param in sharding_info.params:
                 self.varname_to_sharding_info[param.name] = sharding_info
 
     def _shard_optimizer(self, main_block, startup_block, params_grads,
@@ -187,8 +205,30 @@ def _shard_amp_related_op_and_vars(self, main_block, pass_context):
 
                     if self._is_parameter_in_local_shard(param_name):
                         reversed_x.append(input_name)
-                op.desc.set_input('X', reversed_x)
-                op.desc.set_output('Out', reversed_x)
+
+                # NOTE: When `reversed_x` is [], check_finite_and_unscale will be replaced by `fill_constant` op.
+                # The output of check_finite_and_unscale is be set False
+                if reversed_x:
+                    op.desc.set_input('X', reversed_x)
+                    op.desc.set_output('Out', reversed_x)
+                else:
+                    if op.type == "check_finite_and_unscale":
+                        op_role = op.attr('op_role')
+                        out_name = op.output_arg_names[0]
+                        out_var = main_block.vars[out_name]
+                        main_block._remove_op(idx, sync=False)
+                        main_block._insert_op_without_sync(
+                            idx,
+                            type="fill_constant",
+                            outputs={"Out": out_var},
+                            attrs={
+                                "shape": out_var.shape,
+                                "dtype": out_var.dtype,
+                                "value": 0,
+                                OP_ROLE_KEY: op_role,
+                            })
+                    else:
+                        main_block._remove_op(idx, sync=False)
 
         main_block._sync_with_cpp()
 
@@ -287,6 +327,9 @@ def _shard_optimizer_ops_and_states(self, main_block, startup_block):
                         if varname != param_name
                     ])
                     main_block._remove_op(idx, sync=False)
+                else:
+                    self.shared_params_grads.append(
+                        self._get_param_grad(param_name))
 
         for idx, op in reversed(list(enumerate(startup_block.ops))):
             if len(op.output_arg_names) == 1 and op.output_arg_names[
@@ -339,6 +382,13 @@ def _is_parameter_in_local_shard(self, param_name):
         sharding_info = self.varname_to_sharding_info[param_name]
         return sharding_info.is_in_local_shard(param_name)
 
+    def _get_param_grad(self, param_name):
+        assert param_name in self.varname_to_sharding_info
+        sharding_info = self.varname_to_sharding_info[param_name]
+        p_g = sharding_info.get_param_grad(param_name)
+        assert p_g is not None
+        return p_g
+
     def _shard_gradient_synchronization(self, main_block):
 
         if self.stage < 2:
@@ -346,7 +396,7 @@ def _shard_gradient_synchronization(self, main_block):
 
         dp_ring_ids = [group.id for group in self.dp_groups]
         for idx, op in reversed(list(enumerate(main_block.ops))):
-            if _is_param_grad_allreduce_op(op, main_block, dp_ring_ids):
+            if _is_param_grad_allreduce_op(op, main_block):
                 input_name = op.input_arg_names[0]
                 base_name = _get_base_name_from_grad_name(input_name)
                 sharding_info = self.varname_to_sharding_info[base_name]
@@ -354,11 +404,23 @@ def _shard_gradient_synchronization(self, main_block):
                                   sharding_info.group.id,
                                   sharding_info.get_var_rank(base_name),
                                   self._dist_context)
-                if not self.partial_sharding:
+                if not self.partial_sharding or not sharding_info.is_in_local_shard(
+                        base_name):
                     main_block._remove_op(idx + 1, sync=False)
                 else:
                     op._set_attr("ring_id", self.outer_dp_group.id)
 
+            # NOTE:
+            # var@GRAD = sum(var@GRAD@RENAME@0, var@GRAD@RENAME@1)
+            # If the var is not in local rank and it is output of many ops, or the var is renamed in another words,
+            # the sum op should be removed.
+            if _is_param_grad_sum_op(op, main_block):
+                out_name = op.output_arg_names[0]
+                base_name = _get_base_name_from_grad_name(out_name)
+                sharding_info = self.varname_to_sharding_info[base_name]
+                if not sharding_info.is_in_local_shard(base_name):
+                    main_block._remove_op(idx, sync=False)
+
         main_block._sync_with_cpp()
 
     def _shard_parameter(self, main_block, startup_block):
@@ -381,7 +443,10 @@ def _shard_parameter(self, main_block, startup_block):
                     continue
 
                 for input_name in op.desc.input_arg_names():
-                    if op.type == "cast":
+                    # NOTE hack for embedding op when AMP 02-3
+                    # paddle amp force embedding (lookup table) to be run on fp32
+                    if _is_param_fp16_cast_op(main_block, op,
+                                              sharding_info.param_names):
                         continue
                     if input_name not in need_broadcast_vars:
                         continue
@@ -588,13 +653,25 @@ def _get_base_name_from_grad_name(grad_name):
     return base_name
 
 
-def _is_param_grad_allreduce_op(op, block, dp_ring_ids):
+def _is_param_grad_allreduce_op(op, block):
 
-    if not is_backward_op(op):
+    if not is_data_parallel_reduce_op(op):
         return False
-    if op.type != "c_allreduce_sum":
+
+    output_name = op.output_arg_names[0]
+    base_name = _get_base_name_from_grad_name(output_name)
+
+    if not block.has_var(base_name):
         return False
-    if op.attr('ring_id') not in dp_ring_ids:
+
+    return block.var(base_name).is_parameter
+
+
+def _is_param_grad_sum_op(op, block):
+
+    if not is_backward_op(op):
+        return False
+    if op.type != "sum":
         return False
 
     output_name = op.output_arg_names[0]
@@ -652,9 +729,13 @@ def shard_parameters(params, group_size):
 
 class ShardingInfo(object):
 
-    def __init__(self, group, rank, params):
+    def __init__(self, group, rank, params_grads):
         self.group = group
-        self.params = params
+        self.params_grads = dict([(p.name, (p, g)) for p, g in params_grads])
+        assert len(self.params_grads) == len(set(
+            self.params_grads)), "found duplicated param in params_grads"
+
+        self.params = [p for p, _ in params_grads]
         self.param_names = [p.name for p in self.params]
         self.group_size = group.nranks
         self.global_rank = rank
@@ -678,9 +759,14 @@ def get_var_rank(self, varname):
             return self.param_to_rank[varname]
         return -1
 
+    # determine fp32 and fp16 (cast) param
     def is_in_local_shard(self, param_name):
         return self.get_var_rank(param_name) == self.local_rank
 
+    # NOTE the follwo logic is designed for supporting AMP O1 when
+    # the param would be cast to fp16 before used for caculation.
+    # and sharding should only broadcast the casted fp16 param
+    # instead of the origin fp32 version param.
     def get_broadcast_vars_and_param_usage(self, block):
         broadcast_vars = set([])
         fp16_params = set([])
@@ -709,3 +795,11 @@ def get_broadcast_vars_and_param_usage(self, block):
             if usage > 0:
                 broadcast_vars.add(param)
         return broadcast_vars, param_usage
+
+    def get_param_grad(self, param_name):
+        if not self.is_in_local_shard(param_name):
+            raise ValueError(
+                "param[{}] not in current rank.".format(param_name))
+        if param_name not in self.params_grads:
+            raise ValueError('param[{}] not in params_grads'.format(param_name))
+        return self.params_grads.get(param_name, None)
diff --git a/python/paddle/distributed/passes/ps_trainer_pass.py b/python/paddle/distributed/passes/ps_trainer_pass.py
index cdb377a72be02d..466730ae1a56f6 100755
--- a/python/paddle/distributed/passes/ps_trainer_pass.py
+++ b/python/paddle/distributed/passes/ps_trainer_pass.py
@@ -617,6 +617,8 @@ def _apply_single_impl(self, main_program, startup_program, pass_ctx):
         for var in remote_optimize_vars:
             if var in local_optimize_vars:
                 continue
+            if 'learning_rate_0' == var:
+                continue
             if var not in remote_optimize_op_role_vars:
                 optimize_need_delete_vars.append(var)
         need_delete_optimize_vars = list(set(optimize_need_delete_vars))
diff --git a/python/paddle/distributed/ps/the_one_ps.py b/python/paddle/distributed/ps/the_one_ps.py
index 77a0ab0a6595b4..0ce5e70788e72b 100755
--- a/python/paddle/distributed/ps/the_one_ps.py
+++ b/python/paddle/distributed/ps/the_one_ps.py
@@ -192,6 +192,8 @@ def _set(self, accessor_proto, varname, program_id, context,
                     sgd_param.name = "SparseNaiveSGDRule"
                 if common_accessor.accessor_class == "adam":
                     sgd_param.name = "SparseAdamSGDRule"
+                else:  # for fl-ps, because geo accessor is 'sum'
+                    sgd_param.name = "SparseAdamSGDRule"
 
             if sgd_param.name == "SparseAdaGradSGDRule" or sgd_param.name == "StdAdaGradSGDRule":
                 if not sgd_param.adagrad.HasField("learning_rate"):
@@ -1090,8 +1092,9 @@ def sync_strategy_envs():
         print("communicator config:", trainer_config.get_communicator_flags())
 
         self._worker.init_worker(worker_desc, self.string_hosts, self.role_id)
-        self.trainer_endpoint = get_trainer_endpoint(self.role_maker)
-        print("fl-ps > trainer_endpoint: {}".format(self.trainer_endpoint))
+        if not self.is_heter_ps_mode:
+            self.trainer_endpoint = get_trainer_endpoint(self.role_maker)
+            print("fl-ps > trainer_endpoint: {}".format(self.trainer_endpoint))
         print("fl-ps > with_coordinator? {}".format(self.with_coordinator))
         print("fl-ps > coordinator addr: {}".format(self.coordinator_hosts))
         if self.with_coordinator:
diff --git a/python/paddle/distributed/ps/utils/ps_program_builder.py b/python/paddle/distributed/ps/utils/ps_program_builder.py
index 53771b05cbf671..0bd870ffee5d94 100755
--- a/python/paddle/distributed/ps/utils/ps_program_builder.py
+++ b/python/paddle/distributed/ps/utils/ps_program_builder.py
@@ -373,8 +373,8 @@ def _build_trainer_programs(self):
         _main_file = ps_log_root_dir + '4_fl_worker_main_program.prototxt'
         #debug_program(_main_file, self.cloned_main)
 
-        fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs)
-        fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx)
+        #fake_init_ops_pass = new_pass("fake_init_ops_pass", self.attrs)
+        #fake_init_ops_pass.apply([None], [self.cloned_startup], self.pass_ctx)
 
         _main_file = ps_log_root_dir + '5_fl_worker_main_program.prototxt'
         #debug_program(_main_file, self.cloned_main)
diff --git a/python/paddle/distributed/sharding/group_sharded.py b/python/paddle/distributed/sharding/group_sharded.py
index 9ebe7fd60310f6..e0a5bb69f962a4 100644
--- a/python/paddle/distributed/sharding/group_sharded.py
+++ b/python/paddle/distributed/sharding/group_sharded.py
@@ -19,34 +19,52 @@
 import paddle
 
 from paddle.optimizer import Optimizer
-from paddle.distributed.utils import get_logger
+from paddle.distributed.utils.log_utils import get_logger
 from paddle.fluid.framework import in_dygraph_mode
 
 # Old version
-from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import ShardingOptimizerStage2
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import ShardingStage2
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import ShardingStage3
-from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import ShardingScaler
+from paddle.distributed.fleet.meta_optimizers.dygraph_optimizer.sharding_optimizer_stage2 import (
+    ShardingOptimizerStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage2 import (
+    ShardingStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_stage3 import (
+    ShardingStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.sharding_utils import (
+    ShardingScaler,
+)
 
 # New version
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import GroupShardedStage3
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import GroupShardedScaler
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import (
+    GroupShardedOptimizerStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import (
+    GroupShardedStage2,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage3 import (
+    GroupShardedStage3,
+)
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_utils import (
+    GroupShardedScaler,
+)
 
 logger_ = get_logger(logging.WARNING)
 
 
-def group_sharded_parallel(model,
-                           optimizer,
-                           level,
-                           scaler=None,
-                           group=None,
-                           offload=False,
-                           sync_buffers=False,
-                           buffer_max_size=2**23,
-                           segment_size=2**20,
-                           sync_comm=False):
+def group_sharded_parallel(
+    model,
+    optimizer,
+    level,
+    scaler=None,
+    group=None,
+    offload=False,
+    sync_buffers=False,
+    buffer_max_size=2**23,
+    segment_size=2**20,
+    sync_comm=False,
+):
     """
     Use group_sharded_parallel can perform group shared configuration on the model, optimizer and GradScaler. Level has three string options, 'os', 'os_g' and 'p_g_os' corresponds to three different usage scenarios: optimizer state segmentation, optimizer state + gradient segmentation, and parameter + gradient + optimizer state segmentation.
     Usually, optimizer state + gradient segmentation is actually a re optimization of optimizer state segmentation, so optimizer state + gradient segmentation can be used to realize optimizer state segmentation.
@@ -62,12 +80,12 @@ def group_sharded_parallel(model,
         buffer_max_size (int, optional): The max size of the buffer used to integrate gradient in `os_g`. The larger the size, the more GPU memory will be used. Defaults to 2**23, which means that the dimension of the buffer is 2**23.
         segment_size (int, optional): The smallest size of parameter to be sharded in `p_g_os`. Defaults to 2**20, indicating that the dimension of the minimum segmented parameter is 2**20.
         sync_comm (bool, optional): Whether to use synchronous communication, only in `p_g_os` used. Defaults to False, indicating that asynchronous communication is used.
-    
+
     Returns:
         model: A wrapper for group sharded given model.
         optimizer: A wrapper for group sharded given optimizer.
         scaler: A wrapper for group sharded given scaler.
-    
+
     Examples:
         .. code-block:: python
 
@@ -100,13 +118,16 @@ def group_sharded_parallel(model,
     """
     # check optition type
     assert isinstance(
-        model,
-        paddle.nn.Layer), "The model must be the instance of paddle.nn.Layer."
+        model, paddle.nn.Layer
+    ), "The model must be the instance of paddle.nn.Layer."
     assert isinstance(
         optimizer, Optimizer
     ), "The optimizer must be the instance of paddle.optimizer.Optimizer."
-    assert level in ['os', 'os_g',
-                     'p_g_os'], "The level must be os, os_g or p_g_os."
+    assert level in [
+        'os',
+        'os_g',
+        'p_g_os',
+    ], "The level must be os, os_g or p_g_os."
 
     def check_dtype(param):
         return param.dtype == paddle.float16
@@ -124,39 +145,50 @@ def check_dtype(param):
                 params=optimizer._parameter_list,
                 optim=optimizer,
                 group=group,
-                offload=offload)
-            model = GroupShardedStage2(model,
-                                       optimizer,
-                                       group=group,
-                                       sync_buffers=sync_buffers,
-                                       buffer_max_size=buffer_max_size)
+                offload=offload,
+            )
+            model = GroupShardedStage2(
+                model,
+                optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                buffer_max_size=buffer_max_size,
+            )
         else:
-            optimizer = ShardingOptimizerStage2(params=model.parameters(),
-                                                optim=optimizer,
-                                                group=group,
-                                                offload=offload)
-            model = ShardingStage2(model,
-                                   optimizer,
-                                   group=group,
-                                   sync_buffers=sync_buffers,
-                                   buffer_max_size=buffer_max_size)
+            optimizer = ShardingOptimizerStage2(
+                params=model.parameters(),
+                optim=optimizer,
+                group=group,
+                offload=offload,
+            )
+            model = ShardingStage2(
+                model,
+                optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                buffer_max_size=buffer_max_size,
+            )
     elif level == 'p_g_os':
         if in_dygraph_mode():
-            model = GroupShardedStage3(model,
-                                       optimizer=optimizer,
-                                       group=group,
-                                       sync_buffers=sync_buffers,
-                                       segment_size=segment_size,
-                                       offload=offload,
-                                       sync_comm=sync_comm)
+            model = GroupShardedStage3(
+                model,
+                optimizer=optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                segment_size=segment_size,
+                offload=offload,
+                sync_comm=sync_comm,
+            )
         else:
-            model = ShardingStage3(model,
-                                   optimizer=optimizer,
-                                   group=group,
-                                   sync_buffers=sync_buffers,
-                                   segment_size=segment_size,
-                                   offload=offload,
-                                   sync_comm=sync_comm)
+            model = ShardingStage3(
+                model,
+                optimizer=optimizer,
+                group=group,
+                sync_buffers=sync_buffers,
+                segment_size=segment_size,
+                offload=offload,
+                sync_comm=sync_comm,
+            )
     else:
         raise ValueError("Please enter the correct level.")
     if isinstance(scaler, paddle.amp.GradScaler):
@@ -177,14 +209,14 @@ def save_group_sharded_model(model, output, optimizer=None):
     """
     Group sharded encapsulated model and optimizer state saving module.
 
-    .. note::
+    Note:
         If using save_group_sharded_model saves the model. When loading again, you need to set the model or optimizer state before using group_sharded_parallel.
 
     Args:
         model (Layer): A wrapper for group sharded given model.
         output (str): Save directory.
         optimizer (Optimizer, optional): Group sharded encapsulated optimizer. Defaults to None, indicating that the optimizer state is not saved.
-    
+
     Examples:
         .. code-block:: python
 
@@ -219,7 +251,8 @@ def save_group_sharded_model(model, output, optimizer=None):
             save_group_sharded_model(model, optimizer, output=output_dir)
     """
     logger_.info(
-        "==========Begin to save group sharded model and optimizer==========")
+        "==========Begin to save group sharded model and optimizer=========="
+    )
     assert not os.path.isfile(
         output
     ), "Saving directory ({}) should be a directory, not a file".format(output)
@@ -243,4 +276,5 @@ def save_group_sharded_model(model, output, optimizer=None):
         output_opt = os.path.join(output, "model.pdopt")
         paddle.save(optimizer._optim.state_dict(), output_opt)
     logger_.info(
-        "==========End to save group sharded model and optimizer==========")
+        "==========End to save group sharded model and optimizer=========="
+    )
diff --git a/python/paddle/distributed/spawn.py b/python/paddle/distributed/spawn.py
index c0ff2bc273dc57..b7908213c9b51e 100644
--- a/python/paddle/distributed/spawn.py
+++ b/python/paddle/distributed/spawn.py
@@ -21,9 +21,7 @@
 import sys
 import warnings
 
-from paddle.distributed.utils import _print_arguments
-from paddle.distributed.utils import _prepare_trainer_env
-from paddle.distributed.utils import get_host_name_ip
+from paddle.distributed.utils.launch_utils import _print_arguments, _prepare_trainer_env, get_host_name_ip
 from paddle.distributed.cloud_utils import get_cluster_and_pod, _get_trainers_num
 from paddle.distributed.fleet.launch import get_cluster_from_args
 from paddle.distributed.fleet.cloud_utils import use_paddlecloud
diff --git a/python/paddle/distributed/utils/__init__.py b/python/paddle/distributed/utils/__init__.py
new file mode 100644
index 00000000000000..4ce89fa36b06b2
--- /dev/null
+++ b/python/paddle/distributed/utils/__init__.py
@@ -0,0 +1,15 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = []
diff --git a/python/paddle/distributed/utils.py b/python/paddle/distributed/utils/launch_utils.py
similarity index 55%
rename from python/paddle/distributed/utils.py
rename to python/paddle/distributed/utils/launch_utils.py
index 6d8454a6e9ed97..3282b5f58bc1a6 100644
--- a/python/paddle/distributed/utils.py
+++ b/python/paddle/distributed/utils/launch_utils.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,287 +12,22 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import functools
-import logging
-import socket
 import time
 import os
 import signal
 import copy
 import sys
-import six
 import subprocess
 from contextlib import closing
 import socket
 from paddle.fluid import core
-from paddle.distributed.fleet.launch_utils import get_backend_by_compile_flag
 from distutils.util import strtobool
+import six
 
-from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.data_feeder import check_variable_and_dtype
-from paddle import _C_ops, _legacy_C_ops
-
-__all__ = [  #noqa
-    'get_host_name_ip',
-    'Trainer',
-    'get_cluster',
-    'start_local_trainers',
-    'watch_local_trainers',
-    'find_free_ports',
-    'JobServer',
-    'Cluster',
-    'Pod',
-    'Hdfs',
-    'add_arguments',
-    'terminate_local_procs',
-    'TrainerProc',
-    'get_logger',
-    'pull_worker_log',
-    'global_scatter',
-    'global_gather',
-]
-
-
-def global_scatter(x,
-                   local_count,
-                   global_count,
-                   group=None,
-                   use_calc_stream=True):
-    """
-    The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count, 
-    and then receives data according to global_count. The expert refers to a user-defined expert network, 
-    n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
-    
-    As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
-    The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
-    In the global_scatter operator, local_count[i] represents sending local_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
-    global_count[i] represents receiving global_count[i] data from the (i // n_expert)th card to the (i % n_expert)th expert of this card. The rank in the
-    figure respresent the rank of the current card in all cards.
-
-    The process of global_scatter sending data is as follows:
-
-    local_count[0] represents taking out 2 batches from x and sending 2 batches to the 0th expert of the 0th card;
-
-    local_count[1] represents taking out 0 batches from x and sending 0 batches to the 1th expert of the 0th card;
-
-    local_count[2] represents taking out 2 batches from x and sending 2 batches to the 0th expert of the 1th card;
-
-    local_count[3] represents taking out 0 batches from x and sending 0 batches to the 1th expert of the 1th card;
-
-    Therefore, the global_count[0] of the 0th card is equal to 2, which means that 2 batches of data are received from the 0th card to the 0th expert;
-
-    the global_count[1] of the 0th card is equal to 0, which means that 0 batches of data are received from the 0th card to the 1th expert;
-
-    the global_count[0] of the 1th card is equal to 2, which means that 2 batches of data are received from the 0th card to the 0th expert;
-
-    the global_count[1] of the 1th card is equal to 0, which means that 0 batches of data are received from the 0th card to the 1th expert.
-
-    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
-        :width: 800
-        :alt: global_scatter_gather
-        :align: center
-
-    Args:
-        x (Tensor): Tensor. The tensor data type should be float16, float32, float64, int32 or int64.
-        local_count (Tensor): Tensor which have n_expert * world_size elements that indicates
-            how many data needed to be sent. The tensor data type should be int64.
-        global_count (Tensor): Tensor which have n_expert * world_size elements that indicates
-            how many data needed to be received. The tensor data type should be int64.
-        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
-    
-    Returns:
-        out (Tensor): The data received from all experts. 
-    
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import numpy as np
-            import paddle
-            from paddle.distributed import init_parallel_env
-            init_parallel_env()
-            n_expert = 2
-            world_size = 2
-            d_model = 2
-            in_feat = d_model
-            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
-            dtype=np.float32)
-            if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = np.array([2, 1, 1, 1]) 
-                global_count = np.array([2, 1, 1, 1])
-            else:
-                local_count = np.array([1, 1, 2, 1])
-                global_count = np.array([1, 1, 2, 1])
-            local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
-            local_count = paddle.to_tensor(local_count, dtype="int64")
-            global_count = paddle.to_tensor(global_count, dtype="int64")
-            a = paddle.distributed.utils.global_scatter(local_input_buf, \
-            local_count, global_count)
-            a.stop_gradient = False
-            print(a)
-            # out for rank 0: [[1, 2], [3, 4], [1, 2], [5, 6], [3, 4]]
-            # out for rank 1: [[7, 8], [5, 6], [7, 8], [9, 10], [9, 10]]
-            # backward test
-            c = a * a
-            c.backward()
-            print("local_input_buf.grad: ", local_input_buf.grad)
-            # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-            # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-    """
-    if group is not None and not group.is_member():
-        return
-
-    ring_id = 0 if group is None else group.id
-    if _non_static_mode():
-        return _legacy_C_ops.global_scatter(x, local_count, \
-                                    global_count,  \
-                                    'use_calc_stream', use_calc_stream, \
-                                    'ring_id', ring_id)
-    else:
-        op_type = 'global_scatter'
-        check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
-            'global_scatter')
-        check_variable_and_dtype(local_count, 'local_count', ['int64'],
-                                 'global_scatter')
-        check_variable_and_dtype(global_count, 'global_count', ['int64'],
-                                 'global_scatter')
-
-        helper = LayerHelper(op_type, **locals())
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-        helper.append_op(type=op_type,
-                         inputs={
-                             'X': [x],
-                             'local_count': [local_count],
-                             'global_count': [global_count],
-                         },
-                         outputs={'Out': [out]},
-                         attrs={
-                             'ring_id': ring_id,
-                             'use_calc_stream': use_calc_stream
-                         })
-        return out
-
-
-def global_gather(x,
-                  local_count,
-                  global_count,
-                  group=None,
-                  use_calc_stream=True):
-    """
-    The global_gather operator gathers the data of x into n_expert * world_size experts according to global_count, and then receives data according to local_count.
-    The expert refers to a user-defined expert network, n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
-
-    As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
-    The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
-    In the global_gather operator, the meaning of the global_count and local_count is opposed to global_scatter, global_count[i] represents sending global_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
-    local_count[i] represents receiving local_count[i] data from the (i // n_expert)th card to the (i % n_expert)th expert of this card. The data sent will be arranged according to the experts of each card.
-    The rank in the figure respresent the rank of the current card in all cards.
-
-    The process of global_gather sending data is as follows:
-
-    The global_count[0] of the 0th card represents sending 2 data to the 0th expert of the 0th card;
-    
-    The global_count[1] of the 0th card represents sending 0 data to the 1th expert of the 0th card;
-    
-    The global_count[0] of the 1th card represents sending 2 data to the 0th expert of the 0th card;
-    
-    The global_count[1] of the 1th card represents sending 0 data to the 1th expert of the 0th card.
-
-    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
-        :width: 800
-        :alt: global_scatter_gather
-        :align: center
-
-
-    Args:
-        x (Tensor): Tensor. Tensor whose data type should be float16, float32, float64, int32 or int64.
-        local_count (Tensor): Tensor which have n_expert * world_size elements that indicates
-            how many data needed to be received. Tensor data type should be int64.
-        global_count (Tensor): Tensor which have n_expert * world_size elements that indicates
-            how many data needed to be sent. Tensor data type should be int64.
-        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
-        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
-    
-    Returns:
-        out (Tensor): The data received from all experts. 
-    
-    Examples:
-        .. code-block:: python
-
-            # required: distributed
-            import numpy as np
-            import paddle
-            from paddle.distributed import init_parallel_env
-            init_parallel_env()
-            n_expert = 2
-            world_size = 2
-            d_model = 2
-            in_feat = d_model
-            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\
-                                        dtype=np.float32)
-            if paddle.distributed.ParallelEnv().local_rank == 0:
-                local_count = np.array([2, 1, 1, 1])
-                global_count = np.array([2, 1, 1, 1])
-            else:
-                local_count = np.array([1, 1, 2, 1])
-                global_count = np.array([1, 1, 2, 1])
-            local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
-            local_count = paddle.to_tensor(local_count, dtype="int64")
-            global_count = paddle.to_tensor(global_count, dtype="int64")
-            a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
-            print(a)
-            # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
-            # out for rank 1: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
-            a.stop_gradient = False
-            c = a * a
-            c.backward()
-            print("local_input_buf.grad", local_input_buf.grad)
-            # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-            # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
-    """
-    if group is not None and not group.is_member():
-        return
-
-    ring_id = 0 if group is None else group.id
-    if _non_static_mode():
-        return _legacy_C_ops.global_gather(x, local_count, \
-                                    global_count, \
-                                    'use_calc_stream', use_calc_stream, \
-                                    'ring_id', ring_id)
-    else:
-        op_type = 'global_gather'
-        check_variable_and_dtype(
-            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
-            'global_gather')
-
-        check_variable_and_dtype(local_count, 'local_count', ['int64'],
-                                 'global_gather')
-
-        check_variable_and_dtype(global_count, 'global_count', ['int64'],
-                                 'global_gather')
-        helper = LayerHelper(op_type, **locals())
-        out = helper.create_variable_for_type_inference(dtype=x.dtype)
-
-        helper.append_op(type=op_type,
-                         inputs={
-                             'X': [x],
-                             'local_count': [local_count],
-                             'global_count': [global_count]
-                         },
-                         outputs={'Out': [out]},
-                         attrs={
-                             'ring_id': group,
-                             'use_calc_stream': use_calc_stream,
-                         })
-        return out
-
+from paddle.distributed.fleet.launch_utils import get_backend_by_compile_flag
+from ..utils.log_utils import get_logger
 
-logger = logging.getLogger("root")
-logger.propagate = False
+logger = get_logger("INFO", "root")
 
 
 def get_cluster_from_args(args, selected_gpus):
@@ -354,13 +89,6 @@ def get_gpus(selected_gpus):
     return gpus
 
 
-def _print_arguments(args):
-    print("-----------  Configuration Arguments -----------")
-    for arg, value in sorted(six.iteritems(vars(args))):
-        print("%s: %s" % (arg, value))
-    print("------------------------------------------------")
-
-
 class Hdfs(object):
 
     def __init__(self):
@@ -549,21 +277,6 @@ def get_visible_gpus(self):
         return r
 
 
-def get_logger(log_level, name="root"):
-    logger = logging.getLogger(name)
-    # Avoid printing multiple logs
-    if not logger.handlers:
-        logger.setLevel(log_level)
-
-        log_handler = logging.StreamHandler()
-        log_format = logging.Formatter(
-            '%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s')
-        log_handler.setFormatter(log_format)
-        logger.addHandler(log_handler)
-
-    return logger
-
-
 def get_cluster(node_ips, node_ip, trainer_endpoints, selected_gpus):
     assert type(trainer_endpoints) is list, "trainer_endpoints must be list"
     cluster = Cluster(hdfs=None)
@@ -826,3 +539,10 @@ def watch_local_trainers(procs, nranks):
         raise
 
     return alive
+
+
+def _print_arguments(args):
+    print("-----------  Configuration Arguments -----------")
+    for arg, value in sorted(six.iteritems(vars(args))):
+        print("%s: %s" % (arg, value))
+    print("------------------------------------------------")
diff --git a/python/paddle/distributed/utils/log_utils.py b/python/paddle/distributed/utils/log_utils.py
new file mode 100644
index 00000000000000..c00ce9b82786f5
--- /dev/null
+++ b/python/paddle/distributed/utils/log_utils.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
+
+def get_logger(log_level, name="root"):
+
+    logger = logging.getLogger(name)
+
+    # Avoid printing multiple logs
+    logger.propagate = False
+
+    if not logger.handlers:
+        log_handler = logging.StreamHandler()
+        logger.setLevel(log_level)
+        log_format = logging.Formatter(
+            '[%(asctime)-15s] [%(levelname)8s] %(filename)s:%(lineno)s - %(message)s'
+        )
+        log_handler.setFormatter(log_format)
+        logger.addHandler(log_handler)
+    else:
+        logger.setLevel(log_level)
+    return logger
diff --git a/python/paddle/distributed/utils/moe_utils.py b/python/paddle/distributed/utils/moe_utils.py
new file mode 100644
index 00000000000000..d6dbfdfab58c02
--- /dev/null
+++ b/python/paddle/distributed/utils/moe_utils.py
@@ -0,0 +1,255 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.fluid.framework import _non_static_mode
+from paddle.fluid.data_feeder import check_variable_and_dtype
+from paddle import _legacy_C_ops
+
+
+def global_scatter(x,
+                   local_count,
+                   global_count,
+                   group=None,
+                   use_calc_stream=True):
+    """
+    The global_scatter operator distributes the data of x to n_expert * world_size experts according to local_count,
+    and then receives data according to global_count. The expert refers to a user-defined expert network,
+    n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
+
+    As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
+    The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
+    In the global_scatter operator, local_count[i] represents sending local_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
+    global_count[i] represents receiving global_count[i] data from the (i // n_expert)th card to the (i % n_expert)th expert of this card. The rank in the
+    figure respresent the rank of the current card in all cards.
+
+    The process of global_scatter sending data is as follows:
+
+    local_count[0] represents taking out 2 batches from x and sending 2 batches to the 0th expert of the 0th card;
+
+    local_count[1] represents taking out 0 batches from x and sending 0 batches to the 1th expert of the 0th card;
+
+    local_count[2] represents taking out 2 batches from x and sending 2 batches to the 0th expert of the 1th card;
+
+    local_count[3] represents taking out 0 batches from x and sending 0 batches to the 1th expert of the 1th card;
+
+    Therefore, the global_count[0] of the 0th card is equal to 2, which means that 2 batches of data are received from the 0th card to the 0th expert;
+
+    the global_count[1] of the 0th card is equal to 0, which means that 0 batches of data are received from the 0th card to the 1th expert;
+
+    the global_count[0] of the 1th card is equal to 2, which means that 2 batches of data are received from the 0th card to the 0th expert;
+
+    the global_count[1] of the 1th card is equal to 0, which means that 0 batches of data are received from the 0th card to the 1th expert.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
+        :width: 800
+        :alt: global_scatter_gather
+        :align: center
+
+    Args:
+        x (Tensor): Tensor. The tensor data type should be float16, float32, float64, int32 or int64.
+        local_count (Tensor): Tensor which have n_expert * world_size elements that indicates
+            how many data needed to be sent. The tensor data type should be int64.
+        global_count (Tensor): Tensor which have n_expert * world_size elements that indicates
+            how many data needed to be received. The tensor data type should be int64.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
+
+    Returns:
+        out (Tensor): The data received from all experts.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            n_expert = 2
+            world_size = 2
+            d_model = 2
+            in_feat = d_model
+            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]], \
+            dtype=np.float32)
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                local_count = np.array([2, 1, 1, 1])
+                global_count = np.array([2, 1, 1, 1])
+            else:
+                local_count = np.array([1, 1, 2, 1])
+                global_count = np.array([1, 1, 2, 1])
+            local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
+            local_count = paddle.to_tensor(local_count, dtype="int64")
+            global_count = paddle.to_tensor(global_count, dtype="int64")
+            a = paddle.distributed.utils.global_scatter(local_input_buf, \
+            local_count, global_count)
+            a.stop_gradient = False
+            print(a)
+            # out for rank 0: [[1, 2], [3, 4], [1, 2], [5, 6], [3, 4]]
+            # out for rank 1: [[7, 8], [5, 6], [7, 8], [9, 10], [9, 10]]
+            # backward test
+            c = a * a
+            c.backward()
+            print("local_input_buf.grad: ", local_input_buf.grad)
+            # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+            # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+    """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    if _non_static_mode():
+        return _legacy_C_ops.global_scatter(x, local_count, \
+                                    global_count,  \
+                                    'use_calc_stream', use_calc_stream, \
+                                    'ring_id', ring_id)
+    else:
+        op_type = 'global_scatter'
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
+            'global_scatter')
+        check_variable_and_dtype(local_count, 'local_count', ['int64'],
+                                 'global_scatter')
+        check_variable_and_dtype(global_count, 'global_count', ['int64'],
+                                 'global_scatter')
+
+        helper = LayerHelper(op_type, **locals())
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+        helper.append_op(type=op_type,
+                         inputs={
+                             'X': [x],
+                             'local_count': [local_count],
+                             'global_count': [global_count],
+                         },
+                         outputs={'Out': [out]},
+                         attrs={
+                             'ring_id': ring_id,
+                             'use_calc_stream': use_calc_stream
+                         })
+        return out
+
+
+def global_gather(x,
+                  local_count,
+                  global_count,
+                  group=None,
+                  use_calc_stream=True):
+    """
+    The global_gather operator gathers the data of x into n_expert * world_size experts according to global_count, and then receives data according to local_count.
+    The expert refers to a user-defined expert network, n_expert refers to the number of expert networks owned by each card, and world_size refers to the number of graphics cards running the network.
+
+    As shown below, the value of the world size is 2, n_expert 2, the batch size of the x 4 and local_count is [2, 0, 2, 0].
+    The global_count of the rank 0 is [2, 0, , ], rank 1 is [2, 0, ,](Due to the limited space, only the data calculated on rank 0 is shown here).
+    In the global_gather operator, the meaning of the global_count and local_count is opposed to global_scatter, global_count[i] represents sending global_count[i] data to the (i % n_expert)th expert of the (i // n_expert)th card,
+    local_count[i] represents receiving local_count[i] data from the (i // n_expert)th card to the (i % n_expert)th expert of this card. The data sent will be arranged according to the experts of each card.
+    The rank in the figure respresent the rank of the current card in all cards.
+
+    The process of global_gather sending data is as follows:
+
+    The global_count[0] of the 0th card represents sending 2 data to the 0th expert of the 0th card;
+
+    The global_count[1] of the 0th card represents sending 0 data to the 1th expert of the 0th card;
+
+    The global_count[0] of the 1th card represents sending 2 data to the 0th expert of the 0th card;
+
+    The global_count[1] of the 1th card represents sending 0 data to the 1th expert of the 0th card.
+
+    .. image:: https://githubraw.cdn.bcebos.com/PaddlePaddle/docs/develop/docs/api/paddle/distributed/img/global_scatter_gather.png
+        :width: 800
+        :alt: global_scatter_gather
+        :align: center
+
+
+    Args:
+        x (Tensor): Tensor. Tensor whose data type should be float16, float32, float64, int32 or int64.
+        local_count (Tensor): Tensor which have n_expert * world_size elements that indicates
+            how many data needed to be received. Tensor data type should be int64.
+        global_count (Tensor): Tensor which have n_expert * world_size elements that indicates
+            how many data needed to be sent. Tensor data type should be int64.
+        group (Group, optional): The group instance return by new_group or None for global default group. Default: None.
+        use_calc_stream (bool, optional): Wether to use calculation stream (True) or communication stream. Default: True.
+
+    Returns:
+        out (Tensor): The data received from all experts.
+
+    Examples:
+        .. code-block:: python
+
+            # required: distributed
+            import numpy as np
+            import paddle
+            from paddle.distributed import init_parallel_env
+            init_parallel_env()
+            n_expert = 2
+            world_size = 2
+            d_model = 2
+            in_feat = d_model
+            local_input_buf = np.array([[1, 2],[3, 4],[5, 6],[7, 8],[9, 10]],\
+                                        dtype=np.float32)
+            if paddle.distributed.ParallelEnv().local_rank == 0:
+                local_count = np.array([2, 1, 1, 1])
+                global_count = np.array([2, 1, 1, 1])
+            else:
+                local_count = np.array([1, 1, 2, 1])
+                global_count = np.array([1, 1, 2, 1])
+            local_input_buf = paddle.to_tensor(local_input_buf, dtype="float32", stop_gradient=False)
+            local_count = paddle.to_tensor(local_count, dtype="int64")
+            global_count = paddle.to_tensor(global_count, dtype="int64")
+            a = paddle.distributed.utils.global_gather(local_input_buf, local_count, global_count)
+            print(a)
+            # out for rank 0: [[1, 2], [3, 4], [7, 8], [1, 2], [7, 8]]
+            # out for rank 1: [[5, 6], [9, 10], [3, 4], [5, 6], [9, 10]]
+            a.stop_gradient = False
+            c = a * a
+            c.backward()
+            print("local_input_buf.grad", local_input_buf.grad)
+            # out for rank 0: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+            # out for rank 1: [[2, 4], [6, 8], [10, 12], [14, 16], [18, 20]]
+    """
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    if _non_static_mode():
+        return _legacy_C_ops.global_gather(x, local_count, \
+                                    global_count, \
+                                    'use_calc_stream', use_calc_stream, \
+                                    'ring_id', ring_id)
+    else:
+        op_type = 'global_gather'
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
+            'global_gather')
+
+        check_variable_and_dtype(local_count, 'local_count', ['int64'],
+                                 'global_gather')
+
+        check_variable_and_dtype(global_count, 'global_count', ['int64'],
+                                 'global_gather')
+        helper = LayerHelper(op_type, **locals())
+        out = helper.create_variable_for_type_inference(dtype=x.dtype)
+
+        helper.append_op(type=op_type,
+                         inputs={
+                             'X': [x],
+                             'local_count': [local_count],
+                             'global_count': [global_count]
+                         },
+                         outputs={'Out': [out]},
+                         attrs={
+                             'ring_id': group,
+                             'use_calc_stream': use_calc_stream,
+                         })
+        return out
diff --git a/python/paddle/distribution/distribution.py b/python/paddle/distribution/distribution.py
index 8c5843521b0173..75934052da98ff 100644
--- a/python/paddle/distribution/distribution.py
+++ b/python/paddle/distribution/distribution.py
@@ -28,35 +28,56 @@
 import paddle
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
-                                      check_variable_and_dtype, convert_dtype)
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
-                                 elementwise_mul, elementwise_sub, nn, ops,
-                                 tensor)
+from paddle.fluid.data_feeder import (
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
+from paddle.fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 from paddle.tensor import arange, concat, gather_nd, multinomial
 
 
 class Distribution(object):
     """
-    The abstract base class for probability distributions. Functions are 
+    The abstract base class for probability distributions. Functions are
     implemented in specific distributions.
 
     Args:
-        batch_shape(Sequence[int], optional):  independent, not identically 
+        batch_shape(Sequence[int], optional):  independent, not identically
             distributed draws, aka a "collection" or "bunch" of distributions.
-        event_shape(Sequence[int], optional): the shape of a single 
-            draw from the distribution; it may be dependent across dimensions. 
-            For scalar distributions, the event shape is []. For n-dimension 
+        event_shape(Sequence[int], optional): the shape of a single
+            draw from the distribution; it may be dependent across dimensions.
+            For scalar distributions, the event shape is []. For n-dimension
             multivariate distribution, the event shape is [n].
     """
 
     def __init__(self, batch_shape=(), event_shape=()):
 
-        self._batch_shape = batch_shape if isinstance(
-            batch_shape, tuple) else tuple(batch_shape)
-        self._event_shape = event_shape if isinstance(
-            event_shape, tuple) else tuple(event_shape)
+        self._batch_shape = (
+            batch_shape
+            if isinstance(batch_shape, tuple)
+            else tuple(batch_shape)
+        )
+        self._event_shape = (
+            event_shape
+            if isinstance(event_shape, tuple)
+            else tuple(event_shape)
+        )
 
         super(Distribution, self).__init__()
 
@@ -118,16 +139,16 @@ def log_prob(self, value):
 
     def probs(self, value):
         """Probability density/mass function.
-        
-        .. note:: 
-        
-            This method will be deprecated in the future, please use `prob` 
+
+        Note:
+
+            This method will be deprecated in the future, please use `prob`
             instead.
         """
         raise NotImplementedError
 
     def _extend_shape(self, sample_shape):
-        """compute shape of the sample 
+        """compute shape of the sample
 
         Args:
             sample_shape (Tensor): sample shape
@@ -155,7 +176,8 @@ def _validate_args(self, *args):
 
         if is_variable and is_number:
             raise ValueError(
-                'if one argument is Tensor, all arguments should be Tensor')
+                'if one argument is Tensor, all arguments should be Tensor'
+            )
 
         return is_variable
 
@@ -170,15 +192,17 @@ def _to_tensor(self, *args):
         """
         numpy_args = []
         variable_args = []
-        tmp = 0.
+        tmp = 0.0
 
         for arg in args:
             if isinstance(arg, float):
                 arg = [arg]
             if not isinstance(arg, (list, tuple, np.ndarray, tensor.Variable)):
                 raise TypeError(
-                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}"
-                    .format(type(arg)))
+                    "Type of input args must be float, list, numpy.ndarray or Tensor, but received type {}".format(
+                        type(arg)
+                    )
+                )
 
             arg_np = np.array(arg)
             arg_dtype = arg_np.dtype
@@ -216,20 +240,24 @@ def _check_values_dtype_in_probs(self, param, value):
             value (Tensor): Change value's dtype if value's dtype is different from param.
         """
         if _non_static_mode():
-            if value.dtype != param.dtype and convert_dtype(
-                    value.dtype) in ['float32', 'float64']:
+            if value.dtype != param.dtype and convert_dtype(value.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 warnings.warn(
                     "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
                 )
                 if in_dygraph_mode():
                     return _C_ops.cast(value, param.dtype)
                 if _in_legacy_dygraph():
-                    return _legacy_C_ops.cast(value, 'in_dtype', value.dtype,
-                                              'out_dtype', param.dtype)
+                    return _legacy_C_ops.cast(
+                        value, 'in_dtype', value.dtype, 'out_dtype', param.dtype
+                    )
             return value
 
-        check_variable_and_dtype(value, 'value', ['float32', 'float64'],
-                                 'log_prob')
+        check_variable_and_dtype(
+            value, 'value', ['float32', 'float64'], 'log_prob'
+        )
         if value.dtype != param.dtype:
             warnings.warn(
                 "dtype of input 'value' needs to be the same as parameters of distribution class. dtype of 'value' will be converted."
@@ -239,19 +267,25 @@ def _check_values_dtype_in_probs(self, param, value):
 
     def _probs_to_logits(self, probs, is_binary=False):
         r"""
-        Converts probabilities into logits. For the binary, probs denotes the 
-        probability of occurrence of the event indexed by `1`. For the 
-        multi-dimensional, values of last axis denote the probabilities of 
+        Converts probabilities into logits. For the binary, probs denotes the
+        probability of occurrence of the event indexed by `1`. For the
+        multi-dimensional, values of last axis denote the probabilities of
         occurrence of each of the events.
         """
-        return (paddle.log(probs) - paddle.log1p(-probs)) \
-            if is_binary else paddle.log(probs)
+        return (
+            (paddle.log(probs) - paddle.log1p(-probs))
+            if is_binary
+            else paddle.log(probs)
+        )
 
     def _logits_to_probs(self, logits, is_binary=False):
         r"""
-        Converts logits into probabilities. For the binary, each value denotes 
-        log odds, whereas for the multi-dimensional case, the values along the 
+        Converts logits into probabilities. For the binary, each value denotes
+        log odds, whereas for the multi-dimensional case, the values along the
         last dimension denote the log probabilities of the events.
         """
-        return paddle.nn.functional.sigmoid(logits) \
-            if is_binary else paddle.nn.functional.softmax(logits, axis=-1)
+        return (
+            paddle.nn.functional.sigmoid(logits)
+            if is_binary
+            else paddle.nn.functional.softmax(logits, axis=-1)
+        )
diff --git a/python/paddle/distribution/kl.py b/python/paddle/distribution/kl.py
index c5ad3f04358dc6..34b18dd06b00b1 100644
--- a/python/paddle/distribution/kl.py
+++ b/python/paddle/distribution/kl.py
@@ -35,14 +35,14 @@ def kl_divergence(p, q):
 
     .. math::
 
-        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x 
+        KL(p||q) = \int p(x)log\frac{p(x)}{q(x)} \mathrm{d}x
 
     Args:
-        p (Distribution): ``Distribution`` object.
-        q (Distribution): ``Distribution`` object.
+        p (Distribution): ``Distribution`` object. Inherits from the Distribution Base class.
+        q (Distribution): ``Distribution`` object. Inherits from the Distribution Base class.
 
     Returns:
-        Tensor: Batchwise KL-divergence between distribution p and q.
+        Tensor, Batchwise KL-divergence between distribution p and q.
 
     Examples:
 
@@ -64,15 +64,15 @@ def kl_divergence(p, q):
 def register_kl(cls_p, cls_q):
     """Decorator for register a KL divergence implemention function.
 
-    The ``kl_divergence(p, q)`` function will search concrete implemention 
-    functions registered by ``register_kl``, according to multi-dispatch pattern. 
-    If an implemention function is found, it will return the result, otherwise, 
-    it will raise ``NotImplementError`` exception. Users can register 
-    implemention funciton by the decorator. 
+    The ``kl_divergence(p, q)`` function will search concrete implemention
+    functions registered by ``register_kl``, according to multi-dispatch pattern.
+    If an implemention function is found, it will return the result, otherwise,
+    it will raise ``NotImplementError`` exception. Users can register
+    implemention funciton by the decorator.
 
     Args:
-        cls_p(Distribution): Subclass derived from ``Distribution``.
-        cls_q(Distribution): Subclass derived from ``Distribution``.
+        cls_p (Distribution): The Distribution type of Instance p. Subclass derived from ``Distribution``.
+        cls_q (Distribution): The Distribution type of Instance q. Subclass derived from ``Distribution``.
 
     Examples:
         .. code-block:: python
@@ -83,8 +83,9 @@ def register_kl(cls_p, cls_q):
             def kl_beta_beta():
                 pass # insert implementation here
     """
-    if (not issubclass(cls_p, Distribution)
-            or not issubclass(cls_q, Distribution)):
+    if not issubclass(cls_p, Distribution) or not issubclass(
+        cls_q, Distribution
+    ):
         raise TypeError('cls_p and cls_q must be subclass of Distribution')
 
     def decorator(f):
@@ -98,8 +99,11 @@ def _dispatch(cls_p, cls_q):
     """Multiple dispatch into concrete implement function"""
 
     # find all matched super class pair of p and q
-    matchs = [(super_p, super_q) for super_p, super_q in _REGISTER_TABLE
-              if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)]
+    matchs = [
+        (super_p, super_q)
+        for super_p, super_q in _REGISTER_TABLE
+        if issubclass(cls_p, super_p) and issubclass(cls_q, super_q)
+    ]
     if not matchs:
         raise NotImplementedError
 
@@ -108,16 +112,20 @@ def _dispatch(cls_p, cls_q):
 
     if _REGISTER_TABLE[left_p, left_q] is not _REGISTER_TABLE[right_p, right_q]:
         warnings.warn(
-            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.
-            format(cls_p.__name__, cls_q.__name__, left_p.__name__,
-                   right_q.__name__), RuntimeWarning)
+            'Ambiguous kl_divergence({}, {}). Please register_kl({}, {})'.format(
+                cls_p.__name__,
+                cls_q.__name__,
+                left_p.__name__,
+                right_q.__name__,
+            ),
+            RuntimeWarning,
+        )
 
     return _REGISTER_TABLE[left_p, left_q]
 
 
 @functools.total_ordering
 class _Compare(object):
-
     def __init__(self, *classes):
         self.classes = classes
 
@@ -135,22 +143,33 @@ def __le__(self, other):
 
 @register_kl(Beta, Beta)
 def _kl_beta_beta(p, q):
-    return ((q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma()) -
-            (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma()) +
-            ((p.alpha - q.alpha) * p.alpha.digamma()) +
-            ((p.beta - q.beta) * p.beta.digamma()) +
-            (((q.alpha + q.beta) - (p.alpha + p.beta)) *
-             (p.alpha + p.beta).digamma()))
+    return (
+        (q.alpha.lgamma() + q.beta.lgamma() + (p.alpha + p.beta).lgamma())
+        - (p.alpha.lgamma() + p.beta.lgamma() + (q.alpha + q.beta).lgamma())
+        + ((p.alpha - q.alpha) * p.alpha.digamma())
+        + ((p.beta - q.beta) * p.beta.digamma())
+        + (
+            ((q.alpha + q.beta) - (p.alpha + p.beta))
+            * (p.alpha + p.beta).digamma()
+        )
+    )
 
 
 @register_kl(Dirichlet, Dirichlet)
 def _kl_dirichlet_dirichlet(p, q):
     return (
-        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma()) -
-        ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1)) +
-        (((p.concentration - q.concentration) *
-          (p.concentration.digamma() -
-           p.concentration.sum(-1).digamma().unsqueeze(-1))).sum(-1)))
+        (p.concentration.sum(-1).lgamma() - q.concentration.sum(-1).lgamma())
+        - ((p.concentration.lgamma() - q.concentration.lgamma()).sum(-1))
+        + (
+            (
+                (p.concentration - q.concentration)
+                * (
+                    p.concentration.digamma()
+                    - p.concentration.sum(-1).digamma().unsqueeze(-1)
+                )
+            ).sum(-1)
+        )
+    )
 
 
 @register_kl(Categorical, Categorical)
@@ -170,8 +189,7 @@ def _kl_uniform_uniform(p, q):
 
 @register_kl(ExponentialFamily, ExponentialFamily)
 def _kl_expfamily_expfamily(p, q):
-    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_
-    """
+    """Compute kl-divergence using `Bregman divergences <https://www.lix.polytechnique.fr/~nielsen/EntropyEF-ICIP2010.pdf>`_"""
     if not type(p) == type(q):
         raise NotImplementedError
 
@@ -187,19 +205,22 @@ def _kl_expfamily_expfamily(p, q):
 
     try:
         if _non_static_mode():
-            p_grads = paddle.grad(p_log_norm,
-                                  p_natural_params,
-                                  create_graph=True)
+            p_grads = paddle.grad(
+                p_log_norm, p_natural_params, create_graph=True
+            )
         else:
             p_grads = paddle.static.gradients(p_log_norm, p_natural_params)
     except RuntimeError as e:
         raise TypeError(
-            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q})."
-            .format(cls_p=type(p).__name__, cls_q=type(q).__name__)) from e
+            "Cann't compute kl_divergence({cls_p}, {cls_q}) use bregman divergence. Please register_kl({cls_p}, {cls_q}).".format(
+                cls_p=type(p).__name__, cls_q=type(q).__name__
+            )
+        ) from e
 
     kl = q._log_normalizer(*q_natural_params) - p_log_norm
-    for p_param, q_param, p_grad in zip(p_natural_params, q_natural_params,
-                                        p_grads):
+    for p_param, q_param, p_grad in zip(
+        p_natural_params, q_natural_params, p_grads
+    ):
         term = (q_param - p_param) * p_grad
         kl -= _sum_rightmost(term, len(q.event_shape))
 
diff --git a/python/paddle/distribution/normal.py b/python/paddle/distribution/normal.py
index f248e1a09273dc..91c795f3349041 100644
--- a/python/paddle/distribution/normal.py
+++ b/python/paddle/distribution/normal.py
@@ -19,12 +19,23 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
-                                      check_variable_and_dtype, convert_dtype)
+from paddle.fluid.data_feeder import (
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
 from paddle.fluid.framework import _non_static_mode, in_dygraph_mode
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
-                                 elementwise_mul, elementwise_sub, nn, ops,
-                                 tensor)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 
 
 class Normal(distribution.Distribution):
@@ -36,7 +47,7 @@ class Normal(distribution.Distribution):
 
     .. math::
 
-        pdf(x; \mu, \sigma) = \\frac{1}{Z}e^{\\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
+        pdf(x; \mu, \sigma) = \frac{1}{Z}e^{\frac {-0.5 (x - \mu)^2}  {\sigma^2} }
 
     .. math::
 
@@ -49,53 +60,59 @@ class Normal(distribution.Distribution):
     * :math:`Z`: is the normalization constant.
 
     Args:
-        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
-        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is int, float, list, numpy.ndarray or Tensor.
+        loc(int|float|list|tuple|numpy.ndarray|Tensor): The mean of normal distribution.The data type is float32 and float64.
+        scale(int|float|list|tuple|numpy.ndarray|Tensor): The std of normal distribution.The data type is float32 and float64.
         name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Examples:
         .. code-block:: python
-          
-          import paddle
-          from paddle.distribution import Normal
-
-          # Define a single scalar Normal distribution.
-          dist = Normal(loc=0., scale=3.)
-          # Define a batch of two scalar valued Normals.
-          # The first has mean 1 and standard deviation 11, the second 2 and 22.
-          dist = Normal(loc=[1., 2.], scale=[11., 22.])
-          # Get 3 samples, returning a 3 x 2 tensor.
-          dist.sample([3])
-
-          # Define a batch of two scalar valued Normals.
-          # Both have mean 1, but different standard deviations.
-          dist = Normal(loc=1., scale=[11., 22.])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          normal_a = Normal([0.], [1.])
-          normal_b = Normal([0.5], [2.])
-          sample = normal_a.sample([2])
-          # a random tensor created by normal distribution with shape: [2, 1]
-          entropy = normal_a.entropy()
-          # [1.4189385] with shape: [1]
-          lp = normal_a.log_prob(value_tensor)
-          # [-1.2389386] with shape: [1]
-          p = normal_a.probs(value_tensor)
-          # [0.28969154] with shape: [1]
-          kl = normal_a.kl_divergence(normal_b)
-          # [0.34939718] with shape: [1]
+
+            import paddle
+            from paddle.distribution import Normal
+
+            # Define a single scalar Normal distribution.
+            dist = Normal(loc=0., scale=3.)
+            # Define a batch of two scalar valued Normals.
+            # The first has mean 1 and standard deviation 11, the second 2 and 22.
+            dist = Normal(loc=[1., 2.], scale=[11., 22.])
+            # Get 3 samples, returning a 3 x 2 tensor.
+            dist.sample([3])
+
+            # Define a batch of two scalar valued Normals.
+            # Both have mean 1, but different standard deviations.
+            dist = Normal(loc=1., scale=[11., 22.])
+
+            # Complete example
+            value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            normal_a = Normal([0.], [1.])
+            normal_b = Normal([0.5], [2.])
+            sample = normal_a.sample([2])
+            # a random tensor created by normal distribution with shape: [2, 1]
+            entropy = normal_a.entropy()
+            # [1.4189385] with shape: [1]
+            lp = normal_a.log_prob(value_tensor)
+            # [-1.2389386] with shape: [1]
+            p = normal_a.probs(value_tensor)
+            # [0.28969154] with shape: [1]
+            kl = normal_a.kl_divergence(normal_b)
+            # [0.34939718] with shape: [1]
     """
 
     def __init__(self, loc, scale, name=None):
         if not _non_static_mode():
-            check_type(loc, 'loc',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Normal')
-            check_type(scale, 'scale',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Normal')
+            check_type(
+                loc,
+                'loc',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Normal',
+            )
+            check_type(
+                scale,
+                'scale',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Normal',
+            )
 
         self.batch_size_unknown = False
         self.all_arg_is_float = False
@@ -115,11 +132,15 @@ def __init__(self, loc, scale, name=None):
         else:
             if isinstance(loc, float) and isinstance(scale, float):
                 self.all_arg_is_float = True
-            if isinstance(loc, np.ndarray) and str(
-                    loc.dtype) in ['float32', 'float64']:
+            if isinstance(loc, np.ndarray) and str(loc.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = loc.dtype
-            elif isinstance(scale, np.ndarray) and str(
-                    scale.dtype) in ['float32', 'float64']:
+            elif isinstance(scale, np.ndarray) and str(scale.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = scale.dtype
             # pylint: disable=unbalanced-tuple-unpacking
             self.loc, self.scale = self._to_tensor(loc, scale)
@@ -132,11 +153,11 @@ def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
 
         Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
+            shape (list): 1D `int32`. Shape of the generated samples.
+            seed (int): Python integer number.
 
         Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+            Tensor, A tensor with prepended dimensions shape.The data type is float32.
 
         """
         if not _non_static_mode():
@@ -149,21 +170,21 @@ def sample(self, shape, seed=0):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.loc + self.scale, batch_shape + shape, self.dtype, 0.)
+                self.loc + self.scale, batch_shape + shape, self.dtype, 0.0
+            )
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
             zero_tmp_shape = nn.shape(zero_tmp_reshape)
-            normal_random_tmp = nn.gaussian_random(zero_tmp_shape,
-                                                   mean=0.,
-                                                   std=1.,
-                                                   seed=seed,
-                                                   dtype=self.dtype)
+            normal_random_tmp = nn.gaussian_random(
+                zero_tmp_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
+            )
             output = normal_random_tmp * (zero_tmp_reshape + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             return output
         else:
             output_shape = shape + batch_shape
-            output = nn.gaussian_random(output_shape, mean=0., std=1., seed=seed, dtype=self.dtype) * \
-                     (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
+            output = nn.gaussian_random(
+                output_shape, mean=0.0, std=1.0, seed=seed, dtype=self.dtype
+            ) * (tensor.zeros(output_shape, dtype=self.dtype) + self.scale)
             output = elementwise_add(output, self.loc, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -177,25 +198,26 @@ def entropy(self):
 
         .. math::
 
-            entropy(\sigma) = 0.5 \\log (2 \pi e \sigma^2)
+            entropy(\sigma) = 0.5 \log (2 \pi e \sigma^2)
 
         In the above equation:
 
         * :math:`scale = \sigma`: is the std.
 
         Returns:
-          Tensor: Shannon entropy of normal distribution.The data type is float32.
+            Tensor, Shannon entropy of normal distribution.The data type is float32.
 
         """
         name = self.name + '_entropy'
         batch_shape = list((self.loc + self.scale).shape)
-        zero_tmp = tensor.fill_constant_batch_size_like(self.loc + self.scale,
-                                                        batch_shape, self.dtype,
-                                                        0.)
-        return elementwise_add(0.5 + zero_tmp,
-                               0.5 * math.log(2 * math.pi) + nn.log(
-                                   (self.scale + zero_tmp)),
-                               name=name)
+        zero_tmp = tensor.fill_constant_batch_size_like(
+            self.loc + self.scale, batch_shape, self.dtype, 0.0
+        )
+        return elementwise_add(
+            0.5 + zero_tmp,
+            0.5 * math.log(2 * math.pi) + nn.log((self.scale + zero_tmp)),
+            name=name,
+        )
 
     def log_prob(self, value):
         """Log probability density/mass function.
@@ -212,29 +234,33 @@ def log_prob(self, value):
 
         var = self.scale * self.scale
         log_scale = nn.log(self.scale)
-        return elementwise_sub(-1. * ((value - self.loc) * (value - self.loc)) /
-                               (2. * var),
-                               log_scale + math.log(math.sqrt(2. * math.pi)),
-                               name=name)
+        return elementwise_sub(
+            -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var),
+            log_scale + math.log(math.sqrt(2.0 * math.pi)),
+            name=name,
+        )
 
     def probs(self, value):
         """Probability density/mass function.
 
         Args:
-          value (Tensor): The input tensor.
+            value (Tensor): The input tensor.
 
         Returns:
-          Tensor: probability.The data type is same with value.
+            Tensor, probability. The data type is same with value.
 
         """
         name = self.name + '_probs'
         value = self._check_values_dtype_in_probs(self.loc, value)
 
         var = self.scale * self.scale
-        return elementwise_div(ops.exp(-1. * ((value - self.loc) *
-                                              (value - self.loc)) / (2. * var)),
-                               (math.sqrt(2 * math.pi) * self.scale),
-                               name=name)
+        return elementwise_div(
+            ops.exp(
+                -1.0 * ((value - self.loc) * (value - self.loc)) / (2.0 * var)
+            ),
+            (math.sqrt(2 * math.pi) * self.scale),
+            name=name,
+        )
 
     def kl_divergence(self, other):
         r"""The KL-divergence between two normal distributions.
@@ -243,12 +269,12 @@ def kl_divergence(self, other):
 
         .. math::
 
-            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\\frac{diff}{\sigma_1})^2 - 1 - 2 \\ln {ratio})
+            KL\_divergence(\mu_0, \sigma_0; \mu_1, \sigma_1) = 0.5 (ratio^2 + (\frac{diff}{\sigma_1})^2 - 1 - 2 \ln {ratio})
 
         .. math::
 
-            ratio = \\frac{\sigma_0}{\sigma_1}
-        
+            ratio = \frac{\sigma_0}{\sigma_1}
+
         .. math::
 
             diff = \mu_1 - \mu_0
@@ -266,7 +292,7 @@ def kl_divergence(self, other):
             other (Normal): instance of Normal.
 
         Returns:
-            Tensor: kl-divergence between two normal distributions.The data type is float32.
+            Tensor, kl-divergence between two normal distributions.The data type is float32.
 
         """
         if not _non_static_mode():
@@ -274,9 +300,9 @@ def kl_divergence(self, other):
 
         name = self.name + '_kl_divergence'
         var_ratio = self.scale / other.scale
-        var_ratio = (var_ratio * var_ratio)
+        var_ratio = var_ratio * var_ratio
         t1 = (self.loc - other.loc) / other.scale
-        t1 = (t1 * t1)
-        return elementwise_add(0.5 * var_ratio,
-                               0.5 * (t1 - 1. - nn.log(var_ratio)),
-                               name=name)
+        t1 = t1 * t1
+        return elementwise_add(
+            0.5 * var_ratio, 0.5 * (t1 - 1.0 - nn.log(var_ratio)), name=name
+        )
diff --git a/python/paddle/distribution/transform.py b/python/paddle/distribution/transform.py
index d7a512aade2e58..986416e3c59555 100644
--- a/python/paddle/distribution/transform.py
+++ b/python/paddle/distribution/transform.py
@@ -21,20 +21,33 @@
 
 import paddle
 import paddle.nn.functional as F
-from paddle.distribution import (constraint, distribution,
-                                 transformed_distribution, variable)
+from paddle.distribution import (
+    constraint,
+    distribution,
+    transformed_distribution,
+    variable,
+)
 
 __all__ = [  # noqa
-    'Transform', 'AbsTransform', 'AffineTransform', 'ChainTransform',
-    'ExpTransform', 'IndependentTransform', 'PowerTransform',
-    'ReshapeTransform', 'SigmoidTransform', 'SoftmaxTransform',
-    'StackTransform', 'StickBreakingTransform', 'TanhTransform'
+    'Transform',
+    'AbsTransform',
+    'AffineTransform',
+    'ChainTransform',
+    'ExpTransform',
+    'IndependentTransform',
+    'PowerTransform',
+    'ReshapeTransform',
+    'SigmoidTransform',
+    'SoftmaxTransform',
+    'StackTransform',
+    'StickBreakingTransform',
+    'TanhTransform',
 ]
 
 
 class Type(enum.Enum):
-    """Mapping type of a transformation.
-    """
+    """Mapping type of a transformation."""
+
     BIJECTION = 'bijection'  # bijective(injective and surjective)
     INJECTION = 'injection'  # injective-only
     SURJECTION = 'surjection'  # surjective-only
@@ -42,8 +55,7 @@ class Type(enum.Enum):
 
     @classmethod
     def is_injective(cls, _type):
-        """Both bijection and injection are injective mapping.
-        """
+        """Both bijection and injection are injective mapping."""
         return _type in (cls.BIJECTION, cls.INJECTION)
 
 
@@ -55,11 +67,11 @@ class Transform(object):
     used for transforming a random sample generated by ``Distribution`` 
     instance. 
 
-    Suppose :math:`X` is a K-dimensional random variable with probability 
-    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may 
-    be defined by transforming :math:`X` with a suitably well-behaved funciton 
-    :math:`f`. It suffices for what follows to note that if f is one-to-one and 
-    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of 
+    Suppose :math:`X` is a K-dimensional random variable with probability
+    density function :math:`p_X(x)`. A new random variable :math:`Y = f(X)` may
+    be defined by transforming :math:`X` with a suitably well-behaved funciton
+    :math:`f`. It suffices for what follows to note that if `f` is one-to-one and
+    its inverse :math:`f^{-1}` have a well-defined Jacobian, then the density of
     :math:`Y` is
 
     .. math::
@@ -121,14 +133,14 @@ def _is_injective(cls):
         return Type.is_injective(cls._type)
 
     def __call__(self, input):
-        """Make this instance as a callable object. The return value is 
-        depening on the input type. 
+        """Make this instance as a callable object. The return value is
+        depening on the input type.
 
-        * If the input is a ``Tensor`` instance, return 
+        * If the input is a ``Tensor`` instance, return
           ``self.forward(input)`` .
-        * If the input is a ``Distribution`` instance, return 
+        * If the input is a ``Distribution`` instance, return
           ``TransformedDistribution(base=input, transforms=[self])`` .
-        * If the input is a ``Transform`` instance, return 
+        * If the input is a ``Transform`` instance, return
           ``ChainTransform([self, input])`` .
 
         Args:
@@ -139,18 +151,19 @@ def __call__(self, input):
         """
         if isinstance(input, distribution.Distribution):
             return transformed_distribution.TransformedDistribution(
-                input, [self])
+                input, [self]
+            )
         if isinstance(input, Transform):
             return ChainTransform([self, input])
         return self.forward(x)
 
     def forward(self, x):
-        """Forward transformation with mapping :math:`y = f(x)`. 
+        """Forward transformation with mapping :math:`y = f(x)`.
 
         Useful for turning one random outcome into another.
 
         Args:
-            x (Tensos): Input parameter, generally is a sample generated 
+            x (Tensos): Input parameter, generally is a sample generated
                 from ``Distribution``.
 
         Returns:
@@ -158,15 +171,17 @@ def forward(self, x):
         """
         if not isinstance(x, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'x' is a Tensor or Real, but got {type(x)}.")
+                f"Expected 'x' is a Tensor or Real, but got {type(x)}."
+            )
         if x.dim() < self._domain.event_rank:
             raise ValueError(
                 f'The dimensions of x({x.dim()}) should be '
-                f'grater than or equal to {self._domain.event_rank}')
+                f'grater than or equal to {self._domain.event_rank}'
+            )
         return self._forward(x)
 
     def inverse(self, y):
-        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing" 
+        """Inverse transformation :math:`x = f^{-1}(y)`. It's useful for "reversing"
         a transformation to compute one probability in terms of another.
 
         Args:
@@ -177,46 +192,53 @@ def inverse(self, y):
         """
         if not isinstance(y, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'y' is a Tensor or Real, but got {type(y)}.")
+                f"Expected 'y' is a Tensor or Real, but got {type(y)}."
+            )
         if y.dim() < self._codomain.event_rank:
             raise ValueError(
                 f'The dimensions of y({y.dim()}) should be '
-                f'grater than or equal to {self._codomain.event_rank}')
+                f'grater than or equal to {self._codomain.event_rank}'
+            )
         return self._inverse(y)
 
     def forward_log_det_jacobian(self, x):
-        """The log of the absolute value of the determinant of the matrix of all 
+        """The log of the absolute value of the determinant of the matrix of all
         first-order partial derivatives of the inverse function.
 
         Args:
-            x (Tensor): Input tensor, generally is a sample generated from 
+            x (Tensor): Input tensor, generally is a sample generated from
                 ``Distribution``
 
         Returns:
-            Tensor: The log of the absolute value of Jacobian determinant. 
+            Tensor: The log of the absolute value of Jacobian determinant.
         """
         if not isinstance(x, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'y' is a Tensor or Real, but got {type(x)}.")
-        if isinstance(x, paddle.fluid.framework.Variable
-                      ) and x.dim() < self._domain.event_rank:
+                f"Expected 'y' is a Tensor or Real, but got {type(x)}."
+            )
+        if (
+            isinstance(x, paddle.fluid.framework.Variable)
+            and x.dim() < self._domain.event_rank
+        ):
             raise ValueError(
                 f'The dimensions of x({x.dim()}) should be '
-                f'grater than or equal to {self._domain.event_rank}')
+                f'grater than or equal to {self._domain.event_rank}'
+            )
         if not self._is_injective():
             raise NotImplementedError(
                 "forward_log_det_jacobian can't be implemented for non-injective"
-                "transforms.")
+                "transforms."
+            )
 
         return self._call_forward_log_det_jacobian(x)
 
     def inverse_log_det_jacobian(self, y):
         """Compute :math:`log|det J_{f^{-1}}(y)|`.
-        Note that ``forward_log_det_jacobian`` is the negative of this function, 
+        Note that ``forward_log_det_jacobian`` is the negative of this function,
         evaluated at :math:`f^{-1}(y)`.
 
         Args:
-            y (Tensor): The input to the ``inverse`` Jacobian determinant 
+            y (Tensor): The input to the ``inverse`` Jacobian determinant
                 evaluation.
 
         Returns:
@@ -227,7 +249,8 @@ def inverse_log_det_jacobian(self, y):
         if y.dim() < self._codomain.event_rank:
             raise ValueError(
                 f'The dimensions of y({y.dim()}) should be '
-                f'grater than or equal to {self._codomain.event_rank}')
+                f'grater than or equal to {self._codomain.event_rank}'
+            )
         return self._call_inverse_log_det_jacobian(y)
 
     def forward_shape(self, shape):
@@ -241,7 +264,8 @@ def forward_shape(self, shape):
         """
         if not isinstance(shape, typing.Sequence):
             raise TypeError(
-                f"Expected shape is Sequence[int] type, but got {type(shape)}.")
+                f"Expected shape is Sequence[int] type, but got {type(shape)}."
+            )
         return self._forward_shape(shape)
 
     def inverse_shape(self, shape):
@@ -255,7 +279,8 @@ def inverse_shape(self, shape):
         """
         if not isinstance(shape, typing.Sequence):
             raise TypeError(
-                f"Expected shape is Sequence[int] type, but got {type(shape)}.")
+                f"Expected shape is Sequence[int] type, but got {type(shape)}."
+            )
         return self._inverse_shape(shape)
 
     @property
@@ -269,13 +294,13 @@ def _codomain(self):
         return variable.real
 
     def _forward(self, x):
-        """Inner method for publid API ``forward``, subclass should 
+        """Inner method for publid API ``forward``, subclass should
         overwrite this method for supporting forward transformation.
         """
         raise NotImplementedError('Forward not implemented')
 
     def _inverse(self, y):
-        """Inner method of public API ``inverse``, subclass should 
+        """Inner method of public API ``inverse``, subclass should
         overwrite this method for supporting inverse transformation.
         """
         raise NotImplementedError('Inverse not implemented')
@@ -288,7 +313,8 @@ def _call_forward_log_det_jacobian(self, x):
             return -self._inverse_log_det_jacobian(self.forward(y))
         raise NotImplementedError(
             'Neither _forward_log_det_jacobian nor _inverse_log_det_jacobian'
-            'is implemented. One of them is required.')
+            'is implemented. One of them is required.'
+        )
 
     def _call_inverse_log_det_jacobian(self, y):
         """Inner method called by ``inverse_log_det_jacobian``"""
@@ -298,38 +324,39 @@ def _call_inverse_log_det_jacobian(self, y):
             return -self._forward_log_det_jacobian(self._inverse(y))
         raise NotImplementedError(
             'Neither _forward_log_det_jacobian nor _inverse_log_det_jacobian '
-            'is implemented. One of them is required')
+            'is implemented. One of them is required'
+        )
 
     def _forward_shape(self, shape):
-        """Inner method called by ``forward_shape``, which is used to infer the 
-        forward shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``forward_shape``, which is used to infer the
+        forward shape. Subclass should overwrite this method for supporting
         ``forward_shape``.
         """
         return shape
 
     def _inverse_shape(self, shape):
-        """Inner method called by ``inverse_shape``, whic is used to infer the 
-        invese shape. Subclass should overwrite this method for supporting 
+        """Inner method called by ``inverse_shape``, whic is used to infer the
+        invese shape. Subclass should overwrite this method for supporting
         ``inverse_shape``.
         """
         return shape
 
 
 class AbsTransform(Transform):
-    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`, 
+    r"""Absolute transformation with formula :math:`y = f(x) = abs(x)`,
     element-wise.
 
-    This non-injective transformation allows for transformations of scalar 
-    distributions with the absolute value function, which maps ``(-inf, inf)`` 
+    This non-injective transformation allows for transformations of scalar
+    distributions with the absolute value function, which maps ``(-inf, inf)``
     to ``[0, inf)`` .
 
-    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese 
+    * For ``y`` in ``(0, inf)`` , ``AbsTransform.inverse(y)`` returns the set invese
       ``{x  in (-inf, inf) : |x| = y}`` as a tuple, ``-y, y`` .
-    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not 
-      the set inverse (the set inverse is the singleton {0}), but "works" in 
-      conjunction with ``TransformedDistribution`` to produce a left 
+    * For ``y`` equal ``0`` , ``AbsTransform.inverse(0)`` returns ``0, 0``, which is not
+      the set inverse (the set inverse is the singleton {0}), but "works" in
+      conjunction with ``TransformedDistribution`` to produce a left
       semi-continuous pdf.
-    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the 
+    * For ``y`` in ``(-inf, 0)`` , ``AbsTransform.inverse(y)`` returns the
       wrong thing ``-y, y``. This is done for efficiency.
 
     Examples:
@@ -388,7 +415,7 @@ def _codomain(self):
 
 
 class AffineTransform(Transform):
-    r"""Affine transformation with mapping 
+    r"""Affine transformation with mapping
     :math:`y = \text{loc} + \text{scale} \times x`.
 
     Args:
@@ -421,7 +448,8 @@ def __init__(self, loc, scale):
             raise TypeError(f"Expected 'loc' is a Tensor, but got {type(loc)}")
         if not isinstance(scale, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected scale is a Tensor, but got {type(scale)}")
+                f"Expected scale is a Tensor, but got {type(scale)}"
+            )
         self._loc = loc
         self._scale = scale
         super(AffineTransform, self).__init__()
@@ -447,13 +475,17 @@ def _forward_shape(self, shape):
         return tuple(
             paddle.broadcast_shape(
                 paddle.broadcast_shape(shape, self._loc.shape),
-                self._scale.shape))
+                self._scale.shape,
+            )
+        )
 
     def _inverse_shape(self, shape):
         return tuple(
             paddle.broadcast_shape(
                 paddle.broadcast_shape(shape, self._loc.shape),
-                self._scale.shape))
+                self._scale.shape,
+            )
+        )
 
     @property
     def _domain(self):
@@ -505,7 +537,8 @@ def __init__(self, transforms):
             )
         if not all(isinstance(t, Transform) for t in transforms):
             raise TypeError(
-                "All elements of transforms should be Transform type.")
+                "All elements of transforms should be Transform type."
+            )
 
         self.transforms = transforms
         super(ChainTransform, self).__init__()
@@ -524,11 +557,12 @@ def _inverse(self, y):
         return y
 
     def _forward_log_det_jacobian(self, x):
-        value = 0.
+        value = 0.0
         event_rank = self._domain.event_rank
         for t in self.transforms:
-            value += self._sum_rightmost(t.forward_log_det_jacobian(x),
-                                         event_rank - t._domain.event_rank)
+            value += self._sum_rightmost(
+                t.forward_log_det_jacobian(x), event_rank - t._domain.event_rank
+            )
             x = t.forward(x)
             event_rank += t._codomain.event_rank - t._domain.event_rank
         return value
@@ -638,26 +672,26 @@ def _forward_log_det_jacobian(self, x):
 
 class IndependentTransform(Transform):
     r"""
-    ``IndependentTransform`` wraps a base transformation, reinterprets 
+    ``IndependentTransform`` wraps a base transformation, reinterprets
     some of the rightmost batch axes as event axes.
 
     Generally, it is used to expand the event axes. This has no effect on the
-    forward or inverse transformaion, but does sum out the 
-    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant 
+    forward or inverse transformaion, but does sum out the
+    ``reinterpretd_bach_rank`` rightmost dimensions in computing the determinant
     of Jacobian matrix.
 
-    To see this, consider the ``ExpTransform`` applied to a Tensor which has 
-    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's 
+    To see this, consider the ``ExpTransform`` applied to a Tensor which has
+    sample, batch, and event ``(S,B,E)`` shape semantics. Suppose the Tensor's
     paritioned-shape is ``(S=[4], B=[2, 2], E=[3])`` , reinterpreted_batch_rank
     is 1. Then the reinterpreted Tensor's shape  is ``(S=[4], B=[2], E=[2, 3])`` .
-    The shape returned by ``forward`` and ``inverse`` is unchanged, ie, 
-    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian`` 
-    is ``[4,2]``, because the Jacobian determinant is a reduction over the 
+    The shape returned by ``forward`` and ``inverse`` is unchanged, ie,
+    ``[4,2,2,3]`` . However the shape returned by ``inverse_log_det_jacobian``
+    is ``[4,2]``, because the Jacobian determinant is a reduction over the
     event dimensions.
 
     Args:
         base (Transform): The base transformation.
-        reinterpreted_batch_rank (int): The num of rightmost batch rank that 
+        reinterpreted_batch_rank (int): The num of rightmost batch rank that
             will be reinterpreted as event rank.
 
     Examples:
@@ -683,7 +717,8 @@ class IndependentTransform(Transform):
     def __init__(self, base, reinterpreted_batch_rank):
         if not isinstance(base, Transform):
             raise TypeError(
-                f"Expected 'base' is Transform type, but get {type(base)}")
+                f"Expected 'base' is Transform type, but get {type(base)}"
+            )
         if reinterpreted_batch_rank <= 0:
             raise ValueError(
                 f"Expected 'reinterpreted_batch_rank' is grater than zero, but got {reinterpreted_batch_rank}"
@@ -708,7 +743,8 @@ def _inverse(self, y):
 
     def _forward_log_det_jacobian(self, x):
         return self._base.forward_log_det_jacobian(x).sum(
-            list(range(-self._reinterpreted_batch_rank, 0)))
+            list(range(-self._reinterpreted_batch_rank, 0))
+        )
 
     def _forward_shape(self, shape):
         return self._base.forward_shape(shape)
@@ -718,13 +754,15 @@ def _inverse_shape(self, shape):
 
     @property
     def _domain(self):
-        return variable.Independent(self._base._domain,
-                                    self._reinterpreted_batch_rank)
+        return variable.Independent(
+            self._base._domain, self._reinterpreted_batch_rank
+        )
 
     @property
     def _codomain(self):
-        return variable.Independent(self._base._codomain,
-                                    self._reinterpreted_batch_rank)
+        return variable.Independent(
+            self._base._codomain, self._reinterpreted_batch_rank
+        )
 
 
 class PowerTransform(Transform):
@@ -758,7 +796,8 @@ class PowerTransform(Transform):
     def __init__(self, power):
         if not isinstance(power, paddle.fluid.framework.Variable):
             raise TypeError(
-                f"Expected 'power' is a tensor, but got {type(power)}")
+                f"Expected 'power' is a tensor, but got {type(power)}"
+            )
         self._power = power
         super(PowerTransform, self).__init__()
 
@@ -793,7 +832,7 @@ def _inverse_shape(self, shape):
 class ReshapeTransform(Transform):
     r"""Reshape the event shape of a tensor.
 
-    Note that ``in_event_shape`` and ``out_event_shape`` must have the same 
+    Note that ``in_event_shape`` and ``out_event_shape`` must have the same
     number of elements.
 
     Args:
@@ -827,13 +866,16 @@ class ReshapeTransform(Transform):
 
     def __init__(self, in_event_shape, out_event_shape):
         if not isinstance(in_event_shape, typing.Sequence) or not isinstance(
-                out_event_shape, typing.Sequence):
+            out_event_shape, typing.Sequence
+        ):
             raise TypeError(
                 f"Expected type of 'in_event_shape' and 'out_event_shape' is "
                 f"Squence[int], but got 'in_event_shape': {in_event_shape}, "
-                f"'out_event_shape': {out_event_shape}")
+                f"'out_event_shape': {out_event_shape}"
+            )
         if functools.reduce(operator.mul, in_event_shape) != functools.reduce(
-                operator.mul, out_event_shape):
+            operator.mul, out_event_shape
+        ):
             raise ValueError(
                 f"The numel of 'in_event_shape' should be 'out_event_shape', "
                 f"but got {functools.reduce(operator.mul, in_event_shape)}!={functools.reduce(operator.mul, out_event_shape)}"
@@ -861,39 +903,45 @@ def _codomain(self):
 
     def _forward(self, x):
         return x.reshape(
-            tuple(x.shape)[:x.dim() - len(self._in_event_shape)] +
-            self._out_event_shape)
+            tuple(x.shape)[: x.dim() - len(self._in_event_shape)]
+            + self._out_event_shape
+        )
 
     def _inverse(self, y):
         return y.reshape(
-            tuple(y.shape)[:y.dim() - len(self._out_event_shape)] +
-            self._in_event_shape)
+            tuple(y.shape)[: y.dim() - len(self._out_event_shape)]
+            + self._in_event_shape
+        )
 
     def _forward_shape(self, shape):
         if len(shape) < len(self._in_event_shape):
             raise ValueError(
                 f"Expected length of 'shape' is not less than {len(self._in_event_shape)}, but got {len(shape)}"
             )
-        if shape[-len(self._in_event_shape):] != self._in_event_shape:
+        if shape[-len(self._in_event_shape) :] != self._in_event_shape:
             raise ValueError(
                 f"Event shape mismatch, expected: {self._in_event_shape}, but got {shape[-len(self._in_event_shape):]}"
             )
-        return tuple(shape[:-len(self._in_event_shape)]) + self._out_event_shape
+        return (
+            tuple(shape[: -len(self._in_event_shape)]) + self._out_event_shape
+        )
 
     def _inverse_shape(self, shape):
         if len(shape) < len(self._out_event_shape):
             raise ValueError(
                 f"Expected 'shape' length is not less than {len(self._out_event_shape)}, but got {len(shape)}"
             )
-        if shape[-len(self._out_event_shape):] != self._out_event_shape:
+        if shape[-len(self._out_event_shape) :] != self._out_event_shape:
             raise ValueError(
                 f"Event shape mismatch, expected: {self._out_event_shape}, but got {shape[-len(self._out_event_shape):]}"
             )
-        return tuple(shape[:-len(self._out_event_shape)]) + self._in_event_shape
+        return (
+            tuple(shape[: -len(self._out_event_shape)]) + self._in_event_shape
+        )
 
     def _forward_log_det_jacobian(self, x):
         # paddle.zeros not support zero dimension Tensor.
-        shape = x.shape[:x.dim() - len(self._in_event_shape)] or [1]
+        shape = x.shape[: x.dim() - len(self._in_event_shape)] or [1]
         return paddle.zeros(shape, dtype=x.dtype)
 
 
@@ -928,7 +976,7 @@ def _domain(self):
 
     @property
     def _codomain(self):
-        return variable.Variable(False, 0, constraint.Range(0., 1.))
+        return variable.Variable(False, 0, constraint.Range(0.0, 1.0))
 
     def _forward(self, x):
         return F.sigmoid(x)
@@ -943,8 +991,8 @@ def _forward_log_det_jacobian(self, x):
 class SoftmaxTransform(Transform):
     r"""Softmax transformation with mapping :math:`y=\exp(x)` then normalizing.
 
-    It's generally used to convert unconstrained space to simplex. This mapping 
-    is not injective, so ``forward_log_det_jacobian`` and 
+    It's generally used to convert unconstrained space to simplex. This mapping
+    is not injective, so ``forward_log_det_jacobian`` and
     ``inverse_log_det_jacobian`` are not implemented.
 
     Examples:
@@ -997,12 +1045,13 @@ def _inverse_shape(self, shape):
 
 
 class StackTransform(Transform):
-    r""" ``StackTransform`` applies a sequence of transformations along the 
+    r"""``StackTransform`` applies a sequence of transformations along the
     specific axis.
 
     Args:
-        transforms(Sequence[Transform]): The sequence of transformations. 
-        axis(int): The axis along which will be transformed.
+        transforms (Sequence[Transform]): The sequence of transformations.
+        axis (int, optional): The axis along which will be transformed. default
+            value is 0.
 
     Examples:
 
@@ -1010,7 +1059,6 @@ class StackTransform(Transform):
 
             import paddle
 
-
             x = paddle.stack(
                 (paddle.to_tensor([1., 2., 3.]), paddle.to_tensor([1, 2., 3.])), 1)
             t = paddle.distribution.StackTransform(
@@ -1023,11 +1071,13 @@ class StackTransform(Transform):
             #        [[2.71828175 , 1.         ],
             #         [7.38905621 , 4.         ],
             #         [20.08553696, 9.         ]])
+
             print(t.inverse(t.forward(x)))
             # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [[1., 1.],
             #         [2., 2.],
             #         [3., 3.]])
+
             print(t.forward_log_det_jacobian(x))
             # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [[1.        , 0.69314718],
@@ -1042,7 +1092,8 @@ def __init__(self, transforms, axis=0):
             )
         if not all(isinstance(t, Transform) for t in transforms):
             raise TypeError(
-                'Expected all element in transforms is Transform Type.')
+                'Expected all element in transforms is Transform Type.'
+            )
         if not isinstance(axis, int):
             raise TypeError(f"Expected 'axis' is int, but got{type(axis)}.")
 
@@ -1062,34 +1113,45 @@ def axis(self):
 
     def _forward(self, x):
         self._check_size(x)
-        return paddle.stack([
-            t.forward(v)
-            for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
-        ], self._axis)
+        return paddle.stack(
+            [
+                t.forward(v)
+                for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
+            ],
+            self._axis,
+        )
 
     def _inverse(self, y):
         self._check_size(y)
-        return paddle.stack([
-            t.inverse(v)
-            for v, t in zip(paddle.unstack(y, self._axis), self._transforms)
-        ], self._axis)
+        return paddle.stack(
+            [
+                t.inverse(v)
+                for v, t in zip(paddle.unstack(y, self._axis), self._transforms)
+            ],
+            self._axis,
+        )
 
     def _forward_log_det_jacobian(self, x):
         self._check_size(x)
-        return paddle.stack([
-            t.forward_log_det_jacobian(v)
-            for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
-        ], self._axis)
+        return paddle.stack(
+            [
+                t.forward_log_det_jacobian(v)
+                for v, t in zip(paddle.unstack(x, self._axis), self._transforms)
+            ],
+            self._axis,
+        )
 
     def _check_size(self, v):
         if not (-v.dim() <= self._axis < v.dim()):
             raise ValueError(
                 f'Input dimensions {v.dim()} should be grater than stack '
-                f'transform axis {self._axis}.')
+                f'transform axis {self._axis}.'
+            )
         if v.shape[self._axis] != len(self._transforms):
             raise ValueError(
                 f'Input size along {self._axis} should be equal to the '
-                f'length of transforms.')
+                f'length of transforms.'
+            )
 
     @property
     def _domain(self):
@@ -1097,12 +1159,13 @@ def _domain(self):
 
     @property
     def _codomain(self):
-        return variable.Stack([t._codomain for t in self._transforms],
-                              self._axis)
+        return variable.Stack(
+            [t._codomain for t in self._transforms], self._axis
+        )
 
 
 class StickBreakingTransform(Transform):
-    r"""Convert an unconstrained vector to the simplex with one additional 
+    r"""Convert an unconstrained vector to the simplex with one additional
     dimension by the stick-breaking construction.
 
     Examples:
@@ -1131,8 +1194,9 @@ def _forward(self, x):
         offset = x.shape[-1] + 1 - paddle.ones([x.shape[-1]]).cumsum(-1)
         z = F.sigmoid(x - offset.log())
         z_cumprod = (1 - z).cumprod(-1)
-        return F.pad(z, [0]*2*(len(x.shape)-1) + [0, 1], value=1) * \
-            F.pad(z_cumprod, [0]*2*(len(x.shape)-1) + [1, 0], value=1)
+        return F.pad(z, [0] * 2 * (len(x.shape) - 1) + [0, 1], value=1) * F.pad(
+            z_cumprod, [0] * 2 * (len(x.shape) - 1) + [1, 0], value=1
+        )
 
     def _inverse(self, y):
         y_crop = y[..., :-1]
@@ -1150,12 +1214,12 @@ def _forward_log_det_jacobian(self, x):
     def _forward_shape(self, shape):
         if not shape:
             raise ValueError(f"Expected 'shape' is not empty, but got {shape}")
-        return shape[:-1] + (shape[-1] + 1, )
+        return shape[:-1] + (shape[-1] + 1,)
 
     def _inverse_shape(self, shape):
         if not shape:
             raise ValueError(f"Expected 'shape' is not empty, but got {shape}")
-        return shape[:-1] + (shape[-1] - 1, )
+        return shape[:-1] + (shape[-1] - 1,)
 
     @property
     def _domain(self):
@@ -1213,10 +1277,10 @@ def _inverse(self, y):
         return y.atanh()
 
     def _forward_log_det_jacobian(self, x):
-        """We implicitly rely on _forward_log_det_jacobian rather than 
-        explicitly implement ``_inverse_log_det_jacobian`` since directly using 
+        """We implicitly rely on _forward_log_det_jacobian rather than
+        explicitly implement ``_inverse_log_det_jacobian`` since directly using
         ``-tf.math.log1p(-tf.square(y))`` has lower numerical precision.
 
         See details: https://github.com/tensorflow/probability/blob/master/tensorflow_probability/python/bijectors/tanh.py#L69-L80
         """
-        return 2. * (math.log(2.) - x - F.softplus(-2. * x))
+        return 2.0 * (math.log(2.0) - x - F.softplus(-2.0 * x))
diff --git a/python/paddle/distribution/transformed_distribution.py b/python/paddle/distribution/transformed_distribution.py
index ce386971e5fcce..160af5e4870af4 100644
--- a/python/paddle/distribution/transformed_distribution.py
+++ b/python/paddle/distribution/transformed_distribution.py
@@ -77,7 +77,7 @@ def __init__(self, base, transforms):
             max(len(base.event_shape)-chain._domain.event_rank, 0)
         super(TransformedDistribution, self).__init__(
             transformed_shape[:len(transformed_shape) - transformed_event_rank],
-            transformed_shape[:len(transformed_shape) - transformed_event_rank])
+            transformed_shape[len(transformed_shape) - transformed_event_rank:])
 
     def sample(self, shape=()):
         """Sample from ``TransformedDistribution``.
diff --git a/python/paddle/distribution/uniform.py b/python/paddle/distribution/uniform.py
index 7c085da3156866..aca3608ba6283d 100644
--- a/python/paddle/distribution/uniform.py
+++ b/python/paddle/distribution/uniform.py
@@ -19,12 +19,27 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.distribution import distribution
 from paddle.fluid import core
-from paddle.fluid.data_feeder import (check_dtype, check_type,
-                                      check_variable_and_dtype, convert_dtype)
-from paddle.fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
-from paddle.fluid.layers import (control_flow, elementwise_add, elementwise_div,
-                                 elementwise_mul, elementwise_sub, nn, ops,
-                                 tensor)
+from paddle.fluid.data_feeder import (
+    check_dtype,
+    check_type,
+    check_variable_and_dtype,
+    convert_dtype,
+)
+from paddle.fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
+from paddle.fluid.layers import (
+    control_flow,
+    elementwise_add,
+    elementwise_div,
+    elementwise_mul,
+    elementwise_sub,
+    nn,
+    ops,
+    tensor,
+)
 from paddle.tensor import arange, concat, gather_nd, multinomial
 
 
@@ -37,7 +52,7 @@ class Uniform(distribution.Distribution):
 
     .. math::
 
-        pdf(x; a, b) = \\frac{1}{Z}, \ a <=x <b
+        pdf(x; a, b) = \frac{1}{Z}, \ a <=x <b
 
     .. math::
 
@@ -50,53 +65,61 @@ class Uniform(distribution.Distribution):
     * :math:`Z`: is the normalizing constant.
 
     The parameters `low` and `high` must be shaped in a way that supports
-    [broadcasting](https://www.paddlepaddle.org.cn/documentation/docs/en/develop/beginners_guide/basic_concept/broadcasting_en.html) (e.g., `high - low` is a valid operation).
+    :ref:`user_guide_broadcasting` (e.g., `high - low` is a valid operation).
 
     Args:
-        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary of uniform distribution.The data type is int, float, list, numpy.ndarray or Tensor
-        name(str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        low(int|float|list|tuple|numpy.ndarray|Tensor): The lower boundary of
+            uniform distribution.The data type is float32 and float64.
+        high(int|float|list|tuple|numpy.ndarray|Tensor): The higher boundary
+            of uniform distribution.The data type is float32 and float64.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          from paddle.distribution import Uniform
-
-          # Without broadcasting, a single uniform distribution [3, 4]:
-          u1 = Uniform(low=3.0, high=4.0)
-          # 2 distributions [1, 3], [2, 4]
-          u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
-          # 4 distributions
-          u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
-                    high=[[1.5, 2.5], [3.5, 4.5]])
-
-          # With broadcasting:
-          u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
-
-          # Complete example
-          value_tensor = paddle.to_tensor([0.8], dtype="float32")
-
-          uniform = Uniform([0.], [2.])
-
-          sample = uniform.sample([2])
-          # a random tensor created by uniform distribution with shape: [2, 1]
-          entropy = uniform.entropy()
-          # [0.6931472] with shape: [1]
-          lp = uniform.log_prob(value_tensor)
-          # [-0.6931472] with shape: [1]
-          p = uniform.probs(value_tensor)
-          # [0.5] with shape: [1]
+            import paddle
+            from paddle.distribution import Uniform
+
+            # Without broadcasting, a single uniform distribution [3, 4]:
+            u1 = Uniform(low=3.0, high=4.0)
+            # 2 distributions [1, 3], [2, 4]
+            u2 = Uniform(low=[1.0, 2.0], high=[3.0, 4.0])
+            # 4 distributions
+            u3 = Uniform(low=[[1.0, 2.0], [3.0, 4.0]],
+                        high=[[1.5, 2.5], [3.5, 4.5]])
+
+            # With broadcasting:
+            u4 = Uniform(low=3.0, high=[5.0, 6.0, 7.0])
+
+            # Complete example
+            value_tensor = paddle.to_tensor([0.8], dtype="float32")
+
+            uniform = Uniform([0.], [2.])
+
+            sample = uniform.sample([2])
+            # a random tensor created by uniform distribution with shape: [2, 1]
+            entropy = uniform.entropy()
+            # [0.6931472] with shape: [1]
+            lp = uniform.log_prob(value_tensor)
+            # [-0.6931472] with shape: [1]
+            p = uniform.probs(value_tensor)
+            # [0.5] with shape: [1]
     """
 
     def __init__(self, low, high, name=None):
         if not _non_static_mode():
-            check_type(low, 'low',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Uniform')
-            check_type(high, 'high',
-                       (int, float, np.ndarray, tensor.Variable, list, tuple),
-                       'Uniform')
+            check_type(
+                low,
+                'low',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Uniform',
+            )
+            check_type(
+                high,
+                'high',
+                (int, float, np.ndarray, tensor.Variable, list, tuple),
+                'Uniform',
+            )
 
         self.all_arg_is_float = False
         self.batch_size_unknown = False
@@ -116,11 +139,15 @@ def __init__(self, low, high, name=None):
         else:
             if isinstance(low, float) and isinstance(high, float):
                 self.all_arg_is_float = True
-            if isinstance(low, np.ndarray) and str(
-                    low.dtype) in ['float32', 'float64']:
+            if isinstance(low, np.ndarray) and str(low.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = low.dtype
-            elif isinstance(high, np.ndarray) and str(
-                    high.dtype) in ['float32', 'float64']:
+            elif isinstance(high, np.ndarray) and str(high.dtype) in [
+                'float32',
+                'float64',
+            ]:
                 self.dtype = high.dtype
             # pylint: disable=unbalanced-tuple-unpacking
             self.low, self.high = self._to_tensor(low, high)
@@ -132,11 +159,11 @@ def sample(self, shape, seed=0):
         """Generate samples of the specified shape.
 
         Args:
-          shape (list): 1D `int32`. Shape of the generated samples.
-          seed (int): Python integer number.
+            shape (list): 1D `int32`. Shape of the generated samples.
+            seed (int): Python integer number.
 
         Returns:
-          Tensor: A tensor with prepended dimensions shape.The data type is float32.
+            Tensor, A tensor with prepended dimensions shape. The data type is float32.
 
         """
         if not _non_static_mode():
@@ -148,27 +175,33 @@ def sample(self, shape, seed=0):
         if self.batch_size_unknown:
             output_shape = shape + batch_shape
             zero_tmp = tensor.fill_constant_batch_size_like(
-                self.low + self.high, batch_shape + shape, self.dtype, 0.)
+                self.low + self.high, batch_shape + shape, self.dtype, 0.0
+            )
             uniform_random_tmp = nn.uniform_random_batch_size_like(
                 zero_tmp,
                 zero_tmp.shape,
                 dtype=self.dtype,
-                min=0.,
-                max=1.,
-                seed=seed)
+                min=0.0,
+                max=1.0,
+                seed=seed,
+            )
             zero_tmp_reshape = nn.reshape(zero_tmp, output_shape)
-            uniform_random_tmp_reshape = nn.reshape(uniform_random_tmp,
-                                                    output_shape)
-            output = uniform_random_tmp_reshape * (zero_tmp_reshape +
-                                                   self.high - self.low)
+            uniform_random_tmp_reshape = nn.reshape(
+                uniform_random_tmp, output_shape
+            )
+            output = uniform_random_tmp_reshape * (
+                zero_tmp_reshape + self.high - self.low
+            )
             output = elementwise_add(output, self.low, name=name)
             return output
         else:
             output_shape = shape + batch_shape
             output = nn.uniform_random(
-                output_shape, dtype=self.dtype, min=0., max=1.,
-                seed=seed) * (tensor.zeros(output_shape, dtype=self.dtype) +
-                              (self.high - self.low))
+                output_shape, dtype=self.dtype, min=0.0, max=1.0, seed=seed
+            ) * (
+                tensor.zeros(output_shape, dtype=self.dtype)
+                + (self.high - self.low)
+            )
             output = elementwise_add(output, self.low, name=name)
             if self.all_arg_is_float:
                 return nn.reshape(output, shape, name=name)
@@ -179,10 +212,10 @@ def log_prob(self, value):
         """Log probability density/mass function.
 
         Args:
-          value (Tensor): The input tensor.
+            value (Tensor): The input tensor.
 
         Returns:
-          Tensor: log probability.The data type is same with value.
+            Tensor, log probability.The data type is same with value.
 
         """
         value = self._check_values_dtype_in_probs(self.low, value)
@@ -197,10 +230,12 @@ def log_prob(self, value):
                 return nn.log(lb * ub) - nn.log(self.high - self.low)
 
             if _in_legacy_dygraph():
-                lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
-                                        'out_dtype', value.dtype)
-                ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
-                                        'out_dtype', value.dtype)
+                lb = _legacy_C_ops.cast(
+                    lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
+                )
+                ub = _legacy_C_ops.cast(
+                    ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
+                )
                 return nn.log(lb * ub) - nn.log(self.high - self.low)
 
         name = self.name + '_log_prob'
@@ -208,18 +243,18 @@ def log_prob(self, value):
         ub_bool = value < self.high
         lb = tensor.cast(lb_bool, dtype=value.dtype)
         ub = tensor.cast(ub_bool, dtype=value.dtype)
-        return elementwise_sub(nn.log(lb * ub),
-                               nn.log(self.high - self.low),
-                               name=name)
+        return elementwise_sub(
+            nn.log(lb * ub), nn.log(self.high - self.low), name=name
+        )
 
     def probs(self, value):
         """Probability density/mass function.
 
         Args:
-          value (Tensor): The input tensor.
+            value (Tensor): The input tensor.
 
         Returns:
-          Tensor: probability.The data type is same with value.
+            Tensor, probability. The data type is same with value.
 
         """
         value = self._check_values_dtype_in_probs(self.low, value)
@@ -233,10 +268,12 @@ def probs(self, value):
                 return (lb * ub) / (self.high - self.low)
 
             if _in_legacy_dygraph():
-                lb = _legacy_C_ops.cast(lb_bool, 'in_dtype', lb_bool.dtype,
-                                        'out_dtype', value.dtype)
-                ub = _legacy_C_ops.cast(ub_bool, 'in_dtype', ub_bool.dtype,
-                                        'out_dtype', value.dtype)
+                lb = _legacy_C_ops.cast(
+                    lb_bool, 'in_dtype', lb_bool.dtype, 'out_dtype', value.dtype
+                )
+                ub = _legacy_C_ops.cast(
+                    ub_bool, 'in_dtype', ub_bool.dtype, 'out_dtype', value.dtype
+                )
                 return (lb * ub) / (self.high - self.low)
 
         name = self.name + '_probs'
@@ -256,7 +293,7 @@ def entropy(self):
             entropy(low, high) = \\log (high - low)
 
         Returns:
-          Tensor: Shannon entropy of uniform distribution.The data type is float32.
+            Tensor, Shannon entropy of uniform distribution.The data type is float32.
 
         """
         name = self.name + '_entropy'
diff --git a/python/paddle/fft.py b/python/paddle/fft.py
index 5cbc8f5e3beca8..bf3f2403ade33d 100644
--- a/python/paddle/fft.py
+++ b/python/paddle/fft.py
@@ -51,61 +51,76 @@
 def _check_normalization(norm):
     if norm not in ['forward', 'backward', 'ortho']:
         raise ValueError(
-            "Unexpected norm: {}. Norm should be forward, backward or ortho".
-            format(norm))
+            "Unexpected norm: {}. Norm should be forward, backward or ortho".format(
+                norm
+            )
+        )
 
 
 def _check_fft_n(n):
     if not isinstance(n, int):
         raise ValueError(
-            "Invalid FFT argument n({}), it shoule be an integer.".format(n))
+            "Invalid FFT argument n({}), it shoule be an integer.".format(n)
+        )
     if n <= 0:
         raise ValueError(
-            "Invalid FFT argument n({}), it should be positive.".format(n))
+            "Invalid FFT argument n({}), it should be positive.".format(n)
+        )
 
 
 def _check_fft_shape(x, s):
     ndim = x.ndim
     if not isinstance(s, Sequence):
         raise ValueError(
-            "Invaid FFT argument s({}), it should be a sequence of integers.")
+            "Invaid FFT argument s({}), it should be a sequence of integers."
+        )
 
     if len(s) > ndim:
         raise ValueError(
             "Length of FFT argument s should not be larger than the rank of input. "
-            "Received s: {}, rank of x: {}".format(s, ndim))
+            "Received s: {}, rank of x: {}".format(s, ndim)
+        )
     for size in s:
         if not isinstance(size, int) or size <= 0:
-            raise ValueError("FFT sizes {} contains invalid value ({})".format(
-                s, size))
+            raise ValueError(
+                "FFT sizes {} contains invalid value ({})".format(s, size)
+            )
 
 
 def _check_fft_axis(x, axis):
     ndim = x.ndim
     if not isinstance(axis, int):
         raise ValueError(
-            "Invalid FFT axis ({}), it shoule be an integer.".format(axis))
+            "Invalid FFT axis ({}), it shoule be an integer.".format(axis)
+        )
     if axis < -ndim or axis >= ndim:
         raise ValueError(
             "Invalid FFT axis ({}), it should be in range [-{}, {})".format(
-                axis, ndim, ndim))
+                axis, ndim, ndim
+            )
+        )
 
 
 def _check_fft_axes(x, axes):
     ndim = x.ndim
     if not isinstance(axes, Sequence):
         raise ValueError(
-            "Invalid FFT axes ({}), it should be a sequence of integers.".
-            format(axes))
+            "Invalid FFT axes ({}), it should be a sequence of integers.".format(
+                axes
+            )
+        )
     if len(axes) > ndim:
         raise ValueError(
             "Length of fft axes should not be larger than the rank of input. "
-            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim))
+            "Received, len of axes: {}, rank of x: {}".format(len(axes), ndim)
+        )
     for axis in axes:
         if not isinstance(axis, int) or axis < -ndim or axis >= ndim:
             raise ValueError(
-                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})"
-                .format(axes, axis, ndim, ndim))
+                "FFT axes {} contains invalid value ({}), it should be in range [-{}, {})".format(
+                    axes, axis, ndim, ndim
+                )
+            )
 
 
 def _resize_fft_input(x, s, axes):
@@ -127,10 +142,12 @@ def _resize_fft_input(x, s, axes):
             slices.append((0, s[i]))
 
     if axes_to_slice:
-        x = paddle.slice(x,
-                         axes_to_slice,
-                         starts=[item[0] for item in slices],
-                         ends=[item[1] for item in slices])
+        x = paddle.slice(
+            x,
+            axes_to_slice,
+            starts=[item[0] for item in slices],
+            ends=[item[1] for item in slices],
+        )
     if axes_to_pad:
         padding_widths = [0] * (2 * ndim)
         for axis, pad in zip(axes_to_pad, paddings):
@@ -146,8 +163,9 @@ def _normalize_axes(x, axes):
 
 def _check_at_least_ndim(x, rank):
     if x.ndim < rank:
-        raise ValueError("The rank of the input ({}) should >= {}".format(
-            x.ndim, rank))
+        raise ValueError(
+            "The rank of the input ({}) should >= {}".format(x.ndim, rank)
+        )
 
 
 # public APIs 1d
@@ -155,30 +173,30 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Calculate one-dimensional discrete Fourier transform.
 
-    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to 
+    This function uses the efficient fast Fourier transform (FFT) algorithm [1] to
     calculate the 1-D * n * point discrete Fourier transform (DFT).
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
+        n (int, optional): The length of the output transform axis. If `n` is less than
+            the length input, the input will be cropped. If larger, the input is filled
+            with zeros. If `n` is not given, the input length along the axis specified
             by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated
         by `axis`, or the last one if `axis` is not specified.
-    
+
     Examples:
 
         .. code-block:: python
@@ -197,13 +215,9 @@ def fft(x, n=None, axis=-1, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fft_r2c(x,
-                       n,
-                       axis,
-                       norm,
-                       forward=True,
-                       onesided=False,
-                       name=name)
+        return fft_r2c(
+            x, n, axis, norm, forward=True, onesided=False, name=name
+        )
     else:
         return fft_c2c(x, n, axis, norm, forward=True, name=name)
 
@@ -212,7 +226,7 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Compute the 1-D inverse discrete Fourier Transform.
 
-    This function computes the inverse of the 1-D *n*-point discrete Fourier transform 
+    This function computes the inverse of the 1-D *n*-point discrete Fourier transform
     computed by `fft`.  In other words, ``ifft(fft(x)) == x`` to within numerical accuracy.
 
     The input should be ordered in the same way as is returned by `fft`,
@@ -225,27 +239,27 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
 
     For an even number of input points, ``x[n//2]`` represents the sum of
     the values at the positive and negative Nyquist frequencies, as the two
-    are aliased together. 
+    are aliased together.
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
-        n (int, optional): The length of the output transform axis. If `n` is less than 
-            the length input, the input will be cropped. If larger, the input is filled 
-            with zeros. If `n` is not given, the input length along the axis specified 
+        n (int, optional): The length of the output transform axis. If `n` is less than
+            the length input, the input will be cropped. If larger, the input is filled
+            with zeros. If `n` is not given, the input length along the axis specified
             by `axis` is used.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axis indicated 
+        complex tensor. The truncated or zero-padded input, transformed along the axis indicated
         by `axis`, or the last one if `axis` is not specified.
 
     Examples:
@@ -266,13 +280,9 @@ def ifft(x, n=None, axis=-1, norm="backward", name=None):
 
     """
     if is_integer(x) or is_floating_point(x):
-        return fft_r2c(x,
-                       n,
-                       axis,
-                       norm,
-                       forward=False,
-                       onesided=False,
-                       name=name)
+        return fft_r2c(
+            x, n, axis, norm, forward=False, onesided=False, name=name
+        )
     else:
         return fft_c2c(x, n, axis, norm, forward=False, name=name)
 
@@ -286,40 +296,40 @@ def rfft(x, n=None, axis=-1, norm="backward", name=None):
     called the Fast Fourier Transform (FFT).
 
     When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
+    Hermitian-symmetric. This function does not compute the negative frequency
+    terms, and the length of the transformed axis of the output is therefore
     ``n//2 + 1``.
 
     Args:
-        x(Tensor) : Real-valued input tensor 
-        n(int, optional): Number of points along transformation axis in the 
-            input to use. If `n` is smaller than the length of the input, the 
-            input is cropped. If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
+        x(Tensor) : Real-valued input tensor
+        n(int, optional): Number of points along transformation axis in the
+            input to use. If `n` is smaller than the length of the input, the
+            input is cropped. If it is larger, the input is padded with zeros.
+            If `n` is not given, the length of the input along the axis
             specified by `axis` is used.
-        axis(int, optional): Axis over which to compute the FFT. Default value 
+        axis(int, optional): Axis over which to compute the FFT. Default value
             is last axis.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward  pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward  pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor
 
     Examples:
-    
+
     .. code-block:: python
-    
+
         import paddle
 
         x = paddle.to_tensor([0.0, 1.0, 0.0, 0.0])
@@ -334,38 +344,38 @@ def irfft(x, n=None, axis=-1, norm="backward", name=None):
     """
     Computes the inverse of `rfft`.
 
-    This function calculates the inverse of the one-dimensional *n* point discrete 
-    Fourier transform of the actual input calculated by "rfft". In other words, 
+    This function calculates the inverse of the one-dimensional *n* point discrete
+    Fourier transform of the actual input calculated by "rfft". In other words,
     ``irfft(rfft(a),len(a)) == a`` is within the numerical accuracy range.
 
-    The input shall be in the form of "rfft", i.e. the actual zero frequency term, 
-    followed by the complex positive frequency term, in the order of increasing frequency. 
-    Because the discrete Fourier transform of the actual input is Hermite symmetric, 
-    the negative frequency term is regarded as the complex conjugate term of the corresponding 
+    The input shall be in the form of "rfft", i.e. the actual zero frequency term,
+    followed by the complex positive frequency term, in the order of increasing frequency.
+    Because the discrete Fourier transform of the actual input is Hermite symmetric,
+    the negative frequency term is regarded as the complex conjugate term of the corresponding
     positive frequency term.
 
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
         n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            points, ``n//2 + 1``input points are necessary. If the length of the input tensor is greater
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given,
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified
             along the ` axis'.
-        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int, optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis.
         If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1``
         in some cases.
-    
+
     Examples:
 
         .. code-block:: python
@@ -389,25 +399,25 @@ def hfft(x, n=None, axis=-1, norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type. It's a complex.
         n (int, optional): The length of the output transform axis. For `n` output
-            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater 
-            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given, 
-            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified 
+            points, ``n//2 + 1`` input points are necessary. If the length of the input tensor is greater
+            than `n`, it will be cropped, if it is shorter than this, fill in zero. If `n` is not given,
+            it is considered to be ``2 * (k-1)``, where ``k`` is the length of the input axis specified
             along the ` axis'.
-        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis 
-            is used by default.       
+        axis (int,optional): Axis used to calculate FFT. If not specified, the last axis
+            is used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
-        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by 
-        `axis`, or the last input if `axis` is not specified. The length of the conversion axis 
-        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis. 
-        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in 
+        Real tensor. Truncated or zero fill input for the transformation along the axis indicated by
+        `axis`, or the last input if `axis` is not specified. The length of the conversion axis
+        is `n`, or ``2 * k-2``, if `k` is None, where `k` is the length of the input conversion axis.
+        If the output is an odd number, you need to specify the value of 'n', such as ``2 * k-1`` in
         some cases.
-    
+
     Examples:
 
         .. code-block:: python
@@ -428,40 +438,40 @@ def ihfft(x, n=None, axis=-1, norm="backward", name=None):
     """
     The inverse FFT of a signal that has Hermitian symmetry.
 
-    This function computes the one dimensional *n*-point inverse FFT of a signal 
-    that has Hermitian symmetry by means of an efficient algorithm called 
+    This function computes the one dimensional *n*-point inverse FFT of a signal
+    that has Hermitian symmetry by means of an efficient algorithm called
     the Fast Fourier Transform (FFT).
 
     When the DFT is computed for purely real input, the output is
-    Hermitian-symmetric. This function does not compute the negative frequency 
-    terms, and the length of the transformed axis of the output is therefore 
+    Hermitian-symmetric. This function does not compute the negative frequency
+    terms, and the length of the transformed axis of the output is therefore
     ``n//2 + 1``.
 
     Args:
         x(Tensor): Input tensor.
-        n(int, optional): The number of points along transformation axis in the 
-            input to use.  If `n` is smaller than the length of the input, the 
-            input is cropped.  If it is larger, the input is padded with zeros. 
-            If `n` is not given, the length of the input along the axis 
+        n(int, optional): The number of points along transformation axis in the
+            input to use.  If `n` is smaller than the length of the input, the
+            input is cropped.  If it is larger, the input is padded with zeros.
+            If `n` is not given, the length of the input along the axis
             specified by `axis` is used.
         axis(int, optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor.
 
     Examples:
-    
+
     .. code-block:: python
-    
-        import paddle 
+
+        import paddle
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
         print(paddle.fft.ifft(spectrum))
@@ -480,7 +490,7 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
     """
     Compute the N-D discrete Fourier Transform.
 
-    This function calculates the n-D discrete Fourier transform on any number of axes 
+    This function calculates the n-D discrete Fourier transform on any number of axes
     in the M-D array by fast Fourier transform (FFT).
 
     Args:
@@ -493,20 +503,20 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
+            axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by
         `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
+
     Examples:
 
         .. code-block:: python
@@ -536,13 +546,9 @@ def fftn(x, s=None, axes=None, norm="backward", name=None):
             #   [-8.-8.j  0.+0.j  0.+0.j  0.-0.j]]]
     """
     if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(x,
-                        s,
-                        axes,
-                        norm,
-                        forward=True,
-                        onesided=False,
-                        name=name)
+        return fftn_r2c(
+            x, s, axes, norm, forward=True, onesided=False, name=name
+        )
     else:
         return fftn_c2c(x, s, axes, norm, forward=True, name=name)
 
@@ -573,20 +579,20 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used.
         axes (sequence of ints, optional): Axes used to calculate FFT. If not given, the last ``len(s)``
-            axes are used, or all axes if `s` is also not specified.      
+            axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward", meaning no normalization on
-            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies 
-            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are 
+            the forward transforms and scaling by ``1/n`` on the `ifft`. "forward" instead applies
+            the ``1/n`` factor on the forward tranform. For ``norm="ortho"``, both directions are
             scaled by ``1/sqrt(n)``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
-        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by 
+        complex tensor. The truncated or zero-padded input, transformed along the axes indicated by
         `axes`, or by a combination of `s` and `x`, as explained in the parameters section above.
-    
+
     Examples:
 
         .. code-block:: python
@@ -608,19 +614,16 @@ def ifftn(x, s=None, axes=None, norm="backward", name=None):
             #          (-0.1666666716337204+0.28867512941360474j)]])
     """
     if is_integer(x) or is_floating_point(x):
-        return fftn_r2c(x,
-                        s,
-                        axes,
-                        norm,
-                        forward=False,
-                        onesided=False,
-                        name=name)
+        return fftn_r2c(
+            x, s, axes, norm, forward=False, onesided=False, name=name
+        )
     else:
         return fftn_c2c(x, s, axes, norm, forward=False, name=name)
 
 
 def rfftn(x, s=None, axes=None, norm="backward", name=None):
     """
+
     The N dimensional FFT for real input.
 
     This function computes the N-dimensional discrete Fourier Transform over
@@ -637,64 +640,63 @@ def rfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x(Tensor) : Input tensor, taken to be real.
-        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of 
-            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining 
-            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if 
-            the given shape is smaller than that of the input, the input is 
-            cropped.  If it is larger, the input is padded with zeros. if `s` is 
-            not given, the shape of the input along the axes specified by `axes` 
+        s(Sequence[int], optional) : Shape to use from the exec fft. The final element of
+            `s` corresponds to `n` for ``rfft(x, n)``, while for the remaining
+            axes, it corresponds to `n` for ``fft(x, n)``. Along any axis, if
+            the given shape is smaller than that of the input, the input is
+            cropped.  If it is larger, the input is padded with zeros. if `s` is
+            not given, the shape of the input along the axes specified by `axes`
             is used.
-        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given, 
-            the last ``len(s)`` axes are used, or all axes if `s` is also not 
+        axes(Sequence[int], optional) : Axes over which to compute the FFT.  If not given,
+            the last ``len(s)`` axes are used, or all axes if `s` is also not
             specified.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
-            default value is "backward". The details of 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
+            default value is "backward". The details of
             three operations are shown below:
-            
-                - "backward": The factor of forward direction and backward direction are ``1`` 
-                and ``1/n`` respectively;
-                - "forward": The factor of forward direction and backward direction are ``1/n`` 
-                and ``1`` respectively;
+
+                - "backward": The factor of forward direction and backward direction are ``1``
+                  and ``1/n`` respectively;
+                - "forward": The factor of forward direction and backward direction are ``1/n``
+                  and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
-        out(Tensor): complex tensor
+        out(Tensor), complex tensor
 
     Examples:
-    
-    .. code-block:: python
-    
-        import paddle
+        .. code-block:: python
+
+            import paddle
 
-        # default, all axis will be used to exec fft
-        x = paddle.ones((2, 3, 4))
-        print(paddle.fft.rfftn(x))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(24+0j), 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
-
-        # use axes(2, 0)
-        print(paddle.fft.rfftn(x, axes=(2, 0)))
-        # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
-        #        [[[(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ],
-        #          [(8+0j), 0j     , 0j     ]],
-        #
-        #         [[0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ],
-        #          [0j     , 0j     , 0j     ]]])
+            # default, all axis will be used to exec fft
+            x = paddle.ones((2, 3, 4))
+            print(paddle.fft.rfftn(x))
+            # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[[(24+0j), 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ]],
+            #
+            #         [[0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ]]])
+
+            # use axes(2, 0)
+            print(paddle.fft.rfftn(x, axes=(2, 0)))
+            # Tensor(shape=[2, 3, 3], dtype=complex64, place=CUDAPlace(0), stop_gradient=True,
+            #        [[[(8+0j), 0j     , 0j     ],
+            #          [(8+0j), 0j     , 0j     ],
+            #          [(8+0j), 0j     , 0j     ]],
+            #
+            #         [[0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ],
+            #          [0j     , 0j     , 0j     ]]])
 
     """
     return fftn_r2c(x, s, axes, norm, forward=True, onesided=True, name=name)
@@ -717,37 +719,37 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
-            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). 
-            
-            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used. 
-            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros. 
-            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)`` 
-            
+        s (sequence of ints, optional): The length of the output transform axis.
+            (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.).
+
+            - `s` is also the number of input points used along this axis, except for the last axis, where ``s[-1]//2+1`` points of the input are used.
+            - Along any axis, if the shape indicated by `s` is smaller than that of the input, the input is cropped. If it is larger, the input is padded with zeros.
+            - If `s` is not given, the shape of the input along the axes specified by axes is used. Except for the last axis which is taken to be ``2*(k-1)``
+
             where ``k`` is the length of the input along that axis.
-            
+
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
+            `len(s)` axes are used, or all axes if `s` is also not specified.
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            pair and what normalization factor to use. The parameter value must be one
+            of "forward" or "backward" or "ortho". Default is "backward". The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
-        or by a combination of `s` or `x`, as explained in the parameters section above. The length of 
+        Real tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
+        or by a combination of `s` or `x`, as explained in the parameters section above. The length of
         each transformed axis is as given by the corresponding element of `s`, or the length of the input
         in every axis except for the last one if `s` is not given. In the final transformed axis the length
-        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final 
-        transformed axis of the input. To get an odd number of output points in the final axis, 
+        of the output when `s` is not given is ``2*(m-1)``, where ``m`` is the length of the final
+        transformed axis of the input. To get an odd number of output points in the final axis,
         `s` must be specified.
 
     Examples:
@@ -760,12 +762,12 @@ def irfftn(x, s=None, axes=None, norm="backward", name=None):
             print(x)
             irfftn_x = paddle.fft.irfftn(x)
             print(irfftn_x)
-            
+
             # Tensor(shape=[3], dtype=complex128, place=Place(cpu), stop_gradient=True,
             #        [(2+2j), (2+2j), (3+3j)])
             # Tensor(shape=[4], dtype=float64, place=Place(cpu), stop_gradient=True,
             #        [ 2.25000000, -1.25000000,  0.25000000,  0.75000000])
-    
+
     """
     return fftn_c2r(x, s, axes, norm, forward=False, name=name)
 
@@ -775,35 +777,35 @@ def hfftn(x, s=None, axes=None, norm="backward", name=None):
     Compute the N-D FFT of Hermitian symmetric complex input, i.e., a
     signal with a real spectrum.
 
-    This function calculates the n-D discrete Fourier transform of Hermite symmetric 
-    complex input on any axis in M-D array by fast Fourier transform (FFT). 
-    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range. 
-    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary 
+    This function calculates the n-D discrete Fourier transform of Hermite symmetric
+    complex input on any axis in M-D array by fast Fourier transform (FFT).
+    In other words, ``ihfftn(hfftn(x, s)) == x is within the numerical accuracy range.
+    (``s`` here are ``x.shape`` and ``s[-1] = x.shape[- 1] * 2 - 1``. This is necessary
     for the same reason that ``irfft` requires ``x.shape``.)
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): The length of the output transform axis. 
+        s (sequence of ints, optional): The length of the output transform axis.
             (``s[0]`` refers to axis 0, ``s[1]`` to axis 1, etc.). `s` is also the
             number of input points used along this axis, except for the last axis,
-            where ``s[-1]//2+1`` points of the input are used. Along any axis, if 
-            the shape indicated by `s` is smaller than that of the input, the input 
-            is cropped. If it is larger, the input is padded with zeros. 
-            If `s` is not given, the shape of the input along the axes specified by axes 
-            is used. Except for the last axis which is taken to be ``2*(k-1)`` where 
+            where ``s[-1]//2+1`` points of the input are used. Along any axis, if
+            the shape indicated by `s` is smaller than that of the input, the input
+            is cropped. If it is larger, the input is padded with zeros.
+            If `s` is not given, the shape of the input along the axes specified by axes
+            is used. Except for the last axis which is taken to be ``2*(k-1)`` where
             ``k`` is the length of the input along that axis.
         axes (sequence of ints, optional): Axes over which to compute the inverse FFT. If not given, the last
-            `len(s)` axes are used, or all axes if `s` is also not specified.      
+            `len(s)` axes are used, or all axes if `s` is also not specified.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or 
+        Real tensor. Truncate or zero fill input, transforming along the axis indicated by axis or
         a combination of `s` or `X`.
-    
+
     Examples:
 
         .. code-block:: python
@@ -823,36 +825,36 @@ def ihfftn(x, s=None, axes=None, norm="backward", name=None):
     """
     The n dimensional inverse FFT of a signal that has Hermitian symmetry.
 
-    This function computes the n dimensional inverse FFT over any number of axes 
-    in an M-dimensional of a signal that has Hermitian symmetry by means of an 
+    This function computes the n dimensional inverse FFT over any number of axes
+    in an M-dimensional of a signal that has Hermitian symmetry by means of an
     efficient algorithm called the Fast Fourier Transform (FFT).
 
     Args:
         x(Tensor): Input tensor.
-        s(Sequence[int], optional) : Shape (length along each transformed axis) 
-            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis 
-            1, etc.). Along any axis, if the given shape is smaller than that 
-            of the input, the input is cropped. If it is larger, the input is 
-            padded with zeros. if `s` is not given, the shape of the input 
+        s(Sequence[int], optional) : Shape (length along each transformed axis)
+            to use from the input. (``s[0]`` refers to axis 0, ``s[1]`` to axis
+            1, etc.). Along any axis, if the given shape is smaller than that
+            of the input, the input is cropped. If it is larger, the input is
+            padded with zeros. if `s` is not given, the shape of the input
             along the axes specified by `axes` is used.
         axes(Sequence[int], optional) : Axis over which to compute the inverse FFT. If not
             given, the last axis is used.
-        norm(str, optional) : Normalization mode, indicates which direction of 
-            the forward/backward pair of transforms is scaled and with what 
-            normalization factor. Include {"backward", "ortho", "forward"}, 
+        norm(str, optional) : Normalization mode, indicates which direction of
+            the forward/backward pair of transforms is scaled and with what
+            normalization factor. Include {"backward", "ortho", "forward"},
             default value is "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : complex tensor.
 
     Examples:
-    
+
     .. code-block:: python
-    
-        import paddle 
+
+        import paddle
 
         spectrum = paddle.to_tensor([10.0, -5.0, 0.0, -1.0, 0.0, -5.0])
         print(paddle.fft.ifft(spectrum))
@@ -877,22 +879,22 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output.
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``.
             Along each axis, if the given shape is smaller than that of the input,
             the input is cropped. If it is larger, the input is padded with zeros.
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a
+            sequence of 2 integers. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
         or the last two axes if `axes` is not given.
 
     Examples:
@@ -914,13 +916,17 @@ def fft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return fftn(x, s, axes, norm, name)
 
 
@@ -943,22 +949,22 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
 
     Args:
         x (Tensor): The input data. It's a Tensor type.
-        s (sequence of ints, optional): Shape (length of each transformed axis) of the output. 
-            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``. 
+        s (sequence of ints, optional): Shape (length of each transformed axis) of the output.
+            It should be a sequence of 2 integers. This corresponds to ``n`` for ``fft(x, n)``.
             Along each axis, if the given shape is smaller than that of the input,
             the input is cropped. If it is larger, the input is padded with zeros.
             if `s` is not given, the shape of the input along the axes specified
             by `axes` is used. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a 
-            sequence of 2 integers. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. It should be a
+            sequence of 2 integers. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`, 
+        Complex tensor. The truncated or zero-padded input, transformed along the axes indicated by `axes`,
         or the last two axes if `axes` is not given.
 
     Examples:
@@ -979,13 +985,17 @@ def ifft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return ifftn(x, s, axes, norm, name)
 
 
@@ -1000,28 +1010,28 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
         x(Tensor): Input tensor, taken to be real.
         s(Sequence[int], optional) : Shape of the FFT.
         axes(Sequence[int], optional): Axes over which to compute the FFT.
-        norm(str, optional) : {"backward", "ortho", "forward"}, 
-            default is "backward". Indicates which direction of the 
-            forward/backward pair of transforms is scaled and with what 
-            normalization factor. The details of 
+        norm(str, optional) : {"backward", "ortho", "forward"},
+            default is "backward". Indicates which direction of the
+            forward/backward pair of transforms is scaled and with what
+            normalization factor. The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
-    Returns: 
+    Returns:
         out(Tensor): The result of the real 2-D FFT.
 
     Examples:
 
     .. code-block:: python
-    
+
         import paddle
         import numpy as np
 
@@ -1038,13 +1048,17 @@ def rfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return rfftn(x, s, axes, norm, name)
 
 
@@ -1055,24 +1069,24 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): Shape of the real output to the inverse FFT. Default is None.
-        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes 
-            must be two-dimensional. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional): The axes over which to compute the inverse FFT. Axes
+            must be two-dimensional. If not specified, the last two axes are used by default.
         norm (str, optional): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
-            of "forward" or "backward" or "ortho". Default is "backward". The details of 
+            pair and what normalization factor to use. The parameter value must be one
+            of "forward" or "backward" or "ortho". Default is "backward". The details of
             three operations are shown below:
-            
+
                 - "backward": The factor of forward direction and backward direction are ``1`` and ``1/n`` respectively;
                 - "forward": The factor of forward direction and backward direction are ``1/n`` and ``1`` respectively;
                 - "ortho": The factor of forward direction and backword direction are both ``1/sqrt(n)``.
-                
+
             Where ``n`` is the multiplication of each element in  ``s`` .
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name` . 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name` .
+
     Returns:
         Real tensor. The result of the inverse real 2-D FFT.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1090,13 +1104,17 @@ def irfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return irfftn(x, s, axes, norm, name)
 
 
@@ -1107,17 +1125,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x (Tensor): The input data. It's a Tensor type.
         s (sequence of ints, optional): Shape of the real output. Default is None.
-        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be 
-            two-dimensional. If not specified, the last two axes are used by default.       
+        axes (sequence of ints, optional):  Axes over which to compute the FFT. Axes must be
+            two-dimensional. If not specified, the last two axes are used by default.
         norm (str): Indicates which direction to scale the `forward` or `backward` transform
-            pair and what normalization factor to use. The parameter value must be one 
+            pair and what normalization factor to use. The parameter value must be one
             of "forward" or "backward" or "ortho". Default is "backward".
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
-            this property. For more information, please refer to :ref:`api_guide_Name`. 
-    
+        name (str, optional): The default value is None.  Normally there is no need for user to set
+            this property. For more information, please refer to :ref:`api_guide_Name`.
+
     Returns:
         Real tensor. The real result of the 2-D Hermitian complex real FFT.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1135,13 +1153,17 @@ def hfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return hfftn(x, s, axes, norm, name)
 
 
@@ -1155,13 +1177,13 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     Args:
         x(Tensor): Input tensor.
         s(Sequence[int], optional): Shape of the real input to the inverse FFT.
-        axes(Sequance[int], optional): The axes over which to compute the 
+        axes(Sequance[int], optional): The axes over which to compute the
             inverse fft. Default is the last two axes.
-        norm(str, optional): {"backward", "ortho", "forward"}. Default is 
+        norm(str, optional): {"backward", "ortho", "forward"}. Default is
             "backward".
-        name(str, optional): The default value is None.  Normally there is no 
-            need for user to set this property. For more information, please 
-            refer to :ref:`api_guide_Name` . 
+        name(str, optional): The default value is None.  Normally there is no
+            need for user to set this property. For more information, please
+            refer to :ref:`api_guide_Name` .
 
     Returns:
         out(Tensor) : The result of the inverse hermitian 2-D FFT.
@@ -1187,13 +1209,17 @@ def ihfft2(x, s=None, axes=(-2, -1), norm="backward", name=None):
     if s is not None:
         if not isinstance(s, Sequence) or len(s) != 2:
             raise ValueError(
-                "Invalid FFT argument s ({}), it should be a sequence of 2 integers."
-                .format(s))
+                "Invalid FFT argument s ({}), it should be a sequence of 2 integers.".format(
+                    s
+                )
+            )
     if axes is not None:
         if not isinstance(axes, Sequence) or len(axes) != 2:
             raise ValueError(
-                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers."
-                .format(axes))
+                "Invalid FFT argument axes ({}), it should be a sequence of 2 integers.".format(
+                    axes
+                )
+            )
     return ihfftn(x, s, axes, norm, name)
 
 
@@ -1214,7 +1240,7 @@ def fftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1250,8 +1276,8 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     """
     Return the Discrete Fourier Transform sample frequencies.
 
-    The returned floating-point array "F" contains the center of the frequency unit, 
-    and the unit is the number of cycles of the sampling interval (the starting point is zero). 
+    The returned floating-point array "F" contains the center of the frequency unit,
+    and the unit is the number of cycles of the sampling interval (the starting point is zero).
 
     Given input length `n` and a sample spacing `d`::
 
@@ -1263,9 +1289,9 @@ def rfftfreq(n, d=1.0, dtype=None, name=None):
     Args:
         n (int): Dimension inputed.
         d (scalar, optional): Sample spacing (inverse of the sampling rate). Defaults is 1.
-        dtype (str, optional): The data type of returns. Defaults is the data type of returns 
+        dtype (str, optional): The data type of returns. Defaults is the data type of returns
             of ``paddle.get_default_dtype()``.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1307,12 +1333,12 @@ def fftshift(x, axes=None, name=None):
         n (int): Dimension inputed.
         axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
             Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor. The shifted tensor.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1343,19 +1369,19 @@ def fftshift(x, axes=None, name=None):
 
 def ifftshift(x, axes=None, name=None):
     """
-    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the 
+    The inverse of `fftshift`. Although the even length 'x' is the same, the function of the
     odd length 'x' is different. An example.
 
     Args:
         n (int): Dimension inputed.
         axes (int|tuple, optional): The axis on which to move. The default is none, which moves all axes.
             Default is None.
-        name (str, optional): The default value is None.  Normally there is no need for user to set 
+        name (str, optional): The default value is None.  Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor. The shifted tensor.
-    
+
     Examples:
 
         .. code-block:: python
@@ -1417,10 +1443,9 @@ def fft_c2c(x, n, axis, norm, forward, name):
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1442,8 +1467,16 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
     if in_dygraph_mode():
         out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
     elif _in_legacy_dygraph():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
+        attrs = (
+            'axes',
+            axes,
+            'normalization',
+            norm,
+            'forward',
+            forward,
+            'onesided',
+            onesided,
+        )
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
@@ -1458,12 +1491,12 @@ def fft_r2c(x, n, axis, norm, forward, onesided, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
+            _real_to_complex_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1491,8 +1524,16 @@ def fft_c2r(x, n, axis, norm, forward, name):
             out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
     elif _in_legacy_dygraph():
         if n is not None:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', n)
+            attrs = (
+                'axes',
+                axes,
+                'normalization',
+                norm,
+                'forward',
+                forward,
+                'last_dim_size',
+                n,
+            )
         else:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
@@ -1506,12 +1547,12 @@ def fft_c2r(x, n, axis, norm, forward, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
+            _complex_to_real_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1539,8 +1580,10 @@ def fftn_c2c(x, s, axes, norm, forward, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
+                        len(s), len(axes)
+                    )
+                )
             s = [s[i] for i in axes_argsoft]
 
     if s is not None:
@@ -1562,10 +1605,9 @@ def fftn_c2c(x, s, axes, norm, forward, name):
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
 
 
@@ -1591,8 +1633,10 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
+                        len(s), len(axes)
+                    )
+                )
             s = [s[i] for i in axes_argsoft] + [s[-1]]
 
     if s is not None:
@@ -1604,8 +1648,16 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
     if in_dygraph_mode():
         out = _C_ops.fft_r2c(x, axes, norm, forward, onesided)
     elif _in_legacy_dygraph():
-        attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                 'onesided', onesided)
+        attrs = (
+            'axes',
+            axes,
+            'normalization',
+            norm,
+            'forward',
+            forward,
+            'onesided',
+            onesided,
+        )
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
     else:
         inputs = {
@@ -1620,12 +1672,12 @@ def fftn_r2c(x, s, axes, norm, forward, onesided, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _real_to_complex_dtype(dtype))
+            _real_to_complex_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
 
     return out
 
@@ -1654,8 +1706,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
         if s is not None:
             if len(s) != len(axes):
                 raise ValueError(
-                    "Length of s ({}) and length of axes ({}) does not match.".
-                    format(len(s), len(axes)))
+                    "Length of s ({}) and length of axes ({}) does not match.".format(
+                        len(s), len(axes)
+                    )
+                )
             s = [s[i] for i in axes_argsoft] + [s[-1]]
 
     if s is not None:
@@ -1673,8 +1727,16 @@ def fftn_c2r(x, s, axes, norm, forward, name):
             out = _C_ops.fft_c2r(x, axes, norm, forward, 0)
     elif _in_legacy_dygraph():
         if s:
-            attrs = ('axes', axes, 'normalization', norm, 'forward', forward,
-                     'last_dim_size', s[-1])
+            attrs = (
+                'axes',
+                axes,
+                'normalization',
+                norm,
+                'forward',
+                forward,
+                'last_dim_size',
+                s[-1],
+            )
         else:
             attrs = ('axes', axes, 'normalization', norm, 'forward', forward)
         out = getattr(_legacy_C_ops, op_type)(x, *attrs)
@@ -1688,10 +1750,10 @@ def fftn_c2r(x, s, axes, norm, forward, name):
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(
-            _complex_to_real_dtype(dtype))
+            _complex_to_real_dtype(dtype)
+        )
         outputs = {"Out": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
     return out
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 24f69a86662d69..ffd94d840c4d7f 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -52,8 +52,9 @@ def _clip_by_global_norm_using_mp_type(*args):
 
 
 def _cast_to_mp_type_if_enabled(x):
-    if x.dtype == core.VarDesc.VarType.FP16 and _clip_by_global_norm_using_mp_type(
-    ):
+    if (x.dtype == core.VarDesc.VarType.FP16
+            or x.dtype == core.VarDesc.VarType.BF16
+        ) and _clip_by_global_norm_using_mp_type():
         return x.astype(core.VarDesc.VarType.FP32)
     else:
         return x
@@ -65,7 +66,8 @@ def _squared_l2_norm(x):
     """
 
     x = _cast_to_mp_type_if_enabled(x)
-    if core.is_compiled_with_xpu() or x.dtype == core.VarDesc.VarType.FP16:
+    if core.is_compiled_with_xpu(
+    ) or x.dtype == core.VarDesc.VarType.FP16 or x.dtype == core.VarDesc.VarType.BF16:
         square = layers.square(x)
         sum_square = layers.reduce_sum(square)
         return sum_square
@@ -501,7 +503,7 @@ def _dygraph_clip(self, params_grads):
                 merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
 
             sum_square = _squared_l2_norm(merge_grad)
-            if sum_square.dtype == core.VarDesc.VarType.FP16:
+            if sum_square.dtype == core.VarDesc.VarType.FP16 or sum_square.dtype == core.VarDesc.VarType.BF16:
                 sum_square_list_fp16.append(sum_square)
             elif sum_square.dtype == core.VarDesc.VarType.FP32:
                 sum_square_list_fp32.append(sum_square)
@@ -554,8 +556,8 @@ def _dygraph_clip(self, params_grads):
                 continue
             # TODO(wangxi): use inplace elementwise_mul
             if need_clip:
-                clip_input = (clip_var.astype('float16') if g.dtype
-                              == core.VarDesc.VarType.FP16 else clip_var)
+                clip_input = (clip_var.astype(g.dtype)
+                              if clip_var.dtype != g.dtype else clip_var)
                 new_grad = layers.elementwise_mul(g, clip_input)
                 params_and_grads.append((p, new_grad))
             else:
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
index cccc5d90fbab3e..febdacdf43eace 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq.py
@@ -31,9 +31,9 @@
 
 __all__ = ['ImperativePTQ']
 
-_logger = get_logger(__name__,
-                     logging.INFO,
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
 
 
 class ImperativePTQ(object):
@@ -75,17 +75,20 @@ def quantize(self, model, inplace=False, fuse=False, fuse_list=None):
         Return
             quantized_model(paddle.nn.Layer): The quantized model.
         """
-        assert isinstance(model, paddle.nn.Layer), \
-            "The model must be the instance of paddle.nn.Layer."
+        assert isinstance(
+            model, paddle.nn.Layer
+        ), "The model must be the instance of paddle.nn.Layer."
         if not inplace:
             model = copy.deepcopy(model)
         if fuse:
             model.eval()
             model = fuse_utils.fuse_layers(model, fuse_list)
         for name, layer in model.named_sublayers():
-            if PTQRegistry.is_supported_layer(layer) \
-                and utils.is_leaf_layer(layer) \
-                and not self._is_skip_layer(layer):
+            if (
+                PTQRegistry.is_supported_layer(layer)
+                and utils.is_leaf_layer(layer)
+                and not self._is_skip_layer(layer)
+            ):
 
                 # Add quant config
                 quant_config = copy.deepcopy(self._quant_config)
@@ -98,7 +101,8 @@ def quantize(self, model, inplace=False, fuse=False, fuse_list=None):
                 quant_hook_handle = layer.register_forward_post_hook(hook)
                 quant_config.quant_hook_handle = quant_hook_handle
                 layer._forward_post_hooks.move_to_end(
-                    quant_hook_handle._hook_id, last=False)
+                    quant_hook_handle._hook_id, last=False
+                )
 
         return model
 
@@ -110,14 +114,14 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
 
         Args:
             model (Layer): The model to be saved.
-            path (str): The path prefix to save model. The format is 
+            path (str): The path prefix to save model. The format is
                 ``dirname/file_prefix`` or ``file_prefix``.
             input_spec (list[InputSpec|Tensor], optional): Describes the input
                 of the saved model's forward method, which can be described by
-                InputSpec or example Tensor. If None, all input variables of 
+                InputSpec or example Tensor. If None, all input variables of
                 the original Layer's forward method would be the inputs of
                 the saved model. Default None.
-            **configs (dict, optional): Other save configuration options for
+            **config (dict, optional): Other save configuration options for
                 compatibility. We do not recommend using these configurations,
                 they may be removed in the future. If not necessary, DO NOT use
                 them. Default None.
@@ -125,16 +129,17 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
                 (1) output_spec (list[Tensor]): Selects the output targets of
                 the saved model. By default, all return variables of original
                 Layer's forward method are kept as the output of the saved model.
-                If the provided ``output_spec`` list is not all output variables, 
+                If the provided ``output_spec`` list is not all output variables,
                 the saved model will be pruned according to the given
-                ``output_spec`` list. 
+                ``output_spec`` list.
 
         Returns:
             None
         """
 
-        assert isinstance(model, paddle.nn.Layer), \
-            "The model must be the instance of paddle.nn.Layer."
+        assert isinstance(
+            model, paddle.nn.Layer
+        ), "The model must be the instance of paddle.nn.Layer."
 
         # Convert and save dygraph quantized model
         self._convert(model)
@@ -156,12 +161,16 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
         model_filename = basename + INFER_MODEL_SUFFIX
         params_filename = basename + INFER_PARAMS_SUFFIX
 
-        [infer_program, feed_target_names,
-         fetch_targets] = (paddle.fluid.io.load_inference_model(
-             dirname=dirname,
-             executor=exe,
-             model_filename=model_filename,
-             params_filename=params_filename))
+        [
+            infer_program,
+            feed_target_names,
+            fetch_targets,
+        ] = paddle.fluid.io.load_inference_model(
+            dirname=dirname,
+            executor=exe,
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
 
         # Process inference program
         self._clean_up(infer_program)
@@ -169,13 +178,15 @@ def save_quantized_model(self, model, path, input_spec=None, **config):
         self._remove_scale_op(infer_program)
 
         # Save final program
-        paddle.fluid.io.save_inference_model(dirname=dirname,
-                                             feeded_var_names=feed_target_names,
-                                             target_vars=fetch_targets,
-                                             executor=exe,
-                                             main_program=infer_program.clone(),
-                                             model_filename=model_filename,
-                                             params_filename=params_filename)
+        paddle.fluid.io.save_inference_model(
+            dirname=dirname,
+            feeded_var_names=feed_target_names,
+            target_vars=fetch_targets,
+            executor=exe,
+            main_program=infer_program.clone(),
+            model_filename=model_filename,
+            params_filename=params_filename,
+        )
 
         if is_dynamic_mode:
             paddle.disable_static()
@@ -213,8 +224,9 @@ def _cal_thresholds(self, model):
         Returns:
             None
         """
-        assert isinstance(model, paddle.nn.Layer), \
-            "The input model must be the instance of paddle.nn.Layer."
+        assert isinstance(
+            model, paddle.nn.Layer
+        ), "The input model must be the instance of paddle.nn.Layer."
 
         total_num = 0
         cur_num = 0
@@ -226,8 +238,9 @@ def _cal_thresholds(self, model):
             if self._is_quant_layer(sub_layer):
                 cur_num += 1
                 if cur_num % 5 == 0:
-                    _logger.info("Process the %s / %s layer" %
-                                 (cur_num, total_num))
+                    _logger.info(
+                        "Process the %s / %s layer" % (cur_num, total_num)
+                    )
 
                 quant_config = sub_layer._quant_config
 
@@ -236,7 +249,7 @@ def _cal_thresholds(self, model):
                 quant_config.out_act_quantizer.cal_thresholds()
 
                 if PTQRegistry.is_simulated_quant_layer(sub_layer):
-                    weights = (sub_layer.weight, )
+                    weights = (sub_layer.weight,)
                     quant_config.wt_quantizer.sample_data(sub_layer, weights)
                     quant_config.wt_quantizer.cal_thresholds()
 
@@ -250,18 +263,25 @@ def _save_output_thresholds(self, sub_layer, quant_config):
         Returns:
             None
         """
-        assert isinstance(sub_layer, paddle.nn.Layer), \
-            "The input model must be the instance of paddle.nn.Layer."
+        assert isinstance(
+            sub_layer, paddle.nn.Layer
+        ), "The input model must be the instance of paddle.nn.Layer."
 
         layer_info = PTQRegistry.layer_info(sub_layer)
 
         output_names = layer_info.output_names
         output_thresholds = quant_config.out_act_quantizer.thresholds
         assert len(output_names) == 1
-        assert len(output_thresholds) == 1
-        save_name = output_names[0] + str(0) + "_threshold"
-        sub_layer._set_op_attrs({save_name: output_thresholds[0]})
-        sub_layer._set_op_attrs({"out_threshold": output_thresholds[0]})
+        if len(output_thresholds) == 1:
+            save_name = output_names[0] + str(0) + "_threshold"
+            sub_layer._set_op_attrs({save_name: output_thresholds[0]})
+            sub_layer._set_op_attrs({"out_threshold": output_thresholds[0]})
+        else:
+            _logger.warning(
+                "output_thresholds shape of {} need to be 1, but received {}".format(
+                    output_names[0], len(output_thresholds)
+                )
+            )
 
     def _wrap_simulated_layers(self, model):
         """
@@ -272,12 +292,14 @@ def _wrap_simulated_layers(self, model):
         Returns:
             None
         """
-        assert isinstance(model, paddle.nn.Layer), \
-            "The input model must be the instance of paddle.nn.Layer."
+        assert isinstance(
+            model, paddle.nn.Layer
+        ), "The input model must be the instance of paddle.nn.Layer."
 
         for name, sub_layer in model.named_sublayers():
-            if self._is_quant_layer(sub_layer) \
-                and PTQRegistry.is_simulated_quant_layer(sub_layer):
+            if self._is_quant_layer(
+                sub_layer
+            ) and PTQRegistry.is_simulated_quant_layer(sub_layer):
 
                 quant_config = sub_layer._quant_config
                 assert quant_config.enable_in_act_quantizer == True
@@ -303,36 +325,44 @@ def _wrap_simulated_layers(self, model):
                     "activation_bits": in_act_quantizer.quant_bits,
                 }
 
-                quant_layer = quant_layers.__dict__[quant_layer_name](sub_layer,
-                                                                      **kwargs)
+                quant_layer = quant_layers.__dict__[quant_layer_name](
+                    sub_layer, **kwargs
+                )
 
                 # save the input thresholds
                 assert hasattr(quant_layer, "_fake_quant_input")
                 assert hasattr(quant_layer._fake_quant_input, "_scale")
-                assert len(in_act_quantizer.thresholds) == 1
-                input_threshold = np.array([in_act_quantizer.thresholds[0]],
-                                           dtype=np.float32)
-                quant_layer._fake_quant_input._scale.set_value(input_threshold)
+                if len(in_act_quantizer.thresholds) == 1:
+                    input_threshold = np.array(
+                        [in_act_quantizer.thresholds[0]], dtype=np.float32
+                    )
+                    quant_layer._fake_quant_input._scale.set_value(
+                        input_threshold
+                    )
 
                 assert hasattr(quant_layer, "_fake_quant_weight")
                 assert hasattr(quant_layer._fake_quant_weight, "_scale")
                 assert len(wt_quantizer.thresholds) == 1
                 weight_threshold = wt_quantizer.thresholds[0]
                 if isinstance(weight_threshold, list):
-                    weight_threshold = np.array(weight_threshold,
-                                                dtype=np.float32)
+                    weight_threshold = np.array(
+                        weight_threshold, dtype=np.float32
+                    )
                 else:
-                    weight_threshold = np.array([weight_threshold],
-                                                dtype=np.float32)
+                    weight_threshold = np.array(
+                        [weight_threshold], dtype=np.float32
+                    )
                 quant_layer._fake_quant_weight._scale.set_value(
-                    weight_threshold)
+                    weight_threshold
+                )
 
                 # save the output thresholds
                 self._save_output_thresholds(quant_layer, quant_config)
 
                 # replace the layer
-                parent_layer, sub_name = \
-                    utils.find_parent_layer_and_sub_name(model, name)
+                parent_layer, sub_name = utils.find_parent_layer_and_sub_name(
+                    model, name
+                )
                 setattr(parent_layer, sub_name, quant_layer)
 
     def _gather_input_thresholds(self, program, scope):
@@ -351,30 +381,37 @@ def _gather_input_thresholds(self, program, scope):
                 if previous_op is None:
                     continue
 
-                if "quantize_dequantize" in previous_op.type or \
-                    previous_op.type == "moving_average_abs_max_scale":
+                if (
+                    "quantize_dequantize" in previous_op.type
+                    or previous_op.type == "moving_average_abs_max_scale"
+                ):
                     attr_name = previous_op.output('OutScale')[0]
                     in_threshold = utils.load_variable_data(scope, attr_name)
                     in_threshold = utils.fp_numpy_to_naive(in_threshold)
                     argname, index = utils._get_input_name_index(
-                        op, in_var_name)
-                    op._set_attr(argname + str(index) + "_threshold",
-                                 in_threshold)
+                        op, in_var_name
+                    )
+                    op._set_attr(
+                        argname + str(index) + "_threshold", in_threshold
+                    )
                     op._set_attr("with_quant_attr", True)
                 else:
                     for out_var_name in utils._get_op_output_var_names(
-                            previous_op):
+                        previous_op
+                    ):
                         if out_var_name != in_var_name:
                             continue
                         argname, index = utils._get_output_name_index(
-                            previous_op, out_var_name)
+                            previous_op, out_var_name
+                        )
                         attr_name = argname + str(index) + "_threshold"
                         if not previous_op.has_attr(attr_name):
                             continue
                         threshold = previous_op.attr(attr_name)
 
                         argname, index = utils._get_input_name_index(
-                            op, in_var_name)
+                            op, in_var_name
+                        )
                         attr_name = argname + str(index) + "_threshold"
                         op._set_attr(attr_name, threshold)
                         op._set_attr("with_quant_attr", True)
@@ -390,8 +427,11 @@ def _clean_up(self, program):
         """
 
         def _helper(op, next_op, old_attr_name, new_attr_name):
-            if op.has_attr(old_attr_name) and next_op.has_attr(old_attr_name) \
-                and op.attr(old_attr_name) == next_op.attr(old_attr_name):
+            if (
+                op.has_attr(old_attr_name)
+                and next_op.has_attr(old_attr_name)
+                and op.attr(old_attr_name) == next_op.attr(old_attr_name)
+            ):
                 threshold = op.attr(old_attr_name)
                 op._remove_attr(old_attr_name)
                 next_op._remove_attr(old_attr_name)
@@ -417,8 +457,8 @@ def _helper(op, next_op, old_attr_name, new_attr_name):
                 old_attr_name = argname + str(index) + "_threshold"
 
                 argname, index = utils._get_output_name_index(
-                    next_op,
-                    next_op.output("Out")[0])
+                    next_op, next_op.output("Out")[0]
+                )
                 new_attr_name = argname + str(index) + "_threshold"
 
                 _helper(op, next_op, old_attr_name, new_attr_name)
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py
index a6b8033bc78c98..e7b6a243abece6 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/ptq_registry.py
@@ -41,6 +41,7 @@ def __init__(self, layer, input_names, weight_names, output_names):
     LayerInfo(paddle.nn.ReLU, ['X'], [], ['Out']),
     LayerInfo(paddle.nn.ReLU6, ['X'], [], ['Out']),
     LayerInfo(paddle.nn.Hardswish, ['X'], [], ['Out']),
+    LayerInfo(paddle.nn.Swish, ['X'], [], ['Out']),
     LayerInfo(paddle.nn.Sigmoid, ['X'], [], ['Out']),
     LayerInfo(paddle.nn.Softmax, ['X'], [], ['Out']),
     LayerInfo(paddle.nn.Tanh, ['X'], [], ['Out']),
@@ -48,10 +49,15 @@ def __init__(self, layer, input_names, weight_names, output_names):
 ]
 
 QUANT_LAYERS_INFO = [
-    LayerInfo(paddle.nn.quant.quant_layers.QuantizedConv2D, ['Input'],
-              ['Filter'], ['Output']),
-    LayerInfo(paddle.nn.quant.quant_layers.QuantizedLinear, ['X'], ['Y'],
-              ['Out']),
+    LayerInfo(
+        paddle.nn.quant.quant_layers.QuantizedConv2D,
+        ['Input'],
+        ['Filter'],
+        ['Output'],
+    ),
+    LayerInfo(
+        paddle.nn.quant.quant_layers.QuantizedLinear, ['X'], ['Y'], ['Out']
+    ),
 ]
 
 SIMULATED_LAYERS = [paddle.nn.Conv2D, paddle.nn.Linear]
@@ -61,6 +67,7 @@ class PTQRegistry(object):
     """
     Register the supported layers for PTQ and provide layers info.
     """
+
     supported_layers_map = {}
     registered_layers_map = {}
     is_inited = False
@@ -89,8 +96,9 @@ def is_supported_layer(cls, layer):
             flag(bool): Whther the layer is supported.
         """
         cls._init()
-        return layer in cls.supported_layers_map or \
-            isinstance(layer, tuple(cls.supported_layers_map.keys()))
+        return layer in cls.supported_layers_map or isinstance(
+            layer, tuple(cls.supported_layers_map.keys())
+        )
 
     @classmethod
     def is_registered_layer(cls, layer):
@@ -102,8 +110,9 @@ def is_registered_layer(cls, layer):
             flag(bool): Wether the layer is register layer_info.
         """
         cls._init()
-        return layer in cls.registered_layers_map or \
-            isinstance(layer, tuple(cls.registered_layers_map.keys()))
+        return layer in cls.registered_layers_map or isinstance(
+            layer, tuple(cls.registered_layers_map.keys())
+        )
 
     @classmethod
     def is_simulated_quant_layer(cls, layer):
@@ -114,8 +123,9 @@ def is_simulated_quant_layer(cls, layer):
         Returns:
             flag(bool): Whther the layer is supported.
         """
-        return layer in SIMULATED_LAYERS or \
-            isinstance(layer, tuple(SIMULATED_LAYERS))
+        return layer in SIMULATED_LAYERS or isinstance(
+            layer, tuple(SIMULATED_LAYERS)
+        )
 
     @classmethod
     def layer_info(cls, layer):
@@ -126,8 +136,9 @@ def layer_info(cls, layer):
         Returns:
             layer_info(LayerInfo): The layer info of the input layer.
         """
-        assert cls.is_registered_layer(layer), \
-            "The input layer is not register."
+        assert cls.is_registered_layer(
+            layer
+        ), "The input layer is not register."
 
         for layer_key, layer_info in cls.registered_layers_map.items():
             if layer == layer_key or isinstance(layer, layer_key):
diff --git a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
index 84359f711532c0..3a4b7721d55ffd 100644
--- a/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
+++ b/python/paddle/fluid/contrib/slim/quantization/imperative/qat.py
@@ -32,6 +32,7 @@
 from ..quantization_pass import ReplaceFakeQuantDequantPass, QuantWeightPass
 from paddle.fluid.log_helper import get_logger
 from .. import quantization_pass
+from ..utils import move_persistable_var_to_global_block
 from . import utils
 from . import fuse_utils
 
@@ -72,7 +73,8 @@ def __init__(self,
                  weight_preprocess_layer=None,
                  act_preprocess_layer=None,
                  weight_quantize_layer=None,
-                 act_quantize_layer=None):
+                 act_quantize_layer=None,
+                 onnx_format=False):
         """
         The constructor for ImperativeQuantAware.
 
@@ -124,6 +126,8 @@ def __init__(self,
                 activation and returns dequantized activation. 
                 If None, will use quantization op defined by 'activation_quantize_type'.
                 Default is None.
+            onnx_format (bool, optional): Whether to export the quantized model
+                with format of ONNX. Default is False.
 
         Note:
             If user sets attribute 'skip_quant' to a Layer that support dynamic
@@ -223,7 +227,8 @@ def forward(self, inputs):
 
         self._quantize_inputs = ImperativeQuantizeInputs(**kwargs)
 
-        self._quantize_outputs = ImperativeQuantizeOutputs(moving_rate)
+        self._quantize_outputs = ImperativeQuantizeOutputs(
+            moving_rate, activation_bits, onnx_format)
 
     def quantize(self, model):
         """
@@ -412,16 +417,19 @@ class ImperativeQuantizeOutputs(object):
     Calculate the output scales for target layers.
     """
 
-    def __init__(self, moving_rate=0.9):
+    def __init__(self, moving_rate=0.9, activation_bits=8, onnx_format=False):
         """
         The constructor for ImperativeQuantizeOutputs.
 
         Args:
             moving_rate(float): The decay coefficient of moving average.
                                 The default value is 0.9.
+            activation_bits(int, optional): quantization bit number for activation. Default is 8.
         """
         super(ImperativeQuantizeOutputs, self).__init__()
         self._moving_rate = moving_rate
+        self._activation_bits = activation_bits
+        self._onnx_format = onnx_format
 
     def apply(self, model):
         """
@@ -458,12 +466,7 @@ def apply(self, model):
 
             setattr(parent_layer, sub_name, cur_quant_layer)
 
-    def save_quantized_model(self,
-                             model,
-                             path,
-                             input_spec=None,
-                             onnx_format=False,
-                             **config):
+    def save_quantized_model(self, model, path, input_spec=None, **config):
         """
         Save the quantized model for the inference.
 
@@ -476,9 +479,7 @@ def save_quantized_model(self,
                 InputSpec or example Tensor. If None, all input variables of 
                 the original Layer's forward method would be the inputs of
                 the saved model. Default None.
-            onnx_format (bool, optional): Whether to export the quantized model 
-                with format of ONNX. Default is False.
-            **configs (dict, optional): Other save configuration options for
+            **config (dict, optional): Other save configuration options for
                 compatibility. We do not recommend using these configurations,
                 they may be removed in the future. If not necessary, DO NOT use
                 them. Default None.
@@ -518,31 +519,40 @@ def save_quantized_model(self,
                                    model_filename=model_filename,
                                    params_filename=params_filename))
 
-        self._gather_scales(infer_program, scope, fetch_targets)
+        if not self._onnx_format:
+            self._gather_scales(infer_program, scope, fetch_targets)
 
-        # Remove `moving_average_abs_max_scale` node in sub graphs.
-        graph = IrGraph(core.Graph(infer_program.desc), for_test=False)
-        for sub_graph in graph.all_sub_graphs():
-            for _op in sub_graph.all_op_nodes():
-                if _op.name() == "moving_average_abs_max_scale":
-                    sub_graph.safe_remove_nodes(_op)
-            sub_graph.resolve_hazard()
-        infer_program = graph.to_program()
+            # Remove `moving_average_abs_max_scale` node in sub graphs.
+            graph = IrGraph(core.Graph(infer_program.desc), for_test=False)
+            for sub_graph in graph.all_sub_graphs():
+                for _op in sub_graph.all_op_nodes():
+                    if _op.name() == "moving_average_abs_max_scale":
+                        sub_graph.safe_remove_nodes(_op)
+                sub_graph.resolve_hazard()
+            infer_program = graph.to_program()
 
-        self._set_skip_quant_attr(infer_program)
+            self._set_skip_quant_attr(infer_program)
 
-        clip_extra = False
-        if onnx_format:
+            clip_extra = False
+        else:
             graph = IrGraph(core.Graph(infer_program.desc), for_test=False)
-            transform_pass = ReplaceFakeQuantDequantPass(scope, place)
-            transform_pass.apply(graph)
+            transform_pass = ReplaceFakeQuantDequantPass(
+                scope, place, quant_bits=self._activation_bits)
+            for sub_graph in graph.all_sub_graphs():
+                sub_graph._for_test = True
+                transform_pass.apply(sub_graph)
 
             quant_weight_pass = QuantWeightPass(scope, place)
-            quant_weight_pass.apply(graph)
+            for sub_graph in graph.all_sub_graphs():
+                sub_graph._for_test = True
+                quant_weight_pass.apply(sub_graph)
+
             infer_program = graph.to_program()
 
             clip_extra = True
 
+        move_persistable_var_to_global_block(infer_program)
+
         save_inference_model(dirname=dirname,
                              feeded_var_names=feed_target_names,
                              target_vars=fetch_targets,
@@ -559,18 +569,24 @@ def _is_target_layer(self, layer):
         """
         Whether the layer needs to calculate output scales.
         """
+        # exclude fake_quant ops in quant_layers file
+        if not isinstance(layer, dygraph.Layer):
+            return False
+
+        if self._onnx_format:
+            return True if isinstance(layer, tuple(
+                utils.fake_quant_wrap_layers)) else False
+
         flag = False
-        if isinstance(layer, dygraph.Layer):
-            # exclude fake_quant ops in quant_layers file
-            if utils.is_leaf_layer(layer) and \
-                not isinstance(layer, tuple(utils.fake_quant_leaf_layers)):
-                flag = True
+        if utils.is_leaf_layer(layer) and \
+            not isinstance(layer, tuple(utils.fake_quant_leaf_layers)):
+            flag = True
 
-            if isinstance(layer, tuple(utils.fake_quant_wrap_layers)):
-                flag = True
+        if isinstance(layer, tuple(utils.fake_quant_wrap_layers)):
+            flag = True
 
-            if isinstance(layer, paddle.nn.quant.FloatFunctionalLayer):
-                flag = True
+        if isinstance(layer, paddle.nn.quant.FloatFunctionalLayer):
+            flag = True
 
         return flag
 
diff --git a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
index 4e37ba05b68ae4..97cb732d5e6ceb 100644
--- a/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
+++ b/python/paddle/fluid/contrib/slim/quantization/post_training_quantization.py
@@ -344,7 +344,7 @@ def __init__(self,
         self._fetch_list = None
         self._data_loader = data_loader
 
-        self._out_scale_op_list = utils._out_scale_op_list
+        self._out_scale_op_list = utils.QUANT_SUPPORTED_OP_TYPE_LIST
         self._quantized_weight_var_name = set()
         self._quantized_act_var_name = set()
         self._weight_op_pairs = {}
@@ -449,21 +449,7 @@ def quantize(self):
             self._collect_dynamic_quantize_op_threshold(
                 self._dynamic_quantize_op_type)
 
-        # Move sub blocks persistable var to global block
-        global_block = self._program.global_block()
-        for _op in global_block.ops:
-            if _op.type == "while":
-                _block_id = _op.attr("sub_block").id
-                _block = self._program.block(_block_id)
-                persistables = []
-                for _name, _var in _block.vars.items():
-                    if _var.persistable:
-                        global_block._clone_variable(_var)
-                        persistables.append(_name)
-                for _name in persistables:
-                    _block._remove_var(_name)
-                persistables.extend(_op.input('X'))
-                _op.desc.set_input("X", persistables)
+        utils.move_persistable_var_to_global_block(self._program)
 
         if not self._return_graph:
             return self._program
@@ -843,9 +829,6 @@ def _sample_histogram(self):
             hist, _ = np.histogram(var_tensor_abs, bins=bins)
             self._sampling_act_histogram[var_name][0] += hist
 
-    def l2_loss(self, gt, pred):
-        return ((gt - pred)**2).mean()
-
     def _sample_ptf(self):
         """
         The following code are modified from:
@@ -885,10 +868,10 @@ def _sample_ptf(self):
                                                q_max) * scale4
             quant_dequant_var_scale8 = np.clip(np.round(var_tensor / scale8), 0,
                                                q_max) * scale8
-            score1 = self.l2_loss(var_tensor, quant_dequant_var_scale1)
-            score2 = self.l2_loss(var_tensor, quant_dequant_var_scale2)
-            score4 = self.l2_loss(var_tensor, quant_dequant_var_scale4)
-            score8 = self.l2_loss(var_tensor, quant_dequant_var_scale8)
+            score1 = utils.l2_loss(var_tensor, quant_dequant_var_scale1)
+            score2 = utils.l2_loss(var_tensor, quant_dequant_var_scale2)
+            score4 = utils.l2_loss(var_tensor, quant_dequant_var_scale4)
+            score8 = utils.l2_loss(var_tensor, quant_dequant_var_scale8)
             score = [score1, score2, score4, score8]
             mask = 2**score.index(min(score))
             scale = scale1 * mask
@@ -1035,7 +1018,7 @@ def _update_program(self):
                 scope=self._scope,
                 place=self._place,
                 quantizable_op_type=minor_quantizable_op_types,
-                is_full_quantized=self._is_full_quantize)
+                is_full_quantized=True)
 
         for sub_graph in graph.all_sub_graphs():
             sub_graph._for_test = True
diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
index f8d950aa5e0fe8..c94117830d79e9 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py
@@ -44,6 +44,7 @@
     'AddQuantDequantPassV2',
     'ReplaceFakeQuantDequantPass',
     'QuantWeightPass',
+    'AddQuantDequantForInferencePass',
 ]
 
 _fake_quant_op_list = [
@@ -58,6 +59,7 @@
 _fake_quant_dequant_op_list = [
     'fake_quantize_dequantize_moving_average_abs_max',
     "fake_channel_wise_quantize_dequantize_abs_max",
+    "fake_quantize_dequantize_abs_max",
 ]
 
 _conv_ops = ['conv2d', 'depthwise_conv2d', 'conv2d_transpose']
@@ -1437,7 +1439,7 @@ def __init__(self,
         self._place = _get_paddle_place(place)
         self._moving_rate = moving_rate
         self._is_test = is_test
-        self._teller_set = utils._out_scale_op_list
+        self._teller_set = utils.QUANT_SUPPORTED_OP_TYPE_LIST
         self._scale_dict = scale_dict
 
     def apply(self, graph):
@@ -1559,7 +1561,7 @@ def __init__(self, scope=None):
             scope(fluid.Scope): The scope is used to initialize these new parameters.
         """
         self._scope = scope
-        self._teller_set = utils._out_scale_op_list
+        self._teller_set = utils.QUANT_SUPPORTED_OP_TYPE_LIST
 
     def apply(self, graph):
         """
@@ -1844,6 +1846,7 @@ class InsertQuantizeLinear(object):
         channel_wise(bool, optional): Whether quantization with per channel or not. Default is False.
         moving_rate(float): the rate for 'moving average' method.
         is_test(bool, optional): Whether quantization with training or not. Default is True.
+        scale_dict(dict, optional): calibration ranges of tensors output.
     """
 
     def __init__(self,
@@ -1853,7 +1856,8 @@ def __init__(self,
                  quant_axis=-1,
                  channel_wise=False,
                  moving_rate=0.9,
-                 is_test=True):
+                 is_test=True,
+                 scale_dict=None):
         self._place = place
         self._scope = scope
         self.quant_bits = quant_bits
@@ -1861,6 +1865,7 @@ def __init__(self,
         self.channel_wise = channel_wise
         self._is_test = is_test
         self._moving_rate = moving_rate
+        self._scale_dict = scale_dict
 
     def insert_quant_op(self, graph, var_node, var_name=None):
         assert var_node.is_var(), '{} is not a var'.format(var_node.name())
@@ -1872,16 +1877,24 @@ def insert_quant_op(self, graph, var_node, var_name=None):
             var_dtype=var_node.dtype())
         data_type = 'float64' if var_node.dtype(
         ) == core.VarDesc.VarType.FP64 else 'float32'
+        scale_name = self._quantized_scale_name(var_name)
         if self.channel_wise:
             scale_var_shape = var_node.shape()[self.quant_axis]
             scale_var_type = core.VarDesc.VarType.LOD_TENSOR
-            init_scale_value = np.zeros(scale_var_shape, dtype=data_type)
+            init_scale_value = np.ones(scale_var_shape,
+                                       dtype=data_type) * _SCALE_DEFAULT_VALUE
         else:
             scale_var_shape = 1
             scale_var_type = var_node.type()
             init_scale_value = np.array([_SCALE_DEFAULT_VALUE], dtype=data_type)
+
+        if self._scale_dict is not None and var_node.name(
+        ) in self._scale_dict.keys():
+            init_scale_value = np.array([self._scale_dict[var_node.name()]],
+                                        dtype=data_type)
+
         scale_var_node = graph.create_persistable_node(
-            name=self._quantized_scale_name(var_name),
+            name=scale_name,
             var_type=scale_var_type,
             shape=[scale_var_shape],
             var_dtype=var_node.dtype())
@@ -2338,7 +2351,8 @@ def __init__(self,
                  skip_pattern=["skip_quant"],
                  quantizable_op_type=["elementwise_add", "pool2d"],
                  is_full_quantized=False,
-                 is_test=None):
+                 is_test=None,
+                 scale_dict=None):
         """
         Args:
             scope(paddle.Scope): The scope is used to initialize these new parameters.
@@ -2358,7 +2372,8 @@ def __init__(self,
                 quantization to all supported quantizable op type. If set is_full_quantized
                 as False, only apply quantization to the op type according to the input 
                 quantizable_op_type.
-        
+            scale_dict(dict, optional): calibration ranges of tensors output.
+
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
@@ -2380,6 +2395,7 @@ def __init__(self,
         self._quant_bits = quant_bits
         self._is_test = is_test
         self._skip_pattern = skip_pattern
+        self._scale_dict = scale_dict
 
         if is_full_quantized:
             self._quantizable_op_type = utils._act_supported_quantizable_op_type
@@ -2436,8 +2452,6 @@ def apply(self, graph):
                     if is_skip or is_quantized:
                         continue
 
-                    op_node.op()._set_attr("quantization_type",
-                                           "qat_without_weight")
                     arg_names = utils._get_op_input_var_names(op_node)
                     for arg_name in arg_names:
                         in_node = graph._find_node_by_name(
@@ -2454,7 +2468,8 @@ def apply(self, graph):
                                 quant_axis=-1,
                                 channel_wise=False,
                                 moving_rate=self._moving_rate,
-                                is_test=self._is_test)
+                                is_test=self._is_test,
+                                scale_dict=self._scale_dict)
                             quant_var_node, scale_var_node = insert_quant_pass.insert_quant_op(
                                 graph, in_node)
                             dequant_var_node = insert_quant_pass.insert_dequant_op(
@@ -2483,14 +2498,15 @@ class ReplaceFakeQuantDequantPass(object):
     replace quant-dequant ops with quantize_linear and dequantize_linear ops.
     """
 
-    def __init__(self, scope, place):
+    def __init__(self, scope, place, quant_bits=8):
         r"""
         Args:
             scope(paddle.Scope): The scope is used to initialize these new parameters.
             place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to initialize new
                 parameters described above. If ``place`` is string, it can be It can be ``cpu``
                 or ``gpu:x``, where ``x`` is the index of the GPUs.
-        
+            quant_bits(int, optional): quantization bit number for activation. Default is 8.
+
         Examples:
         .. code-block:: python
             # The original graph will be rewrite.
@@ -2508,6 +2524,7 @@ def __init__(self, scope, place):
         """
         self._place = _get_paddle_place(place)
         self._scope = scope
+        self._quant_bits = quant_bits
         assert self._scope != None, "scope must not be None."
         assert self._place != None, "place must not be None."
 
@@ -2517,7 +2534,8 @@ def apply(self, graph):
         fake_quant_dequant_ops = []
 
         for op in graph.all_op_nodes():
-            if op.name() in _fake_quant_dequant_op_list:
+            if op.name() in _fake_quant_dequant_op_list or op.name(
+            ) == "moving_average_abs_max_scale":
                 fake_quant_dequant_ops.append(op)
 
         for _op in fake_quant_dequant_ops:
@@ -2536,7 +2554,7 @@ def _replace_op(self, graph, op):
         quant_axis = op.op().attr("quant_axis") if op.op().has_attr(
             "quant_axis") else -1
         bit_length = op.op().attr("bit_length") if op.op().has_attr(
-            "bit_length") else 8
+            "bit_length") else self._quant_bits
 
         zero_point_node = None
         quanted_node = x_node
@@ -2725,3 +2743,140 @@ def _load_var(self, name):
     def _restore_var(self, name, array):
         tensor = self._scope.find_var(name).get_tensor()
         tensor.set(array, self._place)
+
+
+class AddQuantDequantForInferencePass(object):
+    """
+    When export quant model, it will traverse to find the output of each op, and then insert the quant/dequant op after it.
+    """
+
+    def __init__(self, scope, place, quant_bits=8):
+        """
+        Args:
+            scope(fluid.Scope): The scope is used to initialize these new parameters.
+            place(paddle.CPUPlace|paddle.CUDAPlace|str): place is used to restore the weight tensors.
+                If it's string, it can be ``cpu``, and ``gpu:x``, where ``x`` is the index of the GPUs.
+            quant_bits(int, optional): quantization bit number for weight. Default is 8.
+        """
+        self._scope = scope
+        self._place = place
+        self._quant_bits = quant_bits
+        self._teller_set = utils.QUANT_SUPPORTED_OP_TYPE_LIST
+
+    def apply(self, graph):
+        """
+        Args:
+            graph(IrGraph): the target graph.
+        """
+        assert isinstance(graph,
+                          IrGraph), 'graph must be the instance of IrGraph.'
+        dequant_node_map = {}
+        dequantized_vars_map = collections.OrderedDict()
+        for op_node in graph.all_op_nodes():
+            if op_node.name() in self._teller_set:
+                var_names = utils._get_op_output_var_names(op_node)
+                for var_name in var_names:
+                    out_node = graph._find_node_by_name(op_node.outputs,
+                                                        var_name)
+                    if out_node.dtype() not in \
+                        [core.VarDesc.VarType.FP64, core.VarDesc.VarType.FP32]:
+                        continue
+                    if var_name in dequantized_vars_map:
+                        dequant_var_node = dequantized_vars_map[var_name]
+                    else:
+                        dequant_var_node = self._insert_quant_dequant_op(
+                            graph, out_node)
+                        dequantized_vars_map[var_name] = dequant_var_node
+                    dequant_node_map[var_name] = dequant_var_node
+
+        # remove unuse node and link act quant/dequant linear to op node
+        for op_node in graph.all_op_nodes():
+            if op_node.name() == 'moving_average_abs_max_scale':
+                graph.safe_remove_nodes(op_node)
+            else:
+                var_names = utils._get_op_input_var_names(op_node)
+                for var_name in var_names:
+                    if var_name in dequant_node_map:
+                        in_node = graph._find_node_by_name(
+                            op_node.inputs, var_name)
+                        graph.update_input_link(in_node,
+                                                dequant_node_map[var_name],
+                                                op_node)
+
+        return graph
+
+    def _scale_name(self, var_name):
+        """
+        Return the scale name for the var named `var_name`.
+        """
+        return "%s@scale" % (var_name)
+
+    def _insert_quant_dequant_op(self, graph, var_node):
+        assert var_node.is_var(), '{} is not a var'.format(var_node.name())
+        var_name = var_node.name()
+        quant_axis = -1
+        quant_var_node = graph.create_var_node(
+            name="{}.quantized".format(var_name),
+            var_type=var_node.type(),
+            shape=var_node.shape(),
+            var_dtype=var_node.dtype())
+        scale_var_node = graph._find_node_by_name(graph.all_persistable_nodes(),
+                                                  self._scale_name(var_name))
+        try:
+            zero_point_node = graph._find_node_by_name(
+                graph.all_persistable_nodes(),
+                "{}@zero_point".format(quant_var_node.name()))
+        except:
+            zero_point_node = graph.create_persistable_node(
+                name="{}@zero_point".format(quant_var_node.name()),
+                var_type=core.VarDesc.VarType.LOD_TENSOR,
+                shape=scale_var_node.shape(),
+                var_dtype=core.VarDesc.VarType.INT32)
+            _init_var_node(zero_point_node,
+                           np.zeros(scale_var_node.shape(), dtype="int32"),
+                           self._scope, self._place)
+
+        inputs = {"X": var_node, "Scale": scale_var_node}
+        if zero_point_node is not None:
+            inputs["ZeroPoint"] = zero_point_node
+
+        attrs = {"quant_axis": quant_axis, "bit_length": self._quant_bits}
+        attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
+        outputs = {"Y": quant_var_node}
+
+        quant_op_node = graph.create_op_node(op_type="quantize_linear",
+                                             attrs=attrs,
+                                             inputs=inputs,
+                                             outputs=outputs)
+
+        graph.link_to(var_node, quant_op_node)
+        graph.link_to(scale_var_node, quant_op_node)
+        if zero_point_node is not None:
+            graph.link_to(zero_point_node, quant_op_node)
+        graph.link_to(quant_op_node, quant_var_node)
+
+        # add dequant_linear node
+        dequant_var_node = graph.create_var_node(
+            name="{}.dequantized".format(quant_var_node.name()),
+            var_type=quant_var_node.type(),
+            shape=quant_var_node.shape(),
+            var_dtype=quant_var_node.dtype())
+
+        inputs = {"X": quant_var_node, "Scale": scale_var_node}
+        if zero_point_node is not None:
+            inputs["ZeroPoint"] = zero_point_node
+
+        attrs = {"quant_axis": -1, "bit_length": self._quant_bits}
+        attrs["op_role"] = core.op_proto_and_checker_maker.OpRole.Forward
+
+        dequant_op_node = graph.create_op_node(op_type="dequantize_linear",
+                                               attrs=attrs,
+                                               inputs=inputs,
+                                               outputs={"Y": dequant_var_node})
+
+        graph.link_to(quant_var_node, dequant_op_node)
+        graph.link_to(scale_var_node, dequant_op_node)
+        if zero_point_node is not None:
+            graph.link_to(zero_point_node, dequant_op_node)
+        graph.link_to(dequant_op_node, dequant_var_node)
+        return dequant_var_node
diff --git a/python/paddle/fluid/contrib/slim/quantization/utils.py b/python/paddle/fluid/contrib/slim/quantization/utils.py
index c2c24348f5b76c..fe4446939a5546 100644
--- a/python/paddle/fluid/contrib/slim/quantization/utils.py
+++ b/python/paddle/fluid/contrib/slim/quantization/utils.py
@@ -38,6 +38,7 @@
     "mean",
     "not_equal",
     "reshape",
+    "reshape2",
     "dropout",
     "bilinear_interp",
     "nearest_interp",
@@ -111,10 +112,12 @@
     "reduce_max",
 ]
 
-_out_scale_op_list = list(
+QUANT_SUPPORTED_OP_TYPE_LIST = list(
     set(_weight_supported_quantizable_op_type +
         _act_supported_quantizable_op_type))
 
+_out_scale_op_list = QUANT_SUPPORTED_OP_TYPE_LIST
+
 _channelwise_quant_axis1_ops = [
     'conv2d_transpose', 'mul', 'matmul', 'matmul_v2'
 ]
@@ -329,9 +332,11 @@ def _clip(x, scale):
         x[x < -scale] = -scale
         return x
 
-    assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
     bnt = (1 << (weight_bits - 1)) - 1
+    if isinstance(scale, list) and len(scale) == 1:
+        scale = scale[0]
     if isinstance(scale, list):
+        assert quant_axis in [0, 1], 'quant_axis should be 0 or 1 for now.'
         for i, s in enumerate(scale):
             if s == 0.0:
                 s = 1e-8
@@ -428,6 +433,28 @@ def calculate_quant_cos_error(orig_tensor, qdq_tensor):
     return cos_sim
 
 
+def move_persistable_var_to_global_block(program):
+    # Move sub blocks persistable var to global block
+    global_block = program.global_block()
+    for _op in global_block.ops:
+        if _op.type == "while":
+            _block_id = _op.attr("sub_block").id
+            _block = program.block(_block_id)
+            persistables = []
+            for _name, _var in _block.vars.items():
+                if _var.persistable:
+                    global_block._clone_variable(_var)
+                    persistables.append(_name)
+            for _name in persistables:
+                _block._remove_var(_name)
+            persistables.extend(_op.input('X'))
+            _op.desc.set_input("X", persistables)
+
+
+def l2_loss(gt, pred):
+    return ((gt - pred)**2).mean()
+
+
 class tqdm(object):
 
     def __init__(self, total, bar_format='Loading|{bar}', ncols=80):
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
index 2c18eff983e4c4..3299119ef99c41 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat.py
@@ -56,7 +56,10 @@ def set_vars(self):
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = False
         self.check_export_model_accuracy = True
-        self.diff_threshold = 0.01
+        # The original model and quantized model may have different prediction.
+        # There are 32 test data and we allow at most one is different.
+        # Hence, the diff_threshold is 1 / 32 = 0.03125
+        self.diff_threshold = 0.03125
         self.fuse_conv_bn = False
 
     def func_qat(self):
@@ -65,7 +68,8 @@ def func_qat(self):
         imperative_qat = ImperativeQuantAware(
             weight_quantize_type=self.weight_quantize_type,
             activation_quantize_type=self.activation_quantize_type,
-            fuse_conv_bn=self.fuse_conv_bn)
+            fuse_conv_bn=self.fuse_conv_bn,
+            onnx_format=self.onnx_format)
 
         with fluid.dygraph.guard():
             # For CI coverage
@@ -184,8 +188,7 @@ def func_qat(self):
                 input_spec=[
                     paddle.static.InputSpec(shape=[None, 1, 28, 28],
                                             dtype='float32')
-                ],
-                onnx_format=self.onnx_format)
+                ])
             print('Quantized model saved in %s' % tmpdir)
 
             if core.is_compiled_with_cuda():
@@ -207,7 +210,7 @@ def func_qat(self):
             quant_acc = fluid.layers.accuracy(quant_out, label).numpy()
             paddle.enable_static()
             delta_value = fp32_acc - quant_acc
-            self.assertLess(delta_value, self.diff_threshold)
+            self.assertLessEqual(delta_value, self.diff_threshold)
 
     def test_qat(self):
         with _test_eager_guard():
@@ -221,7 +224,7 @@ def set_vars(self):
         self.weight_quantize_type = 'abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = True
-        self.diff_threshold = 0.025
+        self.diff_threshold = 0.03125
         self.fuse_conv_bn = False
 
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
index 3770ee486499df..f9fa636debc458 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_channelwise.py
@@ -43,7 +43,7 @@ class TestImperativeQatChannelWise(TestImperativeQat):
     def set_vars(self):
         self.weight_quantize_type = 'channel_wise_abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
-        self.diff_threshold = 0.01
+        self.diff_threshold = 0.03125
         self.onnx_format = False
         self.fuse_conv_bn = False
         print('weight_quantize_type', self.weight_quantize_type)
@@ -55,7 +55,7 @@ def set_vars(self):
         self.weight_quantize_type = 'channel_wise_abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
         self.onnx_format = True
-        self.diff_threshold = 0.025
+        self.diff_threshold = 0.03125
         self.fuse_conv_bn = False
         print('weight_quantize_type', self.weight_quantize_type)
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
index db7f15c4cef177..4c491598d2124f 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_imperative_qat_fuse.py
@@ -43,7 +43,7 @@ class TestImperativeQatfuseBN(TestImperativeQat):
     def set_vars(self):
         self.weight_quantize_type = 'abs_max'
         self.activation_quantize_type = 'moving_average_abs_max'
-        self.diff_threshold = 0.01
+        self.diff_threshold = 0.03125
         self.onnx_format = False
         self.fuse_conv_bn = True
 
diff --git a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
index 00a4e2c2aa49e0..471798dec28c5c 100644
--- a/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/contrib/slim/tests/test_post_training_quantization_mobilenetv1.py
@@ -77,13 +77,14 @@ def process_image(sample, mode, color_jitter, rotate):
     return img, sample[1]
 
 
-def _reader_creator(file_list,
-                    mode,
-                    shuffle=False,
-                    color_jitter=False,
-                    rotate=False,
-                    data_dir=DATA_DIR):
-
+def _reader_creator(
+    file_list,
+    mode,
+    shuffle=False,
+    color_jitter=False,
+    rotate=False,
+    data_dir=DATA_DIR,
+):
     def reader():
         with open(file_list) as flist:
             full_lines = [line.strip() for line in flist]
@@ -98,10 +99,9 @@ def reader():
                     continue
                 yield img_path, int(label)
 
-    mapper = functools.partial(process_image,
-                               mode=mode,
-                               color_jitter=color_jitter,
-                               rotate=rotate)
+    mapper = functools.partial(
+        process_image, mode=mode, color_jitter=color_jitter, rotate=rotate
+    )
 
     return paddle.reader.xmap_readers(mapper, reader, THREAD, BUF_SIZE)
 
@@ -112,11 +112,11 @@ def val(data_dir=DATA_DIR):
 
 
 class TestPostTrainingQuantization(unittest.TestCase):
-
     def setUp(self):
         self.int8_download = 'int8/download'
-        self.cache_folder = os.path.expanduser('~/.cache/paddle/dataset/' +
-                                               self.int8_download)
+        self.cache_folder = os.path.expanduser(
+            '~/.cache/paddle/dataset/' + self.int8_download
+        )
         self.data_cache_folder = ''
         data_urls = []
         data_md5s = []
@@ -129,31 +129,34 @@ def setUp(self):
                 'https://paddle-inference-dist.bj.bcebos.com/int8/ILSVRC2012_img_val.tar.gz.partab'
             )
             data_md5s.append('1e9f15f64e015e58d6f9ec3210ed18b5')
-            self.data_cache_folder = self.download_data(data_urls, data_md5s,
-                                                        "full_data", False)
+            self.data_cache_folder = self.download_data(
+                data_urls, data_md5s, "full_data", False
+            )
         else:
             data_urls.append(
                 'http://paddle-inference-dist.bj.bcebos.com/int8/calibration_test_data.tar.gz'
             )
             data_md5s.append('1b6c1c434172cca1bf9ba1e4d7a3157d')
-            self.data_cache_folder = self.download_data(data_urls, data_md5s,
-                                                        "small_data", False)
+            self.data_cache_folder = self.download_data(
+                data_urls, data_md5s, "small_data", False
+            )
 
         # reader/decorator.py requires the relative path to the data folder
         if not os.path.exists("./data/ILSVRC2012"):
-            cmd = 'rm -rf {0} && ln -s {1} {0}'.format("data",
-                                                       self.data_cache_folder)
+            cmd = 'rm -rf {0} && ln -s {1} {0}'.format(
+                "data", self.data_cache_folder
+            )
             os.system(cmd)
 
         self.batch_size = 1 if os.environ.get('DATASET') == 'full' else 50
-        self.sample_iterations = 50 if os.environ.get(
-            'DATASET') == 'full' else 2
-        self.infer_iterations = 50000 if os.environ.get(
-            'DATASET') == 'full' else 2
+        self.infer_iterations = (
+            50000 if os.environ.get('DATASET') == 'full' else 2
+        )
 
         self.root_path = tempfile.TemporaryDirectory()
-        self.int8_model = os.path.join(self.root_path.name,
-                                       "post_training_quantization")
+        self.int8_model = os.path.join(
+            self.root_path.name, "post_training_quantization"
+        )
 
     def tearDown(self):
         self.root_path.cleanup()
@@ -161,7 +164,8 @@ def tearDown(self):
     def cache_unzipping(self, target_folder, zip_path):
         if not os.path.exists(target_folder):
             cmd = 'mkdir {0} && tar xf {1} -C {0}'.format(
-                target_folder, zip_path)
+                target_folder, zip_path
+            )
             os.system(cmd)
 
     def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
@@ -173,13 +177,15 @@ def download_data(self, data_urls, data_md5s, folder_name, is_model=True):
                 download(data_urls[i], self.int8_download, data_md5s[i])
                 file_names.append(data_urls[i].split('/')[-1])
 
-            zip_path = os.path.join(self.cache_folder,
-                                    'full_imagenet_val.tar.gz')
+            zip_path = os.path.join(
+                self.cache_folder, 'full_imagenet_val.tar.gz'
+            )
             if not os.path.exists(zip_path):
                 cat_command = 'cat'
                 for file_name in file_names:
-                    cat_command += ' ' + os.path.join(self.cache_folder,
-                                                      file_name)
+                    cat_command += ' ' + os.path.join(
+                        self.cache_folder, file_name
+                    )
                 cat_command += ' > ' + zip_path
                 os.system(cat_command)
 
@@ -199,8 +205,16 @@ def run_program(self, model_path, batch_size, infer_iterations):
         image_shape = [3, 224, 224]
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        [infer_program, feed_dict, fetch_targets] = \
-            fluid.io.load_inference_model(model_path, exe)
+        [
+            infer_program,
+            feed_dict,
+            fetch_targets,
+        ] = fluid.io.load_inference_model(
+            model_path,
+            exe,
+            model_filename="inference.pdmodel",
+            params_filename="inference.pdiparams",
+        )
         val_reader = paddle.batch(val(), batch_size)
         iterations = infer_iterations
 
@@ -208,23 +222,28 @@ def run_program(self, model_path, batch_size, infer_iterations):
         cnt = 0
         periods = []
         for batch_id, data in enumerate(val_reader()):
-            image = np.array([x[0].reshape(image_shape)
-                              for x in data]).astype("float32")
+            image = np.array([x[0].reshape(image_shape) for x in data]).astype(
+                "float32"
+            )
             label = np.array([x[1] for x in data]).astype("int64")
             label = label.reshape([-1, 1])
 
             t1 = time.time()
-            _, acc1, _ = exe.run(infer_program,
-                                 feed={
-                                     feed_dict[0]: image,
-                                     feed_dict[1]: label
-                                 },
-                                 fetch_list=fetch_targets)
+            pred = exe.run(
+                infer_program,
+                feed={feed_dict[0]: image},
+                fetch_list=fetch_targets,
+            )
             t2 = time.time()
             period = t2 - t1
             periods.append(period)
 
-            test_info.append(np.mean(acc1) * len(data))
+            pred = np.array(pred[0])
+            sort_array = pred.argsort(axis=1)
+            top_1_pred = sort_array[:, -1:][:, ::-1]
+            top_1 = np.mean(label == top_1_pred)
+
+            test_info.append(np.mean(top_1) * len(data))
             cnt += len(data)
 
             if (batch_id + 1) % 100 == 0:
@@ -238,22 +257,25 @@ def run_program(self, model_path, batch_size, infer_iterations):
         acc1 = np.sum(test_info) / cnt
         return (throughput, latency, acc1)
 
-    def generate_quantized_model(self,
-                                 model_path,
-                                 quantizable_op_type,
-                                 batch_size,
-                                 algo="KL",
-                                 round_type="round",
-                                 is_full_quantize=False,
-                                 is_use_cache_file=False,
-                                 is_optimize_model=False,
-                                 batch_nums=10,
-                                 onnx_format=False):
+    def generate_quantized_model(
+        self,
+        model_path,
+        quantizable_op_type,
+        batch_size,
+        algo="KL",
+        round_type="round",
+        is_full_quantize=False,
+        is_use_cache_file=False,
+        is_optimize_model=False,
+        batch_nums=10,
+        onnx_format=False,
+    ):
         try:
             os.system("mkdir " + self.int8_model)
         except Exception as e:
-            print("Failed to create {} due to {}".format(
-                self.int8_model, str(e)))
+            print(
+                "Failed to create {} due to {}".format(self.int8_model, str(e))
+            )
             sys.exit(-1)
 
         place = fluid.CPUPlace()
@@ -261,70 +283,98 @@ def generate_quantized_model(self,
         scope = fluid.global_scope()
         val_reader = val()
 
-        ptq = PostTrainingQuantization(executor=exe,
-                                       sample_generator=val_reader,
-                                       model_dir=model_path,
-                                       batch_size=batch_size,
-                                       batch_nums=batch_nums,
-                                       algo=algo,
-                                       quantizable_op_type=quantizable_op_type,
-                                       round_type=round_type,
-                                       is_full_quantize=is_full_quantize,
-                                       optimize_model=is_optimize_model,
-                                       onnx_format=onnx_format,
-                                       is_use_cache_file=is_use_cache_file)
+        ptq = PostTrainingQuantization(
+            executor=exe,
+            sample_generator=val_reader,
+            model_dir=model_path,
+            model_filename="inference.pdmodel",
+            params_filename="inference.pdiparams",
+            batch_size=batch_size,
+            batch_nums=batch_nums,
+            algo=algo,
+            quantizable_op_type=quantizable_op_type,
+            round_type=round_type,
+            is_full_quantize=is_full_quantize,
+            optimize_model=is_optimize_model,
+            onnx_format=onnx_format,
+            is_use_cache_file=is_use_cache_file,
+        )
         ptq.quantize()
-        ptq.save_quantized_model(self.int8_model)
-
-    def run_test(self,
-                 model,
-                 algo,
-                 round_type,
-                 data_urls,
-                 data_md5s,
-                 quantizable_op_type,
-                 is_full_quantize,
-                 is_use_cache_file,
-                 is_optimize_model,
-                 diff_threshold,
-                 onnx_format=False,
-                 batch_nums=10):
+        ptq.save_quantized_model(
+            self.int8_model,
+            model_filename="inference.pdmodel",
+            params_filename="inference.pdiparams",
+        )
+
+    def run_test(
+        self,
+        model,
+        algo,
+        round_type,
+        data_urls,
+        data_md5s,
+        quantizable_op_type,
+        is_full_quantize,
+        is_use_cache_file,
+        is_optimize_model,
+        diff_threshold,
+        onnx_format=False,
+        batch_nums=10,
+    ):
         infer_iterations = self.infer_iterations
         batch_size = self.batch_size
-        sample_iterations = self.sample_iterations
 
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
 
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            model, infer_iterations * batch_size))
+        print(
+            "Start FP32 inference for {0} on {1} images ...".format(
+                model, infer_iterations * batch_size
+            )
+        )
         (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            os.path.join(model_cache_folder, "model"), batch_size,
-            infer_iterations)
-
-        print("Start INT8 post training quantization for {0} on {1} images ...".
-              format(model, sample_iterations * batch_size))
-        self.generate_quantized_model(os.path.join(model_cache_folder, "model"),
-                                      quantizable_op_type, batch_size,
-                                      sample_iterations, algo, round_type,
-                                      is_full_quantize, is_use_cache_file,
-                                      is_optimize_model, batch_nums,
-                                      onnx_format)
-
-        print("Start INT8 inference for {0} on {1} images ...".format(
-            model, infer_iterations * batch_size))
-        (int8_throughput, int8_latency,
-         int8_acc1) = self.run_program(self.int8_model, batch_size,
-                                       infer_iterations)
+            os.path.join(model_cache_folder, "MobileNetV1_infer"),
+            batch_size,
+            infer_iterations,
+        )
+
+        print(
+            "Start INT8 post training quantization for {0} on {1} images ...".format(
+                model, batch_nums * batch_size
+            )
+        )
+        self.generate_quantized_model(
+            os.path.join(model_cache_folder, "MobileNetV1_infer"),
+            quantizable_op_type,
+            batch_size,
+            algo,
+            round_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            batch_nums,
+            onnx_format,
+        )
+
+        print(
+            "Start INT8 inference for {0} on {1} images ...".format(
+                model, infer_iterations * batch_size
+            )
+        )
+        (int8_throughput, int8_latency, int8_acc1) = self.run_program(
+            self.int8_model, batch_size, infer_iterations
+        )
 
         print("---Post training quantization of {} method---".format(algo))
         print(
-            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}."
-            .format(model, batch_size, fp32_throughput, fp32_latency,
-                    fp32_acc1))
+            "FP32 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.".format(
+                model, batch_size, fp32_throughput, fp32_latency, fp32_acc1
+            )
+        )
         print(
-            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.\n"
-            .format(model, batch_size, int8_throughput, int8_latency,
-                    int8_acc1))
+            "INT8 {0}: batch_size {1}, throughput {2} images/second, latency {3} second, accuracy {4}.\n".format(
+                model, batch_size, int8_throughput, int8_latency, int8_acc1
+            )
+        )
         sys.stdout.flush()
 
         delta_value = fp32_acc1 - int8_acc1
@@ -332,15 +382,14 @@ def run_test(self,
 
 
 class TestPostTrainingKLForMobilenetv1(TestPostTrainingQuantization):
-
     def test_post_training_kl_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "KL"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
         ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
         quantizable_op_type = [
             "conv2d",
             "depthwise_conv2d",
@@ -351,21 +400,30 @@ def test_post_training_kl_mobilenetv1(self):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        batch_nums = 3
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+        )
 
 
 class TestPostTrainingavgForMobilenetv1(TestPostTrainingQuantization):
-
     def test_post_training_avg_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "avg"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
         ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
         quantizable_op_type = [
             "conv2d",
             "depthwise_conv2d",
@@ -375,21 +433,29 @@ def test_post_training_avg_mobilenetv1(self):
         is_use_cache_file = False
         is_optimize_model = True
         diff_threshold = 0.025
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+        )
 
 
 class TestPostTraininghistForMobilenetv1(TestPostTrainingQuantization):
-
     def test_post_training_hist_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "hist"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
         ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
         quantizable_op_type = [
             "conv2d",
             "depthwise_conv2d",
@@ -400,29 +466,30 @@ def test_post_training_hist_mobilenetv1(self):
         is_optimize_model = True
         diff_threshold = 0.03
         batch_nums = 3
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      batch_nums=batch_nums)
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            batch_nums=batch_nums,
+        )
 
 
 class TestPostTrainingAbsMaxForMobilenetv1(TestPostTrainingQuantization):
-
     def test_post_training_abs_max_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "abs_max"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
         ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
         quantizable_op_type = [
             "conv2d",
             "mul",
@@ -432,21 +499,29 @@ def test_post_training_abs_max_mobilenetv1(self):
         is_optimize_model = False
         # The accuracy diff of post-training quantization (abs_max) maybe bigger
         diff_threshold = 0.05
-        self.run_test(model, algo, round_type, data_urls, data_md5s,
-                      quantizable_op_type, is_full_quantize, is_use_cache_file,
-                      is_optimize_model, diff_threshold)
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+        )
 
 
 class TestPostTrainingAvgONNXFormatForMobilenetv1(TestPostTrainingQuantization):
-
     def test_post_training_onnx_format_mobilenetv1(self):
         model = "MobileNet-V1"
         algo = "emd"
         round_type = "round"
         data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
+            'https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/inference/MobileNetV1_infer.tar'
         ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
+        data_md5s = ['5ee2b1775b11dc233079236cdc216c2e']
         quantizable_op_type = [
             "conv2d",
             "depthwise_conv2d",
@@ -458,18 +533,20 @@ def test_post_training_onnx_format_mobilenetv1(self):
         onnx_format = True
         diff_threshold = 0.05
         batch_nums = 3
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      onnx_format=onnx_format,
-                      batch_nums=batch_nums)
+        self.run_test(
+            model,
+            algo,
+            round_type,
+            data_urls,
+            data_md5s,
+            quantizable_op_type,
+            is_full_quantize,
+            is_use_cache_file,
+            is_optimize_model,
+            diff_threshold,
+            onnx_format=onnx_format,
+            batch_nums=batch_nums,
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/contrib/sparsity/asp.py b/python/paddle/fluid/contrib/sparsity/asp.py
index 40c96e0ce3403d..692591d770f5dc 100644
--- a/python/paddle/fluid/contrib/sparsity/asp.py
+++ b/python/paddle/fluid/contrib/sparsity/asp.py
@@ -316,7 +316,7 @@ def prune_model(model, n=2, m=4, mask_algo='mask_1d', with_mask=True):
         m (int, optional): m of `n:m` sparse pattern. Default is 4.
         mask_algo (string, optional): The function name to generate spase mask. Default is `mask_1d`.
                                       The vaild inputs should be one of 'mask_1d', 'mask_2d_greedy' and 'mask_2d_best'.
-        with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Defalut is True.
+        with_mask (bool, optional): To prune mask Variables related to parameters or not. Ture is purning also, False is not. Default is True.
     Returns:
         dictionary: A dictionary with key: `parameter name` (string) and value: its corresponding mask Variable.
     Examples:
diff --git a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
index 8cd422d0d7691d..b0b64f27eccc1e 100644
--- a/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
+++ b/python/paddle/fluid/contrib/sparsity/supported_layer_list.py
@@ -23,9 +23,9 @@
 
 __all__ = ['add_supported_layer']
 
-_logger = get_logger(__name__,
-                     logging.INFO,
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
 
 
 def _default_pruning(weight_nparray, m, n, func_name, param_name):
@@ -38,13 +38,17 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     exlude_cond_shape4 = len(shape) == 4 and shape[1] < m
     if exlude_cond_shape2:
         _logger.warning(
-            '{} is not pruned because the first dimension of {} is smaller than {}'
-            .format(param_name, shape, m))
+            '{} is not pruned because the first dimension of {} is smaller than {}'.format(
+                param_name, shape, m
+            )
+        )
         return weight_pruned_nparray, weight_sparse_mask
     if exlude_cond_shape4:
         _logger.warning(
-            '{} is not pruned because the second dimension of {} is smaller than {}'
-            .format(param_name, shape, m))
+            '{} is not pruned because the second dimension of {} is smaller than {}'.format(
+                param_name, shape, m
+            )
+        )
         return weight_pruned_nparray, weight_sparse_mask
 
     checked_func_name = sparsity.CheckMethod.get_checking_method(func_name)
@@ -60,13 +64,13 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
     # sparsity/utils is row-major pruning. That is the reason we have to transpose weight
     # matrices beforce invoking create_mask. Then we transpose the result mask to make
     # sure its shape to be the same as the input weight.
-    weight_sparse_mask = sparsity.create_mask(weight_nparray.T,
-                                              func_name=func_name,
-                                              n=n,
-                                              m=m).T
+    weight_sparse_mask = sparsity.create_mask(
+        weight_nparray.T, func_name=func_name, n=n, m=m
+    ).T
     weight_pruned_nparray = np.multiply(weight_nparray, weight_sparse_mask)
-    assert sparsity.check_sparsity(weight_pruned_nparray.T,  n=n, m=m, func_name=checked_func_name), \
-                    'Pruning {} weight matrix failure!!!'.format(param_name)
+    assert sparsity.check_sparsity(
+        weight_pruned_nparray.T, n=n, m=m, func_name=checked_func_name
+    ), 'Pruning {} weight matrix failure!!!'.format(param_name)
     return weight_pruned_nparray, weight_sparse_mask
 
 
@@ -78,28 +82,35 @@ def _default_pruning(weight_nparray, m, n, func_name, param_name):
 
 def add_supported_layer(layer, pruning_func=None):
     r"""
+
     Add supported layers and its corresponding pruning function.
 
     Args:
-        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then 
-        it would be turn to string internally. ASP would use this name to match parameter's name and call 
-        its the corresponding pruning function.
+        name (string|Layer): The name or type of layer, needed to support. If layer is `Layer` then
+                             it would be turn to string internally. ASP would use this name to match parameter's name and call
+                             its the corresponding pruning function.
         pruning_func (function, optional): a function type which receives five argument (weight_nparray,
-        m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
-        m, n, and func_name, please see `prune_model` for details.
+                                           m, n, func_name, param_name), weight_nparray is a nparray of weight, param_name is the name of weight,
+                                           m, n, and func_name, please see `prune_model` for details.
+
     """
     name = None
     if isinstance(layer, str):
         name = layer
     elif isinstance(layer, paddle.fluid.dygraph.layers.Layer):
         name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            type(layer).__name__)
+            type(layer).__name__
+        )
     elif issubclass(layer, paddle.fluid.dygraph.layers.Layer):
         name = paddle.fluid.dygraph.layers._convert_camel_to_snake(
-            layer.__name__)
+            layer.__name__
+        )
     else:
-        assert "The type of layer should be string of Layer, but got {}!".format(
-            type(layer))
+        assert (
+            "The type of layer should be string of Layer, but got {}!".format(
+                type(layer)
+            )
+        )
     if pruning_func is None:
         pruning_func = _default_pruning
     _supported_layers_and_prune_func_map_lock.acquire()
diff --git a/python/paddle/fluid/contrib/sparsity/utils.py b/python/paddle/fluid/contrib/sparsity/utils.py
index 1d0694c4dde3cb..c6d706bd31e8ef 100644
--- a/python/paddle/fluid/contrib/sparsity/utils.py
+++ b/python/paddle/fluid/contrib/sparsity/utils.py
@@ -27,9 +27,16 @@
 import threading
 
 __all__ = [
-    'calculate_density', 'check_mask_1d', 'get_mask_1d', 'check_mask_2d',
-    'get_mask_2d_greedy', 'get_mask_2d_best', 'create_mask', 'check_sparsity',
-    'MaskAlgo', 'CheckMethod'
+    'calculate_density',
+    'check_mask_1d',
+    'get_mask_1d',
+    'check_mask_2d',
+    'get_mask_2d_greedy',
+    'get_mask_2d_best',
+    'create_mask',
+    'check_sparsity',
+    'MaskAlgo',
+    'CheckMethod',
 ]
 
 
@@ -76,8 +83,9 @@ def get_checking_method(mask_algo):
             CheckMethod.get_checking_method(MaskAlgo.MASK_2D_BEST)
             # CheckMethod.CHECK_2D
         """
-        assert isinstance(mask_algo, MaskAlgo), \
-               "mask_algo should be MaskAlgo type"
+        assert isinstance(
+            mask_algo, MaskAlgo
+        ), "mask_algo should be MaskAlgo type"
         if mask_algo == MaskAlgo.MASK_1D:
             return CheckMethod.CHECK_1D
         else:
@@ -86,20 +94,25 @@ def get_checking_method(mask_algo):
 
 def calculate_density(x):
     r"""
+
     Return the density of the input tensor.
 
     Args:
         x (nparray): The input tensor.
+
     Returns:
-        float: The density of :attr:`x`.
+        float, The density of :attr:`x`.
+
     Examples:
         .. code-block:: python
-          import paddle
-          import numpy as np
 
-          x = np.array([[0, 1, 3, 0],
+            import paddle
+            import numpy as np
+
+            x = np.array([[0, 1, 3, 0],
                         [1, 1, 0, 1]])
-          paddle.incubate.asp.calculate_density(x) # 0.625
+            paddle.incubate.asp.calculate_density(x) # 0.625
+
     """
     x_flattened = x.flatten()
     return float(np.nonzero(x_flattened)[0].size) / x_flattened.size
@@ -108,7 +121,7 @@ def calculate_density(x):
 def _reshape_1d(mat, m):
     r"""
     Reshape the input 2D matrix to shape (-1, m).
-    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`, 
+    If the second dimension of :attr:`mat` is not a multiples of :attr:`m`,
     then this function would pad the remainder with 0 before reshaping.
 
     .. math::
@@ -126,7 +139,7 @@ def _reshape_1d(mat, m):
     remainder = mat.shape[1] % m
     if mat.shape[1] % m > 0:
         mat_padded = np.zeros((mat.shape[0], mat.shape[1] + (m - remainder)))
-        mat_padded[:, :mat.shape[1]] = mat
+        mat_padded[:, : mat.shape[1]] = mat
         shape = mat_padded.shape
         return mat_padded.reshape(-1, m), shape
     else:
@@ -136,7 +149,7 @@ def _reshape_1d(mat, m):
 def check_mask_1d(mat, n, m):
     r"""
     Check if every row of the input matrix :attr:`mat` is in 1D `n:m` sparse pattern.
-    This function would pad the second dimension of :attr:`mat` by zero 
+    This function would pad the second dimension of :attr:`mat` by zero
     to be a multiples of :attr:`m` if necessary.
 
     1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -179,8 +192,8 @@ def check_mask_1d(mat, n, m):
 
 def get_mask_1d(mat, n, m):
     r"""
-    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
-    in row-directory. This function would pad the second dimension of :attr:`mat` 
+    Generate 1D `n:m` sparse pattern mask of the input matrix :attr:`mat`
+    in row-directory. This function would pad the second dimension of :attr:`mat`
     by zero to be a multiples of :attr:`m` before mask generation.
 
     1D `n:m` sparse pattern: At least :attr:`n` zeros in every :math:`1 \times m` block.
@@ -213,7 +226,7 @@ def get_mask_1d(mat, n, m):
         min_order_indices = np.argsort(np.absolute(sub_mat))
         mask_flattern[i, min_order_indices[:n].tolist()] = 0
     mask_flattern = mask_flattern.reshape(shape)
-    mask[:, :] = mask_flattern[:, :mat.shape[1]]
+    mask[:, :] = mask_flattern[:, : mat.shape[1]]
     return mask
 
 
@@ -239,12 +252,12 @@ def _reshape_2d(mat, m):
     remainder_0 = mat.shape[0] % m
     remainder_1 = mat.shape[1] % m
 
-    new_shape = (mat.shape[0] if remainder_0 == 0 \
-                 else mat.shape[0] + (m - remainder_0),
-                 mat.shape[1] if remainder_1 == 0 \
-                 else mat.shape[1] + (m - remainder_1))
+    new_shape = (
+        mat.shape[0] if remainder_0 == 0 else mat.shape[0] + (m - remainder_0),
+        mat.shape[1] if remainder_1 == 0 else mat.shape[1] + (m - remainder_1),
+    )
     mat_padded = np.zeros(new_shape)
-    mat_padded[:mat.shape[0], :mat.shape[1]] = mat
+    mat_padded[: mat.shape[0], : mat.shape[1]] = mat
 
     mat_flattern = np.empty(new_shape).reshape(-1, m * m)
     curr_idx = 0
@@ -252,9 +265,9 @@ def _reshape_2d(mat, m):
         row_end = row_start + m
         for col_start in range(0, mat_padded.shape[1], m):
             col_end = col_start + m
-            sub_mat = np.squeeze(mat_padded[row_start:row_end, \
-                                            col_start:col_end] \
-                                            .reshape(-1))
+            sub_mat = np.squeeze(
+                mat_padded[row_start:row_end, col_start:col_end].reshape(-1)
+            )
             mat_flattern[curr_idx] = sub_mat
             curr_idx += 1
     return mat_flattern, mat_padded.shape
@@ -263,10 +276,10 @@ def _reshape_2d(mat, m):
 def check_mask_2d(mat, n, m):
     r"""
     Check if every :math:`m \times m` block of the input matrix :attr:`mat` is in 2D `n:m` sparse pattern.
-    This function would pad each dimension of :attr:`mat` by zero to be a multiples of 
+    This function would pad each dimension of :attr:`mat` by zero to be a multiples of
     :attr:`m` if necessary.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     Args:
@@ -304,18 +317,19 @@ def check_mask_2d(mat, n, m):
     mat_padded, shape = _reshape_2d(mat, m)
     for sub_mat in mat_padded:
         sub_mask = np.absolute(np.squeeze(sub_mat.reshape(m, m))) > 0
-        if (np.sum(np.sum(sub_mask, axis=1) > (m-n)) != 0) and \
-            (np.sum(np.sum(sub_mask, axis=0) > (m-n)) != 0):
+        if (np.sum(np.sum(sub_mask, axis=1) > (m - n)) != 0) and (
+            np.sum(np.sum(sub_mask, axis=0) > (m - n)) != 0
+        ):
             return False
     return True
 
 
 def get_mask_2d_greedy(mat, n, m):
     r"""
-    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`. 
+    Greedily generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`.
     This function would pad each dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
     Greedily generating: For each :math:`m \times m` block, selecting values to keep in descent order.
 
@@ -350,15 +364,17 @@ def get_mask_2d_greedy(mat, n, m):
         sub_mask = np.squeeze(mask_padded[idx])
 
         min_order_1d_indices = np.argsort(sub_mat)
-        min_order_2d_indices = [(int(x / m), x % m)
-                                for x in min_order_1d_indices]
+        min_order_2d_indices = [
+            (int(x / m), x % m) for x in min_order_1d_indices
+        ]
         row_counter = collections.Counter()
         col_counter = collections.Counter()
 
         for i in range(len(min_order_1d_indices) - 1, -1, -1):
             matrix_entry = min_order_2d_indices[i]
-            if (row_counter[matrix_entry[0]] == n) or \
-               (col_counter[matrix_entry[1]] == n):
+            if (row_counter[matrix_entry[0]] == n) or (
+                col_counter[matrix_entry[1]] == n
+            ):
                 continue
 
             sub_mask[matrix_entry[0], matrix_entry[1]] = 1.0
@@ -373,7 +389,7 @@ def get_mask_2d_greedy(mat, n, m):
             col_end = col_start + m
             mask[row_start:row_end, col_start:col_end] = mask_padded[curr_idx]
             curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 
 
 _valid_2d_patterns_lock = threading.Lock()
@@ -384,7 +400,7 @@ def _compute_valid_2d_patterns(n, m):
     r"""
     Compute all vaild 2D `n:m` sparse patterns.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     Args:
@@ -406,8 +422,11 @@ def _compute_valid_2d_patterns(n, m):
         patterns = patterns + patterns
         patterns = np.asarray(list(set(permutations(patterns, m))))
 
-        valid = ((patterns.sum(axis=1) <= n).sum(
-            axis=1) == m).nonzero()[0].reshape(-1)
+        valid = (
+            ((patterns.sum(axis=1) <= n).sum(axis=1) == m)
+            .nonzero()[0]
+            .reshape(-1)
+        )
         valid_patterns = np.empty((valid.shape[0], m, m))
         valid_patterns[:] = patterns[valid[:]]
 
@@ -420,11 +439,11 @@ def _compute_valid_2d_patterns(n, m):
 
 def get_mask_2d_best(mat, n, m):
     r"""
-    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat` 
-    to form sparse matrix with maximun L1 norm .This function would pad each 
+    Generate 2D `n:m` sparse pattern mask of the input matrix :attr:`mat`
+    to form sparse matrix with maximun L1 norm .This function would pad each
     dimension of :attr:`mat` by zero to be a multiples of :attr:`m` before mask generation.
 
-    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block 
+    2D `n:m` sparse pattern: At least :math:`n \times n` zeros in every :math:`m \times m` block
     under the constraint of at least :attr:`n` zeros for each row and column.
 
     *Note*: L1 norm of sparse matrix from `Best` API is greater than or equal to the one from `Greedy`.
@@ -454,9 +473,10 @@ def get_mask_2d_best(mat, n, m):
 
     mat_flattern, shape = _reshape_2d(mat, m)
     mask_flattern = np.ones_like(mat_flattern).reshape(-1, m, m)
-    pmax = np.argmax(np.matmul(mat_flattern,
-                               patterns.reshape(patterns.shape[0], m * m).T),
-                     axis=1)
+    pmax = np.argmax(
+        np.matmul(mat_flattern, patterns.reshape(patterns.shape[0], m * m).T),
+        axis=1,
+    )
 
     mask_flattern[:] = patterns[pmax[:]]
     mask = np.empty(shape)
@@ -468,7 +488,7 @@ def get_mask_2d_best(mat, n, m):
             col_end = col_start + m
             mask[row_start:row_end, col_start:col_end] = mask_flattern[curr_idx]
             curr_idx += 1
-    return mask[:mat.shape[0], :mat.shape[1]]
+    return mask[: mat.shape[0], : mat.shape[1]]
 
 
 def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
@@ -508,9 +528,10 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
     dtype = tensor.dtype
     t = tensor.astype(float)
 
-    assert isinstance(func_name, MaskAlgo), \
-           "func_name argumet of create_mask is only accepted as type MaskAlgo. " \
-           "But got {}".format(type(func_name))
+    assert isinstance(func_name, MaskAlgo), (
+        "func_name argumet of create_mask is only accepted as type MaskAlgo. "
+        "But got {}".format(type(func_name))
+    )
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
@@ -520,14 +541,20 @@ def create_mask(tensor, func_name=MaskAlgo.MASK_1D, n=2, m=4):
         t = t.reshape(shape[0] * shape[1], shape[2])
     # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.transpose([0, 1, 3, 2]).reshape(shape[0] * shape[1] * shape[3],
-                                              shape[2])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            shape[0] * shape[1] * shape[3], shape[2]
+        )
         mask = func(t, n=n, m=m)
-        return mask.reshape([shape[0], shape[1], shape[3],
-                             shape[2]]).transpose([0, 1, 3, 2]).astype(dtype)
+        return (
+            mask.reshape([shape[0], shape[1], shape[3], shape[2]])
+            .transpose([0, 1, 3, 2])
+            .astype(dtype)
+        )
     else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+        raise ValueError(
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
 
     mask = func(t, n=n, m=m)
     return mask.reshape(shape).astype(dtype)
@@ -566,9 +593,10 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
     shape = tensor.shape
     t = tensor.astype(float)
 
-    assert type(func_name) == CheckMethod, \
-           "func_name argumet of check_sparsity is only accepted as type CheckMethod. " \
-           "But got {}".format(type(func_name))
+    assert type(func_name) == CheckMethod, (
+        "func_name argumet of check_sparsity is only accepted as type CheckMethod. "
+        "But got {}".format(type(func_name))
+    )
     func = getattr(sys.modules[__name__], func_name.value, None)
     if len(shape) == 1:
         t = t.reshape(1, shape[0])
@@ -578,10 +606,13 @@ def check_sparsity(tensor, func_name=CheckMethod.CHECK_1D, n=2, m=4):
         t = t.reshape(shape[0] * shape[1], shape[2])
     # 4d-tensor conv (h, w, in, out) -> (h*w*out, in) in GemmConvKernel Op
     elif len(shape) == 4:
-        t = t.transpose([0, 1, 3,
-                         2]).reshape([shape[0] * shape[1] * shape[3], shape[2]])
+        t = t.transpose([0, 1, 3, 2]).reshape(
+            [shape[0] * shape[1] * shape[3], shape[2]]
+        )
     else:
-        raise ValueError("The dimension of input tensor is not supported in create_mask, " \
-                         "Only dimension < 4 is supported but got {}".format(len(shape)))
+        raise ValueError(
+            "The dimension of input tensor is not supported in create_mask, "
+            "Only dimension < 4 is supported but got {}".format(len(shape))
+        )
 
     return func(t, n=n, m=m)
diff --git a/python/paddle/fluid/core.py b/python/paddle/fluid/core.py
index 1fa3c769d77fb3..6c642dba67a695 100644
--- a/python/paddle/fluid/core.py
+++ b/python/paddle/fluid/core.py
@@ -20,27 +20,24 @@
 import warnings
 import platform
 
-core_suffix = 'so'
-if os.name == 'nt':
-    core_suffix = 'pyd'
+has_paddle_dy_lib = False
 
-has_avx_core = False
-has_noavx_core = False
+dy_lib_name = 'libpaddle'
+dy_lib_suffix = 'so'
+if os.name == 'nt':
+    dy_lib_suffix = 'pyd'
 
 current_path = os.path.abspath(os.path.dirname(__file__))
-if os.path.exists(current_path + os.sep + 'core_avx.' + core_suffix):
-    has_avx_core = True
-
-if os.path.exists(current_path + os.sep + 'core_noavx.' + core_suffix):
-    has_noavx_core = True
+if os.path.exists(current_path + os.sep + dy_lib_name + '.' + dy_lib_suffix):
+    has_paddle_dy_lib = True
 
 try:
     if os.name == 'nt':
         third_lib_path = current_path + os.sep + '..' + os.sep + 'libs'
         # Will load shared library from 'path' on windows
-        os.environ[
-            'path'] = current_path + ';' + third_lib_path + ';' + os.environ[
-                'path']
+        os.environ['path'] = (
+            current_path + ';' + third_lib_path + ';' + os.environ['path']
+        )
         sys.path.insert(0, third_lib_path)
         # Note: from python3.8, PATH will not take effect
         # https://github.com/python/cpython/pull/12302
@@ -50,20 +47,24 @@
 
 except ImportError as e:
     from .. import compat as cpt
+
     if os.name == 'nt':
         executable_path = os.path.abspath(os.path.dirname(sys.executable))
         raise ImportError(
             """NOTE: You may need to run \"set PATH=%s;%%PATH%%\"
         if you encounters \"DLL load failed\" errors. If you have python
         installed in other directory, replace \"%s\" with your own
-        directory. The original error is: \n %s""" %
-            (executable_path, executable_path, cpt.get_exception_message(e)))
+        directory. The original error is: \n %s"""
+            % (executable_path, executable_path, cpt.get_exception_message(e))
+        )
     else:
         raise ImportError(
             """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
         if you encounters \"libmkldnn.so not found\" errors. If you have python
         installed in other directory, replace \"/usr/local/lib\" with your own
-        directory. The original error is: \n""" + cpt.get_exception_message(e))
+        directory. The original error is: \n"""
+            + cpt.get_exception_message(e)
+        )
 except Exception as e:
     raise e
 
@@ -73,36 +74,45 @@ def avx_supported():
     Whether current system(Linux, MacOS, Windows) is supported with AVX.
     """
     from .. import compat as cpt
+
     sysstr = platform.system().lower()
     has_avx = False
     if sysstr == 'linux':
         try:
-            has_avx = os.popen('cat /proc/cpuinfo | grep -i avx').read() != ''
+            pipe = os.popen('cat /proc/cpuinfo | grep -i avx')
+            has_avx = pipe.read() != ''
+            pipe.close()
         except Exception as e:
-            sys.stderr.write('Can not get the AVX flag from /proc/cpuinfo.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
+            sys.stderr.write(
+                'Can not get the AVX flag from /proc/cpuinfo.\n'
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
         return has_avx
     elif sysstr == 'darwin':
         try:
-            has_avx = os.popen(
-                'sysctl machdep.cpu.features | grep -i avx').read() != ''
+            pipe = os.popen('sysctl machdep.cpu.features | grep -i avx')
+            has_avx = pipe.read() != ''
+            pipe.close()
         except Exception as e:
             sys.stderr.write(
                 'Can not get the AVX flag from machdep.cpu.features.\n'
-                'The original error is: %s\n' % cpt.get_exception_message(e))
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
         if not has_avx:
             import subprocess
+
             pipe = subprocess.Popen(
                 'sysctl machdep.cpu.leaf7_features | grep -i avx',
                 shell=True,
                 stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE)
+                stderr=subprocess.PIPE,
+            )
             _ = pipe.communicate()
             has_avx = True if pipe.returncode == 0 else False
         return has_avx
     elif sysstr == 'windows':
         import ctypes
+
         ONE_PAGE = ctypes.c_size_t(0x1000)
 
         def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
@@ -112,24 +122,31 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             pfnVirtualAlloc.restype = ctypes.c_void_p
             MEM_COMMIT = ctypes.c_ulong(0x1000)
             PAGE_READWRITE = ctypes.c_ulong(0x4)
-            address = pfnVirtualAlloc(None, ONE_PAGE, MEM_COMMIT,
-                                      PAGE_READWRITE)
+            address = pfnVirtualAlloc(
+                None, ONE_PAGE, MEM_COMMIT, PAGE_READWRITE
+            )
             if not address:
                 raise Exception("Failed to VirtualAlloc")
 
             # Copy the code into the memory segment
-            memmove = ctypes.CFUNCTYPE(ctypes.c_void_p, ctypes.c_void_p,
-                                       ctypes.c_void_p,
-                                       ctypes.c_size_t)(ctypes._memmove_addr)
+            memmove = ctypes.CFUNCTYPE(
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_size_t,
+            )(ctypes._memmove_addr)
             if memmove(address, code_str, len(code_str)) < 0:
                 raise Exception("Failed to memmove")
 
             # Enable execute permissions
             PAGE_EXECUTE = ctypes.c_ulong(0x10)
             pfnVirtualProtect = ctypes.windll.kernel32.VirtualProtect
-            res = pfnVirtualProtect(ctypes.c_void_p(address),
-                                    ONE_PAGE, PAGE_EXECUTE,
-                                    ctypes.byref(ctypes.c_ulong(0)))
+            res = pfnVirtualProtect(
+                ctypes.c_void_p(address),
+                ONE_PAGE,
+                PAGE_EXECUTE,
+                ctypes.byref(ctypes.c_ulong(0)),
+            )
             if not res:
                 raise Exception("Failed VirtualProtect")
 
@@ -138,7 +155,8 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             pfnGetCurrentProcess.restype = ctypes.c_void_p
             prochandle = ctypes.c_void_p(pfnGetCurrentProcess())
             res = ctypes.windll.kernel32.FlushInstructionCache(
-                prochandle, ctypes.c_void_p(address), ONE_PAGE)
+                prochandle, ctypes.c_void_p(address), ONE_PAGE
+            )
             if not res:
                 raise Exception("Failed FlushInstructionCache")
 
@@ -156,12 +174,14 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
             # Convert the code_str into a function that returns uint
             func, address = asm_func(code_str)
             retval = func()
-            ctypes.windll.kernel32.VirtualFree(ctypes.c_void_p(address),
-                                               ctypes.c_size_t(0), ONE_PAGE)
+            ctypes.windll.kernel32.VirtualFree(
+                ctypes.c_void_p(address), ctypes.c_size_t(0), ONE_PAGE
+            )
         except Exception as e:
-            sys.stderr.write('Failed getting the AVX flag on Windows.\n'
-                             'The original error is: %s\n' %
-                             cpt.get_exception_message(e))
+            sys.stderr.write(
+                'Failed getting the AVX flag on Windows.\n'
+                'The original error is: %s\n' % cpt.get_exception_message(e)
+            )
         return (retval & (1 << avx_bit)) > 0
     else:
         sys.stderr.write('Do not get AVX flag on %s\n' % sysstr)
@@ -170,10 +190,10 @@ def asm_func(code_str, restype=ctypes.c_uint32, argtypes=()):
 
 def run_shell_command(cmd):
     import subprocess
-    out, err = subprocess.Popen(cmd,
-                                stdout=subprocess.PIPE,
-                                stderr=subprocess.PIPE,
-                                shell=True).communicate()
+
+    out, err = subprocess.Popen(
+        cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True
+    ).communicate()
     if err:
         return None
     else:
@@ -182,8 +202,9 @@ def run_shell_command(cmd):
 
 def get_dso_path(core_so, dso_name):
     if core_so and dso_name:
-        return run_shell_command("ldd %s|grep %s|awk '{print $3}'" %
-                                 (core_so, dso_name))
+        return run_shell_command(
+            "ldd %s|grep %s|awk '{print $3}'" % (core_so, dso_name)
+        )
     else:
         return None
 
@@ -192,16 +213,15 @@ def load_dso(dso_absolute_path):
     if dso_absolute_path:
         try:
             from ctypes import cdll
+
             cdll.LoadLibrary(dso_absolute_path)
         except:
             warnings.warn("Load {} failed".format(dso_absolute_path))
 
 
 def pre_load(dso_name):
-    if has_avx_core:
-        core_so = current_path + os.sep + 'core_avx.' + core_suffix
-    elif has_noavx_core:
-        core_so = current_path + os.sep + 'core_noavx.' + core_suffix
+    if has_paddle_dy_lib:
+        core_so = current_path + os.sep + dy_lib_name + '.' + dy_lib_suffix
     else:
         core_so = None
     dso_path = get_dso_path(core_so, dso_name)
@@ -239,7 +259,7 @@ def to_list(s):
 # (1) the number of dynamic shared librarys (DSO) loaded > 14,
 # (2) after that, load a dynamic shared library (DSO) with static TLS.
 # For paddle, the problem is that 'libgomp' is a DSO with static TLS, and it is loaded after 14 DSOs.
-# So, here is a tricky way to solve the problem by pre load 'libgomp' before 'core_avx.so'.
+# So, here is a tricky way to solve the problem by pre load 'libgomp' before 'libpaddle.so'.
 # The final solution is to upgrade glibc to > 2.22 on the target system.
 if platform.system().lower() == 'linux':
     libc_type, libc_ver = get_libc_ver()
@@ -247,123 +267,75 @@ def to_list(s):
         try:
             pre_load('libgomp')
         except Exception as e:
-            # NOTE(zhiqiu): do not abort if failed, since it may success when import core_avx.so
+            # NOTE(zhiqiu): do not abort if failed, since it may success when import libpaddle.so
             sys.stderr.write('Error: Can not preload libgomp.so')
 
-load_noavx = False
-
-if avx_supported():
-    try:
-        from . import core_avx
-        core_avx.LoDTensor = core_avx.Tensor
-
-        from .core_avx import *
-        from .core_avx import __doc__, __file__, __name__, __package__
-        from .core_avx import __unittest_throw_exception__
-        from .core_avx import _append_python_callable_object_and_return_id
-        from .core_avx import _cleanup, _Scope
-        from .core_avx import _get_use_default_grad_op_desc_maker_ops
-        from .core_avx import _get_all_register_op_kernels
-        from .core_avx import _is_program_version_supported
-        from .core_avx import _set_eager_deletion_mode
-        from .core_avx import _get_eager_deletion_vars
-        from .core_avx import _set_fuse_parameter_group_size
-        from .core_avx import _set_fuse_parameter_memory_size
-        from .core_avx import _is_dygraph_debug_enabled
-        from .core_avx import _dygraph_debug_level
-        from .core_avx import _switch_tracer
-        from .core_avx import _set_paddle_lib_path
-        from .core_avx import _create_loaded_parameter
-        from .core_avx import _cuda_synchronize
-        from .core_avx import _is_compiled_with_heterps
-        from .core_avx import _promote_types_if_complex_exists
-        from .core_avx import _set_cached_executor_build_strategy
-        from .core_avx import _device_synchronize
-        from .core_avx import _get_current_stream
-        from .core_avx import _Profiler, _ProfilerResult, _RecordEvent
-        from .core_avx import _set_current_stream
-        if sys.platform != 'win32':
-            from .core_avx import _set_process_pids
-            from .core_avx import _erase_process_pids
-            from .core_avx import _set_process_signal_handler
-            from .core_avx import _throw_error_if_process_failed
-            from .core_avx import _convert_to_tensor_list
-            from .core_avx import _array_to_share_memory_tensor
-            from .core_avx import _cleanup_mmap_fds
-            from .core_avx import _remove_tensor_list_mmap_fds
-    except Exception as e:
-        if has_avx_core:
-            sys.stderr.write(
-                'Error: Can not import avx core while this file exists: ' +
-                current_path + os.sep + 'core_avx.' + core_suffix + '\n')
-            raise e
-        else:
-            from .. import compat as cpt
-            sys.stderr.write(
-                "Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
-                "Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
-                "reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
-                "to get better performance.\nThe original error is: %s\n" %
-                cpt.get_exception_message(e))
-            load_noavx = True
-else:
-    load_noavx = True
-
-if load_noavx:
-    try:
-        from . import core_noavx
-        core_noavx.LoDTensor = core_noavx.Tensor
-
-        from .core_noavx import *
-        from .core_noavx import __doc__, __file__, __name__, __package__
-        from .core_noavx import __unittest_throw_exception__
-        from .core_noavx import _append_python_callable_object_and_return_id
-        from .core_noavx import _cleanup, _Scope
-        from .core_noavx import _get_use_default_grad_op_desc_maker_ops
-        from .core_noavx import _get_all_register_op_kernels
-        from .core_noavx import _is_program_version_supported
-        from .core_noavx import _set_eager_deletion_mode
-        from .core_noavx import _get_eager_deletion_vars
-        from .core_noavx import _set_fuse_parameter_group_size
-        from .core_noavx import _set_fuse_parameter_memory_size
-        from .core_noavx import _is_dygraph_debug_enabled
-        from .core_noavx import _dygraph_debug_level
-        from .core_noavx import _switch_tracer
-        from .core_noavx import _set_paddle_lib_path
-        from .core_noavx import _create_loaded_parameter
-        from .core_noavx import _cuda_synchronize
-        from .core_noavx import _is_compiled_with_heterps
-        from .core_noavx import _promote_types_if_complex_exists
-        from .core_noavx import _set_cached_executor_build_strategy
-        from .core_noavx import _device_synchronize
-        from .core_noavx import _get_current_stream
-        from .core_noavx import _set_current_stream
-        from .core_noavx import _Profiler, _ProfilerResult, _RecordEvent
-        if sys.platform != 'win32':
-            from .core_noavx import _set_process_pids
-            from .core_noavx import _erase_process_pids
-            from .core_noavx import _set_process_signal_handler
-            from .core_noavx import _throw_error_if_process_failed
-            from .core_noavx import _convert_to_tensor_list
-            from .core_noavx import _array_to_share_memory_tensor
-            from .core_noavx import _cleanup_mmap_fds
-            from .core_noavx import _remove_tensor_list_mmap_fds
-    except Exception as e:
-        if has_noavx_core:
-            sys.stderr.write(
-                'Error: Can not import noavx core while this file exists: ' +
-                current_path + os.sep + 'core_noavx.' + core_suffix + '\n')
-        elif avx_supported():
-            sys.stderr.write(
-                "Error: The installed PaddlePaddle is incorrect. You should reinstall it by "
-                "'python -m pip install --force-reinstall paddlepaddle-gpu[==version]'\n"
-            )
-        else:
-            sys.stderr.write(
-                "Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
-                "you should reinstall paddlepaddle with no-avx core.\n")
-
-        raise e
+try:
+    from . import libpaddle
+
+    if avx_supported() and not libpaddle.is_compiled_with_avx():
+        sys.stderr.write(
+            "Hint: Your machine support AVX, but the installed paddlepaddle doesn't have avx core. "
+            "Hence, no-avx core with worse preformance will be imported.\nIf you like, you could "
+            "reinstall paddlepaddle by 'python -m pip install --force-reinstall paddlepaddle-gpu[==version]' "
+            "to get better performance.\n"
+        )
+
+    # assign tensor alias
+    libpaddle.LoDTensor = libpaddle.Tensor
+
+    from .libpaddle import *
+    from .libpaddle import __doc__, __file__, __name__, __package__
+    from .libpaddle import __unittest_throw_exception__
+    from .libpaddle import _append_python_callable_object_and_return_id
+    from .libpaddle import _cleanup, _Scope
+    from .libpaddle import _get_use_default_grad_op_desc_maker_ops
+    from .libpaddle import _get_all_register_op_kernels
+    from .libpaddle import _is_program_version_supported
+    from .libpaddle import _set_eager_deletion_mode
+    from .libpaddle import _get_eager_deletion_vars
+    from .libpaddle import _set_fuse_parameter_group_size
+    from .libpaddle import _set_fuse_parameter_memory_size
+    from .libpaddle import _is_dygraph_debug_enabled
+    from .libpaddle import _dygraph_debug_level
+    from .libpaddle import _switch_tracer
+    from .libpaddle import _set_paddle_lib_path
+    from .libpaddle import _create_loaded_parameter
+    from .libpaddle import _cuda_synchronize
+    from .libpaddle import _is_compiled_with_heterps
+    from .libpaddle import _promote_types_if_complex_exists
+    from .libpaddle import _set_cached_executor_build_strategy
+    from .libpaddle import _device_synchronize
+    from .libpaddle import _get_current_stream
+    from .libpaddle import _Profiler, _ProfilerResult, _RecordEvent
+    from .libpaddle import _set_current_stream
+    from .libpaddle import _get_phi_kernel_name
+
+    if sys.platform != 'win32':
+        from .libpaddle import _set_process_pids
+        from .libpaddle import _erase_process_pids
+        from .libpaddle import _set_process_signal_handler
+        from .libpaddle import _throw_error_if_process_failed
+        from .libpaddle import _convert_to_tensor_list
+        from .libpaddle import _array_to_share_memory_tensor
+        from .libpaddle import _cleanup_mmap_fds
+        from .libpaddle import _remove_tensor_list_mmap_fds
+except Exception as e:
+    if has_paddle_dy_lib:
+        sys.stderr.write(
+            'Error: Can not import paddle core while this file exists: '
+            + current_path
+            + os.sep
+            + 'libpaddle.'
+            + dy_lib_suffix
+            + '\n'
+        )
+    if not avx_supported() and libpaddle.is_compiled_with_avx():
+        sys.stderr.write(
+            "Error: Your machine doesn't support AVX, but the installed PaddlePaddle is avx core, "
+            "you should reinstall paddlepaddle with no-avx core.\n"
+        )
+    raise e
 
 
 def set_paddle_custom_device_lib_path(lib_path):
@@ -379,22 +351,26 @@ def set_paddle_custom_device_lib_path(lib_path):
 
 # set paddle lib path
 def set_paddle_lib_path():
-    site_dirs = site.getsitepackages() if hasattr(
-        site,
-        'getsitepackages') else [x for x in sys.path if 'site-packages' in x]
+    site_dirs = (
+        site.getsitepackages()
+        if hasattr(site, 'getsitepackages')
+        else [x for x in sys.path if 'site-packages' in x]
+    )
     for site_dir in site_dirs:
         lib_dir = os.path.sep.join([site_dir, 'paddle', 'libs'])
         if os.path.exists(lib_dir):
             _set_paddle_lib_path(lib_dir)
             set_paddle_custom_device_lib_path(
-                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
+            )
             return
     if hasattr(site, 'USER_SITE'):
         lib_dir = os.path.sep.join([site.USER_SITE, 'paddle', 'libs'])
         if os.path.exists(lib_dir):
             _set_paddle_lib_path(lib_dir)
             set_paddle_custom_device_lib_path(
-                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins']))
+                os.path.sep.join([lib_dir, '..', '..', 'paddle-plugins'])
+            )
 
 
 set_paddle_lib_path()
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index b30e3ff1d85274..cca6a5fc1c71de 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -27,6 +27,7 @@
 import warnings
 from ..framework import _get_paddle_place, _in_legacy_dygraph, _in_eager_without_dygraph_check
 import paddle
+import warnings
 
 __all__ = [
     'no_grad', 'no_grad_', 'grad', 'guard', 'enable_dygraph', 'disable_dygraph',
@@ -45,6 +46,20 @@ def in_declarative_mode():
     return _in_declarative_mode_
 
 
+def declarative_unsupport_argument_warning(func_name, input_names, inputs,
+                                           support_values):
+    """
+    Warning if inputs do not elementwisely equals to support_values.
+    It's a utility function for dy2static when dygraph interface have
+    more inputs than static interface such as paddle.grad.
+
+    """
+    for name, inp, sup in zip(input_names, inputs, support_values):
+        if inp != sup:
+            warnings.warn(f"{func_name} has unsupported parameter in jit: " +
+                          f"{name}, jit will discard it")
+
+
 def _switch_to_static_graph_(func):
 
     def __impl__(*args, **kwargs):
@@ -290,6 +305,10 @@ def test_layer():
         test_layer()
 
     """
+    if in_declarative_mode():
+        warnings.warn(
+            "paddle.no_grad is only supported for inference model, and not supported for training under @to_static."
+        )
     if func is None:
         return _switch_tracer_mode_guard_(is_train=False)
     else:
@@ -428,7 +447,7 @@ def guard(place=None):
                     yield
 
 
-@framework.dygraph_only
+@framework.non_static_only
 def grad(outputs,
          inputs,
          grad_outputs=None,
@@ -563,6 +582,16 @@ def test_dygraph_grad(grad_outputs=None):
             grad_y1 = paddle.to_tensor(3.0)
             print(test_dygraph_grad([grad_y1, grad_value])) # [24.]
 	'''
+    if in_declarative_mode():
+        # In dy2static context, we call static interface `gradients`
+        # to calculate grads.
+        from paddle.static import gradients
+        declarative_unsupport_argument_warning(
+            "paddle.grad",
+            ["retain_graph", "create_grad", "only_inputs", "allow_unused"],
+            [retain_graph, create_graph, only_inputs, allow_unused],
+            [None, False, True, False])
+        return gradients(outputs, inputs, grad_outputs, no_grad_vars)
 
     def check_in_out(in_out_list, name):
         assert in_out_list is not None, "{} should not be None".format(name)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index e689797bde4494..e946969f1ab8ce 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -18,6 +18,7 @@
 # It provides a compatibility layer between the AST of various Python versions,
 # as produced by ast.parse from the standard ast module.
 # See details in https://github.com/serge-sans-paille/gast/
+
 import os
 from paddle.utils import gast
 from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
@@ -29,6 +30,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.call_transformer import CallTransformer
 from paddle.fluid.dygraph.dygraph_to_static.cast_transformer import CastTransformer
 from paddle.fluid.dygraph.dygraph_to_static.grad_transformer import GradTransformer
+from paddle.fluid.dygraph.dygraph_to_static.typehint_transformer import TypeHintTransformer
 from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import IfElseTransformer
 from paddle.fluid.dygraph.dygraph_to_static.list_transformer import ListTransformer
 from paddle.fluid.dygraph.dygraph_to_static.logical_transformer import LogicalTransformer
@@ -38,6 +40,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.create_variable_transformer import CreateVariableTransformer
 from paddle.fluid.dygraph.dygraph_to_static.static_analysis import StaticAnalysisVisitor
 from paddle.fluid.dygraph.dygraph_to_static.tensor_shape_transformer import TensorShapeTransformer
+from paddle.fluid.dygraph.dygraph_to_static.decorator_transformer import DecoratorTransformer
 
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
@@ -45,8 +48,6 @@
 
 __all__ = ['DygraphToStaticAst']
 
-DECORATOR_NAMES = ['declarative', 'to_static', 'dygraph_to_static_func']
-
 
 def apply_optimization(transformers):
     """
@@ -104,7 +105,9 @@ def transfer_from_node_type(self, node_wrapper):
             PrintTransformer,  # print statement
             CallTransformer,  # transform call recursively
             CastTransformer,  # type casting statement
-            GradTransformer,  # transform paddle.grad to paddle.gradients
+            #GradTransformer,  # transform paddle.grad to paddle.gradients
+            DecoratorTransformer,  # transform decorators to function call
+            TypeHintTransformer,  # remove all typehint in gast.Name
         ]
 
         apply_optimization(transformers)
@@ -120,27 +123,6 @@ def visit_FunctionDef(self, node):
             self.decorate_func_name = node.name
 
         self.generic_visit(node)
-        # Remove the decorated name of dygraph_to_static
-        if hasattr(node, 'decorator_list'):
-            decorator_list = []
-            for d in node.decorator_list:
-                if isinstance(d, gast.Name) and d.id not in DECORATOR_NAMES:
-                    raise NotImplementedError(
-                        "ProgramTranslator hasn't implemented multiple decorators. Please remove "
-                        + d.id + " in " + self.decorate_func_name)
-                if isinstance(d, gast.Attribute):
-                    full_attribute_name = get_attribute_full_name(d)
-                    has_translate_decorator = False
-                    for deco in DECORATOR_NAMES:
-                        if deco in full_attribute_name:
-                            has_translate_decorator = True
-                            break
-                    if not has_translate_decorator:
-                        raise NotImplementedError(
-                            "ProgramTranslator hasn't implemented multiple decorators. Please remove "
-                            + full_attribute_name + " in " +
-                            self.decorate_func_name)
-            node.decorator_list = decorator_list
         return node
 
     def get_module_name(self):
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 5bb75bda8de97c..a3d96b6fe0ad86 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -33,7 +33,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import StaticFunction
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import convert_to_static
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import unwrap_decorators
-from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_func
+from paddle.fluid.dygraph.dygraph_to_static.utils import is_paddle_func, unwrap
 from paddle.fluid.dygraph.layers import Layer
 
 __all__ = ["convert_call"]
@@ -206,13 +206,19 @@ def dyfunc(x):
             # `foo` will be converted into a wrapper class, suppose as `StaticFunction`.
             # And `foo.__globals__['foo']` will still return this `StaticFunction` instead of
             # `foo` function. So `isinstance(fn, StaticFunction)` is added here.
+            _origfunc = unwrap(func)
             global_functions = set()
-            for fn in func.__globals__.values():
+            for fn in _origfunc.__globals__.values():
                 if inspect.isfunction(fn):
                     global_functions.add(fn)
                 elif isinstance(fn, StaticFunction):
                     _, fn = unwrap_decorators(fn)
                     global_functions.add(fn)
+                elif inspect.isclass(fn):
+                    if isinstance(fn.__dict__.get(func.__name__, None),
+                                  staticmethod):
+                        global_functions.add(
+                            func)  # Add func to ensure that we will convert
 
             if func in global_functions:
                 converted_call = convert_to_static(func)
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 938cf9c3228bef..e22d83d56f3a51 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -87,7 +87,10 @@ def convert_while_loop(cond,
     Args:
         cond(Callable): A callable object that returns a boolean variable to control whether to execute the loop body. It takes ``loop_vars`` as arguments.
         body(Callable): A callable object that returns a tuple or list of variables with the same arguments ``loops_vars`` as ``cond`` .
-        loop_vars(list|tuple): A list or tuple of variables passed to ``cond`` and ``body`` .
+        get_args(callable): Get all arguments that needed in true_fn and false_fn.
+        set_args(callable): Update arguments that modified in trure_fn and false_fn.
+        return_name_ids(list[string], optional): the returned names.
+        push_pop_names(list[string], optional): the names on which called .append() or .pop().
 
     Returns:
         A list or tuple of variables which returned by ``body``.
@@ -306,7 +309,8 @@ def convert_ifelse(pred,
         false_fn(callable): A callable to be performed if ``pred`` is false.
         get_args(callable): Get all arguments that needed in true_fn and false_fn.
         set_args(callable): Update arguments that modified in trure_fn and false_fn.
-        return_name_ids(list[string]): the returned names.
+        return_name_ids(list[string], optional): the returned names.
+        push_pop_names(list[string], optional): the names on which called .append() or .pop().
 
     Returns:
         ``true_fn()`` if the predicate ``pred`` is true else ``false_fn()`` .
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py
index 8ae4c12eb8eafd..bcfa3e3ec1ca92 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/create_variable_transformer.py
@@ -41,6 +41,7 @@ def transform(self):
 
     def visit_FunctionDef(self, node):
         #attributes = set(filter(lambda x: '.' in x, node.pd_scope.modified_vars()))
+        self.generic_visit(node)
         bodys = node.body
         names = sorted(node.pd_scope.created_vars())
         for name in names:
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/decorator_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/decorator_transformer.py
new file mode 100644
index 00000000000000..299b5faa55402a
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/decorator_transformer.py
@@ -0,0 +1,140 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+from paddle.utils import gast
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+from paddle.fluid.dygraph.dygraph_to_static.utils import create_funcDef_node, ast_to_source_code, is_paddle_api, Dygraph2StaticException
+import warnings
+
+import re
+from paddle.fluid.dygraph.dygraph_to_static.utils import RE_PYNAME, RE_PYMODULE
+
+IGNORE_NAMES = [
+    'declarative', 'to_static', 'dygraph_to_static_func', 'wraps',
+    'staticmethod', 'classmethod', 'decorator'
+]
+
+
+class DecoratorTransformer(BaseTransformer):
+    """
+    Transform decorators.
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Type of input node should be AstNodeWrapper, but received %s ." % type(
+            wrapper_root)
+        self.root = wrapper_root.node
+
+        self.ancestor_nodes = []
+
+    def transform(self):
+        """
+        Main function to transform AST.
+        """
+        self.visit(self.root)
+
+    def visit_FunctionDef(self, node):
+        assert isinstance(node, gast.FunctionDef)
+        self.generic_visit(node)
+
+        deco_list = node.decorator_list
+        node.decorator_list = []
+
+        # every decorator will append a node
+        decofun_nodes = []
+        # func to be decoed next time
+        deco_target = '_orig_' + node.name
+        # last decoed func
+        decoed_func = ''
+
+        for deco in reversed(deco_list):
+            # skip INGNORE_NAMES
+            deco_full_name = ast_to_source_code(deco).strip()
+            if isinstance(deco, gast.Call):
+                # match case like :
+                # 1: @_jst.Call(a.b.c.d.deco)()
+                # 2: @q.w.e.r.deco()
+                re_tmp = re.match(
+                    r'({module})*({name}\(){{0,1}}({module})*({name})(\)){{0,1}}\(.*$'
+                    .format(name=RE_PYNAME, module=RE_PYMODULE), deco_full_name)
+                deco_name = re_tmp.group(4)
+            else:
+                # match case like:
+                # @a.d.g.deco
+                re_tmp = re.match(
+                    r'({module})*({name})$'.format(name=RE_PYNAME,
+                                                   module=RE_PYMODULE),
+                    deco_full_name)
+                deco_name = re_tmp.group(2)
+            if deco_name in IGNORE_NAMES:
+                continue
+            elif deco_name == 'contextmanager':
+                warnings.warn(
+                    "Dy2Static : A context manager decorator is used, this may not work correctly after transform."
+                )
+
+            decoed_func = '_decoedby_' + deco_name
+
+            # get function after decoration
+            if isinstance(deco, gast.Call):
+                if '_jst.Call' in deco_full_name:
+                    # in this case , the deco_full_name will be like:
+                    # '_jst.Call(deco)(5)'
+                    rematch = re.match(r'\_jst\.Call\((.+?)\)\((.*)\)',
+                                       deco_full_name)
+                    re_name = rematch.group(1)
+                    re_args = rematch.group(2)
+                    re_args_with_func = deco_target + ', ' + re_args
+                    decofun_str = 'try:\n\t{0} = _jst.Call({1})({2})\nexcept:\n\t{0} = _jst.Call({1})({3})({4})'\
+                        .format(decoed_func, re_name, re_args_with_func, re_args, deco_target)
+                else:
+                    # paddle api will not be transformed to '_jst.Call'
+                    rematch = re.match(r'(.+?)\((.*)\)', deco_full_name)
+                    re_name = rematch.group(1)
+                    re_args = rematch.group(2)
+                    re_args_with_func = deco_target + ', ' + re_args
+                    decofun_str = 'try:\n\t{0} = {1}({2})\nexcept:\n\t{0} = {1}({3})({4})'\
+                        .format(decoed_func, re_name, re_args_with_func, re_args, deco_target)
+
+            else:
+                decofun_str = '{} = _jst.Call({})({})'.format(
+                    decoed_func, deco_full_name, deco_target)
+
+            decofun_nodes.extend(gast.parse(decofun_str).body)
+            deco_target = decoed_func
+
+        if not decofun_nodes:
+            return node
+
+        orig_func_node = gast.FunctionDef(name='_orig_' + node.name,
+                                          args=node.args,
+                                          body=node.body,
+                                          decorator_list=[],
+                                          returns=None,
+                                          type_comment=None)
+
+        args = [arg.id for arg in node.args.args]
+        arg_str = ','.join(args)
+        callfun_str = 'return {}({})'.format(decoed_func, arg_str)
+        callfun_node = gast.parse(callfun_str).body[0]
+
+        node.body = [orig_func_node] + decofun_nodes + [callfun_node]
+
+        return node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
index 1dfdda102c96a1..28053f00be9629 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/partial_program.py
@@ -169,6 +169,25 @@ def __init__(self,
             custom_white_list=custom_white_list,
             custom_black_list=custom_black_list)
 
+        # program_id -> list(scope)
+        self._scope_cache = {}
+
+    def _get_scope(self, program_id=None, use_scope_cache=False):
+        if use_scope_cache:
+            if program_id not in self._scope_cache:
+                scope = core.Scope()
+                self._scope_cache[program_id] = [scope]
+                return scope
+            else:
+                for scope in self._scope_cache[program_id]:
+                    if scope._can_reuesd:
+                        return scope
+                scope = core.Scope()
+                self._scope_cache[program_id].append(scope)
+                return scope
+        else:
+            return core.Scope()
+
     @LazyInitialized
     def __fake_vars(self):
         return _create_fake_var()
@@ -555,11 +574,19 @@ def __call__(self, inputs):
                 ('forward_global_block', self.forward_program.desc.block(0),
                  'backward_global_block', self.backward_program.desc.block(0)))
 
-        _legacy_C_ops.run_program(self._valid_vars(in_vars),
-                                  self._valid_vars(self._params),
-                                  self._valid_vars(out_vars),
-                                  self._create_scope_vec(), self._double_grads,
-                                  self._cuda_graph_vec, *attrs)
+            _legacy_C_ops.run_program(
+                self._valid_vars(in_vars), self._valid_vars(self._params),
+                self._valid_vars(out_vars),
+                self._create_scope_vec(program_id=self.program_id,
+                                       use_scope_cache=True),
+                self._double_grads, self._cuda_graph_vec, *attrs)
+        else:
+            _legacy_C_ops.run_program(self._valid_vars(in_vars),
+                                      self._valid_vars(self._params),
+                                      self._valid_vars(out_vars),
+                                      self._create_scope_vec(),
+                                      self._double_grads, self._cuda_graph_vec,
+                                      *attrs)
         restored_nest_out = self._restore_out(out_vars)
         return self._remove_no_value(restored_nest_out)
 
@@ -572,16 +599,6 @@ def _cast_fp16_if_pure_fp16(self, in_vars):
                         == paddle.float16):
                     in_vars[i] = var.astype('float16')
                     in_vars[i].name = name
-                if (self.forward_program.global_block().has_var(name)
-                        and self.forward_program.global_block().var(name).dtype
-                        == paddle.float16):
-                    in_vars[i] = var.astype('float16')
-                    in_vars[i].name = name
-                if (self.backward_program.global_block().has_var(name)
-                        and self.backward_program.global_block().var(name).dtype
-                        == paddle.float16):
-                    in_vars[i] = var.astype('float16')
-                    in_vars[i].name = name
 
     @property
     def program(self):
@@ -735,10 +752,11 @@ def create_out(var_id):
 
         return input_vars, out_vars
 
-    def _create_scope_vec(self):
+    def _create_scope_vec(self, program_id=None, use_scope_cache=False):
         # Hold forward variables
         tmp_scope_vec = None
-        inner_scope = core.Scope()
+        inner_scope = self._get_scope(program_id=program_id,
+                                      use_scope_cache=use_scope_cache)
         if not framework._in_eager_mode_:
             tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
                                          "program_out_scope",
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 77b55f35e2eb28..fbaa4b9f0ef523 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -42,7 +42,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.utils import input_specs_compatible
 from paddle.fluid.dygraph.dygraph_to_static.utils import type_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import unwrap
-from paddle.fluid.dygraph.dygraph_to_static.utils import make_hashable
+from paddle.fluid.dygraph.dygraph_to_static.utils import make_hashable, ALREADY_D2S
 from paddle.fluid.dygraph.dygraph_to_static.function_spec import FunctionSpec, _hash_spec_names
 from paddle.fluid.dygraph.dygraph_to_static.function_spec import get_buffers, get_parameters
 from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
@@ -136,8 +136,11 @@ def convert_to_static(function):
     Args:
         function(callable): The function with dygraph layers that will be converted into static layers.
     """
+    if getattr(function, ALREADY_D2S, None):
+        return function
     with _CACHE_LOCK:
         static_func = _FUNCTION_CACHE.convert_with_cache(function)
+        setattr(static_func, ALREADY_D2S, True)
         return static_func
 
 
@@ -380,7 +383,6 @@ def __call__(self, *args, **kwargs):
         try:
             concrete_program, partial_program_layer = self.get_concrete_program(
                 *args, **kwargs, is_train=self._is_train_mode())
-
             # 3. synchronize self.training attribute.
             if isinstance(self._class_instance, layers.Layer):
                 partial_program_layer.training = self._class_instance.training
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
index 3eadd455e1033e..17a55fa0b74301 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/return_transformer.py
@@ -22,6 +22,8 @@
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node
 from paddle.fluid.dygraph.dygraph_to_static.utils import ast_to_source_code
 from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+from paddle.fluid.dygraph.dygraph_to_static.utils import Dygraph2StaticException
+from paddle.fluid.dygraph.dygraph_to_static.utils import ORIGI_INFO
 
 __all__ = [
     'RETURN_NO_VALUE_MAGIC_NUM', 'RETURN_NO_VALUE_VAR_NAME', 'ReturnTransformer'
@@ -90,50 +92,37 @@ class ReturnAnalysisVisitor(gast.NodeVisitor):
 
     def __init__(self, root_node):
         self.root = root_node
+        assert isinstance(
+            self.root, gast.FunctionDef), "Input is not gast.FunctionDef node"
 
-        # A list to store where the current function is.
-        self.function_def = []
+        # the number of return statements
+        self.count_return = 0
 
-        # Mapping from gast.FunctionDef node to the number of return statements
-        # Python allows define function inside function so we have to handle it
-        self.count_return = {}
+        # maximum number of variables
+        self.max_return_length = 0
 
-        # Mapping from gast.FunctionDef node to the maximum number of variables
-        # returned by the function's return statement
-        self.max_return_length = {}
         self.visit(self.root)
 
     def visit_FunctionDef(self, node):
-        self.function_def.append(node)
-        self.count_return[node] = 0
-        self.max_return_length[node] = 0
-        self.generic_visit(node)
-        self.function_def.pop()
-        return node
+        """
+        don't analysis closure, just analyze current func def level.
+        """
+        if node == self.root:
+            self.generic_visit(node)
 
     def visit_Return(self, node):
-        assert len(
-            self.function_def) > 0, "Found 'return' statement out of function."
-        cur_func = self.function_def[-1]
-        if cur_func in self.count_return:
-            self.count_return[cur_func] += 1
-        else:
-            self.count_return[cur_func] = 1
+        self.count_return += 1
 
         return_length = get_return_size(node)
-        if cur_func in self.max_return_length:
-            self.max_return_length[cur_func] = max(
-                self.max_return_length[cur_func], return_length)
-        else:
-            self.max_return_length[cur_func] = return_length
+        self.max_return_length = max(self.max_return_length, return_length)
 
         self.generic_visit(node)
 
-    def get_func_return_count(self, func_node):
-        return self.count_return[func_node]
+    def get_func_return_count(self):
+        return self.count_return
 
-    def get_func_max_return_length(self, func_node):
-        return self.max_return_length[func_node]
+    def get_func_max_return_length(self):
+        return self.max_return_length
 
 
 class ReturnTransformer(BaseTransformer):
@@ -143,32 +132,51 @@ class ReturnTransformer(BaseTransformer):
     variable to store the early return statements and boolean states with
     if-else to skip the statements after the return.
 
+    Go through all the function definition and call SingleReturnTransformer for each function.
+    SingleReturnTransformer don't care the nested function def.
     """
 
     def __init__(self, wrapper_root):
         self.wrapper_root = wrapper_root
         self.root = wrapper_root.node
-
         pre_transformer = ReplaceReturnNoneTransformer(self.root)
         pre_transformer.transform()
 
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_FunctionDef(self, node):
+        node = self.generic_visit(node)
+        node = SingleReturnTransformer(node).transform()
+        return node
+
+
+class SingleReturnTransformer(BaseTransformer):
+    """
+    This function only apply to single function. don't care the nested function_def
+    """
+
+    def __init__(self, root):
+        self.root = root
+        assert isinstance(
+            self.root, gast.FunctionDef), "Input is not gast.FunctionDef node"
+
         self.ancestor_nodes = []
-        # The name of the variable which stores the final return value
-        # Mapping from FunctionDef node to string
-        self.return_value_name = {}
-        # The names of the variable which stores the boolean state that skip
-        # statments. Mapping from FunctionDef node to list
-        self.return_name = {}
-        # The names of the variable which is placeholder to handle various-
-        # length return. Mapping from FunctionDef node to list
-        self.return_no_value_name = {}
-        # A list of FunctionDef to store where the current function is.
-        self.function_def = []
+
+        # The name of return placeholder
+        self.return_value_name = None
+
+        # Every return stmt corresponds to a bool value variable, and return name is the name of the boolean variable
+        self.return_name = []
 
         self.pre_analysis = None
 
-    def transform(self):
-        self.visit(self.root)
+    def assert_parent_is_not_while(self, parent_node_of_return):
+        if isinstance(parent_node_of_return, (gast.While, gast.For)):
+            raise Dygraph2StaticException(
+                "Found return statement in While or For body and loop "
+                "is meaningless, please check you code and remove return in while/for."
+            )
 
     def generic_visit(self, node):
         # Because we change ancestor nodes during visit_Return, not current
@@ -188,28 +196,46 @@ def visit(self, node):
         Self-defined visit for appending ancestor
         """
         self.ancestor_nodes.append(node)
-        ret = super(ReturnTransformer, self).visit(node)
+        ret = super(SingleReturnTransformer, self).visit(node)
         self.ancestor_nodes.pop()
         return ret
 
     def visit_FunctionDef(self, node):
-        self.function_def.append(node)
-        self.return_value_name[node] = None
-        self.return_name[node] = []
-        self.return_no_value_name[node] = []
+        """
+        don't analysis closure, just analyze current func def level.
+        """
+        if node == self.root:
+            self.generic_visit(node)
+        return node
 
+    def append_assign_to_return_node(self, value, parent_node_of_return,
+                                     return_name, assign_nodes):
+        self.assert_parent_is_not_while(parent_node_of_return)
+        assert value in [True, False], "value must be True or False."
+        if isinstance(parent_node_of_return, gast.If):
+            # Prepend control flow boolean nodes such as '__return@1 = True'
+            node_str = "{} = _jst.create_bool_as_type({}, {})".format(
+                return_name,
+                ast_to_source_code(parent_node_of_return.test).strip(), value)
+
+            assign_node = gast.parse(node_str).body[0]
+            assign_nodes.append(assign_node)
+
+    def transform(self):
+        node = self.root
         self.pre_analysis = ReturnAnalysisVisitor(node)
-        max_return_length = self.pre_analysis.get_func_max_return_length(node)
-        while self.pre_analysis.get_func_return_count(node) > 1:
-            self.generic_visit(node)
+        max_return_length = self.pre_analysis.get_func_max_return_length()
+        while self.pre_analysis.get_func_return_count() > 0:
+            # every visit will decrease the number of returns.
+            # so we need a while.
+            self.visit(node)
             self.pre_analysis = ReturnAnalysisVisitor(node)
 
         if max_return_length == 0:
-            self.function_def.pop()
             return node
 
         # Prepend initialization of final return and append final return statement
-        value_name = self.return_value_name[node]
+        value_name = self.return_value_name
         if value_name is not None:
             node.body.append(
                 gast.Return(value=gast.Name(id=value_name,
@@ -225,42 +251,32 @@ def visit_FunctionDef(self, node):
                                                    value=gast.Constant(
                                                        kind=None, value=None))
             node.body.insert(0, assign_return_value_node)
-
-        # Prepend no value placeholders
-        self.function_def.pop()
         return node
 
     def visit_Return(self, node):
-        cur_func_node = self.function_def[-1]
         return_name = unique_name.generate(RETURN_PREFIX)
-        self.return_name[cur_func_node].append(return_name)
-        max_return_length = self.pre_analysis.get_func_max_return_length(
-            cur_func_node)
+        self.return_name.append(return_name)
+        max_return_length = self.pre_analysis.get_func_max_return_length()
         parent_node_of_return = self.ancestor_nodes[-2]
 
         for ancestor_index in reversed(range(len(self.ancestor_nodes) - 1)):
             ancestor = self.ancestor_nodes[ancestor_index]
             cur_node = self.ancestor_nodes[ancestor_index + 1]
-            if hasattr(ancestor,
-                       "body") and index_in_list(ancestor.body, cur_node) != -1:
-                if cur_node == node:
-                    self._replace_return_in_stmt_list(ancestor.body, cur_node,
-                                                      return_name,
-                                                      max_return_length,
-                                                      parent_node_of_return)
-                self._replace_after_node_to_if_in_stmt_list(
-                    ancestor.body, cur_node, return_name, parent_node_of_return)
-            elif hasattr(ancestor, "orelse") and index_in_list(
-                    ancestor.orelse, cur_node) != -1:
-                if cur_node == node:
-                    self._replace_return_in_stmt_list(ancestor.orelse, cur_node,
-                                                      return_name,
-                                                      max_return_length,
-                                                      parent_node_of_return)
-                self._replace_after_node_to_if_in_stmt_list(
-                    ancestor.orelse, cur_node, return_name,
-                    parent_node_of_return)
 
+            def _deal_branches(branch_name):
+                if hasattr(ancestor, branch_name):
+                    branch_node = getattr(ancestor, branch_name)
+                    if index_in_list(branch_node, cur_node) != -1:
+                        if cur_node == node:
+                            self._replace_return_in_stmt_list(
+                                branch_node, cur_node, return_name,
+                                max_return_length, parent_node_of_return)
+                        self._replace_after_node_to_if_in_stmt_list(
+                            branch_node, cur_node, return_name,
+                            parent_node_of_return)
+
+            _deal_branches("body")
+            _deal_branches("orelse")
             # If return node in while loop, add `not return_name` in gast.While.test
             if isinstance(ancestor, gast.While):
                 cond_var_node = gast.UnaryOp(op=gast.Not(),
@@ -288,7 +304,7 @@ def visit_Return(self, node):
                 while_node = new_stmts[-1]
                 self.ancestor_nodes[ancestor_index] = while_node
 
-            if ancestor == cur_func_node:
+            if ancestor == self.root:
                 break
         # return_node is replaced so we shouldn't return here
 
@@ -301,34 +317,29 @@ def _replace_return_in_stmt_list(self, stmt_list, return_node, return_name,
             return False
 
         assign_nodes = []
-        # Here assume that the parent node of return is gast.If
-        if isinstance(parent_node_of_return, gast.If):
-            # Prepend control flow boolean nodes such as '__return@1 = True'
-            node_str = "{} = _jst.create_bool_as_type({}, True)".format(
-                return_name,
-                ast_to_source_code(parent_node_of_return.test).strip())
+        self.append_assign_to_return_node(True, parent_node_of_return,
+                                          return_name, assign_nodes)
 
-            assign_true_node = gast.parse(node_str).body[0]
-            assign_nodes.append(assign_true_node)
-
-        cur_func_node = self.function_def[-1]
         return_length = get_return_size(return_node)
         # In this case we should NOT append RETURN_NO_VALUE placeholder
         if return_node.value is not None:
-            cur_func_node = self.function_def[-1]
-            if self.return_value_name[cur_func_node] is None:
-                self.return_value_name[cur_func_node] = unique_name.generate(
+            if self.return_value_name is None:
+                self.return_value_name = unique_name.generate(
                     RETURN_VALUE_PREFIX)
 
             assign_nodes.append(
                 gast.Assign(targets=[
-                    gast.Name(id=self.return_value_name[cur_func_node],
+                    gast.Name(id=self.return_value_name,
                               ctx=gast.Store(),
                               annotation=None,
                               type_comment=None)
                 ],
                             value=return_node.value))
+            return_origin_info = getattr(return_node, ORIGI_INFO, None)
+            setattr(assign_nodes[-1], ORIGI_INFO, return_origin_info)
 
+        # If there is a return in the body or else of if, the remaining statements
+        # will not be executed, so they can be properly replaced.
         stmt_list[i:] = assign_nodes
         return True
 
@@ -354,12 +365,8 @@ def _replace_after_node_to_if_in_stmt_list(self, stmt_list, node,
         stmt_list[i + 1:] = [if_stmt]
 
         # Here assume that the parent node of return is gast.If
-        if isinstance(parent_node_of_return, gast.If):
-            # Prepend control flow boolean nodes such as '__return@1 = False'
-            node_str = "{} = _jst.create_bool_as_type({}, False)".format(
-                return_name,
-                ast_to_source_code(parent_node_of_return.test).strip())
-            assign_false_node = gast.parse(node_str).body[0]
-
-            stmt_list[i:i] = [assign_false_node]
+        assign_nodes = []
+        self.append_assign_to_return_node(False, parent_node_of_return,
+                                          return_name, assign_nodes)
+        stmt_list[i:i] = assign_nodes
         return True
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/typehint_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/typehint_transformer.py
new file mode 100644
index 00000000000000..f258b98b507119
--- /dev/null
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/typehint_transformer.py
@@ -0,0 +1,47 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.utils import gast
+import warnings
+
+from paddle.fluid.dygraph.dygraph_to_static.static_analysis import AstNodeWrapper
+from paddle.fluid.dygraph.dygraph_to_static import utils
+from paddle.fluid.dygraph.dygraph_to_static.base_transformer import BaseTransformer
+
+
+class TypeHintTransformer(BaseTransformer):
+    """
+    A class remove all the typehint in gast.Name(annotation).
+    Please put it behind other transformers because other transformer may relay on typehints.
+    """
+
+    def __init__(self, wrapper_root):
+        assert isinstance(
+            wrapper_root, AstNodeWrapper
+        ), "Input non-AstNodeWrapper node for the initialization of TypeHintTransformer."
+        self.wrapper_root = wrapper_root
+        self.root = wrapper_root.node
+
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_FunctionDef(self, node):
+        node.returns = None
+        self.generic_visit(node)
+        return node
+
+    def visit_Name(self, node):
+        node.annotation = None
+        self.generic_visit(node)
+        return node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 41cd4676e608a9..1d97a81e616973 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -21,7 +21,8 @@
 import collections
 from paddle.utils import gast
 import inspect
-import os
+import os, sys
+import shutil
 import six
 import tempfile
 import textwrap
@@ -35,6 +36,7 @@
 from paddle.fluid.layers import assign
 import collections
 from functools import reduce
+import warnings
 
 # Note(Aurelius): Do not forget the dot `.` to distinguish other
 # module such as paddlenlp.
@@ -43,6 +45,7 @@
 DYGRAPH_TO_STATIC_MODULE_PREFIX = 'paddle.fluid.dygraph.dygraph_to_static'
 GET_ARGS_FUNC_PREFIX = 'get_args'
 SET_ARGS_FUNC_PREFIX = 'set_args'
+ALREADY_D2S = '__already_d2s'
 ARGS_NAME = '__args'
 # NOTE(liym27): Please use `getattr(ast_node, ORIGI_INFO)` instead of . operation to get the original information of ast node.
 ORIGI_INFO = "Original information of source code for ast node."
@@ -50,7 +53,7 @@
 
 class BaseNodeVisitor(gast.NodeVisitor):
     """
-    Implement customized NodeVisitor inherited from gast.NodeVisitor. 
+    Implement customized NodeVisitor inherited from gast.NodeVisitor.
     Ancestor nodes are traced to easily support more operations of currently
     visited node.
     """
@@ -82,6 +85,7 @@ def visit(self, node):
     "PolynomialDecay": "polynomial_decay",
 }
 
+DEL_TEMP_DIR = True  # A flag to avoid atexit.register more than once
 FOR_ITER_INDEX_PREFIX = '__for_loop_var_index'
 FOR_ITER_TUPLE_PREFIX = '__for_loop_iter_tuple'
 FOR_ITER_TARGET_PREFIX = '__for_loop_iter_target'
@@ -91,12 +95,23 @@ def visit(self, node):
 FOR_ITER_VAR_NAME_PREFIX = '__for_loop_iter_var'
 FOR_ITER_ZIP_TO_LIST_PREFIX = '__for_loop_iter_zip'
 
+RE_PYNAME = '[a-zA-Z0-9_]+'
+RE_PYMODULE = r'[a-zA-Z0-9_]+\.'
+
 # FullArgSpec is valid from Python3. Defined a Namedtuple to
 # to make it available in Python2.
-FullArgSpec = collections.namedtuple('FullArgSpec', [
-    'args', 'varargs', 'varkw', 'defaults', 'kwonlyargs', 'kwonlydefaults',
-    'annotations'
-])
+FullArgSpec = collections.namedtuple(
+    'FullArgSpec',
+    [
+        'args',
+        'varargs',
+        'varkw',
+        'defaults',
+        'kwonlyargs',
+        'kwonlydefaults',
+        'annotations',
+    ],
+)
 
 
 def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
@@ -106,7 +121,7 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
     data can be various-length. This API is used in translating dygraph into
     static graph.
 
-     Note: 
+     Note:
         The default :code:`stop_gradient` attribute of the Tensor created by
         this API is true, which means the gradient won't be passed backward
         through the data Tensor. Set :code:`var.stop_gradient = False` If
@@ -117,7 +132,7 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
            for more details.
        shape (list|tuple): List|Tuple of integers declaring the shape. You can
            set "None" at a dimension to indicate the dimension can be of any
-           size. For example, it is useful to set changeable batch size as "None" 
+           size. For example, it is useful to set changeable batch size as "None"
        dtype (np.dtype|VarType|str, optional): The type of the data. Supported
            dtype: bool, float16, float32, float64, int8, int16, int32, int64,
            uint8. Default: float32
@@ -134,35 +149,26 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
         if shape[i] is None:
             shape[i] = -1
 
-    return helper.create_global_variable(name=name,
-                                         shape=shape,
-                                         dtype=dtype,
-                                         type=core.VarDesc.VarType.LOD_TENSOR,
-                                         stop_gradient=True,
-                                         lod_level=lod_level,
-                                         is_data=True,
-                                         need_check_feed=False)
-
-
-def create_undefined_var_like(variable):
-    """ create a undefined var with the same shape and dtype like varaible.
-    """
-    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
-    var = data_layer_not_check(unique_name.generate("undefined_var"),
-                               variable.shape, variable.dtype)
-    var.stop_gradient = False
-    helper = LayerHelper('create_undefined_var_like', **locals())
-    saved_block_ids = helper.main_program.current_block_idx
-    helper.main_program.current_block_idx = 0
-    assign(RETURN_NO_VALUE_MAGIC_NUM, var)
-    helper.main_program.current_block_idx = saved_block_ids
-    return var
+    return helper.create_global_variable(
+        name=name,
+        shape=shape,
+        dtype=dtype,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        stop_gradient=True,
+        lod_level=lod_level,
+        is_data=True,
+        need_check_feed=False,
+    )
 
 
 def create_undefined_variable():
-    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_NO_VALUE_MAGIC_NUM
-    var = data_layer_not_check(unique_name.generate("undefined_var"), [1],
-                               "float64")
+    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import (
+        RETURN_NO_VALUE_MAGIC_NUM,
+    )
+
+    var = data_layer_not_check(
+        unique_name.generate("undefined_var"), [1], "float64"
+    )
     var.stop_gradient = False
     # the variable is created in block(0), we append assign in block(0) either.
     helper = LayerHelper('create_undefined_variable', **locals())
@@ -174,17 +180,16 @@ def create_undefined_variable():
 
 
 class UndefinedVar:
-
     def __init__(self, name):
         self.name = name
 
     def check(self):
         raise UnboundLocalError(
-            "local variable '{}' should be created before using it.")
+            "local variable '{}' should be created before using it."
+        )
 
 
 class Dygraph2StaticException(Exception):
-
     def __init__(self, message):
         super().__init__(message)
 
@@ -201,13 +206,15 @@ def getfullargspec(target):
         return inspect.getfullargspec(target)
     else:
         argspec = inspect.getargspec(target)
-        return FullArgSpec(args=argspec.args,
-                           varargs=argspec.varargs,
-                           varkw=argspec.keywords,
-                           defaults=argspec.defaults,
-                           kwonlyargs=[],
-                           kwonlydefaults=None,
-                           annotations={})
+        return FullArgSpec(
+            args=argspec.args,
+            varargs=argspec.varargs,
+            varkw=argspec.keywords,
+            defaults=argspec.defaults,
+            kwonlyargs=[],
+            kwonlydefaults=None,
+            annotations={},
+        )
 
 
 def parse_arg_and_kwargs(function):
@@ -224,7 +231,7 @@ def parse_arg_and_kwargs(function):
     default_values = fullargspec.defaults
     if default_values:
         assert len(default_values) <= len(arg_names)
-        default_kwarg_names = arg_names[-len(default_values):]
+        default_kwarg_names = arg_names[-len(default_values) :]
         default_kwargs = dict(zip(default_kwarg_names, default_values))
 
     return arg_names, default_kwargs
@@ -298,8 +305,9 @@ def is_api_in_module(node, module_prefix):
         from paddle.fluid.dygraph import to_variable
         from paddle import to_tensor
 
-        return eval("_is_api_in_module_helper({}, '{}')".format(
-            func_str, module_prefix))
+        return eval(
+            "_is_api_in_module_helper({}, '{}')".format(func_str, module_prefix)
+        )
     except Exception:
         return False
 
@@ -330,8 +338,10 @@ def is_numpy_api(node):
     func_str = astor.to_source(gast.gast_to_ast(node.func))
     try:
         import numpy as np
-        module_result = eval("_is_api_in_module_helper({}, '{}')".format(
-            func_str, "numpy"))
+
+        module_result = eval(
+            "_is_api_in_module_helper({}, '{}')".format(func_str, "numpy")
+        )
         # BUG: np.random.uniform doesn't have module and cannot be analyzed
         # TODO: find a better way
         if not module_result:
@@ -340,18 +350,19 @@ def is_numpy_api(node):
         return False
 
 
-def is_control_flow_to_transform(node,
-                                 static_analysis_visitor=None,
-                                 var_name_to_type=None):
+def is_control_flow_to_transform(
+    node, static_analysis_visitor=None, var_name_to_type=None
+):
     """
     Determines whether the node is a PaddlePaddle control flow statement which needs to
     be transformed into a static graph control flow statement.
     """
-    assert isinstance(node, gast.AST), \
-        "The type of input node must be gast.AST, but received %s." % type(node)
-    visitor = IsControlFlowVisitor(node,
-                                   static_analysis_visitor,
-                                   node_var_type_map=var_name_to_type)
+    assert isinstance(
+        node, gast.AST
+    ), "The type of input node must be gast.AST, but received %s." % type(node)
+    visitor = IsControlFlowVisitor(
+        node, static_analysis_visitor, node_var_type_map=var_name_to_type
+    )
     need_to_transform = visitor.transform()
     return need_to_transform
 
@@ -360,6 +371,7 @@ def _delete_keywords_from(node):
     assert isinstance(node, gast.Call)
     func_src = astor.to_source(gast.gast_to_ast(node.func))
     import paddle.fluid as fluid
+
     full_args = eval("inspect.getargspec({})".format(func_src))
     full_args_name = full_args[0]
 
@@ -373,7 +385,8 @@ def to_static_api(dygraph_class):
     else:
         raise NotImplementedError(
             "Paddle dygraph API {} cannot be converted "
-            "to static graph at present.".format(dygraph_class))
+            "to static graph at present.".format(dygraph_class)
+        )
 
 
 def _add_keywords_to(node, dygraph_api_name):
@@ -384,8 +397,10 @@ def _add_keywords_to(node, dygraph_api_name):
                 ast_keyword.arg = "size"
 
         node.keywords.append(
-            gast.keyword(arg="num_flatten_dims",
-                         value=gast.Constant(value=-1, kind=None)))
+            gast.keyword(
+                arg="num_flatten_dims", value=gast.Constant(value=-1, kind=None)
+            )
+        )
 
     if dygraph_api_name == "BilinearTensorProduct":
         for ast_keyword in node.keywords:
@@ -404,15 +419,17 @@ def to_static_ast(node, class_node):
     assert isinstance(class_node, gast.Call)
     static_api = to_static_api(class_node.func.attr)
 
-    node.func = gast.Attribute(attr=static_api,
-                               ctx=gast.Load(),
-                               value=gast.Attribute(attr='layers',
-                                                    ctx=gast.Load(),
-                                                    value=gast.Name(
-                                                        ctx=gast.Load(),
-                                                        id='fluid',
-                                                        annotation=None,
-                                                        type_comment=None)))
+    node.func = gast.Attribute(
+        attr=static_api,
+        ctx=gast.Load(),
+        value=gast.Attribute(
+            attr='layers',
+            ctx=gast.Load(),
+            value=gast.Name(
+                ctx=gast.Load(), id='fluid', annotation=None, type_comment=None
+            ),
+        ),
+    )
 
     update_args_of_func(node, class_node, 'forward')
 
@@ -435,10 +452,13 @@ def update_args_of_func(node, dygraph_node, method_name):
 
     class_src = astor.to_source(gast.gast_to_ast(dygraph_node.func))
     import paddle.fluid as fluid
+
     if method_name == "__init__" or eval(
-            "issubclass({}, fluid.dygraph.Layer)".format(class_src)):
-        full_args = eval("inspect.getargspec({}.{})".format(
-            class_src, method_name))
+        "issubclass({}, fluid.dygraph.Layer)".format(class_src)
+    ):
+        full_args = eval(
+            "inspect.getargspec({}.{})".format(class_src, method_name)
+        )
         full_args_name = [
             arg_name for arg_name in full_args[0] if arg_name != "self"
         ]
@@ -453,21 +473,24 @@ def update_args_of_func(node, dygraph_node, method_name):
 
 
 def create_api_shape_node(tensor_shape_node):
-    assert isinstance(tensor_shape_node,
-                      (gast.Name, gast.Attribute, gast.Subscript))
+    assert isinstance(
+        tensor_shape_node, (gast.Name, gast.Attribute, gast.Subscript)
+    )
 
     if isinstance(tensor_shape_node, gast.Name):
         api_shape_node = gast.Call(
             func=gast.parse('paddle.shape').body[0].value,
             args=[tensor_shape_node],
-            keywords=[])
+            keywords=[],
+        )
         return api_shape_node
 
     if isinstance(tensor_shape_node, gast.Attribute):
         api_shape_node = gast.Call(
             func=gast.parse('paddle.shape').body[0].value,
             args=[tensor_shape_node.value],
-            keywords=[])
+            keywords=[],
+        )
         return api_shape_node
 
     if isinstance(tensor_shape_node, gast.Subscript):
@@ -477,14 +500,15 @@ def create_api_shape_node(tensor_shape_node):
 
 
 def get_constant_variable_node(name, value, shape=[1], dtype='int64'):
-    return gast.parse('%s = paddle.full(%s, "%s", %s)' %
-                      (name, str(shape), str(value), dtype))
+    return gast.parse(
+        '%s = paddle.full(%s, "%s", %s)' % (name, str(shape), str(value), dtype)
+    )
 
 
 def get_attribute_full_name(node):
     assert isinstance(
-        node,
-        gast.Attribute), "Input non-Attribute node to get attribute full name"
+        node, gast.Attribute
+    ), "Input non-Attribute node to get attribute full name"
     return astor.to_source(gast.gast_to_ast(node)).strip()
 
 
@@ -502,15 +526,15 @@ def generate_name_node(name_ids, ctx=gast.Load(), gen_tuple_if_single=False):
         name_ids = [name_ids]
     if not isinstance(name_ids, (list, tuple, set)):
         raise TypeError(
-            'name_ids must be list or tuple or set, but received %s' %
-            type(type(name_ids)))
+            'name_ids must be list or tuple or set, but received %s'
+            % type(type(name_ids))
+        )
 
     def create_node_for_name(name):
         if '.' not in name:
-            return gast.Name(id=name,
-                             ctx=ctx,
-                             annotation=None,
-                             type_comment=None)
+            return gast.Name(
+                id=name, ctx=ctx, annotation=None, type_comment=None
+            )
         return gast.parse(name).body[0].value
 
     gast_names = [create_node_for_name(name_id) for name_id in name_ids]
@@ -532,12 +556,14 @@ def create_funcDef_node(nodes, name, input_args, return_name_ids):
         nodes.append(gast.Return(value=generate_name_node(return_name_ids)))
     else:
         nodes.append(gast.Return(value=None))
-    func_def_node = gast.FunctionDef(name=name,
-                                     args=input_args,
-                                     body=nodes,
-                                     decorator_list=[],
-                                     returns=None,
-                                     type_comment=None)
+    func_def_node = gast.FunctionDef(
+        name=name,
+        args=input_args,
+        body=nodes,
+        decorator_list=[],
+        returns=None,
+        type_comment=None,
+    )
     return func_def_node
 
 
@@ -558,6 +584,23 @@ def create_assign_node(name, node):
     return targets, assign_node
 
 
+def get_temp_dir():
+    """
+    Return @to_static temp directory.
+    """
+    dir_name = "paddle/to_static_tmp/{pid}".format(pid=os.getpid())
+    temp_dir = os.path.join(os.path.expanduser('~/.cache'), dir_name)
+
+    is_windows = sys.platform.startswith('win')
+    if is_windows:
+        temp_dir = os.path.normpath(temp_dir)
+
+    if not os.path.exists(temp_dir):
+        os.makedirs(temp_dir)
+
+    return temp_dir
+
+
 def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
     """
     Transform modified AST of decorated function into python callable object.
@@ -565,27 +608,42 @@ def ast_to_func(ast_root, dyfunc, delete_on_exit=True):
     function, the other inner functions are invisible for the decorated function.
     """
 
-    def remove_if_exit(filepath):
-        if os.path.exists(filepath):
-            os.remove(filepath)
+    def remove_if_exit(dir_path):
+        if os.path.exists(dir_path):
+            shutil.rmtree(dir_path)
+
+    def func_prefix(func):
+        pre_fix = func.__name__
+        if hasattr(func, '__self__'):
+            try:
+                pre_fix = func.__self__.__class__.__name__ + '_' + func.__name__
+            except:
+                pass
+        return pre_fix
 
     source = ast_to_source_code(ast_root)
     source = _inject_import_statements() + source
-
-    f = tempfile.NamedTemporaryFile(mode='w',
-                                    suffix='.py',
-                                    delete=False,
-                                    encoding='utf-8')
+    temp_dir = get_temp_dir()
+    f = tempfile.NamedTemporaryFile(
+        mode='w',
+        prefix=func_prefix(dyfunc),
+        suffix='.py',
+        delete=False,
+        dir=temp_dir,
+        encoding='utf-8',
+    )
     with f:
         module_name = os.path.basename(f.name[:-3])
         f.write(source)
 
-    if delete_on_exit:
-        atexit.register(lambda: remove_if_exit(f.name))
-        atexit.register(lambda: remove_if_exit(f.name[:-3] + ".pyc"))
+    global DEL_TEMP_DIR
+    if delete_on_exit and DEL_TEMP_DIR:
+        # Clear temporary files in TEMP_DIR while exitting Python process
+        atexit.register(remove_if_exit, dir_path=temp_dir)
+        DEL_TEMP_DIR = False
 
-    module = SourceFileLoader(module_name, f.name).load_module()
     func_name = dyfunc.__name__
+    module = SourceFileLoader(module_name, f.name).load_module()
     # The 'forward' or 'another_forward' of 'TranslatedLayer' cannot be obtained
     # through 'func_name'. So set the special function name '__i_m_p_l__'.
     if hasattr(module, '__i_m_p_l__'):
@@ -595,8 +653,9 @@ def remove_if_exit(filepath):
         callable_func = getattr(module, func_name)
     else:
         raise ValueError(
-            'Function: %s doesn\'t exist in the Module transformed from AST.' %
-            func_name)
+            'Function: %s doesn\'t exist in the Module transformed from AST.'
+            % func_name
+        )
     # After transform dygraph function into callable_func saved in tmp file,
     # it lost the global variables from imported statements or defined in source file.
     # Recovers the necessary variables by `__globals__`.
@@ -607,10 +666,14 @@ def remove_if_exit(filepath):
 
 def _inject_import_statements():
     import_statements = [
-        "import paddle", "from paddle import Tensor",
-        "import paddle.fluid as fluid", "import paddle.jit.dy2static as _jst",
-        "from typing import *", "import numpy as np", "import warnings",
-        "warnings.filterwarnings('ignore', category=DeprecationWarning)"
+        "import paddle",
+        "from paddle import Tensor",
+        "import paddle.fluid as fluid",
+        "import paddle.jit.dy2static as _jst",
+        "from typing import *",
+        "import numpy as np",
+        "import warnings",
+        "warnings.filterwarnings('ignore', category=DeprecationWarning)",
     ]
     return '\n'.join(import_statements) + '\n'
 
@@ -633,8 +696,10 @@ def func_to_source_code(function, dedent=True):
     """
     if not (inspect.isfunction(function) or inspect.ismethod(function)):
         raise TypeError(
-            "The type of 'function' should be a function or method, but received {}."
-            .format(type(function).__name__))
+            "The type of 'function' should be a function or method, but received {}.".format(
+                type(function).__name__
+            )
+        )
     source_code_list, _ = inspect.getsourcelines(function)
     # Replace comments with blank lines so that error messages are not misplaced
     source_code_list = [
@@ -654,8 +719,9 @@ def ast_to_source_code(ast_node):
     """
     if not isinstance(ast_node, (gast.AST, ast.AST)):
         raise TypeError(
-            "Type of ast_root should be gast.AST or ast.AST, but received %s." %
-            type(ast_node))
+            "Type of ast_root should be gast.AST or ast.AST, but received %s."
+            % type(ast_node)
+        )
     if isinstance(ast_node, gast.AST):
         ast_node = gast.gast_to_ast(ast_node)
 
@@ -671,8 +737,17 @@ def is_candidate_node(node):
     """
     Nodes with specified type will be dependent on tensor.
     """
-    is_compare_node = isinstance(node, (gast.Compare, gast.BoolOp, gast.UnaryOp,
-                                        gast.For, gast.If, gast.While))
+    is_compare_node = isinstance(
+        node,
+        (
+            gast.Compare,
+            gast.BoolOp,
+            gast.UnaryOp,
+            gast.For,
+            gast.If,
+            gast.While,
+        ),
+    )
     # TODO(Aurelius84): `.numpy()` may be an customized function,
     # and should consider a more elegant way to solve this problem.
     has_numpy_attr = ".numpy()" in ast_to_source_code(node)
@@ -688,9 +763,9 @@ def compare_with_none(node):
             # node.comparators is a list.
             if isinstance(child, list):
                 child = child[0]
-            if (isinstance(child, gast.Constant)
-                    and child.value is None) or (isinstance(child, gast.Name)
-                                                 and child.id == 'None'):
+            if (isinstance(child, gast.Constant) and child.value is None) or (
+                isinstance(child, gast.Name) and child.id == 'None'
+            ):
                 return True
     return False
 
@@ -725,20 +800,22 @@ class IsControlFlowVisitor(gast.NodeVisitor):
              because reshape_op may be called before this statement.
     """
 
-    def __init__(self,
-                 ast_node,
-                 static_analysis_visitor=None,
-                 node_var_type_map=None):
+    def __init__(
+        self, ast_node, static_analysis_visitor=None, node_var_type_map=None
+    ):
         assert isinstance(
             ast_node, gast.AST
         ), "Type of input node should be gast.AST, but received %s." % type(
-            ast_node)
+            ast_node
+        )
         self.ast_root = ast_node
         if static_analysis_visitor is None:
             from .static_analysis import StaticAnalysisVisitor
+
             static_analysis_visitor = StaticAnalysisVisitor(ast_node)
         self.static_analysis_visitor = static_analysis_visitor
-        self.node_to_wrapper_map = self.static_analysis_visitor.get_node_to_wrapper_map(
+        self.node_to_wrapper_map = (
+            self.static_analysis_visitor.get_node_to_wrapper_map()
         )
         self.node_var_type_map = node_var_type_map
 
@@ -767,7 +844,10 @@ def _visit_For(self, node):
         if isinstance(node.iter, gast.Call):
             # for in range(var[0]|var.numpy()[0]) or for in enumerate(var|var.numpy())
             if isinstance(node.iter.func, gast.Name):
-                if node.iter.func.id == "range" or node.iter.func.id == "enumerate":
+                if (
+                    node.iter.func.id == "range"
+                    or node.iter.func.id == "enumerate"
+                ):
                     for arg in node.iter.args:
                         self.visit(arg)
                 else:
@@ -866,7 +946,9 @@ def visit_Constant(self, node):
         return node
 
     def _is_node_with_tensor(self, node, name_id):
-        from paddle.fluid.dygraph.dygraph_to_static.static_analysis import NodeVarType
+        from paddle.fluid.dygraph.dygraph_to_static.static_analysis import (
+            NodeVarType,
+        )
 
         # Look up the node_var_type_map by name_id.
         if self.node_var_type_map:
@@ -896,7 +978,7 @@ def _is_wrapped(f):
         return hasattr(f, '__wrapped__')
 
     unwrapped_f = func
-    while (_is_wrapped(unwrapped_f)):
+    while _is_wrapped(unwrapped_f):
         unwrapped_f = unwrapped_f.__wrapped__
 
     return unwrapped_f
@@ -920,10 +1002,12 @@ def input_specs_compatible(src_input_specs, desired_input_specs):
             if spec not in desired_input_specs:
                 return False
     else:
-        for (src_spec, desired_spec) in zip(src_input_specs,
-                                            desired_input_specs):
+        for (src_spec, desired_spec) in zip(
+            src_input_specs, desired_input_specs
+        ):
             if isinstance(src_spec, paddle.static.InputSpec) or isinstance(
-                    desired_spec, paddle.static.InputSpec):
+                desired_spec, paddle.static.InputSpec
+            ):
                 if not _compatible_tensor_spec(src_spec, desired_spec):
                     return False
             else:
@@ -1008,15 +1092,14 @@ def slice_is_num(slice_node):
 
 
 class NameScope:
-
     def __init__(self):
-        """ 
-            A NameScope is a object which manager all the variable names. 
-            only FunctionDef and Controlflow node will have a namescope property.
+        """
+        A NameScope is a object which manager all the variable names.
+        only FunctionDef and Controlflow node will have a namescope property.
 
-            type can be "function" and "controlflow"
+        type can be "function" and "controlflow"
 
-            we don't analyze the read only variable because they don't affect the analysis.
+        we don't analyze the read only variable because they don't affect the analysis.
         """
         self.globals = set()
         self.nonlocals = set()
@@ -1024,6 +1107,7 @@ def __init__(self):
         self.father = None  # point to the nearest function name scope.
         self.w_vars = set()  # all qualified + normal names been stored
         self.created = set()  # useful for control flow compatibility
+        # only valid in control_flow nodes
         # may be remove later.
         self.push_pop_vars = set()  # we call push and pop in the vars
 
@@ -1031,8 +1115,8 @@ def set_father(self, father):
         self.father = father
 
     def existed_vars(self):
-        """ vars existing in current scope. 
-            they must not contain qualified names.
+        """vars existing in current scope.
+        they must not contain qualified names.
         """
         local_vars = self.w_vars - self.globals - self.nonlocals - self.args
         return set(filter(lambda x: '.' not in x, local_vars))
@@ -1045,15 +1129,59 @@ def modified_vars(self):
         return self.w_vars
 
     def variadic_length_vars(self):
-        return self.push_pop_vars
+        """
+        At present, we do not support global append, such as
+
+        import numpy as np
+        a = []
+        def func():
+            a.append() # global names `a`, we will raise a warning.
+            p.append(a, 1) # global names `np`, we will raise a warning.
+        """
+        non_global_push_pop_names = []
+        for var in self.push_pop_vars:
+            if self._is_simple_name(var) and self.is_global_var(var):
+                warnings.warn(
+                    f"Find variable `{var}` defined in global scope"
+                    f" and call `{var}.append() or {var}.pop()`"
+                    f", which will be ignored and never be transfered into"
+                    f" tensor array."
+                )
+            else:
+                non_global_push_pop_names.append(var)
+        return set(non_global_push_pop_names)
 
     def control_flow_vars(self):
         valid_names = self.w_vars
-        tmp = self.father.global_vars & valid_names,
+        tmp = (self.father.global_vars & valid_names,)
         return {"global": tmp, "nonlocal": self.w_vars - tmp}
 
-    def global_vars(self):
-        return self.globals
+    def _is_simple_name(self, name):
+        if '.' in name or '[' in name:
+            return False
+        return True
+
+    def is_global_var(self, name):
+        """
+        Return whether the name is a var created in global scope.
+        Search from bottom to top. If it is not created or modified,
+        it means global vars; otherwise, it means local vars.
+        Only valid after FunctionNameLivenessAnalysis visitor.
+        """
+        assert self._is_simple_name(
+            name
+        ), "is_global_var accept a simple name, but get `{name}`."
+        ancestor = self
+        while ancestor is not None:
+            if name in ancestor.globals:
+                return True
+            if name in (ancestor.nonlocals | ancestor.w_vars):
+                return False
+            ancestor = ancestor.father
+        return True
+
+    def is_local_var(self, name):
+        return not self.is_global_var(name)
 
     def merge_from(self, name_scope):
         self.globals |= name_scope.globals
@@ -1064,46 +1192,46 @@ def merge_from(self, name_scope):
 
 
 class FunctionNameLivenessAnalysis(gast.NodeVisitor):
-    """ analyze the liveness of a function.
-
-        every variables stored in this scope will be collected,
-        in addition with global/nonlocal information and 
-        push_pop information.
-
-        1. global variable is stored in node.var_globals.
-        2. nonlocal variable is stored in node.var_nonlocals.
-        3. arguments is stored in node.var_args.
-        4. if a variable's push and pop attribute is called, 
-           it will be collected in push_pop_vars. They are
-           used for transformation to tensor_array.
-           NOTE: push_pop_vars **may not** in w_vars. 
-           a.push(0) don't modify the variable a, but the content
-           of a.
-
-        For example:
-
-        def func(*args, **kargs):
-            a = 12
-            global i,j
-            nonlocal x,y
-            print(a)
-            i = k
-            b = []
-            c = [1,2,3]
-            for m in range(10):
-                q = 12
-                b.push(1)
-                c.pop()
-        
-        After this visitor we have: 
-        # node is the FunctionDef node with name: "func"
-        node.pd_scope = NameScope(
-            globals = ['i', 'j'],
-            nonlocals = ['x', 'y'],
-            args = ['args', 'kargs'], 
-            wr_vars = ['a', 'i', 'q', 'm', 'c', 'b']
-            push_pop_vars = ['b', 'c']
-        )
+    """analyze the liveness of a function.
+
+    every variables stored in this scope will be collected,
+    in addition with global/nonlocal information and
+    push_pop information.
+
+    1. global variable is stored in node.var_globals.
+    2. nonlocal variable is stored in node.var_nonlocals.
+    3. arguments is stored in node.var_args.
+    4. if a variable's push and pop attribute is called,
+       it will be collected in push_pop_vars. They are
+       used for transformation to tensor_array.
+       NOTE: push_pop_vars **may not** in w_vars.
+       a.push(0) don't modify the variable a, but the content
+       of a.
+
+    For example:
+
+    def func(*args, **kargs):
+        a = 12
+        global i,j
+        nonlocal x,y
+        print(a)
+        i = k
+        b = []
+        c = [1,2,3]
+        for m in range(10):
+            q = 12
+            b.push(1)
+            c.pop()
+
+    After this visitor we have:
+    # node is the FunctionDef node with name: "func"
+    node.pd_scope = NameScope(
+        globals = ['i', 'j'],
+        nonlocals = ['x', 'y'],
+        args = ['args', 'kargs'],
+        wr_vars = ['a', 'i', 'q', 'm', 'c', 'b']
+        push_pop_vars = ['b', 'c']
+    )
     """
 
     def __init__(self, root_node):
@@ -1123,25 +1251,26 @@ def _current_name_scope(self):
         return self._get_name_scope(self.scope_node_stack[-1])
 
     def _father_name_scope(self):
-        if len(self.scope_node_stack) == 1: return None
+        if len(self.scope_node_stack) == 1:
+            return None
         return self._get_name_scope(self.scope_node_stack[-2])
 
     def _nearest_function_scope(self):
-        if len(self.scope_node_stack) == 1: return None
+        if len(self.scope_node_stack) == 1:
+            return None
         for node in self.scope_node_stack[-2::-1]:
             if isinstance(node, gast.FunctionDef):
                 return self._get_name_scope(node)
 
     def visit_ListComp(self, node):
-        """ [ i for i in range(10) ]
-            In this case, `i` will not created in FunctionScope. 
-            We don't collect `i` by not calling generic_visit.
+        """[ i for i in range(10) ]
+        In this case, `i` will not created in FunctionScope.
+        We don't collect `i` by not calling generic_visit.
         """
         pass
 
     def visit_DictComp(self, node):
-        """ the same as ListComp.
-        """
+        """the same as ListComp."""
         pass
 
     def visit_Name(self, node):
@@ -1151,62 +1280,86 @@ def visit_Name(self, node):
             self._current_name_scope().w_vars.add(node.id)
 
     def visit_FunctionDef(self, node):
-
         def pre_func():
             self._current_name_scope().args |= set(
-                self._get_argument_names(node))
+                self._get_argument_names(node)
+            )
 
         def post_func():
-            """ NOTE: why we need merge w_vars and push_pop_vars here ? 
-                because we do ifelse_transformer after loop_transformer. Loops will changed into functioons. but we know this function will be called in if. so we add w_vars to father function scope.
+            """NOTE: why we need merge w_vars and push_pop_vars here ?
+            because we do ifelse_transformer after loop_transformer. Loops will changed into functioons. but we know this function will be called in if. so we add w_vars to father function scope.
             """
-            from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import WHILE_CONDITION_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX, FOR_BODY_PREFIX
-            from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import TRUE_FUNC_PREFIX, FALSE_FUNC_PREFIX
+            from paddle.fluid.dygraph.dygraph_to_static.loop_transformer import (
+                WHILE_CONDITION_PREFIX,
+                WHILE_BODY_PREFIX,
+                FOR_CONDITION_PREFIX,
+                FOR_BODY_PREFIX,
+            )
+            from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import (
+                TRUE_FUNC_PREFIX,
+                FALSE_FUNC_PREFIX,
+            )
+
             control_flow_function_def = [
-                WHILE_BODY_PREFIX, WHILE_BODY_PREFIX, FOR_CONDITION_PREFIX,
-                FOR_BODY_PREFIX, TRUE_FUNC_PREFIX, FALSE_FUNC_PREFIX
+                WHILE_BODY_PREFIX,
+                WHILE_BODY_PREFIX,
+                FOR_CONDITION_PREFIX,
+                FOR_BODY_PREFIX,
+                TRUE_FUNC_PREFIX,
+                FALSE_FUNC_PREFIX,
             ]
 
             def is_control_flow_def_node():
                 for prefix in control_flow_function_def:
-                    if node.name.startswith(prefix): return True
+                    if node.name.startswith(prefix):
+                        return True
                 return False
 
             if self._father_name_scope() and is_control_flow_def_node():
-                self._father_name_scope().w_vars |= self._current_name_scope(
-                ).w_vars
-                self._father_name_scope(
-                ).push_pop_vars |= self._current_name_scope().push_pop_vars
+                self._father_name_scope().w_vars |= (
+                    self._current_name_scope().w_vars
+                )
+                self._father_name_scope().push_pop_vars |= (
+                    self._current_name_scope().push_pop_vars
+                )
 
         self._visit_scope_node(node, pre_func, post_func)
 
     def _visit_scope_node(self, node, pre_func, post_func):
-        """ scope node main visit logic.
-            pre_func and post_func is callbacks
+        """scope node main visit logic.
+        pre_func and post_func is callbacks
         """
         self._reset_name_scope(node)
         self.scope_node_stack.append(node)
-        self._current_name_scope().father = self._nearest_function_scope()
-        if pre_func: pre_func()
+        self._current_name_scope().set_father(self._nearest_function_scope())
+        if pre_func:
+            pre_func()
         self.generic_visit(node)
-        if post_func: post_func()
+        if post_func:
+            post_func()
         self.scope_node_stack.pop()
 
     def _visit_controlflow_node(self, node):
-
         def post_func():
             self._father_name_scope().merge_from(self._current_name_scope())
             self._nearest_function_scope().merge_from(
-                self._current_name_scope())
-            self._current_name_scope().created = self._nearest_function_scope(
-            ).existed_vars() - node.before_created
+                self._current_name_scope()
+            )
+            self._current_name_scope().created = (
+                self._nearest_function_scope().existed_vars()
+                - node.before_created
+            )
             # gather created vars into father and used in CreateUndefinedVarTransform
-            self._nearest_function_scope().created |= self._current_name_scope(
-            ).created
+            self._nearest_function_scope().created |= (
+                self._current_name_scope().created
+            )
 
         def pre_func():
-            setattr(node, "before_created",
-                    self._nearest_function_scope().existed_vars())
+            setattr(
+                node,
+                "before_created",
+                self._nearest_function_scope().existed_vars(),
+            )
 
         self._visit_scope_node(node, pre_func, post_func)
 
@@ -1244,12 +1397,13 @@ def visit_Call(self, node):
         self._current_name_scope().push_pop_vars.add(name)
 
     def _get_argument_names(self, node):
-        """ get all arguments name in the functiondef node.
-            this node is local to the function and shouldn't 
-            be created.
+        """get all arguments name in the functiondef node.
+        this node is local to the function and shouldn't
+        be created.
         """
         assert isinstance(
-            node, gast.FunctionDef), "Input node is not function define node"
+            node, gast.FunctionDef
+        ), "Input node is not function define node"
         names = [a for a in node.args.args]
         names.append(node.args.vararg)
         names.append(node.args.kwarg)
@@ -1270,20 +1424,19 @@ def empty_node():
         func_def = """
         def {func_name}():
             return
-        """.format(func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX))
+        """.format(
+            func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX)
+        )
         return gast.parse(textwrap.dedent(func_def)).body[0]
 
     assert isinstance(names, (list, tuple))
-    mapped = list(filter(lambda n: '.' not in n, names))
-    nonlocal_names = sorted(
-        mapped,
-        key=mapped.index)  # to keep the order, we can't use set() to unique
+    node = create_nonlocal_stmt_nodes(names)
     if not names:
         return empty_node()
-    if not nonlocal_names:
+    if node == []:
         nonlocal_vars = "\n"
     else:
-        nonlocal_vars = "nonlocal " + ",".join(nonlocal_names)
+        nonlocal_vars = ast_to_source_code(node[0])
     template = """
     def {func_name}():
         {nonlocal_vars}
@@ -1292,7 +1445,8 @@ def {func_name}():
     func_def = template.format(
         func_name=unique_name.generate(GET_ARGS_FUNC_PREFIX),
         nonlocal_vars=nonlocal_vars,
-        vars=",".join(names))
+        vars=",".join(names),
+    )
     return gast.parse(textwrap.dedent(func_def)).body[0]
 
 
@@ -1309,21 +1463,19 @@ def empty_node():
         func_def = """
         def {func_name}({args}):
             pass
-        """.format(func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX),
-                   args=ARGS_NAME)
+        """.format(
+            func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX), args=ARGS_NAME
+        )
         return gast.parse(textwrap.dedent(func_def)).body[0]
 
     assert isinstance(names, (list, tuple))
-    mapped = list(filter(lambda n: '.' not in n, names))
-    nonlocal_names = sorted(
-        mapped,
-        key=mapped.index)  # to keep the order, we can't use set() to unique
+    node = create_nonlocal_stmt_nodes(names)
     if not names:
         return empty_node()
-    if not nonlocal_names:
+    if node == []:
         nonlocal_vars = "\n"
     else:
-        nonlocal_vars = "nonlocal " + ",".join(nonlocal_names)
+        nonlocal_vars = ast_to_source_code(node[0])
     template = """
     def {func_name}({args}):
         {nonlocal_vars}
@@ -1333,7 +1485,8 @@ def {func_name}({args}):
         func_name=unique_name.generate(SET_ARGS_FUNC_PREFIX),
         args=ARGS_NAME,
         nonlocal_vars=nonlocal_vars,
-        vars=",".join(names))
+        vars=",".join(names),
+    )
     return gast.parse(textwrap.dedent(func_def)).body[0]
 
 
@@ -1341,9 +1494,10 @@ def create_nonlocal_stmt_nodes(names):
     assert isinstance(names, (list, tuple))
 
     mapped = list(filter(lambda n: '.' not in n, names))
+    mapped = list(filter(lambda n: '[' not in n, mapped))
     names = sorted(
-        mapped,
-        key=mapped.index)  # to keep the order, we can't use set() to unique
+        mapped, key=mapped.index
+    )  # to keep the order, we can't use set() to unique
     if not names:
         return []
     func_code = "nonlocal {}".format(','.join(names))
@@ -1351,10 +1505,10 @@ def create_nonlocal_stmt_nodes(names):
 
 
 class GetterSetterHelper:
-    """ we have two classes of names in setter and getter function: 
-        w_vars(loop_vars) + push_pop_vars
-        To simplify the setter logic in convert_while and convert_cond,
-        we extract the helper class here.
+    """we have two classes of names in setter and getter function:
+    w_vars(loop_vars) + push_pop_vars
+    To simplify the setter logic in convert_while and convert_cond,
+    we extract the helper class here.
     """
 
     def __init__(self, getter_func, setter_func, *name_lists):
@@ -1370,22 +1524,33 @@ def union(self):
         return self._union
 
     def get(self, names):
-        if names is None: names = []
+        if names is None:
+            names = []
         vars = self.getter()
-        if vars is None: return tuple()
+        if vars is None:
+            return tuple()
         for n in names:
-            assert n in self.name2id, "the name `{}` not in name union set`{}`.".format(
-                n, self.name2id.keys())
+            assert (
+                n in self.name2id
+            ), "the name `{}` not in name union set`{}`.".format(
+                n, self.name2id.keys()
+            )
         return tuple(map(lambda n: vars[self.name2id[n]], names))
 
     def set(self, names, values):
-        if names is None: names = []
-        if values is None: values = []
+        if names is None:
+            names = []
+        if values is None:
+            values = []
         vars = self.getter()
-        if vars is None: return
+        if vars is None:
+            return
         for n in names:
-            assert n in self.name2id, "the name `{}` not in name union set`{}`.".format(
-                n, self.name2id.keys())
+            assert (
+                n in self.name2id
+            ), "the name `{}` not in name union set`{}`.".format(
+                n, self.name2id.keys()
+            )
         vars = list(vars)
         indices = list(map(lambda n: self.name2id[n], names))
         for i, v in zip(indices, values):
@@ -1400,5 +1565,5 @@ def create_name_str(name_ids):
     if not name_ids:
         return 'None'
 
-    names_str = ["'%s'" % name for name in name_ids]
+    names_str = ["'%s'" % (name.replace("'", "\\'")) for name in name_ids]
     return "(%s, )" % ','.join(names_str)
diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index eb4fdc682a7afb..5b6f095df99964 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -30,8 +30,14 @@
 from paddle.fluid.layers.utils import _hash_with_id
 from paddle.fluid.dygraph.base import switch_to_static_graph
 from paddle.fluid.framework import _non_static_mode
-from paddle.fluid.executor import _is_enable_standalone_executor, _is_dy2st_enable_standalone_executor
-from paddle.fluid.dygraph.dygraph_to_static.partial_program import add_build_strategy_for, LazyInitialized
+from paddle.fluid.executor import (
+    _is_enable_standalone_executor,
+    _is_dy2st_enable_standalone_executor,
+)
+from paddle.fluid.dygraph.dygraph_to_static.partial_program import (
+    add_build_strategy_for,
+    LazyInitialized,
+)
 from paddle import _C_ops, _legacy_C_ops
 
 __all__ = ['TranslatedLayer']
@@ -53,17 +59,20 @@ def _load_program_desc(model_file_path):
 
     program_desc = core.ProgramDesc(program_desc_str)
     if not core._is_program_version_supported(program_desc._version()):
-        raise ValueError("Unsupported program version: %d\n" %
-                         program_desc._version())
+        raise ValueError(
+            "Unsupported program version: %d\n" % program_desc._version()
+        )
 
     return program_desc
 
 
 def _is_persistable(var_desc):
-    if var_desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-            var_desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-            var_desc.type() == core.VarDesc.VarType.READER or \
-            var_desc.type() == core.VarDesc.VarType.RAW:
+    if (
+        var_desc.type() == core.VarDesc.VarType.FEED_MINIBATCH
+        or var_desc.type() == core.VarDesc.VarType.FETCH_LIST
+        or var_desc.type() == core.VarDesc.VarType.READER
+        or var_desc.type() == core.VarDesc.VarType.RAW
+    ):
         return False
     return var_desc.persistable()
 
@@ -208,9 +217,11 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             name_old = var.name()
             is_double_grad_var = "@GRAD" in name_old
             has_double_grad = has_double_grad or is_double_grad_var
-            should_rename = (include is None or name_old in include) and (
-                exclude is None
-                or name_old not in exclude) and not is_double_grad_var
+            should_rename = (
+                (include is None or name_old in include)
+                and (exclude is None or name_old not in exclude)
+                and not is_double_grad_var
+            )
             if should_rename:
                 temp_name = name_old.split('_')
                 if len(temp_name) > 1 and temp_name[-1].isnumeric():
@@ -219,15 +230,19 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                     temp_name = name_old
                 while True:
                     name_new = _generate_unique_var_name_sync_with_main_program(
-                        temp_name)
-                    if name_new not in old_names[:var_idx] + old_names[var_idx +
-                                                                       1:]:
+                        temp_name
+                    )
+                    if (
+                        name_new
+                        not in old_names[:var_idx] + old_names[var_idx + 1 :]
+                    ):
                         break
             else:
                 name_new = name_old
             if name_old != name_new:
-                cur_block._rename_var(cpt.to_bytes(name_old),
-                                      cpt.to_bytes(name_new))
+                cur_block._rename_var(
+                    cpt.to_bytes(name_old), cpt.to_bytes(name_new)
+                )
             if not is_double_grad_var:
                 dict_rename_var_old_new[name_old] = name_new
                 dict_rename_var_new_old[name_new] = name_old
@@ -242,13 +257,16 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
                     var_name = var.name()
                     if "@GRAD" in var_name and name_old in var_name:
                         new_var_name = var_name.replace(
-                            name_old, dict_rename_var_old_new[name_old])
+                            name_old, dict_rename_var_old_new[name_old]
+                        )
                         double_grad_rename_dict[var_name] = new_var_name
         for var_name in double_grad_rename_dict:
             dict_rename_var_old_new[var_name] = double_grad_rename_dict[
-                var_name]
+                var_name
+            ]
             dict_rename_var_new_old[
-                double_grad_rename_dict[var_name]] = var_name
+                double_grad_rename_dict[var_name]
+            ] = var_name
 
     # Rename on program desc
     for b_idx in six.moves.range(program_desc.num_blocks()):
@@ -257,27 +275,38 @@ def _rename_var_program_desc(program_desc, include=None, exclude=None):
             op = cur_block.op(op_idx)
             for input_arg_name in op.input_arg_names():
                 if input_arg_name in dict_rename_var_old_new:
-                    if input_arg_name != dict_rename_var_old_new[input_arg_name]:
+                    if (
+                        input_arg_name
+                        != dict_rename_var_old_new[input_arg_name]
+                    ):
                         op._rename_input(
                             input_arg_name,
-                            dict_rename_var_old_new[input_arg_name])
+                            dict_rename_var_old_new[input_arg_name],
+                        )
                         if cur_block.has_var(cpt.to_bytes(input_arg_name)):
                             cur_block._rename_var(
                                 cpt.to_bytes(input_arg_name),
                                 cpt.to_bytes(
-                                    dict_rename_var_old_new[input_arg_name]))
+                                    dict_rename_var_old_new[input_arg_name]
+                                ),
+                            )
             for output_arg_name in op.output_arg_names():
                 if output_arg_name in dict_rename_var_old_new:
-                    if output_arg_name != dict_rename_var_old_new[
-                            output_arg_name]:
+                    if (
+                        output_arg_name
+                        != dict_rename_var_old_new[output_arg_name]
+                    ):
                         op._rename_output(
                             output_arg_name,
-                            dict_rename_var_old_new[output_arg_name])
+                            dict_rename_var_old_new[output_arg_name],
+                        )
                         if cur_block.has_var(cpt.to_bytes(output_arg_name)):
                             cur_block._rename_var(
                                 cpt.to_bytes(output_arg_name),
                                 cpt.to_bytes(
-                                    dict_rename_var_old_new[output_arg_name]))
+                                    dict_rename_var_old_new[output_arg_name]
+                                ),
+                            )
     program_desc.flush()
     return dict_rename_var_new_old, dict_rename_var_old_new
 
@@ -308,8 +337,8 @@ class _ProgramHolder(object):
     """
     Holds the execution information of a Program.
 
-    _ProgramHolder is the execution unit of TranslatedLayer, 
-    if TranslatedLayer contains multiple _ProgramHolder, 
+    _ProgramHolder is the execution unit of TranslatedLayer,
+    if TranslatedLayer contains multiple _ProgramHolder,
     it can execute multiple methods
 
     _ProgramHolder is an internal concept.
@@ -333,7 +362,8 @@ def __init__(self, program_desc):
         self._infer_program_desc = self._preprocess(program_desc)
         # forward + backward program
         self._train_program_desc = self._append_backward_desc(
-            self._infer_program_desc)
+            self._infer_program_desc
+        )
 
     # forward:
     @switch_to_static_graph
@@ -354,11 +384,13 @@ def _forward_program_desc(self):
     def _create_backward_train_program(self):
         whole_program = _build_program_by_desc(self._train_program_desc)
         start_op_index = self._infer_program_desc.block(0).op_size() + 2 * len(
-            self._output_descs)
+            self._output_descs
+        )
         end_op_index = whole_program.desc.block(0).op_size()
-        if (start_op_index < end_op_index):
-            return add_build_strategy_for(whole_program, start_op_index,
-                                          end_op_index)
+        if start_op_index < end_op_index:
+            return add_build_strategy_for(
+                whole_program, start_op_index, end_op_index
+            )
         else:
             return paddle.static.Program()
 
@@ -406,7 +438,8 @@ def _preprocess(self, program_desc):
         # rename persistable variables of 'program_desc'
         list_persistable_var = _get_persistable_var_names(program_desc)
         rename_new_old_dict, _ = _rename_var_program_desc(
-            program_desc, list_persistable_var)
+            program_desc, list_persistable_var
+        )
         # 1. Prune original program
         # remove feed, fetch and scale-1 op, remove op_callstack attr
         ops_to_remove = []
@@ -418,14 +451,17 @@ def _preprocess(self, program_desc):
                 feed_var_name = cpt.to_bytes(op.input('X')[0])
                 root_block._remove_var(feed_var_name)
                 self._input_descs.append(
-                    root_block.find_var(cpt.to_bytes(op.output('Out')[0])))
+                    root_block.find_var(cpt.to_bytes(op.output('Out')[0]))
+                )
             elif op.type() == 'scale' and op.output('Out')[0].startswith(
-                    'save_infer_model/scale_'):
+                'save_infer_model/scale_'
+            ):
                 ops_to_remove.append(i)
                 out_var_name = cpt.to_bytes(op.output('Out')[0])
                 root_block._remove_var(out_var_name)
                 self._output_descs.append(
-                    root_block.find_var(cpt.to_bytes(op.input('X')[0])))
+                    root_block.find_var(cpt.to_bytes(op.input('X')[0]))
+                )
             elif op.type() == 'fetch':
                 ops_to_remove.append(i)
                 fetch_var_name = cpt.to_bytes(op.output('Out')[0])
@@ -433,7 +469,8 @@ def _preprocess(self, program_desc):
                 # NOTE: some old pre-train models have no extra scale_op
                 if not op.input('X')[0].startswith('save_infer_model/scale_'):
                     self._output_descs.append(
-                        root_block.find_var(cpt.to_bytes(op.input('X')[0])))
+                        root_block.find_var(cpt.to_bytes(op.input('X')[0]))
+                    )
             else:
                 if op.has_attr("op_callstack"):
                     op.remove_attr("op_callstack")
@@ -478,7 +515,8 @@ def _preprocess(self, program_desc):
         # there will be a problem of duplicate names, so here is unified
         # to add the LOADED suffix to the parameters of the model loaded
         self._suffix_varname_dict = _get_loaded_var_new_old(
-            program_desc, rename_new_old_dict)
+            program_desc, rename_new_old_dict
+        )
 
         # - get persistable var
         self._persistable_names = _get_persistable_var_names(program_desc)
@@ -492,9 +530,9 @@ def _append_scale_to_output(self, program):
         with framework.program_guard(program):
             for i, out in enumerate(self._output_descs):
                 var = program.global_block().var(out.name())
-                var = nn.scale(var,
-                               1.,
-                               name="translated_layer/scale_{}".format(i))
+                var = nn.scale(
+                    var, 1.0, name="translated_layer/scale_{}".format(i)
+                )
                 scale_output_vars.append(var)
         # 2. update output names & descs
         for i, var in enumerate(scale_output_vars):
@@ -519,15 +557,19 @@ def _get_train_forward_program(self, infer_program_desc):
             block = program.block(block_idx)
             for op in block.ops:
                 if op.type == "batch_norm":
-                    if "ReserveSpace" not in op.output_names or len(
-                            op.output("ReserveSpace")) == 0:
+                    if (
+                        "ReserveSpace" not in op.output_names
+                        or len(op.output("ReserveSpace")) == 0
+                    ):
                         reserve_space = block.create_var(
                             name=unique_name.generate_with_ignorable_key(
-                                ".".join(["reserve_space", 'tmp'])),
+                                ".".join(["reserve_space", 'tmp'])
+                            ),
                             dtype=block.var(op.input("X")[0]).dtype,
                             type=core.VarDesc.VarType.LOD_TENSOR,
                             persistable=False,
-                            stop_gradient=True)
+                            stop_gradient=True,
+                        )
                         op.desc.set_output("ReserveSpace", [reserve_space.name])
         return program
 
@@ -573,9 +615,9 @@ def _append_backward_desc(self, infer_program_desc):
 
 # NOTE: [compatible] deal with model saved by save_inference_model,
 # which need get var info from program desc
-def _load_persistable_vars_by_program(model_path,
-                                      program_holder,
-                                      params_filename=None):
+def _load_persistable_vars_by_program(
+    model_path, program_holder, params_filename=None
+):
     # make sure the path has been checked
     persistable_vars = _get_persistable_vars(program_holder.infer_program)
     load_var_dict = {}
@@ -584,37 +626,43 @@ def _load_persistable_vars_by_program(model_path,
         if _is_parameter(each_var, program_holder.infer_program):
             # create output varbase
             if framework._in_eager_without_dygraph_check():
-                new_var = framework.EagerParamBase(shape=each_var.shape(),
-                                                   dtype=each_var.dtype(),
-                                                   name=each_var.name(),
-                                                   type=each_var.type(),
-                                                   persistable=True)
+                new_var = framework.EagerParamBase(
+                    shape=each_var.shape(),
+                    dtype=each_var.dtype(),
+                    name=each_var.name(),
+                    type=each_var.type(),
+                    persistable=True,
+                )
             else:
-                new_var = framework.ParamBase(shape=each_var.shape(),
-                                              dtype=each_var.dtype(),
-                                              name=each_var.name(),
-                                              type=each_var.type(),
-                                              persistable=True)
+                new_var = framework.ParamBase(
+                    shape=each_var.shape(),
+                    dtype=each_var.dtype(),
+                    name=each_var.name(),
+                    type=each_var.type(),
+                    persistable=True,
+                )
         else:
-            new_var = framework._varbase_creator(type=each_var.type(),
-                                                 name=each_var.name(),
-                                                 shape=each_var.shape(),
-                                                 dtype=each_var.dtype(),
-                                                 persistable=True)
+            new_var = framework._varbase_creator(
+                type=each_var.type(),
+                name=each_var.name(),
+                shape=each_var.shape(),
+                dtype=each_var.dtype(),
+                persistable=True,
+            )
         if params_filename is None:
             framework._dygraph_tracer().trace_op(
                 type='load',
                 inputs={},
                 outputs={'Out': new_var},
-                attrs={'file_path': os.path.join(model_path, orig_each_name)})
+                attrs={'file_path': os.path.join(model_path, orig_each_name)},
+            )
         new_var.stop_gradient = False
         load_var_dict[each_var.name()] = new_var
 
     if params_filename is not None:
         load_var_list = []
         dict_name_old_new = {
-            v: k
-            for k, v in program_holder._suffix_varname_dict.items()
+            v: k for k, v in program_holder._suffix_varname_dict.items()
         }
         for name in sorted(dict_name_old_new.keys()):
             load_var_list.append(load_var_dict[dict_name_old_new[name]])
@@ -623,7 +671,8 @@ def _load_persistable_vars_by_program(model_path,
             type='load_combine',
             inputs={},
             outputs={'Out': load_var_list},
-            attrs={'file_path': os.path.join(model_path, params_filename)})
+            attrs={'file_path': os.path.join(model_path, params_filename)},
+        )
 
         for each_var in persistable_vars:
             if not _is_parameter(each_var, program_holder.infer_program):
@@ -645,8 +694,9 @@ def _load_persistable_vars_by_program(model_path,
     return load_var_dict
 
 
-def _load_persistable_vars(model_path, var_info_path, program_holder,
-                           params_filename):
+def _load_persistable_vars(
+    model_path, var_info_path, program_holder, params_filename
+):
     # 1. load extra var info
     with open(var_info_path, 'rb') as f:
         extra_var_info = pickle.load(f)
@@ -655,8 +705,7 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
     load_var_dict = dict()
     load_var_list = []
     inv_suffix_varname_dict = {
-        value: key
-        for key, value in program_holder._suffix_varname_dict.items()
+        value: key for key, value in program_holder._suffix_varname_dict.items()
     }
 
     # NOTE(chenweihang): we need load persistable vars based the program,
@@ -667,7 +716,8 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
             raise RuntimeError(
                 "The model to be loaded is not complete."
                 "The variable `%s` of program cannot be found in loaded model.",
-                name)
+                name,
+            )
         # get suffix var name, see [why need to append suffix to persistable vars]
         new_name = inv_suffix_varname_dict[name]
         # create output varbase
@@ -680,7 +730,8 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
                     ],  # only to pass check, this shape is not meaningful
                     dtype=core.VarDesc.VarType.FP32,
                     name=new_name,
-                    persistable=True)
+                    persistable=True,
+                )
             else:
                 new_var = framework.ParamBase(
                     shape=[
@@ -688,10 +739,12 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
                     ],  # only to pass check, this shape is not meaningful
                     dtype=core.VarDesc.VarType.FP32,
                     name=new_name,
-                    persistable=True)
+                    persistable=True,
+                )
         else:
-            new_var = framework._varbase_creator(name=new_name,
-                                                 persistable=True)
+            new_var = framework._varbase_creator(
+                name=new_name, persistable=True
+            )
 
         new_var.stop_gradient = extra_var_info[name]['stop_gradient']
         load_var_dict[new_name] = new_var
@@ -704,10 +757,12 @@ def _load_persistable_vars(model_path, var_info_path, program_holder,
         if len(extra_var_info) != 0:
             raise ValueError("The model to be loaded is incomplete.")
     else:
-        framework._dygraph_tracer().trace_op(type='load_combine',
-                                             inputs={},
-                                             outputs={'Out': load_var_list},
-                                             attrs={'file_path': var_file_path})
+        framework._dygraph_tracer().trace_op(
+            type='load_combine',
+            inputs={},
+            outputs={'Out': load_var_list},
+            attrs={'file_path': var_file_path},
+        )
 
     return load_var_dict
 
@@ -729,17 +784,18 @@ def _construct_program_holders(model_path, model_filename=None):
         # [compatible] if assign model_filename, only can load one program as Layer.forward
         model_filename = os.path.basename(model_filename)
         model_file_path = os.path.join(model_path, model_filename)
-        model_name = model_filename[:-len(INFER_MODEL_SUFFIX)]
-        #Load every file that meets the requirements in the directory model_path.
+        model_name = model_filename[: -len(INFER_MODEL_SUFFIX)]
+        # Load every file that meets the requirements in the directory model_path.
         for filename in os.listdir(model_path):
             if model_filename == filename:
                 func_name = 'forward'
                 model_file_path = os.path.join(model_path, model_filename)
             elif filename.endswith(INFER_MODEL_SUFFIX) and filename.startswith(
-                    model_name):
-                parsing_names = filename[len(model_name
-                                             ):-len(INFER_MODEL_SUFFIX) +
-                                         1].split('.')
+                model_name
+            ):
+                parsing_names = filename[
+                    len(model_name) : -len(INFER_MODEL_SUFFIX) + 1
+                ].split('.')
                 if len(parsing_names) == 3 and len(parsing_names[1]) > 0:
                     func_name = parsing_names[1]
                     model_file_path = os.path.join(model_path, filename)
@@ -748,7 +804,8 @@ def _construct_program_holders(model_path, model_filename=None):
             else:
                 continue
             program_holder_dict[func_name] = _ProgramHolder(
-                _load_program_desc(model_file_path))
+                _load_program_desc(model_file_path)
+            )
     else:
         for _, _, file_names in os.walk(model_path):
             for name in file_names:
@@ -760,30 +817,32 @@ def _construct_program_holders(model_path, model_filename=None):
                     else:
                         method_name.replace('model', '')
                     program_holder_dict[method_name] = _ProgramHolder(
-                        _load_program_desc(model_file_path))
+                        _load_program_desc(model_file_path)
+                    )
 
     return program_holder_dict
 
 
-def _construct_params_and_buffers(model_path,
-                                  programs,
-                                  params_filename=None,
-                                  append_suffix=True):
+def _construct_params_and_buffers(
+    model_path, programs, params_filename=None, append_suffix=True
+):
     var_info_filename = str(params_filename) + ".info"
     var_info_path = os.path.join(model_path, var_info_filename)
     params_path = os.path.join(model_path, str(params_filename))
 
     if os.path.exists(var_info_path):
-        var_dict = _load_persistable_vars(model_path, var_info_path,
-                                          programs['forward'], params_filename)
-        model_name = params_filename[:-len(INFER_PARAMS_SUFFIX)]
-        #Load every file that meets the requirements in the directory model_path.
+        var_dict = _load_persistable_vars(
+            model_path, var_info_path, programs['forward'], params_filename
+        )
+        model_name = params_filename[: -len(INFER_PARAMS_SUFFIX)]
+        # Load every file that meets the requirements in the directory model_path.
         for file_name in os.listdir(model_path):
             if file_name.startswith(model_name) and file_name.endswith(
-                    INFER_PARAMS_SUFFIX):
-                parsing_names = file_name[len(model_name
-                                              ):-len(INFER_PARAMS_SUFFIX) +
-                                          1].split('.')
+                INFER_PARAMS_SUFFIX
+            ):
+                parsing_names = file_name[
+                    len(model_name) : -len(INFER_PARAMS_SUFFIX) + 1
+                ].split('.')
                 if len(parsing_names) == 3 and len(parsing_names[1]) > 0:
                     func_name = parsing_names[1]
                 else:
@@ -792,15 +851,17 @@ def _construct_params_and_buffers(model_path,
                 continue
             var_info_path = os.path.join(model_path, var_info_filename)
             var_dict.update(
-                _load_persistable_vars(model_path, var_info_path,
-                                       programs[func_name], file_name))
+                _load_persistable_vars(
+                    model_path, var_info_path, programs[func_name], file_name
+                )
+            )
     elif params_filename is not None and not os.path.exists(params_path):
         # When saving XX, there is only '*.pdmodel'
         return dict()
     else:
-        var_dict = _load_persistable_vars_by_program(model_path,
-                                                     programs['forward'],
-                                                     params_filename)
+        var_dict = _load_persistable_vars_by_program(
+            model_path, programs['forward'], params_filename
+        )
 
     if not append_suffix:
         var_dict = _remove_varname_suffix(var_dict, programs['forward'])
@@ -813,13 +874,23 @@ def _valid_vars(vars):
         return vars
     if framework._in_eager_without_dygraph_check():
         return [
-            core.eager.Tensor(core.VarDesc.VarType.FP32, [], "Fake_var",
-                              core.VarDesc.VarType.RAW, False)
+            core.eager.Tensor(
+                core.VarDesc.VarType.FP32,
+                [],
+                "Fake_var",
+                core.VarDesc.VarType.RAW,
+                False,
+            )
         ]
     else:
         return [
-            core.VarBase(core.VarDesc.VarType.FP32, [], "Fake_var",
-                         core.VarDesc.VarType.RAW, False)
+            core.VarBase(
+                core.VarDesc.VarType.FP32,
+                [],
+                "Fake_var",
+                core.VarDesc.VarType.RAW,
+                False,
+            )
         ]
 
 
@@ -831,7 +902,8 @@ def _run_dygraph(instance, input, program_holder):
         if not isinstance(value, (np.ndarray, core.VarBase, core.eager.Tensor)):
             raise TypeError(
                 "The type of input in TranslatedLayer must be numpy array or Variable(VarBase), but received %s."
-                % type(value))
+                % type(value)
+            )
         # NOTE: In order to unify the API, firstly convert the input to VarBase
         if isinstance(value, np.ndarray):
             if framework._in_eager_without_dygraph_check():
@@ -840,13 +912,16 @@ def _run_dygraph(instance, input, program_holder):
                     name=program_holder.input_descs[i].name(),
                     persistable=False,
                     place=framework._current_expected_place(),
-                    zero_copy=True)
+                    zero_copy=True,
+                )
             else:
-                var = core.VarBase(value=value,
-                                   name=program_holder.input_descs[i].name(),
-                                   persistable=False,
-                                   place=framework._current_expected_place(),
-                                   zero_copy=True)
+                var = core.VarBase(
+                    value=value,
+                    name=program_holder.input_descs[i].name(),
+                    persistable=False,
+                    place=framework._current_expected_place(),
+                    zero_copy=True,
+                )
         else:
             var = value
             # NOTE: we changed var name here,
@@ -868,67 +943,112 @@ def _run_dygraph(instance, input, program_holder):
         else:
             raise ValueError(
                 "The persistable variable %s does not exist in current TranslatedLayer."
-                % var_name)
+                % var_name
+            )
 
     output_vars = []
     for var_desc in program_holder.output_descs:
         if framework._in_eager_without_dygraph_check():
-            var = core.eager.Tensor(dtype=var_desc.dtype(),
-                                    dims=var_desc.shape(),
-                                    name=var_desc.name(),
-                                    type=var_desc.type(),
-                                    persistable=False)
+            var = core.eager.Tensor(
+                dtype=var_desc.dtype(),
+                dims=var_desc.shape(),
+                name=var_desc.name(),
+                type=var_desc.type(),
+                persistable=False,
+            )
         else:
-            var = core.VarBase(var_desc.dtype(), var_desc.shape(),
-                               var_desc.name(), var_desc.type(), False)
+            var = core.VarBase(
+                var_desc.dtype(),
+                var_desc.shape(),
+                var_desc.name(),
+                var_desc.type(),
+                False,
+            )
         output_vars.append(var)
 
     # hold forward variables
     if framework._in_eager_without_dygraph_check():
         tmp_scope_vec = [program_holder.scope]
     else:
-        tmp_scope_vec = core.VarBase(core.VarDesc.VarType.FP32, [],
-                                     "program_out_scope",
-                                     core.VarDesc.VarType.STEP_SCOPES, True)
+        tmp_scope_vec = core.VarBase(
+            core.VarDesc.VarType.FP32,
+            [],
+            "program_out_scope",
+            core.VarDesc.VarType.STEP_SCOPES,
+            True,
+        )
         tmp_scope_vec.value().set_scope(program_holder.scope)
 
     double_grad_vars = []
     for var_desc in program_holder.double_grad_descs:
         if framework._in_eager_without_dygraph_check():
-            var = core.eager.Tensor(dtype=var_desc.dtype(),
-                                    dims=var_desc.shape(),
-                                    name=var_desc.name(),
-                                    type=var_desc.type(),
-                                    persistable=False)
+            var = core.eager.Tensor(
+                dtype=var_desc.dtype(),
+                dims=var_desc.shape(),
+                name=var_desc.name(),
+                type=var_desc.type(),
+                persistable=False,
+            )
         else:
-            var = core.VarBase(var_desc.dtype(), var_desc.shape(),
-                               var_desc.name(), var_desc.type(), False)
+            var = core.VarBase(
+                var_desc.dtype(),
+                var_desc.shape(),
+                var_desc.name(),
+                var_desc.type(),
+                False,
+            )
         double_grad_vars.append(var)
 
     # 2. run program by op
-    trace_program = program_holder.infer_program if instance._is_test else program_holder.train_program
-    forward_program = program_holder._infer_program_desc if instance._is_test else program_holder.forward_program
+    trace_program = (
+        program_holder.infer_program
+        if instance._is_test
+        else program_holder.train_program
+    )
+    forward_program = (
+        program_holder._infer_program_desc
+        if instance._is_test
+        else program_holder.forward_program
+    )
     end_op_index = program_holder.infer_program.block(0).op_size()
 
     attrs = [
         'global_block',
-        trace_program.block(0), 'start_op_index', 0, 'end_op_index',
-        end_op_index, 'is_test', instance._is_test, 'program_id',
-        _hash_with_id(trace_program, instance)
+        trace_program.block(0),
+        'start_op_index',
+        0,
+        'end_op_index',
+        end_op_index,
+        'is_test',
+        instance._is_test,
+        'program_id',
+        _hash_with_id(trace_program, instance),
     ]
 
-    use_interpretorcore = _is_enable_standalone_executor(
-    ) and _is_dy2st_enable_standalone_executor()
+    use_interpretorcore = (
+        _is_enable_standalone_executor()
+        and _is_dy2st_enable_standalone_executor()
+    )
     attrs.extend(('use_interpretorcore', use_interpretorcore))
     if use_interpretorcore:
         attrs.extend(
-            ('forward_global_block', forward_program.block(0),
-             'backward_global_block', program_holder.backward_program.block(0)))
-
-    _legacy_C_ops.run_program(_valid_vars(input_vars),
-                              _valid_vars(persistable_vars),
-                              _valid_vars(output_vars), tmp_scope_vec,
-                              _valid_vars(double_grad_vars), None, *attrs)
+            (
+                'forward_global_block',
+                forward_program.block(0),
+                'backward_global_block',
+                program_holder.backward_program.block(0),
+            )
+        )
+
+    _legacy_C_ops.run_program(
+        _valid_vars(input_vars),
+        _valid_vars(persistable_vars),
+        _valid_vars(output_vars),
+        tmp_scope_vec,
+        _valid_vars(double_grad_vars),
+        None,
+        *attrs
+    )
 
     # NOTE: [ why need set param's gradient type here ]
     # if user set sparse gradient mode, the param's gradient
@@ -946,8 +1066,6 @@ def _run_dygraph(instance, input, program_holder):
             continue
         persistable_var._set_grad_type(grad_var.type())
 
-    drop_scope_if_no_grad(instance, tmp_scope_vec)
-
     # 3. prepare output, keep same form with inputs
     outs = output_vars
     if len(output_vars) == 1:
@@ -955,27 +1073,26 @@ def _run_dygraph(instance, input, program_holder):
     return outs
 
 
-def drop_scope_if_no_grad(instance, scope_vec):
-    tracer = framework._dygraph_tracer()
-    scope = scope_vec.value().get_scope() if isinstance(
-        scope_vec, (core.VarBase)) else scope_vec[0]
-    if (not instance._is_test) and (not tracer._has_grad):
-        scope.drop_kids()
-
-
 def _run_static_graph(input, program_holder, trace_program):
     main_program = framework.default_main_program()
     param_var_names = _get_persistable_var_names(trace_program)
     _, dict_rename_var_old_new = _rename_var_program_desc(
-        trace_program, exclude=param_var_names)
+        trace_program, exclude=param_var_names
+    )
     trace_program.flush()
     output_names = [var.name() for var in program_holder.output_descs]
     # append blocks from 'trace_program'
-    _append_block(main_program, trace_program, program_holder, input,
-                  dict_rename_var_old_new)
+    _append_block(
+        main_program,
+        trace_program,
+        program_holder,
+        input,
+        dict_rename_var_old_new,
+    )
     main_program._sync_with_cpp()
-    outs = _get_output_from_program(main_program, program_holder,
-                                    dict_rename_var_old_new)
+    outs = _get_output_from_program(
+        main_program, program_holder, dict_rename_var_old_new
+    )
     if len(outs) == 1:
         outs = outs[0]
     return outs
@@ -984,7 +1101,7 @@ def _run_static_graph(input, program_holder, trace_program):
 def _collect_current_and_parent_var(program, block_idx):
     '''
     Get variables in current block and its parent block.
-    
+
     Args:
         program(Program): The program containing the current block.
         block_idx(int): index of current block.
@@ -1003,46 +1120,55 @@ def _collect_current_and_parent_var(program, block_idx):
     return vars
 
 
-def _append_block(dest_program,
-                  src_program_desc,
-                  program_holder,
-                  input_variables,
-                  dict_rename_var_old_new=None):
+def _append_block(
+    dest_program,
+    src_program_desc,
+    program_holder,
+    input_variables,
+    dict_rename_var_old_new=None,
+):
     '''
     Append Variables and Operators in 'src_program_desc' to dest_program.
-    
+
     Args:
         dest_program(Program): Variables and Operators are appended to it.
         src_program_desc(ProgramDesc): Variables in it will be appended to 'dest_program'.
         program_holder(_ProgramHolder): program_holder of TranslatedLayer
         input_variables(list): list of input variables
-        dict_rename_var_old_new(None|dict): When using '_rename_var_program_desc', 
+        dict_rename_var_old_new(None|dict): When using '_rename_var_program_desc',
         use it to map the name of the variable before it was modified and the new name.
     '''
 
     origin_block_idx = dest_program.current_block_idx
-    param_var_names = _collect_current_and_parent_var(dest_program,
-                                                      origin_block_idx)
-    append_var_from_block_desc_static(dest_program.block(origin_block_idx),
-                                      src_program_desc.block(0),
-                                      exclude=param_var_names)
+    param_var_names = _collect_current_and_parent_var(
+        dest_program, origin_block_idx
+    )
+    append_var_from_block_desc_static(
+        dest_program.block(origin_block_idx),
+        src_program_desc.block(0),
+        exclude=param_var_names,
+    )
 
     name_inp_desc = [inp.name() for inp in program_holder.input_descs]
     input_names = [inp.name for inp in input_variables]
     if len(name_inp_desc) != len(input_names):
         raise ValueError(
-            "The number of input is invalid, expected {}, but received {}.".
-            format(len(name_inp_desc), len(input_names)))
+            "The number of input is invalid, expected {}, but received {}.".format(
+                len(name_inp_desc), len(input_names)
+            )
+        )
     for i, out_name in enumerate(name_inp_desc):
         if dict_rename_var_old_new:
             out_name = dict_rename_var_old_new[out_name]
         dest_program.block(origin_block_idx).append_op(
             type='assign',
             inputs={'X': [input_names[i]]},
-            outputs={'Out': [out_name]})
+            outputs={'Out': [out_name]},
+        )
 
     append_ops = append_op_from_block_desc_static(
-        dest_program.block(origin_block_idx), src_program_desc.block(0))
+        dest_program.block(origin_block_idx), src_program_desc.block(0)
+    )
     dest_program._sync_with_cpp()
 
     offset_block_idx = dest_program.num_blocks - 1
@@ -1056,11 +1182,12 @@ def _append_block(dest_program,
             else:
                 parent_idx = origin_block_idx
             dest_block = dest_program._create_block(parent_idx=parent_idx)
-            append_var_from_block_desc_static(dest_block,
-                                              src_block,
-                                              exclude=param_var_names)
+            append_var_from_block_desc_static(
+                dest_block, src_block, exclude=param_var_names
+            )
             append_ops += append_op_from_block_desc_static(
-                dest_block, src_block)
+                dest_block, src_block
+            )
 
     dest_program._sync_with_cpp()
     for op in append_ops:
@@ -1070,15 +1197,16 @@ def _append_block(dest_program,
                 origin_id = sub.id
             if isinstance(sub, framework.Block):
                 origin_id = sub.idx
-            op._set_attr('sub_block',
-                         dest_program.block(offset_block_idx + origin_id))
+            op._set_attr(
+                'sub_block', dest_program.block(offset_block_idx + origin_id)
+            )
     dest_program._sync_with_cpp()
     dest_program.current_block_idx = origin_block_idx
 
 
-def _get_output_from_program(program,
-                             program_holder,
-                             dict_rename_var_old_new=None):
+def _get_output_from_program(
+    program, program_holder, dict_rename_var_old_new=None
+):
     """
     Get output name of 'program' according to program_holder
     """
@@ -1127,20 +1255,21 @@ def append_op_from_desc_static(block, op_desc):
     op_type = op_desc.type()
     op_append = block.desc.append_op()
     op_append.copy_from(op_desc)
-    op = framework.Operator(block=block,
-                            desc=op_append,
-                            type=op_type,
-                            inputs=None,
-                            outputs=None,
-                            attrs=None)
+    op = framework.Operator(
+        block=block,
+        desc=op_append,
+        type=op_type,
+        inputs=None,
+        outputs=None,
+        attrs=None,
+    )
     block.ops.append(op)
     return op
 
 
-def append_var_from_block_desc_static(block,
-                                      src_block_desc,
-                                      include=None,
-                                      exclude=None):
+def append_var_from_block_desc_static(
+    block, src_block_desc, include=None, exclude=None
+):
     """
     Append Variables of 'src_block_desc' to current block.
     If 'include' is not `None`,variables that are not in include are not append.
@@ -1159,13 +1288,14 @@ def append_var_from_block_desc_static(block,
     for var_desc in src_block_desc.all_vars():
         var_desc_name = var_desc.name()
         should_append = (include is None or var_desc_name in include) and (
-            exclude is None or var_desc_name not in exclude)
+            exclude is None or var_desc_name not in exclude
+        )
         if not block.has_var(var_desc_name) and should_append:
             var_type = var_desc.type()
             if var_type in [
-                    core.VarDesc.VarType.SELECTED_ROWS,
-                    core.VarDesc.VarType.LOD_TENSOR,
-                    core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                core.VarDesc.VarType.SELECTED_ROWS,
+                core.VarDesc.VarType.LOD_TENSOR,
+                core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             ]:
                 data_type = var_desc.dtype()
                 var_shape = var_desc.shape()
@@ -1173,8 +1303,8 @@ def append_var_from_block_desc_static(block,
                 data_type = None
                 var_shape = None
             if var_type in [
-                    core.VarDesc.VarType.LOD_TENSOR,
-                    core.VarDesc.VarType.LOD_TENSOR_ARRAY
+                core.VarDesc.VarType.LOD_TENSOR,
+                core.VarDesc.VarType.LOD_TENSOR_ARRAY,
             ]:
                 lod_level = var_desc.lod_level()
             else:
@@ -1193,16 +1323,18 @@ def append_var_from_block_desc_static(block,
                     shape=var_shape,
                     lod_level=lod_level,
                     persistable=var_desc.persistable(),
-                    set_need_check_feed=var_desc.need_check_feed()))
+                    set_need_check_feed=var_desc.need_check_feed(),
+                )
+            )
     return vars_append
 
 
 class TranslatedLayer(layers.Layer):
     """
-    TranslatedLayer is a ``paddle.nn.Layer`` for holding the model 
-    loaded by :ref:`api_paddle_jit_load` . It can be used like a 
+    TranslatedLayer is a ``paddle.nn.Layer`` for holding the model
+    loaded by :ref:`api_paddle_jit_load` . It can be used like a
     general Layer object in eval or train mode.
-    
+
     .. note:
         The TranslatedLayer objects should not be created by constructor, it only can be loaded and constructed by :ref:`api_paddle_jit_load` .
 
@@ -1318,8 +1450,9 @@ def __init__(self, programs, persistable_vars):
         # the TranslatedLayer object holded var names count started from 0
         with unique_name.guard():
             for name, var in persistable_vars.items():
-                if isinstance(var,
-                              (framework.ParamBase, framework.EagerParamBase)):
+                if isinstance(
+                    var, (framework.ParamBase, framework.EagerParamBase)
+                ):
                     dy_name = _generate_unique_var_name(PARAMETER_NAME_PREFIX)
                     self._persistable_var_name_dict[name] = dy_name
                     self.add_parameter(dy_name, var)
@@ -1353,7 +1486,8 @@ def _construct(model_path, configs=None):
 
         # 2. load layer parameters & buffers
         persistable_vars = _construct_params_and_buffers(
-            model_path, programs, params_filename)
+            model_path, programs, params_filename
+        )
 
         # 3. construct TranslatedLayer object
         translated_layer = TranslatedLayer(programs, persistable_vars)
@@ -1365,9 +1499,12 @@ def _construct(model_path, configs=None):
                     ins.name() for ins in program_holder.input_descs
                 ]
             setattr(
-                TranslatedLayer, method_name,
+                TranslatedLayer,
+                method_name,
                 TranslatedLayer._execution_method_creator(
-                    method_name, program_holder))
+                    method_name, program_holder
+                ),
+            )
 
         # 5. set TranslatedLayer's default mode to eval
         translated_layer.eval()
@@ -1376,7 +1513,6 @@ def _construct(model_path, configs=None):
 
     @staticmethod
     def _execution_method_creator(method_name, program_holder):
-
         def __i_m_p_l__(self, *input):
             program_holder = self._program_holder_dict[__i_m_p_l__.__name__]
             # When using jit.save, it runs in static graph mode.
@@ -1389,7 +1525,8 @@ def __i_m_p_l__(self, *input):
                 # because '_run_static_graph' modifies 'ProgramDesc', 'OpDesc.op_size()' will return a very large wrong number.
                 # A Segmentation fault error may occur if used 'p=ProgramDesc(program_holder.infer_program)'.
                 p = framework.Program._construct_from_desc(
-                    core.ProgramDesc(program_holder.infer_program))
+                    core.ProgramDesc(program_holder.infer_program)
+                )
                 return _run_static_graph(input, program_holder, p.desc)
 
         __i_m_p_l__.__name__ = method_name
@@ -1410,13 +1547,13 @@ def program(self, method_name='forward'):
         Args:
             - method_name (string): mehtod name corresponding to the program
                 to be obtained. Default: 'forward'.
-        
+
         Returns:
             Program
 
         Examples:
             .. code-block:: python
-            
+
                 import numpy as np
                 import paddle
                 import paddle.nn as nn
@@ -1502,8 +1639,9 @@ def _get_program_holder(self, method_name='forward'):
         program_holder = self._program_holder_dict.get(method_name, None)
         if program_holder is None:
             raise ValueError(
-                "The method `%s` does not exist in loaded TranslatedLayer." %
-                method_name)
+                "The method `%s` does not exist in loaded TranslatedLayer."
+                % method_name
+            )
         return program_holder
 
     def _input_spec(self, method_name='forward'):
@@ -1513,9 +1651,11 @@ def _input_spec(self, method_name='forward'):
         # 2. build input spec by input desc
         input_spec = []
         for var_desc in program_holder.input_descs:
-            spec = paddle.static.InputSpec(shape=var_desc.shape(),
-                                           dtype=var_desc.dtype(),
-                                           name=var_desc.name())
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name(),
+            )
             input_spec.append(spec)
 
         return input_spec
@@ -1530,9 +1670,11 @@ def _output_spec(self, method_name='forward'):
             # NOTE(chenweihang): InputSpec describes a tensor, not just input.
             # Maybe the name is not good enough. Here we use InputSpec to
             # construct the description of Output tensor
-            spec = paddle.static.InputSpec(shape=var_desc.shape(),
-                                           dtype=var_desc.dtype(),
-                                           name=var_desc.name())
+            spec = paddle.static.InputSpec(
+                shape=var_desc.shape(),
+                dtype=var_desc.dtype(),
+                name=var_desc.name(),
+            )
             output_spec.append(spec)
 
         return output_spec
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 856a21881c2338..9a0c7bda0a895b 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -381,7 +381,8 @@ def keep_name_table(self, value):
 
 def _parse_save_configs(configs):
     supported_configs = [
-        'output_spec', "with_hook", "combine_params", "clip_extra"
+        'output_spec', "with_hook", "combine_params", "clip_extra",
+        "skip_forward"
     ]
 
     # input check
@@ -396,7 +397,8 @@ def _parse_save_configs(configs):
     inner_config.output_spec = configs.get('output_spec', None)
     inner_config.with_hook = configs.get('with_hook', False)
     inner_config.combine_params = configs.get("combine_params", False)
-    inner_config.clip_extra = configs.get("clip_extra", False)
+    inner_config.clip_extra = configs.get("clip_extra", True)
+    inner_config.skip_forward = configs.get("skip_forward", False)
 
     return inner_config
 
@@ -522,7 +524,10 @@ def _build_load_path_and_config(path, config):
             "don't know which one to load, please make sure that the specified target "
             "of ``path`` is unique." % (path, path))
     elif not prefix_format_exist and not directory_format_exist:
-        raise ValueError("The ``path`` (%s) to load model not exists." % path)
+        raise ValueError("The ``path`` (%s) to load model not exists. "
+                         "Please make sure that *.pdmodel exists or "
+                         "don't using ``skip_forward=True`` to jit.save." %
+                         path)
     else:
         if prefix_format_exist:
             file_prefix = os.path.basename(path)
@@ -906,6 +911,7 @@ def fun(inputs):
 
     combine_vars = {}
     property_vals = []  # (value, key)
+    concrete_program = None
     for attr_func in functions:
         if isinstance(layer, Layer):
             static_func = getattr(inner_layer, attr_func, None)
@@ -921,6 +927,10 @@ def fun(inputs):
                 concrete_program = static_func.concrete_program_specify_input_spec(
                     inner_input_spec, with_hook=with_hook)
             elif 'forward' == attr_func:
+                if configs.skip_forward:
+                    # do not jit.save forward function
+                    continue
+
                 # transform in jit.save, if input_spec is incomplete, declarative will throw error
                 # inner_input_spec is list[InputSpec], it should be packed with same structure
                 # as original input_spec here.
@@ -1080,8 +1090,9 @@ def fun(inputs):
                                                ordered_vars)),
                                     filename=params_filename)
         # save property
-        property_filename = file_prefix + INFER_PROPERTY_SUFFIX
-        _save_property(property_filename, property_vals)
+        property_save_path = os.path.join(os.path.normpath(model_path),
+                                          file_prefix + INFER_PROPERTY_SUFFIX)
+        _save_property(property_save_path, property_vals)
 
     # NOTE(chenweihang): [ Save extra variable info ]
     # save_inference_model will lose some important variable information, including:
@@ -1100,10 +1111,10 @@ def fun(inputs):
     # file `***.pdiparams.info`
 
     # "layer" can only be Layer or function or StaticFunction.
-
     contain_parameter = False
-    for var in concrete_program.main_program.list_vars():
-        contain_parameter |= isinstance(var, Parameter)
+    if concrete_program is not None:
+        for var in concrete_program.main_program.list_vars():
+            contain_parameter |= isinstance(var, Parameter)
 
     if (isinstance(layer, Layer) or contain_parameter) and extra_var_info:
         with scope_guard(scope):
@@ -1639,7 +1650,7 @@ def forward(self, input):
                 check_type(
                     f, "each element of fetch", int,
                     "fluid.dygraph.jit.TracedLayer.save_inference_model")
-        clip_extra = kwargs.get('clip_extra', False)
+        clip_extra = kwargs.get('clip_extra', True)
         # path check
         file_prefix = os.path.basename(path)
         if file_prefix == "":
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 879900085d57e3..d21db965a54d0f 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -32,12 +32,25 @@
 from .. import unique_name
 from paddle.fluid import core
 from .layer_object_helper import LayerObjectHelper
-from .layer_hooks import record_program_ops_pre_hook, set_op_customized_attrs_post_hook, LayerOpsRecoder
-from .base import program_desc_tracing_guard, param_guard, in_declarative_mode, _convert_into_variable
+from .layer_hooks import (
+    record_program_ops_pre_hook,
+    set_op_customized_attrs_post_hook,
+    LayerOpsRecoder,
+)
+from .base import (
+    program_desc_tracing_guard,
+    param_guard,
+    in_declarative_mode,
+    _convert_into_variable,
+)
 from paddle.fluid import framework
 from ..param_attr import ParamAttr
 from paddle.fluid.executor import Executor, global_scope
-from paddle.fluid.framework import _non_static_mode, convert_np_dtype_to_dtype_, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    convert_np_dtype_to_dtype_,
+    in_dygraph_mode,
+)
 from paddle.fluid.framework import Program, program_guard
 from paddle.fluid.framework import _current_expected_place as _get_device
 from paddle.fluid.core import VarDesc
@@ -67,7 +80,7 @@ def _addindent(string, indent):
 
 
 class HookRemoveHelper(object):
-    """ A HookRemoveHelper that can be used to remove hook. """
+    """A HookRemoveHelper that can be used to remove hook."""
 
     next_hook_id = 0
 
@@ -153,13 +166,14 @@ def __init__(self, name_scope=None, dtype="float32"):
 
     def train(self):
         """
+
         Sets this Layer and all its sublayers to training mode.
         This only effects certain modules like `Dropout` and `BatchNorm`.
 
         Returns:
             None
 
-        Example::
+        Examples:
             .. code-block:: python
 
                 import paddle
@@ -236,6 +250,7 @@ def forward(self, input):
 
     def apply(self, fn):
         """
+
         Applies ``fn`` recursively to every sublayer (as returned by ``.sublayers()``)
         as well as self. Typical use includes initializing the parameters of a model.
 
@@ -243,7 +258,7 @@ def apply(self, fn):
             fn (function): a function to be applied to each sublayer
 
         Returns:
-            Layer: self
+            Layer, self
 
         Example::
             .. code-block:: python
@@ -263,6 +278,7 @@ def init_weights(layer):
               net.apply(init_weights)
 
               print(net.state_dict())
+
         """
         for layer in self.children():
             layer.apply(fn)
@@ -272,10 +288,12 @@ def init_weights(layer):
         return self
 
     def full_name(self):
-        """Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
+        """
+
+        Full name for this layer, composed by name_scope + "/" + MyLayer.__class__.__name__
 
         Returns:
-            str: full name of this layer.
+            str, full name of this layer.
 
         Example::
             .. code-block:: python
@@ -297,7 +315,9 @@ def forward(self, x):
         return self._full_name
 
     def register_forward_post_hook(self, hook):
-        """Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
+        """
+
+        Register a forward post-hook for Layer. The hook will be called after `forward` function has been computed.
 
         It should have the following form, `input` and `output` of the `hook` is `input` and `output` of the `Layer` respectively.
         User can use forward post-hook to change the output of the Layer or perform information statistics tasks on the Layer.
@@ -308,7 +328,7 @@ def register_forward_post_hook(self, hook):
             hook(function): a function registered as a forward post-hook
 
         Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
 
         Examples:
             .. code-block:: python
@@ -340,13 +360,16 @@ def forward_post_hook(layer, input, output):
 
                 # hook change the linear's output to output * 2, so out0 is equal to out1 * 2.
                 assert (out0.numpy() == (out1.numpy()) * 2).any()
+
         """
         hook_remove_helper = HookRemoveHelper(self._forward_post_hooks)
         self._forward_post_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
     def register_forward_pre_hook(self, hook):
-        """Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
+        """
+
+        Register a forward pre-hook for Layer. The hook will be called before `forward` function has been computed.
 
         It should have the following form, `input` of the `hook` is `input` of the `Layer`,
         hook can either return a tuple or a single modified value in the hook. We will wrap the value into a tuple if
@@ -359,7 +382,7 @@ def register_forward_pre_hook(self, hook):
             hook(function): a function registered as a forward pre-hook
 
         Returns:
-            HookRemoveHelper: a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
+            HookRemoveHelper, a HookRemoveHelper object that can be used to remove the added hook by calling `hook_remove_helper.remove()` .
 
         Examples:
             .. code-block:: python
@@ -398,12 +421,14 @@ def forward_pre_hook(layer, input):
         self._forward_pre_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
-    def create_parameter(self,
-                         shape,
-                         attr=None,
-                         dtype=None,
-                         is_bias=False,
-                         default_initializer=None):
+    def create_parameter(
+        self,
+        shape,
+        attr=None,
+        dtype=None,
+        is_bias=False,
+        default_initializer=None,
+    ):
         """Create parameters for this layer.
 
         Parameters:
@@ -443,12 +468,15 @@ def forward(self, input):
         temp_attr = copy.deepcopy(attr)
         if isinstance(temp_attr, six.string_types) and temp_attr == "":
             temp_attr = None
-        return self._helper.create_parameter(temp_attr, shape, dtype, is_bias,
-                                             default_initializer)
-
-    @deprecated(since="2.0.0",
-                update_to="paddle.nn.Layer.create_tensor",
-                reason="New api in create_tensor, easier to use.")
+        return self._helper.create_parameter(
+            temp_attr, shape, dtype, is_bias, default_initializer
+        )
+
+    @deprecated(
+        since="2.0.0",
+        update_to="paddle.nn.Layer.create_tensor",
+        reason="New api in create_tensor, easier to use.",
+    )
     def create_variable(self, name=None, persistable=None, dtype=None):
         """
 
@@ -488,14 +516,16 @@ def forward(self, input):
         if name is not None:
             var_name = ".".join([self._full_name, name])
         else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
+            var_name = unique_name.generate(
+                ".".join([self._full_name, "_generated_var"])
+            )
 
         return self._helper.main_program.current_block().create_var(
             name=var_name,
             persistable=persistable,
             dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
 
     # TODO: Add more parameter list when we need them
     def create_tensor(self, name=None, persistable=None, dtype=None):
@@ -538,38 +568,46 @@ def forward(self, input):
         if name is not None:
             var_name = ".".join([self._full_name, name])
         else:
-            var_name = unique_name.generate(".".join(
-                [self._full_name, "_generated_var"]))
+            var_name = unique_name.generate(
+                ".".join([self._full_name, "_generated_var"])
+            )
 
         return self._helper.main_program.current_block().create_var(
             name=var_name,
             persistable=persistable,
             dtype=dtype,
-            type=core.VarDesc.VarType.LOD_TENSOR)
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
 
     def parameters(self, include_sublayers=True):
-        """Returns a list of all Parameters from current layer and its sub-layers.
+        """
+
+        Returns a list of all Parameters from current layer and its sub-layers.
 
         Returns:
-            list of Tensor : a list of Parameters.
+            list of Tensor, a list of Parameters.
 
         Examples:
             .. code-block:: python
 
-            import paddle
+                import paddle
 
-            linear = paddle.nn.Linear(1,1)
-            print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
+                linear = paddle.nn.Linear(1,1)
+                print(linear.parameters())  # print linear_0.w_0 and linear_0.b_0
 
         """
         ret = [
-            param for _, param in self.named_parameters(
-                include_sublayers=include_sublayers)
+            param
+            for _, param in self.named_parameters(
+                include_sublayers=include_sublayers
+            )
         ]
         return ret
 
     def children(self):
-        """Returns an iterator over immediate children layers.
+        """
+
+        Returns an iterator over immediate children layers.
 
         Yields:
             Layer: a child layer
@@ -619,13 +657,15 @@ def named_children(self):
                 yield name, layer
 
     def sublayers(self, include_self=False):
-        """Returns a list of sub layers.
+        """
+
+        Returns a list of sub layers.
 
         Parameters:
             include_self(bool, optional): Whether return self as sublayers. Default: False
 
         Returns:
-            list of Layer : a list of sub layers.
+            list of Layer, a list of sub layers.
 
         Examples:
             .. code-block:: python
@@ -678,9 +718,11 @@ def named_parameters(self, prefix='', include_sublayers=True):
 
         """
         params_set = set()
-        named_sublayers = self.named_sublayers(
-            prefix=prefix, include_self=True) if include_sublayers else zip(
-                [prefix], [self])
+        named_sublayers = (
+            self.named_sublayers(prefix=prefix, include_self=True)
+            if include_sublayers
+            else zip([prefix], [self])
+        )
         for layer_prefix, sublayer in named_sublayers:
             params = sublayer._parameters.items()
             for key, param in params:
@@ -724,9 +766,9 @@ def named_sublayers(self, prefix='', include_self=False, layers_set=None):
             if layer is None:
                 continue
             layer_prefix = prefix + ('.' if prefix else '') + key
-            for p, l in layer.named_sublayers(prefix=layer_prefix,
-                                              include_self=True,
-                                              layers_set=layers_set):
+            for p, l in layer.named_sublayers(
+                prefix=layer_prefix, include_self=True, layers_set=layers_set
+            ):
                 yield p, l
 
     def register_buffer(self, name, tensor, persistable=True):
@@ -769,25 +811,32 @@ def register_buffer(self, name, tensor, persistable=True):
 
         if '_buffers' not in self.__dict__:
             raise ValueError(
-                "super(YourLayer, self).__init__() should be called first")
+                "super(YourLayer, self).__init__() should be called first"
+            )
         elif not isinstance(name, six.string_types):
             raise TypeError(
-                "The name of buffer should be a string, but received {}.".
-                format(type(name).__name__))
+                "The name of buffer should be a string, but received {}.".format(
+                    type(name).__name__
+                )
+            )
         elif '.' in name:
             raise KeyError(
                 "The name of buffer can not contain `.`, "
                 "because when you access the newly added buffer in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
         elif name == '':
             raise KeyError("The name of buffer can not be empty.")
         elif hasattr(self, name) and name not in self._buffers:
             raise KeyError("attribute '{}' already exists.".format(name))
-        elif tensor is not None and not (type(tensor) == core.VarBase
-                                         or type(tensor) == core.eager.Tensor):
+        elif tensor is not None and not (
+            type(tensor) == core.VarBase or type(tensor) == core.eager.Tensor
+        ):
             raise TypeError(
-                "The registered buffer should be a Paddle.Tensor, but received {}."
-                .format(type(tensor).__name__))
+                "The registered buffer should be a Paddle.Tensor, but received {}.".format(
+                    type(tensor).__name__
+                )
+            )
         else:
             self._buffers[name] = tensor
             if persistable:
@@ -797,13 +846,14 @@ def register_buffer(self, name, tensor, persistable=True):
 
     def buffers(self, include_sublayers=True):
         """
+
         Returns a list of all buffers from current layer and its sub-layers.
 
         Parameters:
             include_sublayers(bool, optional): Whether include the buffers of sublayers. If True, also include the buffers from sublayers. Default: True
 
         Returns:
-            list of Tensor : a list of buffers.
+            list of Tensor, a list of buffers.
 
         Examples:
             .. code-block:: python
@@ -820,8 +870,10 @@ def buffers(self, include_sublayers=True):
 
         """
         ret = [
-            buffer for _, buffer in self.named_buffers(
-                include_sublayers=include_sublayers)
+            buffer
+            for _, buffer in self.named_buffers(
+                include_sublayers=include_sublayers
+            )
         ]
         return ret
 
@@ -862,9 +914,11 @@ def named_buffers(self, prefix='', include_sublayers=True):
 
         """
         buffers_set = set()
-        named_sublayers = self.named_sublayers(
-            prefix=prefix, include_self=True) if include_sublayers else zip(
-                [prefix], [self])
+        named_sublayers = (
+            self.named_sublayers(prefix=prefix, include_self=True)
+            if include_sublayers
+            else zip([prefix], [self])
+        )
         for layer_prefix, sublayer in named_sublayers:
             buffers = sublayer._buffers.items()
             for key, buffer in buffers:
@@ -910,7 +964,7 @@ def _dygraph_call_func(self, *inputs, **kwargs):
             hook_result = forward_pre_hook(self, inputs)
             if hook_result is not None:
                 if not isinstance(hook_result, tuple):
-                    hook_result = (hook_result, )
+                    hook_result = (hook_result,)
                 inputs = hook_result
 
         if not self._built:
@@ -920,16 +974,20 @@ def _dygraph_call_func(self, *inputs, **kwargs):
                 # TODO(liuyuhui) Only xpu broadcast parameters here.
                 # The other device is to call _sync_params_buffers in DataParallel
                 # to realize the parameter synchronization among multiply cards.
-                if parallel_helper._is_data_parallel_mode(
-                ) and paddle.is_compiled_with_xpu():
+                if (
+                    parallel_helper._is_data_parallel_mode()
+                    and paddle.is_compiled_with_xpu()
+                ):
                     parallel_helper._broadcast_parameters(
-                        self._parameters.values())
+                        self._parameters.values()
+                    )
 
             self._built = True
 
         if in_profiler_mode():
-            with profiler.RecordEvent(self.__class__.__name__,
-                                      profiler.TracerEventType.Forward):
+            with profiler.RecordEvent(
+                self.__class__.__name__, profiler.TracerEventType.Forward
+            ):
                 outputs = self.forward(*inputs, **kwargs)
         else:
             outputs = self.forward(*inputs, **kwargs)
@@ -942,8 +1000,14 @@ def _dygraph_call_func(self, *inputs, **kwargs):
         return outputs
 
     def __call__(self, *inputs, **kwargs):
-        if (not in_declarative_mode()) and (not self._forward_pre_hooks) \
-            and (not self._forward_post_hooks) and (not self._built) and in_dygraph_mode() and (not in_profiler_mode()):
+        if (
+            (not in_declarative_mode())
+            and (not self._forward_pre_hooks)
+            and (not self._forward_post_hooks)
+            and (not self._built)
+            and in_dygraph_mode()
+            and (not in_profiler_mode())
+        ):
             self._build_once(*inputs, **kwargs)
             return self.forward(*inputs, **kwargs)
         else:
@@ -964,7 +1028,9 @@ def backward(self, *inputs):
         raise ValueError("Layer shouldn't implement backward")
 
     def add_sublayer(self, name, sublayer):
-        """Adds a sub Layer instance.
+        """
+
+        Adds a sub Layer instance.
 
         Added sublayer can be accessed by self.name
 
@@ -972,7 +1038,7 @@ def add_sublayer(self, name, sublayer):
             name(str): name of this sublayer.
             sublayer(Layer): an instance of Layer.
         Returns:
-            Layer: the sublayer passed in.
+            Layer, the sublayer passed in.
 
         Examples:
             .. code-block:: python
@@ -999,8 +1065,9 @@ def forward(self, input):
                 model = MySequential(fc1, fc2)
                 for prefix, layer in model.named_sublayers():
                     print(prefix, layer)
+
         """
-        assert (isinstance(sublayer, Layer) or sublayer == None)
+        assert isinstance(sublayer, Layer) or sublayer == None
 
         self._sub_layers[name] = sublayer
         return sublayer
@@ -1014,7 +1081,7 @@ def add_parameter(self, name, parameter):
             name(str): name of this sublayer.
             parameter(Parameter): an instance of Parameter.
         Returns:
-            Parameter: the parameter passed in.
+            Parameter, the parameter passed in.
         Examples:
             .. code-block:: python
 
@@ -1037,32 +1104,42 @@ def forward(self, input):
         """
         if '_parameters' not in self.__dict__:
             raise RuntimeError(
-                "super(YourLayer, self).__init__() should be called firstly.")
+                "super(YourLayer, self).__init__() should be called firstly."
+            )
         elif not isinstance(name, six.string_types):
             raise TypeError(
-                "The name of parameter should be a string, but received {}.".
-                format(type(name).__name__))
+                "The name of parameter should be a string, but received {}.".format(
+                    type(name).__name__
+                )
+            )
         elif '.' in name:
             raise KeyError(
                 "The name of parameter can not contain `.`, "
                 "because when you access the newly added parameter in the "
-                "form of `self.**.**`, it will cause AttributeError.")
+                "form of `self.**.**`, it will cause AttributeError."
+            )
         elif name == '':
             raise KeyError("The name of parameter can not be empty.")
         elif hasattr(self, name) and name not in self._parameters:
             raise KeyError("The parameter '{}' already exists.".format(name))
-        elif parameter is not None and not isinstance(parameter,
-                                                      framework.Parameter):
+        elif parameter is not None and not isinstance(
+            parameter, framework.Parameter
+        ):
             raise TypeError(
-                "The parameter to be added should be a Parameter, but received {}."
-                .format(type(parameter).__name__))
+                "The parameter to be added should be a Parameter, but received {}.".format(
+                    type(parameter).__name__
+                )
+            )
         else:
             if parameter is None:
                 self._parameters[name] = None
 
             if len(self._loaddict_holder) > 0:
-                assert parameter.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    parameter.name)
+                assert (
+                    parameter.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    parameter.name
+                )
 
                 parameter.set_value(self._loaddict_holder[parameter.name])
 
@@ -1081,37 +1158,50 @@ def _set_op_attrs(self, attrs):
         """
 
         def is_already_registered(is_pre_hook):
-            layers_hooks = self._forward_pre_hooks if is_pre_hook else self._forward_post_hooks
-            candidate_hook = record_program_ops_pre_hook if is_pre_hook else set_op_customized_attrs_post_hook
+            layers_hooks = (
+                self._forward_pre_hooks
+                if is_pre_hook
+                else self._forward_post_hooks
+            )
+            candidate_hook = (
+                record_program_ops_pre_hook
+                if is_pre_hook
+                else set_op_customized_attrs_post_hook
+            )
 
             already_registed = False
             if layers_hooks:
                 last_key = next(reversed(layers_hooks))
-                already_registed = (layers_hooks[last_key] == candidate_hook)
+                already_registed = layers_hooks[last_key] == candidate_hook
 
             return already_registed
 
         if not isinstance(attrs, dict):
             raise TypeError(
                 "attrs should be type(dict), but received {}".format(
-                    type(attrs).__name__))
+                    type(attrs).__name__
+                )
+            )
 
         # NOTE: Overwrite behavior for same key.
         self._customized_attrs.update(attrs)
 
         if not is_already_registered(is_pre_hook=True):
             pre_hook_helper = self.register_forward_pre_hook(
-                record_program_ops_pre_hook)
+                record_program_ops_pre_hook
+            )
             assert len(self._op_recorder.hooks) == 0
             self._op_recorder.hooks = [pre_hook_helper]
 
         # manually register post_hook to ensure it is inserted into the head.
         if not is_already_registered(is_pre_hook=False):
             post_hook_helper = self.register_forward_post_hook(
-                set_op_customized_attrs_post_hook)
+                set_op_customized_attrs_post_hook
+            )
             if len(self._forward_post_hooks) > 1:
-                self._forward_post_hooks.move_to_end(post_hook_helper._hook_id,
-                                                     last=False)
+                self._forward_post_hooks.move_to_end(
+                    post_hook_helper._hook_id, last=False
+                )
 
             assert len(self._op_recorder.hooks) == 1
 
@@ -1144,7 +1234,6 @@ def __getattr__(self, name):
         return object.__getattribute__(self, name)
 
     def __setattr__(self, name, value):
-
         def _remove_if_exist(*dicts):
             for d in dicts:
                 if name in d:
@@ -1156,10 +1245,14 @@ def _remove_if_exist(*dicts):
         if isinstance(value, framework.Parameter):
             if params is None:
                 raise ValueError(
-                    "super(YourLayer, self).__init__() should be called first")
+                    "super(YourLayer, self).__init__() should be called first"
+                )
             if len(self._loaddict_holder) > 0:
-                assert value.name in self._loaddict_holder, "Parameter not found, Can't not find [ {} ] in state_dict".format(
-                    value.name)
+                assert (
+                    value.name in self._loaddict_holder
+                ), "Parameter not found, Can't not find [ {} ] in state_dict".format(
+                    value.name
+                )
 
                 value.set_value(self._loaddict_holder[value.name])
 
@@ -1168,9 +1261,10 @@ def _remove_if_exist(*dicts):
         elif params is not None and name in params:
             if value is not None:
                 raise TypeError(
-                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'"
-                    .format(name,
-                            type(value).__name__))
+                    "assignment to parameter '{}' should be of type Parameter or None, but got '{}'".format(
+                        name, type(value).__name__
+                    )
+                )
             params[name] = None
         else:
             layers = self.__dict__.get('_sub_layers', None)
@@ -1185,9 +1279,10 @@ def _remove_if_exist(*dicts):
             elif layers is not None and name in layers:
                 if value is not None:
                     raise TypeError(
-                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'"
-                        .format(name,
-                                type(value).__name__))
+                        "assignment to sublayer '{}' should be of type Layer or None, but got '{}'".format(
+                            name, type(value).__name__
+                        )
+                    )
                 layers[name] = None
             else:
                 _buffers = self.__dict__.get('_buffers', None)
@@ -1196,8 +1291,9 @@ def _remove_if_exist(*dicts):
                         raise ValueError(
                             "super(YourLayer, self).__init__() should be called first"
                         )
-                    _remove_if_exist(self.__dict__, self._parameters,
-                                     self._sub_layers)
+                    _remove_if_exist(
+                        self.__dict__, self._parameters, self._sub_layers
+                    )
                     # Set persistable=False by default. Only `register_buffer` can
                     # add a persistable buffer.
                     if name not in self._buffers:
@@ -1211,6 +1307,7 @@ def _remove_if_exist(*dicts):
                     # value via `assign`.
                     if type(value) == framework.Variable:
                         from paddle import assign
+
                         # Note(zhhsplendid): the condition below happens in PaddleGan model,
                         # but should all non-Variable _buffers[name] be re-assign? We
                         # should consider it in the future. I current wrote this as
@@ -1218,18 +1315,23 @@ def _remove_if_exist(*dicts):
                         if in_declarative_mode() and _buffers[name] is None:
                             raise RuntimeError(
                                 'In Dy2stat, self.{0} is a buffer and self.{0} is '
-                                'not allowed to be set to Variable when self.{0} is None.'
-                                .format(name))
-                        elif _buffers[name] is None or type(getattr(
-                                self, name)) == core.VarBase:
+                                'not allowed to be set to Variable when self.{0} is None.'.format(
+                                    name
+                                )
+                            )
+                        elif (
+                            _buffers[name] is None
+                            or type(getattr(self, name)) == core.VarBase
+                        ):
                             _buffers[name] = assign(value)
                         else:
                             assign(value, getattr(self, name))
                     elif value is not None:
                         raise TypeError(
-                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
-                            .format(name,
-                                    type(value).__name__))
+                            "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'".format(
+                                name, type(value).__name__
+                            )
+                        )
                     else:
                         # Assigning None will remove the buffer, but if re-assign a new varBase to it,
                         # it will be remarked as a buffer with same `persistable` attribute.
@@ -1316,12 +1418,14 @@ def register_state_dict_hook(self, hook):
         self._state_dict_hooks[hook_remove_helper._hook_id] = hook
         return hook_remove_helper
 
-    def _obtain_parameters_buffers(self,
-                                   destination=None,
-                                   include_sublayers=True,
-                                   structured_name_prefix=""):
+    def _obtain_parameters_buffers(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+    ):
         """
-        The difference from state_dict() is that state_dict_hook will not be called, 
+        The difference from state_dict() is that state_dict_hook will not be called,
         but the original types of parameters and buffers will be maintained.
         """
         if destination is None:
@@ -1330,7 +1434,10 @@ def _obtain_parameters_buffers(self,
             if data is not None:
                 destination[structured_name_prefix + name] = data
         for name, buffer in self._buffers.items():
-            if buffer is not None and name not in self._non_persistable_buffer_names_set:
+            if (
+                buffer is not None
+                and name not in self._non_persistable_buffer_names_set
+            ):
                 destination[structured_name_prefix + name] = buffer
 
         if include_sublayers:
@@ -1339,17 +1446,22 @@ def _obtain_parameters_buffers(self,
                     destination_temp = destination.copy()
                     destination_temp.update(
                         layer_item._obtain_parameters_buffers(
-                            destination_temp, include_sublayers,
-                            structured_name_prefix + layer_name + "."))
+                            destination_temp,
+                            include_sublayers,
+                            structured_name_prefix + layer_name + ".",
+                        )
+                    )
                     destination = destination_temp
         return destination
 
-    def _state_dict_impl(self,
-                         destination=None,
-                         include_sublayers=True,
-                         structured_name_prefix="",
-                         include_non_persistable_buffer=False,
-                         use_hook=True):
+    def _state_dict_impl(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+        include_non_persistable_buffer=False,
+        use_hook=True,
+    ):
         """
         Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
@@ -1367,7 +1479,10 @@ def _state_dict_impl(self,
                 destination[structured_name_prefix + name] = data
         for name, buffer in self._buffers.items():
             if not include_non_persistable_buffer:
-                if buffer is not None and name not in self._non_persistable_buffer_names_set:
+                if (
+                    buffer is not None
+                    and name not in self._non_persistable_buffer_names_set
+                ):
                     destination[structured_name_prefix + name] = buffer
             else:
                 if buffer is not None:
@@ -1379,9 +1494,13 @@ def _state_dict_impl(self,
                     destination_temp = destination.copy()
                     destination_temp.update(
                         layer_item._state_dict_impl(
-                            destination_temp, include_sublayers,
+                            destination_temp,
+                            include_sublayers,
                             structured_name_prefix + layer_name + ".",
-                            include_non_persistable_buffer, use_hook))
+                            include_non_persistable_buffer,
+                            use_hook,
+                        )
+                    )
                     destination = destination_temp
         if use_hook:
             for state_dict_hook in self._state_dict_hooks.values():
@@ -1391,12 +1510,15 @@ def _state_dict_impl(self,
 
         return destination
 
-    def to_static_state_dict(self,
-                             destination=None,
-                             include_sublayers=True,
-                             structured_name_prefix="",
-                             use_hook=True):
+    def to_static_state_dict(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+        use_hook=True,
+    ):
         '''
+
         Get all parameters and buffers of current layer and its sub-layers. And set them into a dict
 
         Parameters:
@@ -1405,7 +1527,7 @@ def to_static_state_dict(self,
             use_hook(bool, optional) : If true, the operations contained in _state_dict_hooks will be appended to the destination. Default: True
 
         Retruns:
-            dict: a dict contains all the parameters and persistable buffers.
+            dict, a dict contains all the parameters and persistable buffers.
 
         Examples:
             .. code-block:: python
@@ -1423,13 +1545,16 @@ def to_static_state_dict(self,
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix,
             include_non_persistable_buffer=True,
-            use_hook=use_hook)
-
-    def state_dict(self,
-                   destination=None,
-                   include_sublayers=True,
-                   structured_name_prefix="",
-                   use_hook=True):
+            use_hook=use_hook,
+        )
+
+    def state_dict(
+        self,
+        destination=None,
+        include_sublayers=True,
+        structured_name_prefix="",
+        use_hook=True,
+    ):
         '''
         Get all parameters and persistable buffers of current layer and its sub-layers. And set them into a dict
 
@@ -1457,7 +1582,8 @@ def state_dict(self,
             include_sublayers=include_sublayers,
             structured_name_prefix=structured_name_prefix,
             include_non_persistable_buffer=False,
-            use_hook=use_hook)
+            use_hook=use_hook,
+        )
 
     @framework.deprecate_stat_dict
     def set_state_dict(self, state_dict, use_structured_name=True):
@@ -1489,22 +1615,31 @@ def _check_match(key, param):
             state = state_dict.get(key, None)
             if state is None:
                 raise ValueError(
-                    "{} is not found in the provided dict.".format(key))
-            if (isinstance(state, dict) or isinstance(state, list)):
-                if (len(state) != len(param)):
-                    raise ValueError("{} receieves the length of {}, "
-                                     "but the expected shape is {}".format(
-                                         key, len(state), len(param)))
+                    "{} is not found in the provided dict.".format(key)
+                )
+            if isinstance(state, dict) or isinstance(state, list):
+                if len(state) != len(param):
+                    raise ValueError(
+                        "{} receieves the length of {}, "
+                        "but the expected shape is {}".format(
+                            key, len(state), len(param)
+                        )
+                    )
                 else:
                     return param, state
             else:
-                state_shape = state.shape() if inspect.ismethod(
-                    state.shape) else state.shape
+                state_shape = (
+                    state.shape()
+                    if inspect.ismethod(state.shape)
+                    else state.shape
+                )
 
                 if list(state_shape) != list(param.shape):
                     raise ValueError(
-                        "{} receives a shape {}, but the expected shape is {}.".
-                        format(key, list(state_shape), list(param.shape)))
+                        "{} receives a shape {}, but the expected shape is {}.".format(
+                            key, list(state_shape), list(param.shape)
+                        )
+                    )
                 return param, state
 
         matched_param_state = []
@@ -1541,8 +1676,10 @@ def _set_var(var, ndarray):
             executor = Executor(_get_device())._default_executor
             # restore parameter states
             core._create_loaded_parameter(
-                [param for param, state in matched_param_state], global_scope(),
-                executor)
+                [param for param, state in matched_param_state],
+                global_scope(),
+                executor,
+            )
             for param, state in matched_param_state:
                 _set_var(param, state)
 
@@ -1559,7 +1696,7 @@ def to(self, device=None, dtype=None, blocking=None):
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
         Returns:
             self
 
@@ -1594,11 +1731,13 @@ def to(self, device=None, dtype=None, blocking=None):
                 #        [ 0.33960250,  0.96878713]])
 
         '''
-        return self._to_impl(device=device,
-                             dtype=dtype,
-                             blocking=blocking,
-                             include_sublayers=True,
-                             floating_only=False)
+        return self._to_impl(
+            device=device,
+            dtype=dtype,
+            blocking=blocking,
+            include_sublayers=True,
+            floating_only=False,
+        )
 
     def _apply(self, func, device, dtype, blocking, include_sublayers=True):
         if include_sublayers:
@@ -1612,8 +1751,9 @@ def _apply(self, func, device, dtype, blocking, include_sublayers=True):
 
                 if param.grad is not None:
                     with no_grad():
-                        grad_applied = func(param._grad_ivar(), device, dtype,
-                                            blocking)
+                        grad_applied = func(
+                            param._grad_ivar(), device, dtype, blocking
+                        )
 
         for key, buf in self._buffers.items():
             if buf is not None:
@@ -1637,12 +1777,14 @@ def _transform(self, t, device, dtype, blocking):
             # Note(zhangbo): Paddle GPU minimum memory allocation unit is 256 bytes, waiting_alloc_memory will comput ‘t’ occupied memory space.
             # Coefficient 1.2 is used to avoid OOM that may occur in this critical state when the memory is just enough.
             waiting_alloc_memory = (
-                (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+                ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            )
             gpu_memory_available = core.gpu_memory_available()
             if gpu_memory_available < waiting_alloc_memory:
                 # Copy param / Tensor to cpu
-                t_used = t._copy_to(paddle.CPUPlace(),
-                                    blocking)  # k-v type will error
+                t_used = t._copy_to(
+                    paddle.CPUPlace(), blocking
+                )  # k-v type will error
                 # Release mem of t
                 t.value().get_tensor()._clear()
             else:
@@ -1653,7 +1795,8 @@ def _transform(self, t, device, dtype, blocking):
         # 2. cast param / Tensor to dtype
         if dtype is not None and dtype != t_used.dtype:
             with paddle.fluid.framework._dygraph_place_guard(
-                    place=t_used.place):
+                place=t_used.place
+            ):
                 t_casted = t_used.cast(dtype=dtype)
         else:
             t_casted = t_used
@@ -1671,12 +1814,14 @@ def _transform(self, t, device, dtype, blocking):
 
         return t
 
-    def _to_impl(self,
-                 device=None,
-                 dtype=None,
-                 blocking=None,
-                 include_sublayers=True,
-                 floating_only=False):
+    def _to_impl(
+        self,
+        device=None,
+        dtype=None,
+        blocking=None,
+        include_sublayers=True,
+        floating_only=False,
+    ):
         '''
         Cast the parameters and buffers of Layer by the give device, dtype and blocking.
 
@@ -1689,7 +1834,7 @@ def _to_impl(self,
 
             blocking(bool|None, optional): If False and the source is in pinned memory, the copy will be
               asynchronous with respect to the host. Otherwise, the argument has no effect. If None, the blocking is set True. Default: None.
-            
+
             include_sublayers(bool|True, optional): If True, deal with self and all sublayers parameters and buffers, if not only deal with self parameters and buffers. Default: True.
 
             floating_only(bool|False, optional): If True, only cast all floating point parameters and buffers of Layer by the give device, dtype and blocking.
@@ -1705,20 +1850,28 @@ def _to_impl(self,
         if device is not None:
             if isinstance(device, str):
                 device = paddle.device._convert_to_place(device)
-            elif isinstance(device, (core.CPUPlace, core.CUDAPlace,
-                                     core.CUDAPinnedPlace, core.XPUPlace)):
+            elif isinstance(
+                device,
+                (
+                    core.CPUPlace,
+                    core.CUDAPlace,
+                    core.CUDAPinnedPlace,
+                    core.XPUPlace,
+                ),
+            ):
                 pass
             else:
                 raise ValueError(
                     "device value error, must be str, paddle.CPUPlace(), paddle.CUDAPlace(), paddle.CUDAPinnedPlace() or paddle.XPUPlace(), but the type of device is "
-                    + type(device).__name__)
+                    + type(device).__name__
+                )
 
         if blocking is None:
             blocking = True
         else:
             assert isinstance(
-                blocking,
-                bool), "blocking value error, must be the True, False or None"
+                blocking, bool
+            ), "blocking value error, must be the True, False or None"
 
         def transform(t, device, dtype, blocking):
             if floating_only and (not paddle.is_floating_point(t)):
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index 91f22842a45612..7b1ea80d872955 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -420,7 +420,7 @@ def sync_params_buffers(model,
         paddle.distributed.broadcast(coalesced_var,
                                      src=src_rank,
                                      group=comm_group,
-                                     use_calc_stream=True)
+                                     sync_op=True)
 
     for coalesced_var, origin_vars, var_shapes in coalesced_vars:
         var_len = [np.prod(v_shape) for v_shape in var_shapes]
diff --git a/python/paddle/fluid/dygraph/varbase_patch_methods.py b/python/paddle/fluid/dygraph/varbase_patch_methods.py
index cb6907d842ca6a..b5f89b295c7982 100644
--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -919,7 +919,7 @@ def values(self):
                     indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
                     values = [1, 2, 3, 4, 5]
                     dense_shape = [3, 4]
-                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int32'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
                     print(sparse_x.values())
                     #[1, 2, 3, 4, 5]
         """
@@ -944,7 +944,7 @@ def to_dense(self):
                     indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
                     values = [1, 2, 3, 4, 5]
                     dense_shape = [3, 4]
-                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(paddle.to_tensor(indices, dtype='int64'), paddle.to_tensor(values, dtype='float32'), shape=dense_shape)
                     dense_x = sparse_x.to_dense()
                     #[[0., 1., 0., 2.],
                     # [0., 0., 3., 0.],
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index 447cab7119b8c6..21e4cc6644d7af 100755
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -78,7 +78,7 @@ def _switch_scope(scope):
 @signature_safe_contextmanager
 def scope_guard(scope):
     """
-    
+
     This function switches scope through python `with` statement.
     Scope records the mapping between variable names and variables ( :ref:`api_guide_Variable` ),
     similar to brackets in programming languages.
@@ -96,7 +96,7 @@ def scope_guard(scope):
         None
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -147,10 +147,12 @@ def as_numpy(tensor, copy=False):
     assert isinstance(tensor, core.LoDTensor)
     lod = tensor.lod()
     if len(lod) > 0:
-        raise RuntimeError("Some of your fetched tensors hold LoD information. \
+        raise RuntimeError(
+            "Some of your fetched tensors hold LoD information. \
             They can not be completely cast to Python ndarray. \
             Please set the parameter 'return_numpy' as 'False' to \
-            return LoDTensor itself directly.")
+            return LoDTensor itself directly."
+        )
     if tensor._is_initialized():
         if copy:
             return np.array(tensor)
@@ -164,10 +166,10 @@ def dtype_is_compatible_with(first, second):
     """
     Returns True if the first dtype can be compatible the second one.
     Currently, we require the two dtype's have to be same.
-      
+
     Args:
         dtype (np.dtype|VarType|str): The type of data: float32, int64, etc.
-    
+
     Returns:
         True if the two types are same.
     """
@@ -223,7 +225,7 @@ def check_feed_shape_type(var, feed, num_places=1):
     2. Each non-negative number of the two dimensions are same.
     3. For negative number or 'None' in a dimension, it means unknown so it
        is compatible with any number.
-    
+
     Args:
         var (Variable): the Variable object
         feed (LoDTensor): the fed value, which must be a LoDTensor
@@ -240,21 +242,29 @@ def check_feed_shape_type(var, feed, num_places=1):
         if diff_shape is not None:
             raise ValueError(
                 'The fed Variable %r should have dimensions = %d, shape = '
-                '%r, but received fed shape %r on each device' %
-                (var.name, len(var.shape), var.shape, diff_shape))
+                '%r, but received fed shape %r on each device'
+                % (var.name, len(var.shape), var.shape, diff_shape)
+            )
         if not dtype_is_compatible_with(feed._dtype(), var.dtype):
-            var_dtype_format = convert_dtype(var.dtype) if isinstance(
-                var.dtype, core.VarDesc.VarType) else var.dtype
-            feed_dtype_format = convert_dtype(feed._dtype()) if isinstance(
-                feed._dtype(), core.VarDesc.VarType) else feed._dtype()
+            var_dtype_format = (
+                convert_dtype(var.dtype)
+                if isinstance(var.dtype, core.VarDesc.VarType)
+                else var.dtype
+            )
+            feed_dtype_format = (
+                convert_dtype(feed._dtype())
+                if isinstance(feed._dtype(), core.VarDesc.VarType)
+                else feed._dtype()
+            )
             raise ValueError(
-                'The data type of fed Variable %r must be %r, but received %r' %
-                (var.name, var_dtype_format, feed_dtype_format))
+                'The data type of fed Variable %r must be %r, but received %r'
+                % (var.name, var_dtype_format, feed_dtype_format)
+            )
     return True
 
 
 def has_feed_operators(block, feed_targets, feed_holder_name):
-    """ Check whether the block already has feed operators.
+    """Check whether the block already has feed operators.
 
     Return false if the block does not have any feed operators.
     If some feed operators have been prepended to the block, check that
@@ -283,20 +293,22 @@ def has_feed_operators(block, feed_targets, feed_holder_name):
             if feed_target_name not in feed_targets:
                 raise Exception(
                     "'feed_targets' does not have {} variable".format(
-                        feed_target_name))
+                        feed_target_name
+                    )
+                )
         else:
             break
     if feed_count > 0 and feed_count != len(feed_targets):
         raise Exception(
-            "Feed operators in program desc do not match 'feed_targets'")
+            "Feed operators in program desc do not match 'feed_targets'"
+        )
     return feed_count > 0
 
 
-def has_fetch_operators(block,
-                        fetch_targets,
-                        fetch_holder_name,
-                        fetch_op='fetch'):
-    """ Check whether the block already has fetch operators.
+def has_fetch_operators(
+    block, fetch_targets, fetch_holder_name, fetch_op='fetch'
+):
+    """Check whether the block already has fetch operators.
 
     Return false if the block does not have any fetch operators.
     If some fetch operators have been appended to the block, check that
@@ -324,25 +336,25 @@ def has_fetch_operators(block,
             assert op.desc.output('Out')[0] == fetch_holder_name
             fetch_target_name = op.desc.input('X')[0]
             if fetch_target_name not in [
-                    var.desc.name() for var in fetch_targets
+                var.desc.name() for var in fetch_targets
             ]:
                 raise Exception(
                     "'fetch_targets' does not have {} variable".format(
-                        fetch_target_name))
+                        fetch_target_name
+                    )
+                )
             idx = op.desc.attr('col')
             assert fetch_target_name == fetch_targets[idx].desc.name()
     if fetch_count > 0 and fetch_count != len(fetch_targets):
         raise Exception(
-            "Fetch operators in program desc do not match 'fetch_targets'")
+            "Fetch operators in program desc do not match 'fetch_targets'"
+        )
     return fetch_count > 0
 
 
-def _add_feed_fetch_ops(program,
-                        feed,
-                        fetch_list,
-                        feed_var_name,
-                        fetch_var_name,
-                        use_fetch_v2=False):
+def _add_feed_fetch_ops(
+    program, feed, fetch_list, feed_var_name, fetch_var_name, use_fetch_v2=False
+):
     tmp_program = program.clone()
 
     global_block = tmp_program.global_block()
@@ -353,7 +365,8 @@ def _add_feed_fetch_ops(program,
         feed_var = global_block.create_var(
             name=feed_var_name,
             type=core.VarDesc.VarType.FEED_MINIBATCH,
-            persistable=True)
+            persistable=True,
+        )
 
     if fetch_var_name in global_block.vars:
         fetch_var = global_block.var(fetch_var_name)
@@ -361,21 +374,25 @@ def _add_feed_fetch_ops(program,
         fetch_var = global_block.create_var(
             name=fetch_var_name,
             type=core.VarDesc.VarType.FETCH_LIST,
-            persistable=True)
+            persistable=True,
+        )
 
     # prepend feed operators
     if not has_feed_operators(global_block, feed, feed_var_name):
         for i, name in enumerate(feed):
             if global_block.has_var(name):
                 out = global_block.var(name)
-                global_block._prepend_op(type='feed',
-                                         inputs={'X': [feed_var]},
-                                         outputs={'Out': [out]},
-                                         attrs={'col': i})
+                global_block._prepend_op(
+                    type='feed',
+                    inputs={'X': [feed_var]},
+                    outputs={'Out': [out]},
+                    attrs={'col': i},
+                )
             else:
                 warnings.warn(
                     "The variable %s is not found in program. It is not declared or is pruned."
-                    % name)
+                    % name
+                )
 
     if use_fetch_v2:
         fetch_op = 'fetch_v2'
@@ -383,22 +400,26 @@ def _add_feed_fetch_ops(program,
         fetch_op = 'fetch'
 
     # append fetch_operators
-    if not has_fetch_operators(global_block, fetch_list, fetch_var_name,
-                               fetch_op):
+    if not has_fetch_operators(
+        global_block, fetch_list, fetch_var_name, fetch_op
+    ):
         for i, var in enumerate(fetch_list):
             assert isinstance(var, Variable) or isinstance(
-                var, six.string_types), ("Wrong type for fetch_list[%s]: %s" %
-                                         (i, type(var)))
-            global_block.append_op(type=fetch_op,
-                                   inputs={'X': [var]},
-                                   outputs={'Out': [fetch_var]},
-                                   attrs={'col': i})
+                var, six.string_types
+            ), "Wrong type for fetch_list[%s]: %s" % (i, type(var))
+            global_block.append_op(
+                type=fetch_op,
+                inputs={'X': [var]},
+                outputs={'Out': [fetch_var]},
+                attrs={'col': i},
+            )
 
     return tmp_program
 
 
-def _apply_inplace_addto_pass(program, enable_inplace, enable_addto,
-                              skip_var_names):
+def _apply_inplace_addto_pass(
+    program, enable_inplace, enable_addto, skip_var_names
+):
     use_cuda = True if core.is_compiled_with_cuda() else False
 
     attrs = {"use_cuda": use_cuda, "mem_opt_skip_vars": skip_var_names}
@@ -407,12 +428,14 @@ def _apply_inplace_addto_pass(program, enable_inplace, enable_addto,
     empty_startup_program = Program()
     if enable_inplace:
         pass_name = "buffer_shared_inplace_pass"
-        _apply_pass(program, empty_startup_program, pass_name, attrs,
-                    attr_types)
+        _apply_pass(
+            program, empty_startup_program, pass_name, attrs, attr_types
+        )
     if enable_addto and use_cuda:
         pass_name = "inplace_addto_op_pass"
-        _apply_pass(program, empty_startup_program, pass_name, attrs,
-                    attr_types)
+        _apply_pass(
+            program, empty_startup_program, pass_name, attrs, attr_types
+        )
 
 
 def _fetch_var(name, scope=None, return_numpy=True):
@@ -441,7 +464,8 @@ def _fetch_var(name, scope=None, return_numpy=True):
     assert var is not None, (
         "Cannot find " + name + " in scope. Perhaps you need to make the"
         " variable persistable by using var.persistable = True in your"
-        " program.")
+        " program."
+    )
     tensor = var.get_tensor()
     if return_numpy:
         tensor = as_numpy(tensor, copy=True)
@@ -449,7 +473,6 @@ def _fetch_var(name, scope=None, return_numpy=True):
 
 
 def _to_name_str(var):
-
     def _to_str(var):
         if isinstance(var, Variable):
             return var.desc.name()
@@ -474,19 +497,26 @@ def _to_str(var):
 
 
 def _is_enable_standalone_executor():
-    return framework._enable_standalone_executor_ is None or framework._enable_standalone_executor_ in [
-        1, '1', True, 'True', 'true'
-    ]
+    return (
+        framework._enable_standalone_executor_ is None
+        or framework._enable_standalone_executor_
+        in [1, '1', True, 'True', 'true']
+    )
 
 
 def _is_dy2st_enable_standalone_executor():
     return framework._dy2st_enable_standalone_executor_ in [
-        1, '1', True, 'True', 'true'
+        1,
+        '1',
+        True,
+        'True',
+        'true',
     ]
 
 
 def _prepare_fleet_executor():
     from ..distributed.fleet.proto import fleet_executor_desc_pb2
+
     trainer_endpoints_str = os.getenv("PADDLE_TRAINER_ENDPOINTS", "")
     trainer_endpoints = trainer_endpoints_str.split(',')
     fleet_exe_desc = fleet_executor_desc_pb2.FleetExecutorDesc()
@@ -504,7 +534,8 @@ def _prepare_fleet_executor():
 
 def _get_strong_program_cache_key_for_new_exe(program, feed, fetch_list):
     return program.desc.cached_hash_str() + _get_program_cache_key(
-        feed, fetch_list)
+        feed, fetch_list
+    )
 
 
 def _get_strong_program_cache_key(program, feed, fetch_list):
@@ -515,10 +546,16 @@ def _get_varname_from_block(block):
             block_str.append(var_name)
         return "\n".join(block_str)
 
-    inner_program = program._program if isinstance(
-        program, compiler.CompiledProgram) else program
-    return _get_varname_from_block(inner_program.blocks[0]) + str(
-        id(program)) + _get_program_cache_key(feed, fetch_list)
+    inner_program = (
+        program._program
+        if isinstance(program, compiler.CompiledProgram)
+        else program
+    )
+    return (
+        _get_varname_from_block(inner_program.blocks[0])
+        + str(id(program))
+        + _get_program_cache_key(feed, fetch_list)
+    )
 
 
 def _get_program_cache_key(feed, fetch_list):
@@ -534,30 +571,35 @@ def _get_program_cache_key(feed, fetch_list):
 
 def _as_lodtensor(data, place, dtype=None):
     """
-        Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
-        For higher dimensional sequence data, please use LoDTensor directly.
+    Convert numpy.ndarray to Tensor, its only support Tensor without LoD information.
+    For higher dimensional sequence data, please use LoDTensor directly.
 
-        Examples:
-            >>> import paddle.fluid as fluid
-            >>> place = fluid.CPUPlace()
-            >>> exe = fluid.executor(place)
-            >>> data = np.array(size=(100, 200, 300))
-            >>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
-            >>>     ...
+    Examples:
+        >>> import paddle.fluid as fluid
+        >>> place = fluid.CPUPlace()
+        >>> exe = fluid.executor(place)
+        >>> data = np.array(size=(100, 200, 300))
+        >>> np_outs = map(lambda x: fluid.executor._as_lodtensor(x, place), data)
+        >>>     ...
 
-        Args:
-            data(numpy.ndarray|list|tuple|scalar): a instance of array, scalar, list or tuple
-            data(core.Place): the place of created tensor
-            dtype(core.VarDesc.VarType|str): the expected data type of created tensor
+    Args:
+        data(numpy.ndarray|list|tuple|scalar): a instance of array, scalar, list or tuple
+        data(core.Place): the place of created tensor
+        dtype(core.VarDesc.VarType|str): the expected data type of created tensor
 
-        Returns:
-            LoDTensor
-        """
-    #NOTE(zhiqiu): convert python builtin, like float, int, and list, to numpy ndarray
+    Returns:
+        LoDTensor
+    """
+    # NOTE(zhiqiu): convert python builtin, like float, int, and list, to numpy ndarray
     if not isinstance(data, np.ndarray):
-        assert dtype is not None, 'The dtype should be given when feed data is not np.ndarray'
-        dtype = convert_dtype(dtype) if isinstance(
-            dtype, core.VarDesc.VarType) else dtype
+        assert (
+            dtype is not None
+        ), 'The dtype should be given when feed data is not np.ndarray'
+        dtype = (
+            convert_dtype(dtype)
+            if isinstance(dtype, core.VarDesc.VarType)
+            else dtype
+        )
         if np.isscalar(data):
             data = np.array([data]).astype(dtype)
         elif isinstance(data, (list, tuple)):
@@ -572,7 +614,9 @@ def _as_lodtensor(data, place, dtype=None):
         else:
             raise TypeError(
                 "Convert data of type {} to Tensor is not supported".format(
-                    type(data)))
+                    type(data)
+                )
+            )
 
     # convert numpy.ndarray to tensor
     tensor = core.LoDTensor()
@@ -581,7 +625,6 @@ def _as_lodtensor(data, place, dtype=None):
 
 
 class FetchHandler(object):
-
     def __init__(self, var_dict=None, period_secs=60):
         assert var_dict != None
         self.var_dict = var_dict
@@ -595,7 +638,8 @@ def handler(self, res_dict):
 
     @staticmethod
     def help():
-        print("""
+        print(
+            """
 class FetchHandlerExample(FetchHandler):
     def handler(self, res_dict):
         print(res_dict["auc"])
@@ -604,11 +648,11 @@ def handler(self, res_dict):
 auc = Variable()
 var_dict = {"auc": auc}
 handler = FetchHandlerExample(var_dict=var_dict)
-""")
+"""
+        )
 
 
 class _StandaloneExecutor(object):
-
     def __init__(self, place, main_program, scope):
         self._place = core.Place()
         self._place.set_place(place)
@@ -621,15 +665,16 @@ def run(self, scope, feed_names, fetch_list, return_numpy=True):
         Args:
             feed_names(list): This parameter represents the input names of the model.
             fetch_list(list): This parameter represents the Tensors that need to be returned
-                after the model runs. The default is None. 
+                after the model runs. The default is None.
             return_numpy(bool): This parameter indicates whether convert the fetched Tensors
                 (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
                 the type of the return value is a list of :code:`LoDTensor`. The default is True.
         """
         fetch_list = self._check_fetch(fetch_list)
 
-        tensors = self._new_exe.run(scope, feed_names,
-                                    fetch_list)._move_to_list()
+        tensors = self._new_exe.run(
+            scope, feed_names, fetch_list
+        )._move_to_list()
         if return_numpy:
             return as_numpy(tensors, copy=True)
         else:
@@ -642,10 +687,10 @@ def _create_new_executor(self):
 
     def _update_feed(self, feed):
         """
-        Update the feed dict, remove the feed item which is pruned in program.  
+        Update the feed dict, remove the feed item which is pruned in program.
 
         Notes: This is a very low level API. Users should not use this API
-        directly. 
+        directly.
 
         Args:
             feed(list|dict): feed dict or list.
@@ -661,8 +706,9 @@ def _update_feed(self, feed):
 
         if not isinstance(feed, dict):
             raise TypeError(
-                "feed requires dict as its Parameter. But you passed in %s" %
-                (type(feed)))
+                "feed requires dict as its Parameter. But you passed in %s"
+                % (type(feed))
+            )
 
         global_block = self._main_program.global_block()
         for feed_name in list(feed.keys()):
@@ -670,7 +716,8 @@ def _update_feed(self, feed):
                 feed.pop(feed_name)
                 warnings.warn(
                     "The variable %s is not found in program. It is not declared or is pruned."
-                    % feed_name)
+                    % feed_name
+                )
 
         return feed
 
@@ -684,19 +731,27 @@ def _check_fetch(self, fetch_list):
                 fetch_var = fetch_var.name
             elif not isinstance(fetch_var, str):
                 raise TypeError(
-                    "Required fetch_var shall be str|Variable, but received {}".
-                    format(type(fetch_var).__name__))
+                    "Required fetch_var shall be str|Variable, but received {}".format(
+                        type(fetch_var).__name__
+                    )
+                )
 
             res.append(fetch_var)
         return res
 
 
 class _ExecutorCache(object):
-
     class _CachedData(object):
-
-        def __init__(self, program, feed, fetch_list, feed_var_name,
-                     fetch_var_name, place, scope):
+        def __init__(
+            self,
+            program,
+            feed,
+            fetch_list,
+            feed_var_name,
+            fetch_var_name,
+            place,
+            scope,
+        ):
             self.program = program
             self.feed = feed
             self.fetch_list = fetch_list
@@ -708,17 +763,29 @@ def __init__(self, program, feed, fetch_list, feed_var_name,
             # NOTE(Ruibiao): Not all changeable item is considered for key at present,
             # ONLY: program, feed, and fetch_list
             if isinstance(self.program, compiler.CompiledProgram):
+                if not self.program._program:
+                    # The program holds no _program, maybe it is constructed by graph.
+                    # Convert graph to program in order to generate key.
+                    self.program._program = framework.IrGraph(
+                        self.program._graph
+                    ).to_program()
                 self.key = hash(
                     _get_strong_program_cache_key_for_new_exe(
-                        self.program._program, feed, fetch_list))
+                        self.program._program, feed, fetch_list
+                    )
+                )
             else:
                 self.key = hash(
                     _get_strong_program_cache_key_for_new_exe(
-                        self.program, feed, fetch_list))
+                        self.program, feed, fetch_list
+                    )
+                )
 
         def __eq__(self, other):
-            return isinstance(
-                other, _ExecutorCache._CachedData) and self.key == other.key
+            return (
+                isinstance(other, _ExecutorCache._CachedData)
+                and self.key == other.key
+            )
 
         def __hash__(self):
             return self.key
@@ -728,21 +795,41 @@ def __init__(self):
         # the _ExecutorCache instance, otherwise a global cache may not be released after
         # the Executor instance deleted
         self._get_cached_program_and_executor = lru_cache(maxsize=8)(
-            self._get_program_and_executor)
+            self._get_program_and_executor
+        )
 
     def clear(self):
         self._get_cached_program_and_executor.cache_clear()
 
-    def get_program_and_executor(self, program, feed, fetch_list, feed_var_name,
-                                 fetch_var_name, place, scope):
+    def get_program_and_executor(
+        self,
+        program,
+        feed,
+        fetch_list,
+        feed_var_name,
+        fetch_var_name,
+        place,
+        scope,
+    ):
         return self._get_cached_program_and_executor(
-            self._CachedData(program, feed, fetch_list, feed_var_name,
-                             fetch_var_name, place, scope))
+            self._CachedData(
+                program,
+                feed,
+                fetch_list,
+                feed_var_name,
+                fetch_var_name,
+                place,
+                scope,
+            )
+        )
 
     def _get_program_and_executor(self, cached_data):
         program = cached_data.program
-        inner_program = program._program if isinstance(
-            program, compiler.CompiledProgram) else program
+        inner_program = (
+            program._program
+            if isinstance(program, compiler.CompiledProgram)
+            else program
+        )
         feed = cached_data.feed
         fetch_list = cached_data.fetch_list
         feed_var_name = cached_data.feed_var_name
@@ -752,9 +839,13 @@ def _get_program_and_executor(self, cached_data):
 
         # To apply IR pass, compile the Program to IrGraph and convert it back to Program
         if isinstance(program, compiler.CompiledProgram) or isinstance(
-                program._graph, compiler.CompiledProgram):
-            compiled_program = program if isinstance(
-                program, compiler.CompiledProgram) else program._graph
+            program._graph, compiler.CompiledProgram
+        ):
+            compiled_program = (
+                program
+                if isinstance(program, compiler.CompiledProgram)
+                else program._graph
+            )
             build_strategy = compiled_program._build_strategy
             # print(f"Program before convert:\n {inner_program}", flush=True)
             compiled_program._compile(scope, place)
@@ -766,37 +857,55 @@ def _get_program_and_executor(self, cached_data):
 
             inner_program = converted_program
             # print(f"Program after convert:\n {inner_program}", flush=True)
-            warnings.warn(
-                "FLAGS_USE_STANDALONE_EXECUTOR and FLAGS_CONVERT_GRAPH_TO_PROGRAM is set to 1. Graph will be converted to Program and executed using new executor."
-            )
         else:
             build_strategy = None
             from paddle.incubate.autograd import prim_enabled, prim2orig
+
             if prim_enabled() and program == default_main_program():
                 prim2orig()
 
             inner_program = program
 
-        program = _add_feed_fetch_ops(program=inner_program,
-                                      feed=feed,
-                                      fetch_list=fetch_list,
-                                      feed_var_name=feed_var_name,
-                                      fetch_var_name=fetch_var_name,
-                                      use_fetch_v2=True)
+        program = _add_feed_fetch_ops(
+            program=inner_program,
+            feed=feed,
+            fetch_list=fetch_list,
+            feed_var_name=feed_var_name,
+            fetch_var_name=fetch_var_name,
+            use_fetch_v2=True,
+        )
 
-        # If there are multiple blocks in the program, subblock will not be executed with the new executor in temporary
-        if program.num_blocks > 1:
-            warnings.warn("There are more than 1 block in program.")
+        if (
+            os.environ.get('FLAGS_CONVERT_GRAPH_TO_PROGRAM', None)
+            in [1, '1', True, 'True', 'true']
+            and not program._is_start_up_program_
+        ):
+            if program.num_blocks > 1:
+                # If there are multiple blocks in the program, subblock will not be executed with the new executor in temporary
+                logging.warning("There are more than 1 block in program.")
+            elif program.num_blocks == 1:
+                logging.warning("There are 1 block in program.")
+            else:
+                logging.warning("There are no block in program.")
 
         # standalone executor will apply buffer_shared_inplace_pass and
         # inplace_addto_op_pass to program according to build_strategy
-        enable_inplace = True if build_strategy is None or build_strategy.enable_inplace else False
-        enable_addto = True if build_strategy is not None and build_strategy.enable_addto else False
+        enable_inplace = (
+            True
+            if build_strategy is None or build_strategy.enable_inplace
+            else False
+        )
+        enable_addto = (
+            True
+            if build_strategy is not None and build_strategy.enable_addto
+            else False
+        )
         if enable_inplace or enable_addto:
             # inplace should skip feed and fetch var
             skip_var_names = eval(_get_program_cache_key(feed, fetch_list))
-            _apply_inplace_addto_pass(program, enable_inplace, enable_addto,
-                                      skip_var_names)
+            _apply_inplace_addto_pass(
+                program, enable_inplace, enable_addto, skip_var_names
+            )
 
         new_program = program.clone()
         new_exe = _StandaloneExecutor(place, new_program, scope)
@@ -816,10 +925,10 @@ class Executor(object):
             will set the default device according to its installation version. If Paddle
             is CPU version, the default device would be set to `CPUPlace()` . If Paddle is
             GPU version, the default device would be set to `CUDAPlace(0)` . Default is None.
-            If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x`` 
+            If ``place`` is string, it can be ``cpu``, and ``gpu:x``, where ``x``
             is the index of the GPUs. Note: users only pass one Place or None to initialize
             Executor when using multiple-cards. Other APIs will override the cards. See
-            `document for multiple-cards <https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/01_paddle2.0_introduction/update_en.html#stand-alone-multi-card-launch>`_ 
+            `document for multiple-cards <https://www.paddlepaddle.org.cn/documentation/docs/en/develop/guides/01_paddle2.0_introduction/update_en.html#stand-alone-multi-card-launch>`_
 
     Returns:
         Executor
@@ -890,6 +999,7 @@ def __init__(self, place=None):
         self.ctx_caches = dict()
         self.trainer_caches = dict()
         self.scope_caches = dict()
+        self.micro_scope_cache = dict()
         self.var_caches = dict()
         self.pruned_program_caches = dict()
         p = core.Place()
@@ -900,7 +1010,8 @@ def __init__(self, place=None):
         self._prepare_to_run_called = False
 
         self._auto_checkpoint_name = unique_name.generate(
-            "__auto_checkpoint_executor__")
+            "__auto_checkpoint_executor__"
+        )
 
         # NOTE: Whether to use experimental executor `StandaloneExecutor`.
         self._enable_interpreter_core = _is_enable_standalone_executor()
@@ -953,6 +1064,12 @@ def _add_trainer_cache(self, trainer_cache_key, ctx):
     def _add_scope_cache(self, scope_cache_key, scope):
         self.scope_caches[scope_cache_key] = scope
 
+    def _add_micro_scopes_cache(self, program_cache_key, micro_scopes: list):
+        self.micro_scope_cache[program_cache_key] = micro_scopes
+
+    def _get_micro_scopes_cache(self, program_cache_key):
+        return self.micro_scope_cache.get(program_cache_key, None)
+
     # just for testing, will be removed later
     @lru_cache()
     def _log_force_set_program_cache(self, use_program_cache):
@@ -970,8 +1087,9 @@ def _feed_data(self, program, feed, feed_var_name, scope):
                 var = global_block.var(feed_target_name)
                 if var.dtype != core.VarDesc.VarType.STRINGS:
                     if not isinstance(cur_feed, core.LoDTensor):
-                        cur_feed = _as_lodtensor(cur_feed, self.place,
-                                                 var.dtype)
+                        cur_feed = _as_lodtensor(
+                            cur_feed, self.place, var.dtype
+                        )
                     check_feed_shape_type(var, cur_feed)
                 idx = op.desc.attr('col')
                 core.set_feed_variable(scope, cur_feed, feed_var_name, idx)
@@ -998,7 +1116,7 @@ def _split_optimize_ops_in_fetch_list(cls, fetch_list):
 
         Returns:
             optimize_ops(list): The optimize operators splited from fetch_list.
-            fetch_list(list):  The updated fetch_list which does not contain optimize operators.  
+            fetch_list(list):  The updated fetch_list which does not contain optimize operators.
         """
         _optimize_ops = []
         _fetch_list = []
@@ -1009,14 +1127,19 @@ def _get_targets(_optimize_ops, _fetch_list, item):
                     _optimize_ops.append(item)
                 else:
                     raise TypeError(
-                        "The operator in fetch_list is not an optimize_op")
-            elif isinstance(item, Variable) or isinstance(
-                    item, str) or isinstance(item, six.string_types):
+                        "The operator in fetch_list is not an optimize_op"
+                    )
+            elif (
+                isinstance(item, Variable)
+                or isinstance(item, str)
+                or isinstance(item, six.string_types)
+            ):
                 _fetch_list.append(item)
             else:
                 raise TypeError(
                     "The item in fetch_list should be str, variable or optimize_op, but received %s.",
-                    type(item))
+                    type(item),
+                )
 
         for index, item in enumerate(fetch_list):
             # NOTE(zhiqiu): to support (optimizer_ops, param_and_grads) and optimizer_ops in fetch_list
@@ -1028,9 +1151,10 @@ def _get_targets(_optimize_ops, _fetch_list, item):
             elif isinstance(item, tuple):
                 if not isinstance(item[0], (list, tuple)):
                     raise TypeError(
-                        "Requires fetch_list[{}][0] shall be one of (list, tuple) when type(fetch_list[{}]) is `tuple`, but received fetch_list[{}][0]'s type is `{}`."
-                        .format(index, index, index,
-                                type(item[0]).__name__))
+                        "Requires fetch_list[{}][0] shall be one of (list, tuple) when type(fetch_list[{}]) is `tuple`, but received fetch_list[{}][0]'s type is `{}`.".format(
+                            index, index, index, type(item[0]).__name__
+                        )
+                    )
                 for i in item[0]:
                     _get_targets(_optimize_ops, _fetch_list, i)
             else:
@@ -1039,19 +1163,17 @@ def _get_targets(_optimize_ops, _fetch_list, item):
         return _fetch_list, _optimize_ops
 
     @classmethod
-    def _prune_program(cls,
-                       program,
-                       feed=None,
-                       fetch_list=None,
-                       optimize_ops=None):
+    def _prune_program(
+        cls, program, feed=None, fetch_list=None, optimize_ops=None
+    ):
         """
         Prune operators and variables which are not needed to generate
-        :code:`fetch_list` and optimize operators. 
-        Prune operators and variables which are needed 
-        to generate variables to be feeded.  
+        :code:`fetch_list` and optimize operators.
+        Prune operators and variables which are needed
+        to generate variables to be feeded.
 
         Notes: This is a very low level API. Users should not use this API
-        directly. 
+        directly.
 
         Args:
             program(Program): the origin program
@@ -1105,10 +1227,10 @@ def _prune_program(cls,
     @classmethod
     def _update_feed(cls, program, feed):
         """
-        Update the feed dict, remove the feed item which is pruned in program.  
+        Update the feed dict, remove the feed item which is pruned in program.
 
         Notes: This is a very low level API. Users should not use this API
-        directly. 
+        directly.
 
         Args:
             program(Program): the pruned program.
@@ -1125,6 +1247,7 @@ def _update_feed(cls, program, feed):
                 warnings.warn(
                     "The program holds no _program, maybe it is constructed by graph."
                 )
+                return feed
         else:
             global_block = program.global_block()
 
@@ -1134,7 +1257,8 @@ def _update_feed(cls, program, feed):
                     feed.pop(feed_name)
                     warnings.warn(
                         "The variable %s is not found in program. It is not declared or is pruned."
-                        % feed_name)
+                        % feed_name
+                    )
 
         elif isinstance(feed, list) or isinstance(feed, tuple):
             for i, each in enumerate(feed):
@@ -1143,7 +1267,8 @@ def _update_feed(cls, program, feed):
                         each.pop(feed_name)
                         warnings.warn(
                             "The variable %s is not found in program. It is not declared or is pruned."
-                            % feed_name)
+                            % feed_name
+                        )
         return feed
 
     '''
@@ -1178,9 +1303,18 @@ def close(self):
                 del trainer_instance
             self._default_executor.close()
 
-    def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
-                      return_numpy, return_merged):
+    def _run_parallel(
+        self,
+        program,
+        scope,
+        feed,
+        fetch_list,
+        fetch_var_name,
+        return_numpy,
+        return_merged,
+    ):
         from paddle.optimizer.lr import LRScheduler
+
         exe = program._executor
         # TODO(zhenghuihuang): quantization uses Graph in CompiledProgram
         # instead of program. We will add support for checking Vars in Graph
@@ -1195,9 +1329,11 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                 if not isinstance(feed_tensor, core.LoDTensor):
                     # always set to CPU place, since the tensor need to be split
                     # it is fast in CPU
-                    feed_tensor = _as_lodtensor(feed[feed_name],
-                                                core.CPUPlace(),
-                                                var.dtype if var else None)
+                    feed_tensor = _as_lodtensor(
+                        feed[feed_name],
+                        core.CPUPlace(),
+                        var.dtype if var else None,
+                    )
                 if need_check_feed:
                     check_feed_shape_type(var, feed_tensor, exe.device_count())
                 feed_tensor_dict[feed_name] = feed_tensor
@@ -1208,16 +1344,20 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
             for i, each in enumerate(feed):
                 if not isinstance(each, dict):
                     raise TypeError(
-                        "Each element of feed list should be a dict")
+                        "Each element of feed list should be a dict"
+                    )
                 res_dict = dict()
                 for feed_name in each:
                     tensor = each[feed_name]
-                    var = global_block.var(
-                        feed_name) if need_check_feed else None
+                    var = (
+                        global_block.var(feed_name) if need_check_feed else None
+                    )
                     if not isinstance(tensor, core.LoDTensor):
-                        tensor = _as_lodtensor(each[feed_name],
-                                               program._places[i],
-                                               var.dtype if var else None)
+                        tensor = _as_lodtensor(
+                            each[feed_name],
+                            program._places[i],
+                            var.dtype if var else None,
+                        )
                     if need_check_feed:
                         check_feed_shape_type(var, tensor)
                     res_dict[feed_name] = tensor
@@ -1238,23 +1378,26 @@ def _run_parallel(self, program, scope, feed, fetch_list, fetch_var_name,
                 )
             else:
                 exe.feed_and_split_tensor_into_local_scopes(
-                    {lr_sheduler._var_name: lr_tensor})
+                    {lr_sheduler._var_name: lr_tensor}
+                )
 
         fetch_var_names = list(map(_to_name_str, fetch_list))
         tensors = exe.run(fetch_var_names, return_merged)._move_to_list()
         return as_numpy(tensors) if return_numpy else tensors
 
-    def run(self,
-            program=None,
-            feed=None,
-            fetch_list=None,
-            feed_var_name='feed',
-            fetch_var_name='fetch',
-            scope=None,
-            return_numpy=True,
-            use_program_cache=False,
-            return_merged=True,
-            use_prune=False):
+    def run(
+        self,
+        program=None,
+        feed=None,
+        fetch_list=None,
+        feed_var_name='feed',
+        fetch_var_name='fetch',
+        scope=None,
+        return_numpy=True,
+        use_program_cache=False,
+        return_merged=True,
+        use_prune=False,
+    ):
         """
         Run the specified :code:`Program` or :code:`CompiledProgram`. It should be noted that the executor
         will execute all the operators in :code:`Program` or :code:`CompiledProgram` without pruning some
@@ -1278,12 +1421,12 @@ def run(self,
                 so the length of this list should be equal to the number of places.
                 The default is None.
             fetch_list(list): This parameter represents the Tensors that need to be returned
-                after the model runs. The default is None. 
+                after the model runs. The default is None.
             feed_var_name(str): This parameter represents the name of the input Tensor of
                 the feed operator. The default is "feed".
             fetch_var_name(str): This parameter represents the name of the output Tensor of
                 the fetch operator. The default is "fetch".
-            scope(Scope): the scope used to run this program, you can switch 
+            scope(Scope): the scope used to run this program, you can switch
                 it to different scope. default is :code:`paddle.static.global_scope()`
             return_numpy(bool): This parameter indicates whether convert the fetched Tensors
                 (the Tensor specified in the fetch list) to numpy.ndarray. if it is False,
@@ -1304,14 +1447,14 @@ def run(self,
                 results are variant, please set :code:`return_merged` as False, which denotes that the fetched
                 results will not be merged. The default is True, but it is just for the compatibility, and may
                 use False as default value in the future version.
-            use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned. 
+            use_prune(bool): This parameter indicates whether the input :code:`Program` will be pruned.
                 If the parameter is True, the program will be pruned accroding to the given feed and fetch_list,
-                which means the operators and variables in program that generate :code:`feed` and are not 
-                needed to generate :code:`fetch_list` will be pruned. The default is False, which means the 
+                which means the operators and variables in program that generate :code:`feed` and are not
+                needed to generate :code:`fetch_list` will be pruned. The default is False, which means the
                 program will not pruned and all the operators and variables will be executed during running.
-                Note that if the tuple returned from :code:`Optimizer.minimize()` is passed to :code:`fetch_list`, 
+                Note that if the tuple returned from :code:`Optimizer.minimize()` is passed to :code:`fetch_list`,
                 :code:`use_prune` will be overrided to True, and the program will be pruned.
-                
+
         Returns:
 
             List: The fetched result list.
@@ -1429,32 +1572,49 @@ def run(self,
         """
         # Temporary FLAGS, just for testing the performance of program cache
         force_use_program_cache = os.environ.get(
-            'FLAGS_FORCE_USE_PROGRAM_CACHE', None)
+            'FLAGS_FORCE_USE_PROGRAM_CACHE', None
+        )
         if force_use_program_cache is not None:
             use_program_cache = force_use_program_cache in [
-                1, '1', True, 'True', 'true'
+                1,
+                '1',
+                True,
+                'True',
+                'true',
             ]
             self._log_force_set_program_cache(use_program_cache)
 
         try:
-            res = self._run_impl(program=program,
-                                 feed=feed,
-                                 fetch_list=fetch_list,
-                                 feed_var_name=feed_var_name,
-                                 fetch_var_name=fetch_var_name,
-                                 scope=scope,
-                                 return_numpy=return_numpy,
-                                 use_program_cache=use_program_cache,
-                                 use_prune=use_prune,
-                                 return_merged=return_merged)
+            res = self._run_impl(
+                program=program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=use_program_cache,
+                use_prune=use_prune,
+                return_merged=return_merged,
+            )
             core.update_autotune_status()
             return res
         except Exception as e:
             six.reraise(*sys.exc_info())
 
-    def _run_impl(self, program, feed, fetch_list, feed_var_name,
-                  fetch_var_name, scope, return_numpy, use_program_cache,
-                  return_merged, use_prune):
+    def _run_impl(
+        self,
+        program,
+        feed,
+        fetch_list,
+        feed_var_name,
+        fetch_var_name,
+        scope,
+        return_numpy,
+        use_program_cache,
+        return_merged,
+        use_prune,
+    ):
         if self._closed:
             raise RuntimeError("Attempted to use a closed Executor")
 
@@ -1473,17 +1633,20 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
                     program=program,
                     feed=feed,
                     fetch_list=fetch_list,
-                    with_standalone_executor=self.
-                    _fleet_executor_with_standalone)
+                    with_standalone_executor=self._fleet_executor_with_standalone,
+                    return_numpy=return_numpy,
+                )
             if "startup_program" in program._pipeline_opt:
                 program = program._pipeline_opt["startup_program"]
             else:
-                return self._run_pipeline(program,
-                                          fetch_list=fetch_list,
-                                          use_program_cache=use_program_cache)
+                return self._run_pipeline(
+                    program,
+                    fetch_list=fetch_list,
+                    use_program_cache=use_program_cache,
+                )
 
         if isinstance(program, Program) and program._heter_pipeline_opt:
-            #print("program._heter_pipeline_opt: {}".format(
+            # print("program._heter_pipeline_opt: {}".format(
             #    program._heter_pipeline_opt))
             ## change default executor
             heter_place = program._heter_pipeline_opt["heter_place"]
@@ -1493,20 +1656,26 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
             self._default_executor = core.Executor(p)
             # TODO(zhangminxu): support heterps pipeline training using exe.run
             if "startup_program" in program._heter_pipeline_opt:
-                #print("get startup_program from _pipeline_opt")
+                # print("get startup_program from _pipeline_opt")
                 program = program._heter_pipeline_opt["startup_program"]
 
-        if isinstance(program, Program) and \
-                        len(program.global_block().ops) == 0:
+        if (
+            isinstance(program, Program)
+            and len(program.global_block().ops) == 0
+        ):
             if use_default_main_program:
-                error_info = "Now you are using default_main_program, "\
-                    "but there are no operators in the program to be executed. "\
-                    "Please ensure you create model correctly or you can pass "\
+                error_info = (
+                    "Now you are using default_main_program, "
+                    "but there are no operators in the program to be executed. "
+                    "Please ensure you create model correctly or you can pass "
                     "the Program or the CompiledProgram manually."
+                )
             else:
-                error_info = "There are no operators in the program to be executed. "\
-                    "If you pass Program manually, please use fluid.program_guard "\
+                error_info = (
+                    "There are no operators in the program to be executed. "
+                    "If you pass Program manually, please use fluid.program_guard "
                     "to ensure the current Program is being used."
+                )
             warnings.warn(error_info)
 
         if scope is None:
@@ -1516,27 +1685,36 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
         _origin_fetch_list = fetch_list
         _origin_program = program
         fetch_list, optimize_ops = self._split_optimize_ops_in_fetch_list(
-            fetch_list)
+            fetch_list
+        )
         if optimize_ops:
             use_prune = True
         if use_prune:
-            cache_key = _get_strong_program_cache_key(program, feed,
-                                                      _origin_fetch_list)
+            cache_key = _get_strong_program_cache_key(
+                program, feed, _origin_fetch_list
+            )
             cached_pruned_program = self._get_pruned_program_cache(cache_key)
             if cached_pruned_program is None:
                 if isinstance(program, compiler.CompiledProgram):
                     program_scope_cache = self._get_pruned_program_scope_cache(
-                        str(id(_origin_program)))
+                        str(id(_origin_program))
+                    )
                     # copy the original program, so it can be cached.
                     program = copy.copy(program)
                     # share the local scopes for same original CompiledProgram.
                     program._share_vars_from = program_scope_cache
-                    if self._get_pruned_program_scope_cache(
-                            str(id(_origin_program))) is None:
+                    if (
+                        self._get_pruned_program_scope_cache(
+                            str(id(_origin_program))
+                        )
+                        is None
+                    ):
                         self._add_pruned_program_scope_cache(
-                            str(id(_origin_program)), program)
-                pruned_program = self._prune_program(program, feed, fetch_list,
-                                                     optimize_ops)
+                            str(id(_origin_program)), program
+                        )
+                pruned_program = self._prune_program(
+                    program, feed, fetch_list, optimize_ops
+                )
                 self._add_pruned_program_cache(cache_key, pruned_program)
             else:
                 pruned_program = cached_pruned_program
@@ -1546,69 +1724,93 @@ def _run_impl(self, program, feed, fetch_list, feed_var_name,
 
         def _can_use_interpreter_core(program, place):
             if core.is_compiled_with_mlu() or isinstance(
-                    place, core.CustomPlace):
+                place, core.CustomPlace
+            ):
                 return False
 
             use_standalone_executor_for_compiled_program = os.environ.get(
-                'FLAGS_CONVERT_GRAPH_TO_PROGRAM',
-                None) in [1, '1', True, 'True', 'true']
+                'FLAGS_CONVERT_GRAPH_TO_PROGRAM', None
+            ) in [1, '1', True, 'True', 'true']
 
             # Only support fleet when 'FLAGS_CONVERT_GRAPH_TO_PROGRAM' is set to true
             from paddle.distributed.fleet import fleet
-            if fleet._role_maker is not None and not use_standalone_executor_for_compiled_program:
-                warnings.warn("Standalone executor is not used for fleet",
-                              UserWarning)
+
+            if (
+                fleet._role_maker is not None
+                and not use_standalone_executor_for_compiled_program
+            ):
+                warnings.warn(
+                    "Standalone executor is not used for fleet", UserWarning
+                )
                 return False
 
-            compiled = isinstance(program,
-                                  compiler.CompiledProgram) or isinstance(
-                                      program._graph, compiler.CompiledProgram)
+            compiled = isinstance(
+                program, compiler.CompiledProgram
+            ) or isinstance(program._graph, compiler.CompiledProgram)
             if compiled:
-                compiled_program = program if isinstance(
-                    program, compiler.CompiledProgram) else program._graph
-                # Unsupported case 1 : the CompiledProgram is constructed by Graph
-                if compiled_program._program is None:
-                    warnings.warn("Standalone executor is not used for Graph",
-                                  UserWarning)
-                    return False
-
-                # Unsupported case 2: data parallel
-                if compiled_program._is_data_parallel and len(
+                compiled_program = (
+                    program
+                    if isinstance(program, compiler.CompiledProgram)
+                    else program._graph
+                )
+                # Unsupported case 1: data parallel
+                if (
+                    compiled_program._is_data_parallel
+                    and len(
                         compiled_program._get_places(
-                            place, compiled_program._places)) != 1:
+                            place, compiled_program._places
+                        )
+                    )
+                    != 1
+                ):
                     warnings.warn(
                         "Standalone executor is not used for data parallel",
-                        UserWarning)
+                        UserWarning,
+                    )
                     return False
 
-                # Unsupported case 3 : parallel graph
+                # Unsupported case 2: parallel graph
                 if core.globals()['FLAGS_enable_parallel_graph'] in [
-                        1, '1', True, 'True', 'true'
+                    1,
+                    '1',
+                    True,
+                    'True',
+                    'true',
                 ]:
                     warnings.warn(
                         "Standalone executor is not used for parallel graph",
-                        UserWarning)
+                        UserWarning,
+                    )
                     return False
 
-                # Unsupported case 4: inference
+                # Unsupported case 3: inference
                 if compiled_program._is_inference:
                     warnings.warn(
                         "Standalone executor is not used for inference",
-                        UserWarning)
+                        UserWarning,
+                    )
                     return False
 
-                # Unsupported case 5: CUDA Graph
-                if compiled_program._build_strategy is not None and compiled_program._build_strategy.allow_cuda_graph_capture:
+                # Unsupported case 4: CUDA Graph
+                if (
+                    compiled_program._build_strategy is not None
+                    and compiled_program._build_strategy.allow_cuda_graph_capture
+                ):
                     warnings.warn(
                         "Standalone executor is not used for CUDA Graph",
-                        UserWarning)
+                        UserWarning,
+                    )
                     return False
 
-                # Unsupported case 6: async mode
-                if compiled_program._build_strategy is not None and compiled_program._build_strategy.async_mode:
+                # Unsupported case 5: async mode
+                if (
+                    compiled_program._build_strategy is not None
+                    and compiled_program._build_strategy.async_mode
+                ):
                     warnings.warn(
                         "Standalone executor is not used for async mode",
-                        UserWarning)
+                        UserWarning,
+                    )
                     return False
 
                 return use_standalone_executor_for_compiled_program
@@ -1618,8 +1820,11 @@ def _can_use_interpreter_core(program, place):
 
         # NOTE: This is an experimental feature. If `export FLAGS_USE_STANDALONE_EXECUTOR=1 `,
         # use StandaloneExecutor to run the program.
-        if return_merged and self._enable_interpreter_core and _can_use_interpreter_core(
-                program, self.place):
+        if (
+            return_merged
+            and self._enable_interpreter_core
+            and _can_use_interpreter_core(program, self.place)
+        ):
 
             if feed is None:
                 feed = {}
@@ -1629,18 +1834,27 @@ def _can_use_interpreter_core(program, place):
             if not isinstance(feed, dict):
                 raise TypeError(
                     "feed requires dict as its Parameter. But you passed in %s"
-                    % (type(feed)))
+                    % (type(feed))
+                )
             feed = self._update_feed(program, feed)
 
             program, new_exe = self._executor_cache.get_program_and_executor(
-                program, feed, fetch_list, feed_var_name, fetch_var_name,
-                self.place, scope)
+                program,
+                feed,
+                fetch_list,
+                feed_var_name,
+                fetch_var_name,
+                self.place,
+                scope,
+            )
 
             self._feed_data(program, feed, feed_var_name, scope)
             if hasattr(program, 'lr_sheduler'):
                 from paddle.optimizer.lr import LRScheduler
-                assert isinstance(program.lr_sheduler,
-                                  LRScheduler), "must be LRScheduler"
+
+                assert isinstance(
+                    program.lr_sheduler, LRScheduler
+                ), "must be LRScheduler"
                 lr_sheduler = program.lr_sheduler
                 lr_value = lr_sheduler()
                 lr_var = program.global_block().vars[lr_sheduler._var_name]
@@ -1654,13 +1868,10 @@ def _can_use_interpreter_core(program, place):
                 else:
                     tensor._copy_from(cpu_tensor, self.place)
 
-            warnings.warn(
-                "FLAGS_USE_STANDALONE_EXECUTOR is set to 1. New executor is used to execute Program."
+            return new_exe.run(
+                scope, list(feed.keys()), fetch_list, return_numpy
             )
 
-            return new_exe.run(scope, list(feed.keys()), fetch_list,
-                               return_numpy)
-
         compiled = isinstance(program, compiler.CompiledProgram)
 
         # Check if fluid.data() variable no feed data
@@ -1674,13 +1885,15 @@ def _can_use_interpreter_core(program, place):
                 varobj = global_block.vars[varname]
 
                 # Can not check var build by fluid.layers.data(), bucause fluid.layers.data() had not set need_check_feed
-                if vardesc.persistable() == False and \
-                    vardesc.type() == core.VarDesc.VarType.LOD_TENSOR and \
-                    vardesc.need_check_feed() == True and \
-                    varobj.stop_gradient == True and \
-                    varobj.is_data == True and \
-                    varobj.belong_to_optimizer == False and \
-                    varname not in feed:
+                if (
+                    vardesc.persistable() == False
+                    and vardesc.type() == core.VarDesc.VarType.LOD_TENSOR
+                    and vardesc.need_check_feed() == True
+                    and varobj.stop_gradient == True
+                    and varobj.is_data == True
+                    and varobj.belong_to_optimizer == False
+                    and varname not in feed
+                ):
                     raise ValueError('Need feed data for variable %s' % varname)
 
         acp._auto_checkpoint(self, program)
@@ -1688,46 +1901,63 @@ def _can_use_interpreter_core(program, place):
         # For backward compatibility, run directly.
         if not compiled:
             # In distributed training, the compiled program is saved in Program._graph
-            has_compiled_graph = isinstance(program._graph,
-                                            compiler.CompiledProgram)
+            has_compiled_graph = isinstance(
+                program._graph, compiler.CompiledProgram
+            )
 
             if has_compiled_graph:
                 program._graph._compile(scope, self.place)
                 # _graph in program does not support inference since the _graph is optimized
                 # through optimizer.minimize function and should not be used as inference graph
                 # assert not program._graph._is_inference
-                return self._run_parallel(program._graph,
-                                          scope=scope,
-                                          feed=feed,
-                                          fetch_list=fetch_list,
-                                          fetch_var_name=fetch_var_name,
-                                          return_numpy=return_numpy,
-                                          return_merged=return_merged)
-
-            return self._run_program(program,
-                                     feed=feed,
-                                     fetch_list=fetch_list,
-                                     feed_var_name=feed_var_name,
-                                     fetch_var_name=fetch_var_name,
-                                     scope=scope,
-                                     return_numpy=return_numpy,
-                                     use_program_cache=use_program_cache)
+                return self._run_parallel(
+                    program._graph,
+                    scope=scope,
+                    feed=feed,
+                    fetch_list=fetch_list,
+                    fetch_var_name=fetch_var_name,
+                    return_numpy=return_numpy,
+                    return_merged=return_merged,
+                )
+
+            return self._run_program(
+                program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+                scope=scope,
+                return_numpy=return_numpy,
+                use_program_cache=use_program_cache,
+            )
 
         program._compile(scope, self.place)
         if program._is_inference:
             return self._run_inference(program._executor, feed)
         else:
-            return self._run_parallel(program,
-                                      scope=scope,
-                                      feed=feed,
-                                      fetch_list=fetch_list,
-                                      fetch_var_name=fetch_var_name,
-                                      return_numpy=return_numpy,
-                                      return_merged=return_merged)
-
-    def _run_program(self, program, feed, fetch_list, feed_var_name,
-                     fetch_var_name, scope, return_numpy, use_program_cache):
+            return self._run_parallel(
+                program,
+                scope=scope,
+                feed=feed,
+                fetch_list=fetch_list,
+                fetch_var_name=fetch_var_name,
+                return_numpy=return_numpy,
+                return_merged=return_merged,
+            )
+
+    def _run_program(
+        self,
+        program,
+        feed,
+        fetch_list,
+        feed_var_name,
+        fetch_var_name,
+        scope,
+        return_numpy,
+        use_program_cache,
+    ):
         from paddle.optimizer.lr import LRScheduler
+
         if feed is None:
             feed = {}
         elif isinstance(feed, (list, tuple)):
@@ -1736,19 +1966,22 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
 
         if not isinstance(feed, dict):
             raise TypeError(
-                "feed requires dict as its Parameter. But you passed in %s" %
-                (type(feed)))
+                "feed requires dict as its Parameter. But you passed in %s"
+                % (type(feed))
+            )
 
         assert program is not None, "The program should not be Empty"
         if not isinstance(program, Program):
             raise TypeError(
                 "Executor requires Program as its Parameter. But you passed in %s"
-                % (type(program)))
+                % (type(program))
+            )
 
         if not isinstance(fetch_var_name, str):
             raise TypeError(
                 "The name of fetch variable requires string as its Parameter. But you passed in %s"
-                % (type(fetch_var_name)))
+                % (type(fetch_var_name))
+            )
 
         if use_program_cache:
             cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
@@ -1761,35 +1994,41 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
                     feed=feed,
                     fetch_list=fetch_list,
                     feed_var_name=feed_var_name,
-                    fetch_var_name=fetch_var_name)
+                    fetch_var_name=fetch_var_name,
+                )
                 self._add_program_cache(cache_key, cached_program)
                 fetch_list_str = list(map(_to_name_str, fetch_list))
                 cached_ctx = self._default_executor.prepare(
-                    cached_program.desc, 0, fetch_list_str, False)
+                    cached_program.desc, 0, fetch_list_str, False
+                )
                 # currently, we cache program, vars, sub_scope here
                 # we suppose that in a life cycle of training, a user
                 # will not create many programs. So, here the basic
                 # rule of caching is to cache all unseen (program, var, scope)
                 # when a user use use_program_cache.
                 cached_scope = scope.new_scope()
-                self._default_executor.create_variables(cached_program.desc,
-                                                        cached_scope, 0)
+                self._default_executor.create_variables(
+                    cached_program.desc, cached_scope, 0
+                )
                 self._add_ctx_cache(cache_key, cached_ctx)
                 self._add_scope_cache(cache_key, cached_scope)
             program = cached_program
             ctx = cached_ctx
             scope = cached_scope
         else:
-            program = _add_feed_fetch_ops(program=program,
-                                          feed=feed,
-                                          fetch_list=fetch_list,
-                                          feed_var_name=feed_var_name,
-                                          fetch_var_name=fetch_var_name)
+            program = _add_feed_fetch_ops(
+                program=program,
+                feed=feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+            )
 
         self._feed_data(program, feed, feed_var_name, scope)
         if hasattr(program, 'lr_sheduler'):
-            assert isinstance(program.lr_sheduler,
-                              LRScheduler), "must be LRScheduler"
+            assert isinstance(
+                program.lr_sheduler, LRScheduler
+            ), "must be LRScheduler"
             lr_sheduler = program.lr_sheduler
             lr_value = lr_sheduler()
             lr_var = program.global_block().vars[lr_sheduler._var_name]
@@ -1798,11 +2037,13 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
             tensor.set(data, self.place)
 
         if not use_program_cache:
-            self._default_executor.run(program.desc, scope, 0, True, True,
-                                       [fetch_var_name])
+            self._default_executor.run(
+                program.desc, scope, 0, True, True, [fetch_var_name]
+            )
         else:
-            self._default_executor.run_prepared_ctx(ctx, scope, False, False,
-                                                    False)
+            self._default_executor.run_prepared_ctx(
+                ctx, scope, False, False, False
+            )
         arr = scope.find_var(fetch_var_name).get_fetch_list()
         tensors = arr._move_to_list()
         if return_numpy:
@@ -1814,17 +2055,21 @@ def _run_inference(self, exe, feed):
         return exe.run(feed)
 
     def _check_fetch_list(self, fetch_list):
-        is_fetch_var = lambda var: isinstance(var,
-                                              (Variable, str, six.string_types))
+        is_fetch_var = lambda var: isinstance(
+            var, (Variable, str, six.string_types)
+        )
         is_tuple_list = lambda var: isinstance(var, (tuple, list))
 
-        if fetch_list is None: return []
-        if is_fetch_var(fetch_list): return [fetch_list]
+        if fetch_list is None:
+            return []
+        if is_fetch_var(fetch_list):
+            return [fetch_list]
 
-        assert is_tuple_list(fetch_list), \
-            "Currently , The fetch_list type only should be list or tuple, \n"\
-            "but the input type is {}. For more information please refer to \n"\
+        assert is_tuple_list(fetch_list), (
+            "Currently , The fetch_list type only should be list or tuple, \n"
+            "but the input type is {}. For more information please refer to \n"
             "the executor.run(...).".format(type(fetch_list))
+        )
 
         res = []
         for i, var in enumerate(fetch_list):
@@ -1838,9 +2083,10 @@ def _check_fetch_list(self, fetch_list):
                     res.append(var)
             else:
                 raise TypeError(
-                    "Require fetch_list[{}] 's type shall be one of (Variable, str), but received {}."
-                    .format(i,
-                            type(var).__name__))
+                    "Require fetch_list[{}] 's type shall be one of (Variable, str), but received {}.".format(
+                        i, type(var).__name__
+                    )
+                )
 
         return res
 
@@ -1857,25 +2103,30 @@ def _adjust_pipeline_resource(self, pipeline_opt, dataset, pipeline_num):
             pipeline_num = filelist_length
             print(
                 "Pipeline training: setting the pipeline num to %d is enough because there are only %d files"
-                % (filelist_length, filelist_length))
+                % (filelist_length, filelist_length)
+            )
         if filelist_length < pipeline_num * pipeline_opt["concurrency_list"][0]:
             print(
                 "Pipeline training: setting the 1st element in concurrency_list to %d is enough because there are only %d files"
-                % (filelist_length // pipeline_num, filelist_length))
-            pipeline_opt["concurrency_list"][
-                0] = filelist_length // pipeline_num
+                % (filelist_length // pipeline_num, filelist_length)
+            )
+            pipeline_opt["concurrency_list"][0] = (
+                filelist_length // pipeline_num
+            )
         dataset.set_thread(pipeline_opt["concurrency_list"][0] * pipeline_num)
         return pipeline_num
 
-    def _prepare_trainer(self,
-                         program=None,
-                         dataset=None,
-                         scope=None,
-                         thread=0,
-                         debug=False,
-                         fetch_list=None,
-                         fetch_info=None,
-                         print_period=100):
+    def _prepare_trainer(
+        self,
+        program=None,
+        dataset=None,
+        scope=None,
+        thread=0,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+    ):
         is_heter = 0
         use_ps_gpu = 0
         if not program._fleet_opt is None:
@@ -1896,16 +2147,19 @@ def _prepare_trainer(self,
         if is_heter:
             from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
             from paddle.fluid.incubate.fleet.utils.fleet_util import FleetUtil
+
             fu = FleetUtil()
             ret = fu.split_program_by_device(program)
         if not compiled:
             # TODO: Need a better way to distinguish and specify different execution mode
             if program._pipeline_opt:
                 trainer = TrainerFactory()._create_trainer(
-                    program._pipeline_opt)
+                    program._pipeline_opt
+                )
             elif program._heter_pipeline_opt:
                 trainer = TrainerFactory()._create_trainer(
-                    program._heter_pipeline_opt)
+                    program._heter_pipeline_opt
+                )
             else:
                 trainer = TrainerFactory()._create_trainer(program._fleet_opt)
                 trainer._set_thread_barrier(program._is_distributed)
@@ -1915,13 +2169,16 @@ def _prepare_trainer(self,
         else:
             if program._pipeline_opt:
                 trainer = TrainerFactory()._create_trainer(
-                    program.program._pipeline_opt)
+                    program.program._pipeline_opt
+                )
             elif program._heter_pipeline_opt:
                 trainer = TrainerFactory()._create_trainer(
-                    program.program._heter_pipeline_opt)
+                    program.program._heter_pipeline_opt
+                )
             else:
                 trainer = TrainerFactory()._create_trainer(
-                    program.program._fleet_opt)
+                    program.program._fleet_opt
+                )
             trainer._set_program(program.program)
 
         if thread <= 0:
@@ -1930,7 +2187,8 @@ def _prepare_trainer(self,
             elif dataset.thread_num <= 0:
                 raise RuntimeError(
                     "You should set thread num first, either in Dataset"
-                    "or in Executor.train_from_dataset")
+                    "or in Executor.train_from_dataset"
+                )
             else:
                 trainer._set_thread(dataset.thread_num)
         else:
@@ -1940,19 +2198,22 @@ def _prepare_trainer(self,
         trainer._set_fetch_var_and_info(fetch_list, fetch_info, print_period)
         return scope, trainer
 
-    def _run_from_dataset(self,
-                          program=None,
-                          dataset=None,
-                          scope=None,
-                          thread=0,
-                          is_infer=False,
-                          debug=False,
-                          fetch_list=None,
-                          fetch_info=None,
-                          print_period=100,
-                          fetch_handler=None):
+    def _run_from_dataset(
+        self,
+        program=None,
+        dataset=None,
+        scope=None,
+        thread=0,
+        is_infer=False,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+        fetch_handler=None,
+    ):
         if program._pipeline_opt is not None:
             import paddle
+
             if dataset is not None:
                 raise RuntimeError("dataset should be None for pipeline mode")
             # The following fake dataset is created to call
@@ -1963,24 +2224,28 @@ def _run_from_dataset(self,
                     data_vars.append(var)
             if core.is_compiled_with_npu():
                 dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'InMemoryDataset')
+                    'InMemoryDataset'
+                )
             else:
                 dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'FileInstantDataset')
+                    'FileInstantDataset'
+                )
             dataset.set_batch_size(1)
             dataset.set_thread(1)
             dataset.set_filelist(['None'])
             dataset.set_use_var(data_vars)
         elif program._heter_pipeline_opt is not None:
             stage_id = program._heter_pipeline_opt["pipeline_stage"]
-            #print("test_fl_stage_id: {}".format(stage_id))
+            # print("test_fl_stage_id: {}".format(stage_id))
             heter_place = program._heter_pipeline_opt["heter_place"]
             if stage_id != 0:
                 if "is_fl_mode" not in program._heter_pipeline_opt:
                     import paddle
+
                     if dataset is not None:
                         raise RuntimeError(
-                            "dataset should be None for heter pipeline mode")
+                            "dataset should be None for heter pipeline mode"
+                        )
                     # The following fake dataset is created to call
                     # the _prepare_trainer api, and it is meaningless.
                     data_vars = []
@@ -1988,7 +2253,8 @@ def _run_from_dataset(self,
                         if var.is_data:
                             data_vars.append(var)
                     dataset = paddle.fluid.DatasetFactory().create_dataset(
-                        'InMemoryDataset')
+                        'InMemoryDataset'
+                    )
                     dataset.set_batch_size(1)
                     dataset.set_thread(1)
                     dataset.set_filelist(['None'])
@@ -1996,7 +2262,8 @@ def _run_from_dataset(self,
             else:
                 if dataset is None:
                     raise RuntimeError(
-                        "dataset is need and should be initialized")
+                        "dataset is need and should be initialized"
+                    )
             ## change default executor
             heter_place = framework._get_paddle_place(heter_place)
             p = core.Place()
@@ -2023,7 +2290,8 @@ def _run_from_dataset(self,
                 feed=[],
                 fetch_list=real_fetch_list,
                 feed_var_name='feed',
-                fetch_var_name='fetch')
+                fetch_var_name='fetch',
+            )
             main_block = program._pipeline_opt["section_program"].block(0)
             for op in main_block.ops:
                 # set the op_role of fetch op to Optimize to avoid
@@ -2031,16 +2299,19 @@ def _run_from_dataset(self,
                 if op.type == 'fetch':
                     op._set_attr(
                         'op_role',
-                        core.op_proto_and_checker_maker.OpRole.Optimize)
+                        core.op_proto_and_checker_maker.OpRole.Optimize,
+                    )
             fetch_list = None
-        scope, trainer = self._prepare_trainer(program=program,
-                                               dataset=dataset,
-                                               scope=scope,
-                                               thread=thread,
-                                               debug=debug,
-                                               fetch_list=fetch_list,
-                                               fetch_info=fetch_info,
-                                               print_period=print_period)
+        scope, trainer = self._prepare_trainer(
+            program=program,
+            dataset=dataset,
+            scope=scope,
+            thread=thread,
+            debug=debug,
+            fetch_list=fetch_list,
+            fetch_info=fetch_info,
+            print_period=print_period,
+        )
 
         trainer._set_infer(is_infer)
         trainer._gen_trainer_desc()
@@ -2055,8 +2326,11 @@ def _run_from_dataset(self,
         dataset._dynamic_adjust_before_train(trainer.proto_desc.thread_num)
 
         if program._heter_pipeline_opt is None:
-            trainer_instance = self._default_executor.init_for_dataset(  # -->InitForDataset
-                program.desc, trainer._desc(), scope, dataset.dataset)
+            trainer_instance = (
+                self._default_executor.init_for_dataset(  # -->InitForDataset
+                    program.desc, trainer._desc(), scope, dataset.dataset
+                )
+            )
         else:
             # cache trainer instance for heterps pipeline training
             if fetch_list == None:
@@ -2065,8 +2339,9 @@ def _run_from_dataset(self,
             trainer_instance = self._get_trainer_cache(cache_key)
             if trainer_instance is None:
                 trainer_instance = self._default_executor.init_for_dataset(
-                    program.desc, trainer._desc(), scope, dataset.dataset)
-                #print("test_fl_ps - trainer_desc: {}\n".format(trainer))
+                    program.desc, trainer._desc(), scope, dataset.dataset
+                )
+                # print("test_fl_ps - trainer_desc: {}\n".format(trainer))
                 self._add_trainer_cache(cache_key, trainer_instance)
             else:
                 trainer_instance.ResetDataset(dataset.dataset)
@@ -2093,18 +2368,20 @@ def _run_from_dataset(self,
 
         return None
 
-    def _prepare_pipeline_ctx(self,
-                              program=None,
-                              dataset=None,
-                              scope=None,
-                              thread=0,
-                              is_infer=False,
-                              debug=False,
-                              fetch_list=None,
-                              fetch_info=None,
-                              print_period=100,
-                              fetch_handler=None,
-                              use_program_cache=False):
+    def _prepare_pipeline_ctx(
+        self,
+        program=None,
+        dataset=None,
+        scope=None,
+        thread=0,
+        is_infer=False,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+        fetch_handler=None,
+        use_program_cache=False,
+    ):
         assert program._pipeline_opt is not None
         assert dataset is None, "dataset should be None for pipeline mode"
 
@@ -2124,10 +2401,12 @@ def _get_dataset():
                     data_vars.append(var)
             if core.is_compiled_with_npu():
                 dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'InMemoryDataset')
+                    'InMemoryDataset'
+                )
             else:
                 dataset = paddle.fluid.DatasetFactory().create_dataset(
-                    'FileInstantDataset')
+                    'FileInstantDataset'
+                )
             dataset.set_batch_size(1)
             dataset.set_thread(1)
             dataset.set_filelist(['None'])
@@ -2148,11 +2427,13 @@ def _get_real_program_fetch_list():
                 if fetch_var_name in real_program.global_block().vars:
                     real_fetch_list.append(fetch_var)
 
-            real_program = _add_feed_fetch_ops(program=real_program,
-                                               feed=[],
-                                               fetch_list=real_fetch_list,
-                                               feed_var_name='feed',
-                                               fetch_var_name='fetch')
+            real_program = _add_feed_fetch_ops(
+                program=real_program,
+                feed=[],
+                fetch_list=real_fetch_list,
+                feed_var_name='feed',
+                fetch_var_name='fetch',
+            )
             main_block = real_program.block(0)
             for op in main_block.ops:
                 # set the op_role of fetch op to Optimize to avoid
@@ -2160,7 +2441,8 @@ def _get_real_program_fetch_list():
                 if op.type == 'fetch':
                     op._set_attr(
                         'op_role',
-                        core.op_proto_and_checker_maker.OpRole.Optimize)
+                        core.op_proto_and_checker_maker.OpRole.Optimize,
+                    )
             return real_program, real_fetch_list
 
         real_program, real_fetch_list = _get_real_program_fetch_list()
@@ -2168,14 +2450,16 @@ def _get_real_program_fetch_list():
         program._pipeline_opt["section_program"] = real_program
         fetch_list = None
 
-        scope, trainer = self._prepare_trainer(program=program,
-                                               dataset=dataset,
-                                               scope=scope,
-                                               thread=thread,
-                                               debug=debug,
-                                               fetch_list=fetch_list,
-                                               fetch_info=fetch_info,
-                                               print_period=print_period)
+        scope, trainer = self._prepare_trainer(
+            program=program,
+            dataset=dataset,
+            scope=scope,
+            thread=thread,
+            debug=debug,
+            fetch_list=fetch_list,
+            fetch_info=fetch_info,
+            print_period=print_period,
+        )
 
         trainer._set_infer(is_infer)
         trainer._gen_trainer_desc()
@@ -2190,93 +2474,148 @@ def _get_real_program_fetch_list():
 
         trainer_desc = trainer._desc()  # slow, cache
         trainer_instance = self._default_executor.init_for_dataset(
-            program.desc, trainer_desc, scope, dataset.dataset)
+            program.desc, trainer_desc, scope, dataset.dataset
+        )
 
         ctx = [scope, real_fetch_list, trainer_instance]
-        if use_program_cache: self._add_ctx_cache(cache_key, ctx)
+        if use_program_cache:
+            self._add_ctx_cache(cache_key, ctx)
 
         return ctx
 
-    def _prepare_fleet_executor_carrier(self,
-                                        carrier_id="",
-                                        program=None,
-                                        scope=None,
-                                        fleet_opt=None,
-                                        with_standalone_executor=False):
-        num_micro_batches = fleet_opt[
-            "num_micro_batches"] if "num_micro_batches" in fleet_opt else 1
+    def _prepare_fleet_executor_carrier(
+        self,
+        carrier_id="",
+        program=None,
+        scope=None,
+        fleet_opt=None,
+        micro_scope_list=[],
+        with_standalone_executor=False,
+    ):
+        num_micro_batches = (
+            fleet_opt["num_micro_batches"]
+            if "num_micro_batches" in fleet_opt
+            else 1
+        )
         cur_rank = int(os.getenv("PADDLE_TRAINER_ID", 0))
         trainer_endpoints = os.getenv("PADDLE_TRAINER_ENDPOINTS", "").split(',')
         nrank = len(trainer_endpoints)
 
-        assert 'scheduler' in fleet_opt or 'tasks' in fleet_opt, \
-            "Fleet executor need configuration for scheduler, you can choose from 1F1B or Origin. " \
+        assert 'scheduler' in fleet_opt or 'tasks' in fleet_opt, (
+            "Fleet executor need configuration for scheduler, you can choose from 1F1B or Origin. "
             "Or you can provide a list of task nodes to init fleet executor directly."
+        )
         if 'tasks' in fleet_opt:
-            assert 'task_id_to_rank' in fleet_opt, "If you provide tasks to init fleet executor," \
-                                                   " task_id_to_rank should also be provided."
+            assert 'task_id_to_rank' in fleet_opt, (
+                "If you provide tasks to init fleet executor,"
+                " task_id_to_rank should also be provided."
+            )
             print('fleet executor will use user defined task nodes')
             tasks = [task.task_node() for task in fleet_opt['tasks']]
             task_id_to_rank = fleet_opt['task_id_to_rank']
         else:
             scheduler = fleet_opt['scheduler']
             if scheduler == '1F1B':
-                from paddle.distributed.fleet.fleet_executor_utils import run1f1b
-                if "dist_strategy" not in fleet_opt or \
-                   "pp_degree" not in fleet_opt["dist_strategy"] or \
-                   fleet_opt["dist_strategy"]["pp_degree"] == 1:
+                from paddle.distributed.fleet.fleet_executor_utils import (
+                    run1f1b,
+                )
+
+                if (
+                    "dist_strategy" not in fleet_opt
+                    or "pp_degree" not in fleet_opt["dist_strategy"]
+                    or fleet_opt["dist_strategy"]["pp_degree"] == 1
+                ):
                     warnings.warn("Using 1F1B scheduler with pp_degree == 1.")
                 tasks, task_id_to_rank = run1f1b(
-                    program, cur_rank, fleet_opt.get('num_micro_batches', 1),
-                    fleet_opt.get('dist_strategy', {}), nrank,
-                    with_standalone_executor)
+                    program,
+                    cur_rank,
+                    fleet_opt.get('num_micro_batches', 1),
+                    fleet_opt.get('dist_strategy', {}),
+                    nrank,
+                    with_standalone_executor,
+                )
             elif scheduler == 'Origin':
                 from paddle.distributed.fleet.fleet_executor_utils import origin
-                if "dist_strategy" in fleet_opt and \
-                   "pp_degree" in fleet_opt["dist_strategy"]:
-                    assert fleet_opt["dist_strategy"]["pp_degree"] == 1, \
-                        "For pipeline mode, the scheduler should be 1F1B instead of Origin."
+
+                if (
+                    "dist_strategy" in fleet_opt
+                    and "pp_degree" in fleet_opt["dist_strategy"]
+                ):
+                    assert (
+                        fleet_opt["dist_strategy"]["pp_degree"] == 1
+                    ), "For pipeline mode, the scheduler should be 1F1B instead of Origin."
                 if "num_micro_batches" in fleet_opt:
-                    assert fleet_opt["num_micro_batches"] == 1, \
-                        "For origin scheduler mode, the num micro batches should be 1."
+                    assert (
+                        fleet_opt["num_micro_batches"] == 1
+                    ), "For origin scheduler mode, the num micro batches should be 1."
                 tasks, task_id_to_rank = origin(program, cur_rank)
             else:
-                raise "Fleet_executor only supports 1F1B and Origin scheduler, " \
-                      "but received " + str(scheduler) + "."
+                raise "Fleet_executor only supports 1F1B and Origin scheduler, " "but received " + str(
+                    scheduler
+                ) + "."
             # NOTE: have to hold these vars, otherwise will be destructed
             fleet_opt['tasks'] = tasks
             fleet_opt['task_id_to_rank'] = task_id_to_rank
         place = core.Place()
         place.set_place(self.place)
-        # NOTE: the last argument is used to force create some vars in root scope,
-        # won't be used during train.
-        self._fleet_executor.init(carrier_id, program.desc, scope, place,
-                                  num_micro_batches, tasks, task_id_to_rank, [])
-
-    def _run_using_fleet_executor(self,
-                                  program=None,
-                                  feed=None,
-                                  feed_var_name="feed",
-                                  fetch_var_name="fetch",
-                                  fetch_list=None,
-                                  with_standalone_executor=False):
+
+        inference_root_scope_vars = (
+            fleet_opt["fetch_var"] if "fetch_var" in fleet_opt else []
+        )
+        self._fleet_executor.init(
+            carrier_id,
+            program.desc,
+            scope,
+            place,
+            num_micro_batches,
+            tasks,
+            task_id_to_rank,
+            inference_root_scope_vars,
+            micro_scope_list,
+        )
+
+    def _run_using_fleet_executor(
+        self,
+        program=None,
+        feed=None,
+        feed_var_name="feed",
+        fetch_var_name="fetch",
+        fetch_list=None,
+        with_standalone_executor=False,
+        return_numpy=True,
+    ):
         cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
         cached_program = self._get_program_cache(cache_key)
         cached_scope = self._get_scope_cache(cache_key)
+        micro_cached_scopes = self._get_micro_scopes_cache(cache_key)
+        fleet_opt = program._pipeline_opt["fleet_opt"]
         if cached_scope is None:
             cached_scope = global_scope()
             self._add_scope_cache(cache_key, cached_scope)
+        if micro_cached_scopes is None:
+            micro_cached_scopes = []
+            if (
+                "inference_generation" in fleet_opt
+                and fleet_opt["inference_generation"]
+            ):
+                for _ in range(int(fleet_opt["num_micro_batches"])):
+                    micro_cached_scopes.append(cached_scope.new_scope())
+                self._add_micro_scopes_cache(cache_key, micro_cached_scopes)
         if cached_program is None:
-            assert program._pipeline_opt, "program should have _pipeline_opt to start carrier"
+            assert (
+                program._pipeline_opt
+            ), "program should have _pipeline_opt to start carrier"
             real_feed = [] if feed is None else feed
             real_program = program
             if "section_program" in program._pipeline_opt:
                 real_program = program._pipeline_opt["section_program"]
-            cached_program = _add_feed_fetch_ops(program=real_program,
-                                                 feed=real_feed,
-                                                 fetch_list=fetch_list,
-                                                 feed_var_name=feed_var_name,
-                                                 fetch_var_name=fetch_var_name)
+            cached_program = _add_feed_fetch_ops(
+                program=real_program,
+                feed=real_feed,
+                fetch_list=fetch_list,
+                feed_var_name=feed_var_name,
+                fetch_var_name=fetch_var_name,
+            )
             main_block = cached_program.block(0)
             for op in main_block.ops:
                 # set the op_role of fetch op to Optimize to avoid
@@ -2284,9 +2623,9 @@ def _run_using_fleet_executor(self,
                 if op.type == 'fetch':
                     op._set_attr(
                         'op_role',
-                        core.op_proto_and_checker_maker.OpRole.Optimize)
+                        core.op_proto_and_checker_maker.OpRole.Optimize,
+                    )
             self._add_program_cache(cache_key, cached_program)
-            fleet_opt = program._pipeline_opt["fleet_opt"]
             if 'tasks' in fleet_opt:
                 # Insert feed/fetch op for cloned program in each task node,
                 # these ops has already been inserted into the origin program.
@@ -2298,9 +2637,11 @@ def _run_using_fleet_executor(self,
                 feed_task = fleet_opt['tasks'][0]
                 print("Inserting feed ops for task", feed_task.task_id())
                 feed_program = feed_task.get_program()
-                feed_program = self._add_feed_ops(program=feed_program,
-                                                  feed=real_feed,
-                                                  feed_var_name=feed_var_name)
+                feed_program = self._add_feed_ops(
+                    program=feed_program,
+                    feed=real_feed,
+                    feed_var_name=feed_var_name,
+                )
                 feed_task.set_program(feed_program)
 
                 # Insert fetch ops
@@ -2310,7 +2651,8 @@ def _run_using_fleet_executor(self,
                 fetch_program = self._add_fetch_ops(
                     program=fetch_program,
                     fetch_list=fetch_list,
-                    fetch_var_name=fetch_var_name)
+                    fetch_var_name=fetch_var_name,
+                )
                 main_block = fetch_program.block(0)
                 for op in main_block.ops:
                     # set the op_role of fetch op to Optimize to avoid
@@ -2318,7 +2660,8 @@ def _run_using_fleet_executor(self,
                     if op.type == 'fetch':
                         op._set_attr(
                             'op_role',
-                            core.op_proto_and_checker_maker.OpRole.Optimize)
+                            core.op_proto_and_checker_maker.OpRole.Optimize,
+                        )
                 fetch_task.set_program(fetch_program)
 
             self._prepare_fleet_executor_carrier(
@@ -2326,7 +2669,9 @@ def _run_using_fleet_executor(self,
                 program=cached_program,
                 scope=cached_scope,
                 fleet_opt=fleet_opt,
-                with_standalone_executor=with_standalone_executor)
+                micro_scope_list=micro_cached_scopes,
+                with_standalone_executor=with_standalone_executor,
+            )
 
         if feed:
             # NOTE: don't have to traverse programs in task nodes,
@@ -2335,18 +2680,49 @@ def _run_using_fleet_executor(self,
             self._feed_data(cached_program, feed, feed_var_name, cached_scope)
 
         from paddle.optimizer.lr import LRScheduler
+
         if hasattr(program, 'lr_sheduler'):
             lr_sheduler = program.lr_sheduler
             assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
             lr_value = lr_sheduler()
             lr_var = program.global_block().vars[lr_sheduler._var_name]
             data = np.array([lr_value]).astype(convert_dtype(lr_var.dtype))
-            tensor = core.get_variable_tensor(cached_scope,
-                                              lr_sheduler._var_name)
+            tensor = core.get_variable_tensor(
+                cached_scope, lr_sheduler._var_name
+            )
             tensor.set(data, self.place)
 
         self._fleet_executor.run(cache_key)
 
+        if "fetch_var" in fleet_opt:
+            # If we speed up the generation in evaluation, we need to generate
+            # multiple queries at the same time. Each query will in separate scope in order
+            # not mix up. It indicate that final result will in multiple scopes and need to
+            # fetch each.
+            result_list = []
+            for scope in micro_cached_scopes:
+                scope_result_list = []
+                for varname in fleet_opt["fetch_var"]:
+                    tensor = None
+                    try:
+                        tensor = core.get_variable_tensor(scope, varname)
+                        if return_numpy:
+                            tensor = as_numpy(tensor)
+                    except:
+                        var = scope.find_var(varname)
+                        tensor = var.get_lod_tensor_array()
+                        if return_numpy:
+                            tensor = as_numpy(tensor)
+                        else:
+                            tensor = [t for t in tensor]
+
+                    if tensor:
+                        scope_result_list.append(tensor)
+
+                if scope_result_list:
+                    result_list.append(scope_result_list)
+            return result_list
+
         if fetch_list:
             arr = cached_scope.find_var(fetch_var_name).get_fetch_list()
             tensors = arr._move_to_list()
@@ -2364,30 +2740,32 @@ def _add_feed_ops(self, program, feed, feed_var_name):
             feed_var = global_block.create_var(
                 name=feed_var_name,
                 type=core.VarDesc.VarType.FEED_MINIBATCH,
-                persistable=True)
+                persistable=True,
+            )
 
         # prepend feed operators
         if not has_feed_operators(global_block, feed, feed_var_name):
             for i, name in enumerate(feed):
                 if global_block.has_var(name):
                     out = global_block.var(name)
-                    global_block._prepend_op(type='feed',
-                                             inputs={'X': [feed_var]},
-                                             outputs={'Out': [out]},
-                                             attrs={'col': i})
+                    global_block._prepend_op(
+                        type='feed',
+                        inputs={'X': [feed_var]},
+                        outputs={'Out': [out]},
+                        attrs={'col': i},
+                    )
                 else:
                     warnings.warn(
                         "The variable %s is not found in program. It is not declared or is pruned."
-                        % name)
+                        % name
+                    )
 
         return tmp_program
 
     @classmethod
-    def _add_fetch_ops(cls,
-                       program,
-                       fetch_list,
-                       fetch_var_name,
-                       use_fetch_v2=False):
+    def _add_fetch_ops(
+        cls, program, fetch_list, fetch_var_name, use_fetch_v2=False
+    ):
         tmp_program = program.clone()
 
         global_block = tmp_program.global_block()
@@ -2398,7 +2776,8 @@ def _add_fetch_ops(cls,
             fetch_var = global_block.create_var(
                 name=fetch_var_name,
                 type=core.VarDesc.VarType.FETCH_LIST,
-                persistable=True)
+                persistable=True,
+            )
 
         if use_fetch_v2:
             fetch_op = 'fetch_v2'
@@ -2406,17 +2785,19 @@ def _add_fetch_ops(cls,
             fetch_op = 'fetch'
 
         # append fetch_operators
-        if not has_fetch_operators(global_block, fetch_list, fetch_var_name,
-                                   fetch_op):
+        if not has_fetch_operators(
+            global_block, fetch_list, fetch_var_name, fetch_op
+        ):
             for i, var in enumerate(fetch_list):
                 assert isinstance(var, Variable) or isinstance(
-                    var,
-                    six.string_types), ("Wrong type for fetch_list[%s]: %s" %
-                                        (i, type(var)))
-                global_block.append_op(type=fetch_op,
-                                       inputs={'X': [var]},
-                                       outputs={'Out': [fetch_var]},
-                                       attrs={'col': i})
+                    var, six.string_types
+                ), "Wrong type for fetch_list[%s]: %s" % (i, type(var))
+                global_block.append_op(
+                    type=fetch_op,
+                    inputs={'X': [var]},
+                    outputs={'Out': [fetch_var]},
+                    attrs={'col': i},
+                )
 
         return tmp_program
 
@@ -2431,25 +2812,36 @@ def _remove_fetch_ops(cls, program, fetch_op_name='fetch'):
 
         return tmp_program
 
-    def _run_pipeline(self,
-                      program=None,
-                      dataset=None,
-                      scope=None,
-                      thread=0,
-                      is_infer=False,
-                      debug=False,
-                      fetch_list=None,
-                      fetch_info=None,
-                      print_period=100,
-                      fetch_handler=None,
-                      use_program_cache=False):
-        scope, real_fetch_list, trainer_instance = \
-            self._prepare_pipeline_ctx(program, dataset, scope, thread,
-                                       is_infer, debug, fetch_list, fetch_info,
-                                       print_period, fetch_handler,
-                                       use_program_cache)
+    def _run_pipeline(
+        self,
+        program=None,
+        dataset=None,
+        scope=None,
+        thread=0,
+        is_infer=False,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+        fetch_handler=None,
+        use_program_cache=False,
+    ):
+        scope, real_fetch_list, trainer_instance = self._prepare_pipeline_ctx(
+            program,
+            dataset,
+            scope,
+            thread,
+            is_infer,
+            debug,
+            fetch_list,
+            fetch_info,
+            print_period,
+            fetch_handler,
+            use_program_cache,
+        )
 
         from paddle.optimizer.lr import LRScheduler
+
         if hasattr(program, 'lr_sheduler'):
             lr_sheduler = program.lr_sheduler
             assert isinstance(lr_sheduler, LRScheduler), "must be LRScheduler"
@@ -2471,16 +2863,18 @@ def _run_pipeline(self,
 
         return None
 
-    def infer_from_dataset(self,
-                           program=None,
-                           dataset=None,
-                           scope=None,
-                           thread=0,
-                           debug=False,
-                           fetch_list=None,
-                           fetch_info=None,
-                           print_period=100,
-                           fetch_handler=None):
+    def infer_from_dataset(
+        self,
+        program=None,
+        dataset=None,
+        scope=None,
+        thread=0,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+        fetch_handler=None,
+    ):
         """
         Infer from a pre-defined Dataset. Dataset is defined in paddle.fluid.dataset.
         Given a program, either a program or compiled program, infer_from_dataset will
@@ -2536,26 +2930,39 @@ def infer_from_dataset(self,
                                        dataset=dataset)
 
         """
-        return self._run_from_dataset(program, dataset, scope, thread, True,
-                                      debug, fetch_list, fetch_info,
-                                      print_period, fetch_handler)
-
-    def start_heter_trainer(self,
-                            program=None,
-                            scope=None,
-                            debug=False,
-                            fetch_list=None,
-                            fetch_info=None,
-                            print_period=100,
-                            fetch_handler=None):
-        scope, trainer = self._prepare_trainer(program=program,
-                                               dataset=None,
-                                               scope=scope,
-                                               thread=1,
-                                               debug=debug,
-                                               fetch_list=fetch_list,
-                                               fetch_info=fetch_info,
-                                               print_period=print_period)
+        return self._run_from_dataset(
+            program,
+            dataset,
+            scope,
+            thread,
+            True,
+            debug,
+            fetch_list,
+            fetch_info,
+            print_period,
+            fetch_handler,
+        )
+
+    def start_heter_trainer(
+        self,
+        program=None,
+        scope=None,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+        fetch_handler=None,
+    ):
+        scope, trainer = self._prepare_trainer(
+            program=program,
+            dataset=None,
+            scope=scope,
+            thread=1,
+            debug=debug,
+            fetch_list=fetch_list,
+            fetch_info=fetch_info,
+            print_period=print_period,
+        )
 
         trainer._set_infer(False)
         trainer._gen_trainer_desc()
@@ -2563,32 +2970,35 @@ def start_heter_trainer(self,
         self._dump_debug_info(program=program, trainer=trainer)
 
         trainer_instance = self._default_executor.init_for_dataset(
-            program.desc, trainer._desc(), scope, None)
+            program.desc, trainer._desc(), scope, None
+        )
 
-        #if fetch_handler is not None:
+        # if fetch_handler is not None:
         #    scope0 = trainer_instance.get_worker_scope(0)
         #    fetch_monitor = FetchHandlerMonitor(scope0, fetch_handler)
         #    fetch_monitor.start()
         #    self._default_executor.run_from_dataset(trainer_instance)
         #    fetch_monitor.stop()
         #    self._default_executor.release_trainer(trainer_instance)
-        #else:
+        # else:
 
         self._default_executor.run_from_dataset(trainer_instance)
-        #self._default_executor.release_trainer(trainer_instance)
+        # self._default_executor.release_trainer(trainer_instance)
 
         return trainer_instance
 
-    def train_from_dataset(self,
-                           program=None,
-                           dataset=None,
-                           scope=None,
-                           thread=0,
-                           debug=False,
-                           fetch_list=None,
-                           fetch_info=None,
-                           print_period=100,
-                           fetch_handler=None):
+    def train_from_dataset(
+        self,
+        program=None,
+        dataset=None,
+        scope=None,
+        thread=0,
+        debug=False,
+        fetch_list=None,
+        fetch_info=None,
+        print_period=100,
+        fetch_handler=None,
+    ):
         """
         Train from a pre-defined Dataset. Dataset is defined in paddle.fluid.dataset.
         Given a program, either a program or compiled program, train_from_dataset will
@@ -2610,7 +3020,7 @@ def train_from_dataset(self,
                 for each run. default is global_scope
             thread(int): number of thread a user wants to run in this function. Default is 0, which
                 means using thread num of dataset
-            debug(bool): whether a user wants to run train_from_dataset 
+            debug(bool): whether a user wants to run train_from_dataset
             fetch_list(Tensor List): fetch Tensor list, each variable will be printed
                 during training
             fetch_info(String List): print information for each Tensor, its length should be equal
@@ -2620,9 +3030,9 @@ def train_from_dataset(self,
 
         Returns:
             None
-        
+
         Examples:
-        
+
             .. code-block:: python
 
               import paddle
@@ -2643,6 +3053,15 @@ def train_from_dataset(self,
                                      dataset=dataset)
 
         """
-        return self._run_from_dataset(program, dataset, scope, thread, False,
-                                      debug, fetch_list, fetch_info,
-                                      print_period, fetch_handler)
+        return self._run_from_dataset(
+            program,
+            dataset,
+            scope,
+            thread,
+            False,
+            debug,
+            fetch_list,
+            fetch_info,
+            print_period,
+            fetch_handler,
+        )
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index bf56b125fd7187..4fbcdc78536acf 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -75,7 +75,7 @@
 CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName()
 
 _dygraph_tracer_ = None
-_in_eager_mode_ = (os.environ.get('FLAGS_enable_eager_mode', '1') == '1')
+_in_eager_mode_ = os.environ.get('FLAGS_enable_eager_mode', '1') == '1'
 _global_expected_place_ = None
 _current_device = None
 global_prog_seed = 0
@@ -84,10 +84,12 @@
 _already_patch_varbase = False
 _current_cuda_graph_mode = None
 _global_flags_ = core.globals()
-_enable_standalone_executor_ = (os.environ.get('FLAGS_USE_STANDALONE_EXECUTOR',
-                                               None))
-_dy2st_enable_standalone_executor_ = (os.environ.get(
-    'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 0))
+_enable_standalone_executor_ = os.environ.get(
+    'FLAGS_USE_STANDALONE_EXECUTOR', None
+)
+_dy2st_enable_standalone_executor_ = os.environ.get(
+    'FLAGS_DY2ST_USE_STANDALONE_EXECUTOR', 0
+)
 
 # Some explanation of our execution system 2022.03
 # For now we have 3 kinds of execution system, since we refactored dygraph mode to
@@ -148,6 +150,7 @@ def _update_monkey_methods(is_eager):
 
 def _switch_tensor_bind_type(is_eager):
     import paddle
+
     if is_eager:
         paddle.Tensor = core.eager.Tensor
     else:
@@ -182,8 +185,12 @@ def _fallback_legacy_dygraph():
     global _is_first_import_
     need_fallback = False
     # Only enable eager on CPU/GPU
-    is_not_support = core.is_compiled_with_xpu() or core.is_compiled_with_npu(
-    ) or core.is_compiled_with_ipu() or core.is_compiled_with_mlu()
+    is_not_support = (
+        core.is_compiled_with_xpu()
+        or core.is_compiled_with_npu()
+        or core.is_compiled_with_ipu()
+        or core.is_compiled_with_mlu()
+    )
 
     if _in_eager_mode_ and is_not_support:
         # switch into legacy dygraph mode
@@ -283,16 +290,16 @@ def ipu_shard_guard(index=-1, stage=-1):
         index(int, optional): Specify which ipu the Tensor is computed on, (such as '0, 1, 2, 3').
             The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as '0, 1, 2, 3').
-            The sharded model will be computed from small to large. The default value is -1, 
+            The sharded model will be computed from small to large. The default value is -1,
             which means no pipelining computation order and run Ops in terms of graph.
-    
-    **Note**:
-    Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer 
-    to :code:`paddle.static.IpuStrategy` . 
-    Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer 
-    to :code:`paddle.static.IpuStrategy` .
-    A index is allowed to match none stage or a stage. A stage is only allowed to match a new or 
-    duplicated index.
+
+    Note:
+        Only if the enable_manual_shard=True, the 'index' is able to be set not -1. Please refer
+        to :ref:`api_paddle_static_IpuStrategy`.
+        Only if the enable_pipelining=True, the 'stage' is able to be set not -1. Please refer
+        to :ref:`api_paddle_static_IpuStrategy`.
+        A index is allowed to match none stage or a stage. A stage is only allowed to match a new or
+        duplicated index.
 
     Examples:
         .. code-block:: python
@@ -331,18 +338,22 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
     """
     Shard the ipu with the given call function. Set every ops in call function to the given ipu sharding.
 
+    Note:
+        Only when enable_manual_shard=True to set the index to a value other than -1. please refer to :ref:`api_paddle_static_IpuStrategy` .
+        Only when enable_pipelining=True to set stage to a value other than -1. please refer to :ref:`api_paddle_static_IpuStrategy` .
+        An index supports a corresponding None stage or a stage, and a stage only supports a new index or a duplicate index.
+
     Args:
         call_func(Layer|function): Specify the call function to be wrapped.
         index(int, optional): Specify which ipu the Tensor is computed on, (such as ‘0, 1, 2, 3’).
             The default value is -1, which means the Op only run on IPU 0.
         stage(int, optional): Specify the computation order of the sharded model(such as ‘0, 1, 2, 3’).
-            The sharded model will be computed from small to large. The default value is -1, 
+            The sharded model will be computed from small to large. The default value is -1,
             which means no pipelining computation order and run Ops in terms of graph.
 
     Returns:
         The wrapped call function.
 
-
     Examples:
         .. code-block:: python
 
@@ -357,7 +368,6 @@ def set_ipu_shard(call_func, index=-1, stage=-1):
     """
 
     def decorate(func):
-
         def wrapper(*args, **kwargs):
             with ipu_shard_guard(index=index, stage=stage):
                 return func(*args, **kwargs)
@@ -365,16 +375,17 @@ def wrapper(*args, **kwargs):
         return wrapper
 
     from .dygraph.layers import Layer
+
     if not isinstance(call_func, Layer):
         if callable(call_func):
             return decorate(call_func)
         else:
             raise TypeError(
-                "Unsupported type. Only accept paddle.nn.Layer or function.")
+                "Unsupported type. Only accept paddle.nn.Layer or function."
+            )
 
     # patch paddle.nn.Layer
     class BlockFn(type(call_func)):
-
         def __call__(self, *args, **kwargs):
             with ipu_shard_guard(index=index, stage=stage):
                 return super().__call__(*args, **kwargs)
@@ -386,62 +397,68 @@ def __call__(self, *args, **kwargs):
 
 def require_version(min_version, max_version=None):
     """
-        Check if the installed version of PaddlePaddle is in [min_version, max_version],
-        if the installed version is lower than ``min_version`` or higher than ``max_version``,
-        an exception will be thrown, NO returns if the installed version is satisfied.
+    Check if the installed version of PaddlePaddle is in [min_version, max_version],
+    if the installed version is lower than ``min_version`` or higher than ``max_version``,
+    an exception will be thrown, NO returns if the installed version is satisfied.
 
-        Args:
-            min_version (str): the minimum version required (like '1.4.0').
-            max_version (str, optional): the max version required (like '1.6.0'), default is None,
-                meaning any version equal or higher than ``min_version`` is acceptable.
+    Args:
+        min_version (str): the minimum version required (like '1.4.0').
+        max_version (str, optional): the max version required (like '1.6.0'), default is None,
+            meaning any version equal or higher than ``min_version`` is acceptable.
 
-        Returns:
-            None.
+    Returns:
+        None.
 
-        Raises:
-            TypeError: if the type of ``min_version`` is not str.
-            TypeError: if the type of ``max_version`` is not str or type(None).
-            ValueError: if the value of ``min_version`` is not in version format.
-            ValueError: if the value of ``max_version`` is not in version format or None.
-            Exception: if the installed version is lower than ``min_version`` or higher than ``max_version``.
+    Raises:
+        TypeError: if the type of ``min_version`` is not str.
+        TypeError: if the type of ``max_version`` is not str or type(None).
+        ValueError: if the value of ``min_version`` is not in version format.
+        ValueError: if the value of ``max_version`` is not in version format or None.
+        Exception: if the installed version is lower than ``min_version`` or higher than ``max_version``.
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-                import paddle.fluid as fluid
+            import paddle.fluid as fluid
 
-                # any version >= 0.1.0 is acceptable.
-                fluid.require_version('0.1.0')
+            # any version >= 0.1.0 is acceptable.
+            fluid.require_version('0.1.0')
 
-                # if 0.1.0 <= version <= 10.0.0, it is acceptable.
-                fluid.require_version(min_version='0.1.0', max_version='10.0.0')
-        """
+            # if 0.1.0 <= version <= 10.0.0, it is acceptable.
+            fluid.require_version(min_version='0.1.0', max_version='10.0.0')
+    """
     if not isinstance(min_version, str):
         raise TypeError(
             "The type of 'min_version' in require_version must be str, but received %s."
-            % (type(min_version)))
+            % (type(min_version))
+        )
 
     if not isinstance(max_version, (str, type(None))):
         raise TypeError(
             "The type of 'max_version' in require_version must be str or type(None), but received %s."
-            % (type(max_version)))
+            % (type(max_version))
+        )
 
     check_format = re.match(r'\d+(\.\d+){0,3}', min_version)
     if check_format is None or check_format.group() != min_version:
         raise ValueError(
             "The value of 'min_version' in require_version must be in format '\\d+(\\.\\d+){0,3}', "
-            "like '1.5.2.0', but received %s" % min_version)
+            "like '1.5.2.0', but received %s" % min_version
+        )
 
     if max_version is not None:
         check_format = re.match(r'\d+(\.\d+){0,3}', max_version)
         if check_format is None or check_format.group() != max_version:
             raise ValueError(
                 "The value of 'max_version' in require_version must be in format '\\d+(\\.\\d+){0,3}', "
-                "like '1.5.2.0', but received %s" % max_version)
+                "like '1.5.2.0', but received %s" % max_version
+            )
 
     version_installed = [
-        fluid_version.major, fluid_version.minor, fluid_version.patch,
-        fluid_version.rc
+        fluid_version.major,
+        fluid_version.minor,
+        fluid_version.patch,
+        fluid_version.rc,
     ]
     zero_version = ['0', '0', '0', '0']
 
@@ -458,64 +475,86 @@ def version_cmp(ver_a, ver_b):
             warnings.warn(
                 "PaddlePaddle version in [%s, %s] required, but %s installed. "
                 "Maybe you are using a develop version, "
-                "please make sure the version is good with your code." %
-                (min_version, max_version, fluid_version.full_version))
+                "please make sure the version is good with your code."
+                % (min_version, max_version, fluid_version.full_version)
+            )
         else:
             warnings.warn(
                 "PaddlePaddle version %s or higher is required, but %s installed, "
                 "Maybe you are using a develop version, "
-                "please make sure the version is good with your code." %
-                (min_version, fluid_version.full_version))
+                "please make sure the version is good with your code."
+                % (min_version, fluid_version.full_version)
+            )
         return
 
     min_version_split = min_version.split('.')
-    min_version_to_check = min_version_split + zero_version[
-        len(min_version_split):]
+    min_version_to_check = (
+        min_version_split + zero_version[len(min_version_split) :]
+    )
 
     if max_version is not None:
         max_version_split = max_version.split('.')
-        max_version_to_check = max_version_split + zero_version[
-            len(max_version_split):]
+        max_version_to_check = (
+            max_version_split + zero_version[len(max_version_split) :]
+        )
 
-        if version_cmp(version_installed,
-                       max_version_to_check) > 0 or version_cmp(
-                           version_installed, min_version_to_check) < 0:
+        if (
+            version_cmp(version_installed, max_version_to_check) > 0
+            or version_cmp(version_installed, min_version_to_check) < 0
+        ):
             raise Exception(
                 "VersionError: PaddlePaddle version in [%s, %s] required, but %s installed."
-                % (min_version, max_version, fluid_version.full_version))
+                % (min_version, max_version, fluid_version.full_version)
+            )
     else:
         if version_cmp(version_installed, min_version_to_check) < 0:
             raise Exception(
                 "VersionError: PaddlePaddle version %s or higher is required, but %s installed, "
                 "please upgrade your PaddlePaddle to %s or other higher version."
-                % (min_version, fluid_version.full_version, min_version))
+                % (min_version, fluid_version.full_version, min_version)
+            )
 
 
 def _dygraph_not_support_(func):
-
     def __impl__(*args, **kwargs):
-        assert not _non_static_mode(
-        ), "We don't support %s in dynamic graph mode" % func.__name__
+        assert not _non_static_mode(), (
+            "We don't support %s in dynamic graph mode" % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
 
 
 def _dygraph_only_(func):
+    def __impl__(*args, **kwargs):
+        assert _non_static_mode(), (
+            "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode."
+            % func.__name__
+        )
+        return func(*args, **kwargs)
 
+    return __impl__
+
+
+def _non_static_only_(func):
     def __impl__(*args, **kwargs):
-        assert _non_static_mode(
-        ), "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode." % func.__name__
+        from .dygraph.base import in_declarative_mode
+
+        assert _non_static_mode() or in_declarative_mode(), (
+            "We only support '%s()' in dynamic graph mode, please call 'paddle.disable_static()' to enter dynamic graph mode."
+            % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
 
 
 def _static_only_(func):
-
     def __impl__(*args, **kwargs):
-        assert not _non_static_mode(
-        ), "In PaddlePaddle 2.x, we turn on dynamic graph mode by default, and '%s()' is only supported in static graph mode. So if you want to use this api, please call 'paddle.enable_static()' before this api to enter static graph mode." % func.__name__
+        assert not _non_static_mode(), (
+            "In PaddlePaddle 2.x, we turn on dynamic graph mode by default, and '%s()' is only supported in static graph mode. So if you want to use this api, please call 'paddle.enable_static()' before this api to enter static graph mode."
+            % func.__name__
+        )
         return func(*args, **kwargs)
 
     return __impl__
@@ -534,14 +573,14 @@ def _set_pipeline_stage(stage):
 # TODO(zhiqiu): We should make VarBase consistent with Variable in future, for example, by inheritting
 # same base class.
 def _fake_interface_only_(func):
-
     def __impl__(*args, **kwargs):
         raise AssertionError(
             "'%s' only can be called by `paddle.Tensor` in dynamic graph mode. Suggestions:\n"
             "  1. If you are in static graph mode, you can switch to dynamic graph mode by turning off `paddle.enable_static()` or calling `paddle.disable_static()`.\n"
             "  2. If you are using `@paddle.jit.to_static`, you can turn off ProgramTranslator by calling `paddle.jit.ProgramTranslator().enable(False)`. "
             "If you have to translate dynamic graph to static graph, please use other API to replace '%s'."
-            % (func.__name__, func.__name__))
+            % (func.__name__, func.__name__)
+        )
 
     return __impl__
 
@@ -552,13 +591,13 @@ def __impl__(*args, **kwargs):
 # NOTE(chenweihang): not using `wrap_decorator` here is because `wrap_decorator` will
 # move kwargs to args, which doesn't work in this decorate case
 def deprecate_stat_dict(func):
-
     @functools.wraps(func)
     def wrapper(*args, **kwargs):
         if 'stat_dict' in kwargs:
             warnings.warn(
                 "The argument `stat_dict` has deprecated, please change it to `state_dict`.",
-                DeprecationWarning)
+                DeprecationWarning,
+            )
             kwargs['state_dict'] = kwargs['stat_dict']
             kwargs.pop('stat_dict')
         return func(*args, **kwargs)
@@ -570,6 +609,7 @@ def wrapper(*args, **kwargs):
 dygraph_only = wrap_decorator(_dygraph_only_)
 static_only = wrap_decorator(_static_only_)
 fake_interface_only = wrap_decorator(_fake_interface_only_)
+non_static_only = wrap_decorator(_non_static_only_)
 
 
 def _dygraph_tracer():
@@ -639,12 +679,12 @@ def _set_expected_place(place):
 
 # TODO(zhiqiu): remove this function.
 def _var_base_to_np(var_base):
-    """	
-    convert VarBase tp numpy	
+    """
+    convert VarBase tp numpy
 
-    Args:	
-        var_base(VarBase) : the VarBase to convert	
-    Returns (np.ndarray): the np.ndarray contain the value of VarBase	
+    Args:
+        var_base(VarBase) : the VarBase to convert
+    Returns (np.ndarray): the np.ndarray contain the value of VarBase
     """
 
     warnings.warn(
@@ -663,7 +703,9 @@ def _cpu_num():
                 'And if this parameter are set as N (equal to the number of physical CPU core) the program may be faster.\n\n'
                 'export CPU_NUM={} # for example, set CPU_NUM as number of physical CPU core which is {}.\n\n'
                 '!!! The default number of CPU_NUM=1.\n'.format(
-                    multiprocessing.cpu_count(), multiprocessing.cpu_count()))
+                    multiprocessing.cpu_count(), multiprocessing.cpu_count()
+                )
+            )
         os.environ['CPU_NUM'] = str(1)
     cpu_num = os.environ.get('CPU_NUM')
     return int(cpu_num)
@@ -742,14 +784,14 @@ def disable_signal_handler():
     Paddle installs signal handlers at C++ level to log debug information upon failing.
     However, conflicts can happen if another python module is making use of such signal.
     Such being the case, one may disblae paddle signal handler via this interface.
-    
+
     Known frameworks that require disabling signal handler includes:
     1. TVM
     2. ADLIK
 
     Make sure you called paddle.disable_signal_handler() before using above mentioned frameworks.
 
-    Returns: None 
+    Returns: None
 
     Examples:
         .. code-block:: python
@@ -822,7 +864,7 @@ def cuda_places(device_ids=None):
 
     If :code:`device_ids` is not None, it should be the device
     ids of GPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
     [paddle.CUDAPlace(0), paddle.CUDAPlace(1), paddle.CUDAPlace(2)].
 
     Parameters:
@@ -832,21 +874,20 @@ def cuda_places(device_ids=None):
         list of paddle.CUDAPlace: Created GPU place list.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
             import paddle.static as static
 
             # required: gpu
-            
+
             paddle.enable_static()
 
             cuda_places = static.cuda_places()
 
     """
-    assert core.is_compiled_with_cuda(), \
-        "Not compiled with CUDA"
+    assert core.is_compiled_with_cuda(), "Not compiled with CUDA"
     if device_ids is None:
         device_ids = _cuda_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -867,9 +908,9 @@ def xpu_places(device_ids=None):
         xpu places would be returned.
         If :code:`device_ids` is not None, it should be the device
         ids of XPUs. For example, if :code:`device_ids=[0,1,2]`,
-        the returned list would be 
+        the returned list would be
         [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)].
-    
+
     Parameters:
         device_ids (list or tuple of int, optional): list of XPU device ids.
     Returns:
@@ -881,12 +922,11 @@ def xpu_places(device_ids=None):
 
             import paddle
             import paddle.static as static
-            
+
             paddle.enable_static()
             xpu_places = static.xpu_places()
     """
-    assert core.is_compiled_with_xpu(), \
-        "Not compiled with XPU"
+    assert core.is_compiled_with_xpu(), "Not compiled with XPU"
     if device_ids is None:
         device_ids = _xpu_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -898,7 +938,7 @@ def npu_places(device_ids=None):
     """
     **Note**:
         For multi-card tasks, please use `FLAGS_selected_npus` environment variable to set the visible NPU device.
-    
+
     This function creates a list of :code:`paddle.NPUPlace` objects.
     If :code:`device_ids` is None, environment variable of
     :code:`FLAGS_selected_npus` would be checked first. For example, if
@@ -908,9 +948,9 @@ def npu_places(device_ids=None):
     npu places would be returned.
     If :code:`device_ids` is not None, it should be the device
     ids of NPUs. For example, if :code:`device_ids=[0,1,2]`,
-    the returned list would be 
+    the returned list would be
     [paddle.NPUPlace(0), paddle.NPUPlace(1), paddle.NPUPlace(2)].
-    
+
     Parameters:
         device_ids (list or tuple of int, optional): list of NPU device ids.
     Returns:
@@ -922,12 +962,11 @@ def npu_places(device_ids=None):
 
             import paddle
             import paddle.static as static
-            
+
             paddle.enable_static()
             npu_places = static.npu_places()
     """
-    assert core.is_compiled_with_npu(), \
-        "Not compiled with NPU"
+    assert core.is_compiled_with_npu(), "Not compiled with NPU"
     if device_ids is None:
         device_ids = _npu_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -940,7 +979,7 @@ def cpu_places(device_count=None):
     This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the default value is 1,
     i.e. CPU_NUM=1.
     :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -953,7 +992,7 @@ def cpu_places(device_count=None):
         list of paddle.CPUPlace: Created list of CPU places.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -974,7 +1013,7 @@ def cuda_pinned_places(device_count=None):
     This function creates a list of :code:`fluid.CUDAPinnedPlace` objects.
 
     If :code:`device_count` is None, the device count would
-    be determined by environment variable :code:`CPU_NUM`. 
+    be determined by environment variable :code:`CPU_NUM`.
     If :code:`CPU_NUM` is not set, the default value is 1,
     i.e. CPU_NUM=1.
     :code:`CPU_NUM` indicates the number of devices used in the current task.
@@ -995,8 +1034,7 @@ def cuda_pinned_places(device_count=None):
             cuda_pinned_places = fluid.cuda_pinned_places(1)
 
     """
-    assert core.is_compiled_with_cuda(), \
-        "Not compiled with CUDA"
+    assert core.is_compiled_with_cuda(), "Not compiled with CUDA"
     if device_count is None:
         device_count = len(_cuda_ids())
     return [core.CUDAPinnedPlace()] * device_count
@@ -1004,19 +1042,20 @@ def cuda_pinned_places(device_count=None):
 
 def mlu_places(device_ids=None):
     """
-    **Note**:
+    This function creates a list of :code:`paddle.device.MLUPlace` objects.
+    If :code:`device_ids` is None, environment variable of
+    :code:`FLAGS_selected_mlus` would be checked first. For example, if
+    :code:`FLAGS_selected_mlus=0,1,2`, the returned list would
+    be [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
+    If :code:`FLAGS_selected_mlus` is not set, all visible
+    mlu places would be returned.
+    If :code:`device_ids` is not None, it should be the device
+    ids of MLUs. For example, if :code:`device_ids=[0,1,2]`,
+    the returned list would be
+    [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
+
+    Note:
         For multi-card tasks, please use `FLAGS_selected_mlus` environment variable to set the visible MLU device.
-        This function creates a list of :code:`paddle.device.MLUPlace` objects.
-        If :code:`device_ids` is None, environment variable of
-        :code:`FLAGS_selected_mlus` would be checked first. For example, if
-        :code:`FLAGS_selected_mlus=0,1,2`, the returned list would
-        be [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
-        If :code:`FLAGS_selected_mlus` is not set, all visible
-        mlu places would be returned.
-        If :code:`device_ids` is not None, it should be the device
-        ids of MLUs. For example, if :code:`device_ids=[0,1,2]`,
-        the returned list would be
-        [paddle.device.MLUPlace(0), paddle.device.MLUPlace(1), paddle.device.MLUPlace(2)].
 
     Parameters:
         device_ids (list or tuple of int, optional): list of MLU device ids.
@@ -1035,8 +1074,7 @@ def mlu_places(device_ids=None):
             paddle.enable_static()
             mlu_places = static.mlu_places()
     """
-    assert core.is_compiled_with_mlu(), \
-        "Not compiled with MLU"
+    assert core.is_compiled_with_mlu(), "Not compiled with MLU"
     if device_ids is None:
         device_ids = _mlu_ids()
     elif not isinstance(device_ids, (list, tuple)):
@@ -1045,7 +1083,6 @@ def mlu_places(device_ids=None):
 
 
 class NameScope(object):
-
     def __init__(self, name="", parent=None):
         self._children = dict()
         self._name = name
@@ -1056,8 +1093,9 @@ def child(self, prefix):
             new_child = NameScope(prefix, self)
             self._children[prefix] = [new_child]
         else:
-            new_child = NameScope(prefix + "_%d" % len(self._children[prefix]),
-                                  self)
+            new_child = NameScope(
+                prefix + "_%d" % len(self._children[prefix]), self
+            )
             self._children[prefix].append(new_child)
         return new_child
 
@@ -1077,7 +1115,7 @@ def name_scope(prefix=None):
 
     Generate hierarchical name prefix for the operators in Static Graph.
 
-    Note: 
+    Note:
         This should only used for debugging and visualization purpose.
         Don't use it for serious analysis such as graph/program transformations.
         Don't use it in dygraph, since it will cause memory leak.
@@ -1086,7 +1124,7 @@ def name_scope(prefix=None):
         prefix(str, optional): prefix. Default is none.
 
     Examples:
-    
+
         .. code-block:: python
 
           import paddle
@@ -1103,7 +1141,7 @@ def name_scope(prefix=None):
           with paddle.static.name_scope("s4"):
                 g = f - 1
 
-          # Op are created in the default main program.  
+          # Op are created in the default main program.
           for op in paddle.static.default_main_program().block(0).ops:
               # elementwise_add is created in /s1/
               if op.type == 'elementwise_add':
@@ -1147,6 +1185,7 @@ def _full_name_scope():
 
 def generate_control_dev_var_name():
     import random
+
     return CONTROL_DEP_VAR_PREFIX + "@" + str(random.random())
 
 
@@ -1214,8 +1253,9 @@ def dtype_is_floating(dtype):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     return dtype in [
-        core.VarDesc.VarType.FP16, core.VarDesc.VarType.FP32,
-        core.VarDesc.VarType.FP64
+        core.VarDesc.VarType.FP16,
+        core.VarDesc.VarType.FP32,
+        core.VarDesc.VarType.FP64,
     ]
 
 
@@ -1235,16 +1275,20 @@ def _debug_string_(proto, throw_on_error=True):
     if not proto.IsInitialized(error_fields) and throw_on_error:
         raise ValueError(
             "{0} are not initialized.\nThe message is {1}:\n".format(
-                error_fields, proto))
+                error_fields, proto
+            )
+        )
     return proto.__str__()
 
 
-def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
-                     name=None,
-                     shape=None,
-                     dtype=None,
-                     persistable=None,
-                     **kwargs):
+def _varbase_creator(
+    type=core.VarDesc.VarType.LOD_TENSOR,
+    name=None,
+    shape=None,
+    dtype=None,
+    persistable=None,
+    **kwargs
+):
     if dtype is not None:
         if not isinstance(dtype, core.VarDesc.VarType):
             dtype = convert_np_dtype_to_dtype_(dtype)
@@ -1252,16 +1296,21 @@ def _varbase_creator(type=core.VarDesc.VarType.LOD_TENSOR,
     if _in_eager_mode_:
         eager_tensor = core.eager.Tensor(
             dtype if dtype else core.VarDesc.VarType.FP32,
-            list(shape) if shape else [], name,
+            list(shape) if shape else [],
+            name,
             type if type else core.VarDesc.VarType.LOD_TENSOR,
-            True if persistable else False)
+            True if persistable else False,
+        )
         eager_tensor.retain_grads()
         return eager_tensor
     else:
-        return core.VarBase(dtype if dtype else core.VarDesc.VarType.FP32,
-                            list(shape) if shape else [], name,
-                            type if type else core.VarDesc.VarType.LOD_TENSOR,
-                            True if persistable else False)
+        return core.VarBase(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [],
+            name,
+            type if type else core.VarDesc.VarType.LOD_TENSOR,
+            True if persistable else False,
+        )
 
 
 def _all_is_type(vals, expected_type):
@@ -1271,12 +1320,12 @@ def _all_is_type(vals, expected_type):
     NOTE: BuiltIn all() will always return True if vals is empty.
     """
     assert isinstance(vals, (list, tuple))
-    if not vals: return False
+    if not vals:
+        return False
     return all(isinstance(v, expected_type) for v in vals)
 
 
 class VariableMetaClass(type):
-
     @classmethod
     def __instancecheck__(cls, instance):
         t = type(instance)
@@ -1289,7 +1338,6 @@ def __instancecheck__(cls, instance):
 
 
 class ParameterMetaClass(VariableMetaClass):
-
     @classmethod
     def __instancecheck__(cls, instance):
         t = type(instance)
@@ -1304,12 +1352,13 @@ def __instancecheck__(cls, instance):
 @six.add_metaclass(VariableMetaClass)
 class Variable(object):
     """
-    **Notes**:
-        **The constructor of Variable should not be invoked directly.**
 
-        **In Static Graph Mode: Please use** `Block.create_var` **to create a Static variable which has no data until being feed.**
+    Notes:
+        The constructor of Variable should not be invoked directly.
+
+        In Static Graph Mode: Please use ** `Block.create_var` ** to create a Static variable which has no data until being feed.
 
-        **In Dygraph Mode: Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph variable with real data**
+        In Dygraph Mode: Please use ** :ref:`api_fluid_dygraph_to_variable` ** to create a dygraph variable with real data.
 
     In Fluid, every input and output of an OP is a variable. In most
     cases, variables are used for holding different kinds of data or training
@@ -1346,21 +1395,23 @@ class Variable(object):
 
     """
 
-    def __init__(self,
-                 block,
-                 type=core.VarDesc.VarType.LOD_TENSOR,
-                 name=None,
-                 shape=None,
-                 dtype=None,
-                 lod_level=None,
-                 capacity=None,
-                 persistable=None,
-                 error_clip=None,
-                 stop_gradient=False,
-                 is_data=False,
-                 need_check_feed=False,
-                 belong_to_optimizer=False,
-                 **kwargs):
+    def __init__(
+        self,
+        block,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        name=None,
+        shape=None,
+        dtype=None,
+        lod_level=None,
+        capacity=None,
+        persistable=None,
+        error_clip=None,
+        stop_gradient=False,
+        is_data=False,
+        need_check_feed=False,
+        belong_to_optimizer=False,
+        **kwargs
+    ):
         self.block = block
         if name is None:
             name = unique_name.generate('_generated_var')
@@ -1373,6 +1424,9 @@ def __init__(self,
             type = core.VarDesc.VarType.STRINGS
             lod_level = None
 
+        if type == core.VarDesc.VarType.SPARSE_COO:
+            lod_level = None
+
         self.belong_to_optimizer = belong_to_optimizer
 
         self.error_clip = error_clip
@@ -1388,10 +1442,11 @@ def __init__(self,
         if is_new_var:
             self.desc.set_type(type)
         elif self.desc.type() != type:
-            raise ValueError("Variable '{0}' has been created before. The "
-                             "previous type is {1}, the new type is {2}. They"
-                             " are not matched".format(self.name,
-                                                       self.desc.type(), type))
+            raise ValueError(
+                "Variable '{0}' has been created before. The "
+                "previous type is {1}, the new type is {2}. They"
+                " are not matched".format(self.name, self.desc.type(), type)
+            )
 
         if shape is not None:
             if is_new_var:
@@ -1403,29 +1458,32 @@ def __init__(self,
                     raise ValueError(
                         "Variable '{0}' has been created before. The previous "
                         "shape is {1}, the new shape is {2}. They are not "
-                        "matched.".format(self.name, old_shape, shape))
+                        "matched.".format(self.name, old_shape, shape)
+                    )
         if dtype is not None:
             if is_new_var:
                 self.desc.set_dtype(dtype)
             else:
                 old_dtype = self.dtype
                 if dtype != old_dtype:
-                    raise ValueError("Variable '{0}' has been created before. "
-                                     "The previous data type is {1}, the new "
-                                     "data type is {2}. They are not "
-                                     "matched.".format(self.name, old_dtype,
-                                                       dtype))
+                    raise ValueError(
+                        "Variable '{0}' has been created before. "
+                        "The previous data type is {1}, the new "
+                        "data type is {2}. They are not "
+                        "matched.".format(self.name, old_dtype, dtype)
+                    )
 
         if lod_level is not None:
             if is_new_var:
                 self.desc.set_lod_level(lod_level)
             else:
                 if lod_level != self.lod_level:
-                    raise ValueError("Variable '{0}' has been created before. "
-                                     "The previous lod_level is {1}, the new "
-                                     "lod_level is {2}. They are not "
-                                     "matched".format(self.name, self.lod_level,
-                                                      lod_level))
+                    raise ValueError(
+                        "Variable '{0}' has been created before. "
+                        "The previous lod_level is {1}, the new "
+                        "lod_level is {2}. They are not "
+                        "matched".format(self.name, self.lod_level, lod_level)
+                    )
         if persistable is not None:
             if is_new_var:
                 self.desc.set_persistable(persistable)
@@ -1435,7 +1493,9 @@ def __init__(self,
                         "Variable '{0}' has been created before."
                         "The previous persistable is {1}, the new "
                         "persistable is {2}. They are not matched".format(
-                            self.name, self.persistable, persistable))
+                            self.name, self.persistable, persistable
+                        )
+                    )
 
         if need_check_feed and is_new_var:
             self.desc.set_need_check_feed(need_check_feed)
@@ -1455,12 +1515,13 @@ def __init__(self,
 
     def detach(self):
         """
+
         Returns a new Variable, detached from the current graph.
         It will share data with origin Variable and without tensor copy.
         In addition, the detached Variable doesn't provide gradient propagation.
 
         Returns:
-             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable): The detached Variable.
+             ( :ref:`api_guide_Variable_en` | dtype is same as current Variable), The detached Variable.
 
         Examples:
             .. code-block:: python
@@ -1474,22 +1535,25 @@ def detach(self):
 
                 # create a detached Variable
                 y = x.detach()
+
         """
 
-        assert self.type == core.VarDesc.VarType.SELECTED_ROWS or \
-            self.type == core.VarDesc.VarType.LOD_TENSOR, \
-            "only support a variable with SELECTED_ROWS or LOD_TENSOR to be detached"
+        assert (
+            self.type == core.VarDesc.VarType.SELECTED_ROWS
+            or self.type == core.VarDesc.VarType.LOD_TENSOR
+        ), "only support a variable with SELECTED_ROWS or LOD_TENSOR to be detached"
 
         output = self.block.create_var(
             name=unique_name.generate_with_ignorable_key("detach_" + self.name),
             dtype=self.dtype,
             type=self.type,
             persistable=self.persistable,
-            stop_gradient=True)
+            stop_gradient=True,
+        )
 
-        self.block.append_op(type='share_data',
-                             inputs={'X': [self]},
-                             outputs={'Out': [output]})
+        self.block.append_op(
+            type='share_data', inputs={'X': [self]}, outputs={'Out': [output]}
+        )
         return output
 
     @fake_interface_only
@@ -1679,14 +1743,20 @@ def _to_readable_code(self):
         """
         # VarType.LOD_TENSOR -> LOD_TENSOR
         type_str = str(self.type).split('.')[1]
-        if self.type == core.VarDesc.VarType.SELECTED_ROWS or self.type == core.VarDesc.VarType.LOD_TENSOR:
+        if (
+            self.type == core.VarDesc.VarType.SELECTED_ROWS
+            or self.type == core.VarDesc.VarType.LOD_TENSOR
+        ):
             dtype_str = str(self.dtype).split('.')[1]
-            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".\
-                format(name=self.name, type=type_str, shape=self.shape,
-                       dtype=dtype_str, stop_gradient=self.stop_gradient)
+            var_str = "{name} : {type}.shape{shape}.dtype({dtype}).stop_gradient({stop_gradient})".format(
+                name=self.name,
+                type=type_str,
+                shape=self.shape,
+                dtype=dtype_str,
+                stop_gradient=self.stop_gradient,
+            )
         else:
-            var_str = "{name} : {type})".\
-                format(name=self.name, type=type_str)
+            var_str = "{name} : {type})".format(name=self.name, type=type_str)
 
         if self.is_parameter:
             if self.trainable:
@@ -1699,12 +1769,16 @@ def _to_readable_code(self):
         if self.persistable:
             var_str = "persist " + var_str
 
-        from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import (
+            get_default_distributed_context,
+        )
+
         dist_context = get_default_distributed_context()
         dist_tensor = dist_context.get_dist_tensor_for_program(self)
         if dist_tensor is not None:
-            var_str += ", {name} = {value}".format(name="dist_attr",
-                                                   value=dist_tensor)
+            var_str += ", {name} = {value}".format(
+                name="dist_attr", value=dist_tensor
+            )
 
         return var_str
 
@@ -1738,15 +1812,18 @@ def to_string(self, throw_on_error, with_details=False):
                 print(new_variable.to_string(True, True))
         """
         assert isinstance(throw_on_error, bool) and isinstance(
-            with_details, bool)
+            with_details, bool
+        )
         protostr = self.desc.serialize_to_string()
         proto = framework_pb2.VarDesc.FromString(six.binary_type(protostr))
         res_str = _debug_string_(proto, throw_on_error)
         if with_details:
-            additional_attr = ("error_clip", )
+            additional_attr = ("error_clip",)
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         cpt.to_text(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name,
+                    cpt.to_text(getattr(self, attr_name)),
+                )
 
         return res_str
 
@@ -1755,7 +1832,7 @@ def to_string(self, throw_on_error, with_details=False):
     def element_size(self):
         """
         Returns the size in bytes of an element in the Tensor.
-        
+
         Examples:
           .. code-block:: python
 
@@ -2011,6 +2088,7 @@ def type(self):
     @property
     def T(self):
         """
+
         Permute current Variable with its dimensions reversed.
 
         If `n` is the dimensions of `x` , `x.T` is equivalent to `x.transpose([n-1, n-2, ..., 0])`.
@@ -2029,6 +2107,7 @@ def T(self):
                 x_T_np = exe.run(paddle.static.default_main_program(), fetch_list=[x_T])[0]
                 print(x_T_np.shape)
                 # (5, 3, 2)
+
         """
         if len(self.shape) == 1:
             return self
@@ -2041,32 +2120,33 @@ def T(self):
             dtype=self.dtype,
             type=self.type,
             persistable=False,
-            stop_gradient=False)
+            stop_gradient=False,
+        )
         input_shape = self.block.create_var(
             name=unique_name.generate_with_ignorable_key(self.name + '.tmp'),
             dtype=self.dtype,
             type=core.VarDesc.VarType.LOD_TENSOR,
             persistable=False,
-            stop_gradient=False)
-
-        self.block.append_op(type='transpose2',
-                             inputs={'X': [self]},
-                             outputs={
-                                 'Out': [out],
-                                 'XShape': [input_shape]
-                             },
-                             attrs={'axis': perm})
+            stop_gradient=False,
+        )
+
+        self.block.append_op(
+            type='transpose2',
+            inputs={'X': [self]},
+            outputs={'Out': [out], 'XShape': [input_shape]},
+            attrs={'axis': perm},
+        )
         return out
 
     def clone(self):
         """
         Returns a new static Variable, which is the clone of the original static
-        Variable. It remains in the current graph, that is, the cloned Variable 
+        Variable. It remains in the current graph, that is, the cloned Variable
         provides gradient propagation. Calling ``out = tensor.clone()`` is same
         as ``out = assign(tensor)`` .
 
         Returns:
-            Variable: The cloned Variable.
+            Variable, The cloned Variable.
 
         Examples:
             .. code-block:: python
@@ -2086,15 +2166,17 @@ def clone(self):
             dtype=self.dtype,
             type=self.type,
             persistable=self.persistable,
-            stop_gradient=self.stop_gradient)
+            stop_gradient=self.stop_gradient,
+        )
 
-        self.block.append_op(type='assign',
-                             inputs={'X': [self]},
-                             outputs={'Out': [output]})
+        self.block.append_op(
+            type='assign', inputs={'X': [self]}, outputs={'Out': [output]}
+        )
         return output
 
     def _set_error_clip(self, error_clip):
         """
+
         Set the error_clip.
 
         Args:
@@ -2102,19 +2184,22 @@ def _set_error_clip(self, error_clip):
 
         Returns:
             None
+
         """
         self.error_clip = error_clip
 
     def _set_info(self, key, value):
         """
+
         Set key-value information for this variable.
 
         Args:
             key(str): Key for this information.
             value(object): The value associated to the key.
 
-        Returns: 
+        Returns:
             None
+
         """
         if not hasattr(self, "_info"):
             self._info = {}
@@ -2122,13 +2207,15 @@ def _set_info(self, key, value):
 
     def _get_info(self, key):
         """
+
         Get the information of this variable corresponding to key.
 
         Args:
             key(str): Key for this information.
 
-        Returns: 
+        Returns:
             object
+
         """
         if hasattr(self, "_info") and key in self._info:
             return self._info[key]
@@ -2136,7 +2223,9 @@ def _get_info(self, key):
 
     def _slice_indices(self, slice, length):
         """
+
         Reference implementation for the slice.indices method.
+
         """
         # Compute step and length as integers.
         step = 1 if slice.step is None else slice.step
@@ -2156,8 +2245,9 @@ def _slice_indices(self, slice, length):
             start = upper if step < 0 else lower
         else:
             start = slice.start
-            start = max(start +
-                        length, lower) if start < 0 else min(start, upper)
+            start = (
+                max(start + length, lower) if start < 0 else min(start, upper)
+            )
 
         # Compute stop.
         if slice.stop is None:
@@ -2203,11 +2293,15 @@ def _detectContinuesSlice(self, item):
         for index, o in enumerate(item):
             if isinstance(o, int):
                 start = int(o)
-                if (index > 0 and index >= self.shape[index]) \
-                        or (index < 0 and (index + self.shape[index]) < 0):
+                if (index > 0 and index >= self.shape[index]) or (
+                    index < 0 and (index + self.shape[index]) < 0
+                ):
                     raise IndexError("invalid index")
-                start = max(start + self.shape[index], 0) if start < 0 else min(
-                    start, self.shape[index])
+                start = (
+                    max(start + self.shape[index], 0)
+                    if start < 0
+                    else min(start, self.shape[index])
+                )
                 starts.append(start)
                 ends.append(start + 1)
             elif isinstance(o, slice):
@@ -2225,30 +2319,31 @@ def _cloneVar(self, copy=False):
         if not copy:
             return self.block.create_var(
                 name=unique_name.generate_with_ignorable_key(self.name),
-                dtype=self.dtype)
+                dtype=self.dtype,
+            )
         else:
             return self
 
     def _sliceVar(self, axes, starts, ends):
         new_var = self._cloneVar()
-        self.block.append_op(type="slice",
-                             inputs={'Input': [self]},
-                             outputs={'Out': [new_var]},
-                             attrs={
-                                 'axes': axes,
-                                 'starts': starts,
-                                 'ends': ends
-                             })
+        self.block.append_op(
+            type="slice",
+            inputs={'Input': [self]},
+            outputs={'Out': [new_var]},
+            attrs={'axes': axes, 'starts': starts, 'ends': ends},
+        )
         return new_var
 
     def _concatVar(self, inputs, axis):
         new_var = self._cloneVar()
-        self.block.append_op(type="concat",
-                             inputs={'X': inputs},
-                             outputs={'Out': [new_var]},
-                             attrs={
-                                 'axis': axis,
-                             })
+        self.block.append_op(
+            type="concat",
+            inputs={'X': inputs},
+            outputs={'Out': [new_var]},
+            attrs={
+                'axis': axis,
+            },
+        )
         return new_var
 
     def _sliceAndConcatVar(self, item, axis):
@@ -2262,21 +2357,24 @@ def _sliceAndConcatVar(self, item, axis):
                 vars = []
                 if step > 0:
                     while start < stop:
-                        vars.append(self._sliceVar([axis], [start],
-                                                   [start + 1]))
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1])
+                        )
                         start += step
                 else:
                     while start > stop:
-                        vars.append(self._sliceVar([axis], [start],
-                                                   [start + 1]))
+                        vars.append(
+                            self._sliceVar([axis], [start], [start + 1])
+                        )
                         start += step
                 return self._concatVar(vars, axis)
         elif isinstance(item, int):
             if self.shape[axis] < 0:
                 return self._cloneVar(True)
             index = int(item)
-            if (index > 0 and index >= self.shape[axis]) \
-                    or (index < 0 and (index + self.shape[axis]) < 0):
+            if (index > 0 and index >= self.shape[axis]) or (
+                index < 0 and (index + self.shape[axis]) < 0
+            ):
                 raise IndexError("invalid index")
             return self._sliceVar([axis], [index], [index + 1])
         else:
@@ -2290,21 +2388,21 @@ def __setitem__(self, item, value):
 
     def get_value(self, scope=None):
         """
-        Get the value of variable in given scope. 
+        Get the value of variable in given scope.
 
         Args:
-            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
                 Default: None
 
         Returns:
-            Tensor: the value in given scope.
+            Tensor, the value in given scope.
 
         Examples:
             .. code-block:: python
 
                 import paddle
-                import paddle.static as static 
+                import paddle.static as static
                 import numpy as np
 
                 paddle.enable_static()
@@ -2333,38 +2431,43 @@ def get_value(self, scope=None):
         # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
+
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}."
-                .format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".format(
+                    type(scope)
+                )
+            )
 
         if scope is None:
             scope = global_scope()
         var_temp = scope.find_var(self.name)
         if var_temp is None:
-            raise ValueError("Can not find Variable '{}' in the Scope.".format(
-                self.name))
+            raise ValueError(
+                "Can not find Variable '{}' in the Scope.".format(self.name)
+            )
         t = var_temp.get_tensor()
         return t
 
     def set_value(self, value, scope=None):
         '''
-        Set the value to the tensor in given scope. 
+
+        Set the value to the tensor in given scope.
 
         Args:
             value(Tensor/ndarray) : The value to be set.
-            scope(Scope, optional) : If `scope` is None, it will be set to global scope 
+            scope(Scope, optional) : If `scope` is None, it will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, use `scope`.
                 Default: None
 
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
                 import paddle
-                import paddle.static as static 
+                import paddle.static as static
                 import numpy as np
 
                 paddle.enable_static()
@@ -2388,6 +2491,7 @@ def set_value(self, value, scope=None):
                     if var.persistable:
                         t_load = paddle.load(path+var.name+'.pdtensor')
                         var.set_value(t_load)
+
         '''
 
         # The 'framework' is a low-level module, and 'executor'
@@ -2397,21 +2501,26 @@ def set_value(self, value, scope=None):
 
         if not (isinstance(value, np.ndarray) or hasattr(value, '__array__')):
             raise TypeError(
-                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}."
-                .format(type(value)))
+                "`value` should be `numpy.ndarray` or `LoDTensor`, but received {}.".format(
+                    type(value)
+                )
+            )
 
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope` type, but received {}."
-                .format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope` type, but received {}.".format(
+                    type(scope)
+                )
+            )
 
         if scope is None:
             scope = global_scope()
 
         var_temp = scope.find_var(self.name)
         if var_temp is None:
-            raise ValueError("Can not find Variable '{}' in the Scope.".format(
-                self.name))
+            raise ValueError(
+                "Can not find Variable '{}' in the Scope.".format(self.name)
+            )
 
         t = var_temp.get_tensor()
 
@@ -2422,8 +2531,10 @@ def set_value(self, value, scope=None):
                 value_shape = value.shape
             if list(t.shape()) != list(value_shape):
                 raise ValueError(
-                    "{} expected a shape {}, but the received shape is {}.".
-                    format(self.name, list(t.shape()), list(value_shape)))
+                    "{} expected a shape {}, but the received shape is {}.".format(
+                        self.name, list(t.shape()), list(value_shape)
+                    )
+                )
 
         p = t._place()
         if p.is_cpu_place():
@@ -2451,10 +2562,11 @@ def set_value(self, value, scope=None):
 
     def size(self):
         """
+
         Returns the number of elements for current Variable, which is a int64 Variable with shape [1]
 
         Returns:
-            Variable: the number of elements for current Variable
+            Variable, the number of elements for current Variable
 
         Examples:
             .. code-block:: python
@@ -2468,36 +2580,42 @@ def size(self):
 
                 # get the number of elements of the Variable
                 y = x.size()
+
         """
 
         output = self.block.create_var(
             name=unique_name.generate_with_ignorable_key(self.name + "_size"),
-            dtype=core.VarDesc.VarType.INT64)
+            dtype=core.VarDesc.VarType.INT64,
+        )
 
-        self.block.append_op(type='size',
-                             inputs={'Input': [self]},
-                             outputs={'Out': [output]})
+        self.block.append_op(
+            type='size', inputs={'Input': [self]}, outputs={'Out': [output]}
+        )
         return output
 
     def _set_attr(self, name, val):
         """
+
         Set the value of attribute by attribute's name.
 
         Args:
             name(str): the attribute name.
             val(int|str|list): the value of the attribute.
+
         """
         self._update_desc_attr(name, val)
 
     def _has_attr(self, name):
         """
+
         Whether this Variable has the attribute with the name `name` or not.
 
         Args:
             name(str): the attribute name.
 
         Returns:
-            bool: True if has this attribute.
+            bool, True if has this attribute.
+
         """
         return self.desc.has_attr(name)
 
@@ -2527,7 +2645,7 @@ def _get_attr(self, name):
             name(str): the attribute name.
 
         Returns:
-            int|str|list: The attribute value. The return value
+            int|str|list, The attribute value. The return value
             can be any valid attribute type.
         """
         return self.desc.attr(name)
@@ -2575,8 +2693,8 @@ def instance(cls):
 
     def __init__(self):
         assert not hasattr(
-            self.__class__,
-            '_instance'), 'Please use `instance()` to get OpProtoHolder object!'
+            self.__class__, '_instance'
+        ), 'Please use `instance()` to get OpProtoHolder object!'
         op_protos = get_all_op_protos()
         self.op_proto_map = {}
         for proto in op_protos:
@@ -2612,7 +2730,7 @@ def generated_op_attr_names():
             core.op_proto_and_checker_maker.kOpRoleVarAttrName(),
             core.op_proto_and_checker_maker.kOpNameScopeAttrName(),
             core.op_proto_and_checker_maker.kOpCreationCallstackAttrName(),
-            core.op_proto_and_checker_maker.kOpDeviceAttrName()
+            core.op_proto_and_checker_maker.kOpDeviceAttrName(),
         }
 
 
@@ -2659,24 +2777,44 @@ class Operator(object):
                                 inputs={"X": [var1, var2, var3]},
                                 outputs={"Out": [var1]})
     """
+
     OP_WITHOUT_KERNEL_SET = {
-        'feed', 'fetch', 'recurrent', 'go', 'rnn_memory_helper_grad',
-        'conditional_block', 'while', 'send', 'recv', 'listen_and_serv',
-        'fl_listen_and_serv', 'ncclInit', 'select', 'checkpoint_notify',
-        'gen_bkcl_id', 'c_gen_bkcl_id', 'gen_nccl_id', 'c_gen_nccl_id',
-        'c_comm_init', 'c_sync_calc_stream', 'c_sync_comm_stream',
-        'queue_generator', 'dequeue', 'enqueue', 'heter_listen_and_serv',
-        'c_wait_comm', 'c_wait_compute', 'c_gen_hccl_id', 'c_comm_init_hccl',
-        'copy_cross_scope', 'c_gen_cncl_id'
+        'feed',
+        'fetch',
+        'recurrent',
+        'go',
+        'rnn_memory_helper_grad',
+        'conditional_block',
+        'while',
+        'send',
+        'recv',
+        'listen_and_serv',
+        'fl_listen_and_serv',
+        'ncclInit',
+        'select',
+        'checkpoint_notify',
+        'gen_bkcl_id',
+        'c_gen_bkcl_id',
+        'gen_nccl_id',
+        'c_gen_nccl_id',
+        'c_comm_init',
+        'c_sync_calc_stream',
+        'c_sync_comm_stream',
+        'queue_generator',
+        'dequeue',
+        'enqueue',
+        'heter_listen_and_serv',
+        'c_wait_comm',
+        'c_wait_compute',
+        'c_gen_hccl_id',
+        'c_comm_init_hccl',
+        'copy_cross_scope',
+        'c_gen_cncl_id',
     }
 
-    def __init__(self,
-                 block,
-                 desc,
-                 type=None,
-                 inputs=None,
-                 outputs=None,
-                 attrs=None):
+    def __init__(
+        self, block, desc, type=None, inputs=None, outputs=None, attrs=None
+    ):
         # read attr type index from op proto to avoid unexpected type
         # conversions, e.g. narrowing conversion like double to float
         try:
@@ -2690,7 +2828,8 @@ def __init__(self,
         if _non_static_mode():
             if type is None:
                 raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
+                    "`type` to initialized an Operator can not be None."
+                )
             self._type = type
             self.attrs = attrs if attrs else {}
         else:
@@ -2710,11 +2849,14 @@ def __init__(self,
 
             if op_maker.kOpRoleAttrName() not in op_attrs:
                 op_attrs[
-                    op_maker.kOpRoleAttrName()] = self.block.program._op_role
+                    op_maker.kOpRoleAttrName()
+                ] = self.block.program._op_role
 
             role_var_name = op_maker.kOpRoleVarAttrName()
-            if len(self.block.program._op_role_var
-                   ) != 0 and role_var_name not in op_attrs:
+            if (
+                len(self.block.program._op_role_var) != 0
+                and role_var_name not in op_attrs
+            ):
                 op_attrs[role_var_name] = self.block.program._op_role_var
 
             if role_var_name in op_attrs and len(op_attrs[role_var_name]) == 0:
@@ -2729,16 +2871,20 @@ def __init__(self,
                 return
             if type is None:
                 raise ValueError(
-                    "`type` to initialized an Operator can not be None.")
+                    "`type` to initialized an Operator can not be None."
+                )
             else:
                 callstack_var_name = op_maker.kOpCreationCallstackAttrName()
                 op_attrs[callstack_var_name] = []
                 for frame in traceback.extract_stack():
                     op_attrs[callstack_var_name].append(
                         '  File "{}", line {}, in {}'.format(
-                            frame[0], frame[1], frame[2]))
-                    op_attrs[callstack_var_name].append('    {}'.format(
-                        frame[3]))
+                            frame[0], frame[1], frame[2]
+                        )
+                    )
+                    op_attrs[callstack_var_name].append(
+                        '    {}'.format(frame[3])
+                    )
 
             self.desc.set_type(type)
             proto = OpProtoHolder.instance().get_op_proto(type)
@@ -2754,20 +2900,25 @@ def __init__(self,
                     op_device = op_maker.kOpDeviceAttrName()
                     op_attrs[op_device] = _current_device
                 else:
-                    warnings.warn("The Op(%s) is not support to set device." %
-                                  type)
+                    warnings.warn(
+                        "The Op(%s) is not support to set device." % type
+                    )
                 if 'force_cpu' in op_attrs:
-                    if (type == 'less_than' and op_attrs['force_cpu'] != None
-                        ) or op_attrs['force_cpu'] != False:
+                    if (
+                        type == 'less_than' and op_attrs['force_cpu'] != None
+                    ) or op_attrs['force_cpu'] != False:
                         warnings.warn(
                             "The Attr(force_cpu) of Op(%s) will be deprecated in the future, "
                             "please use 'device_guard' instead. 'device_guard' has higher priority when they are "
-                            "used at the same time." % type)
+                            "used at the same time." % type
+                        )
             if _current_pipeline_stage is not None:
-                pipeline_attr_name = 'pipeline_stage' + core.kAutoParallelSuffix(
+                pipeline_attr_name = (
+                    'pipeline_stage' + core.kAutoParallelSuffix()
+                )
+                self._update_desc_attr(
+                    pipeline_attr_name, _current_pipeline_stage
                 )
-                self._update_desc_attr(pipeline_attr_name,
-                                       _current_pipeline_stage)
 
             def find_name(var_list, name):
                 for var_name in var_list:
@@ -2778,8 +2929,9 @@ def find_name(var_list, name):
             if inputs is not None:
                 for in_proto in proto.inputs:
                     found = find_name(inputs, in_proto.name)
-                    assert found or in_proto.dispensable, "Input {} not found".format(
-                        in_proto.name)
+                    assert (
+                        found or in_proto.dispensable
+                    ), "Input {} not found".format(in_proto.name)
                     if found:
                         in_args = inputs[in_proto.name]
                         if not isinstance(in_args, (list, tuple)):
@@ -2787,7 +2939,8 @@ def find_name(var_list, name):
                         if not in_proto.duplicable and len(in_args) > 1:
                             raise ValueError(
                                 "Input %s expects only one input, but %d are given."
-                                % (in_proto.name, len(in_args)))
+                                % (in_proto.name, len(in_args))
+                            )
                         in_arg_names = []
                         for index, arg in enumerate(in_args):
                             if isinstance(arg, six.string_types):
@@ -2801,8 +2954,9 @@ def find_name(var_list, name):
                                     "The type of '%s' in operator %s should be "
                                     "one of [basestring(), str, Varibale] in python2, "
                                     "or one of [str, bytes, Variable] in python3."
-                                    "but received : %s" %
-                                    (in_proto.name, type, arg))
+                                    "but received : %s"
+                                    % (in_proto.name, type, arg)
+                                )
                         self.desc.set_input(in_proto.name, in_arg_names)
                     else:
                         self.desc.set_input(in_proto.name, [])
@@ -2813,9 +2967,12 @@ def find_name(var_list, name):
                         continue
                     if not ((m.name in outputs) or m.dispensable):
                         raise ValueError(
-                            ("Incorrect setting for output(s) of "
-                             "operator \"%s\", should set: [%s].") %
-                            (type, m.name))
+                            (
+                                "Incorrect setting for output(s) of "
+                                "operator \"%s\", should set: [%s]."
+                            )
+                            % (type, m.name)
+                        )
                 for out_proto in proto.outputs:
                     if out_proto.name not in outputs:
                         continue
@@ -2825,7 +2982,8 @@ def find_name(var_list, name):
                     if not out_proto.duplicable and len(out_args) > 1:
                         raise ValueError(
                             "Output %s expects only one output, but %d are given."
-                            % (out_proto.name, len(out_args)))
+                            % (out_proto.name, len(out_args))
+                        )
                     out_arg_names = []
                     for arg in out_args:
                         if isinstance(arg, six.string_types):
@@ -2846,27 +3004,32 @@ def find_name(var_list, name):
                     raise TypeError("'attrs' should be a dict.")
                 for attr in proto.attrs:
                     attr_name = attr.name
-                    if (attr_name
-                            not in op_attrs) or (op_attrs[attr_name] is None):
+                    if (attr_name not in op_attrs) or (
+                        op_attrs[attr_name] is None
+                    ):
                         continue
                     attr_val = op_attrs[attr_name]
                     self._update_desc_attr(attr_name, attr_val)
                 for attr_name in extra_attrs_map.keys():
-                    if (attr_name
-                            not in op_attrs) or (op_attrs[attr_name] is None):
-                        self._update_desc_attr(attr_name,
-                                               extra_attrs_map[attr_name])
+                    if (attr_name not in op_attrs) or (
+                        op_attrs[attr_name] is None
+                    ):
+                        self._update_desc_attr(
+                            attr_name, extra_attrs_map[attr_name]
+                        )
                     else:
                         self._update_desc_attr(attr_name, op_attrs[attr_name])
 
             # proto.attrs doesn't include ipu_index
             if core.is_compiled_with_ipu():
                 if global_ipu_index >= 0:
-                    self._update_desc_attr(ipu_index_attr_name,
-                                           global_ipu_index)
+                    self._update_desc_attr(
+                        ipu_index_attr_name, global_ipu_index
+                    )
                 if global_ipu_stage >= 0:
-                    self._update_desc_attr(ipu_stage_attr_name,
-                                           global_ipu_stage)
+                    self._update_desc_attr(
+                        ipu_stage_attr_name, global_ipu_stage
+                    )
 
             self.desc.check_attrs()
             if self._has_kernel(type):
@@ -2925,7 +3088,8 @@ def _to_readable_code(self, skip_op_callstack=True):
         assert isinstance(
             skip_op_callstack, bool
         ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack))
+            type(skip_op_callstack)
+        )
         outputs_str = "{"
         for i in range(0, len(self.output_names)):
             outputs_str += "{name}=".format(name=self.output_names[i])
@@ -2955,9 +3119,9 @@ def _to_readable_code(self, skip_op_callstack=True):
             attr_type = self.desc.attr_type(name, True)
             if attr_type == core.AttrType.VAR:
                 attr_var_name = self.desc.attr(name, True).name()
-                a = "{name} = Var['{value}']".format(name=name,
-                                                     type=attr_type,
-                                                     value=attr_var_name)
+                a = "{name} = Var['{value}']".format(
+                    name=name, type=attr_type, value=attr_var_name
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -2968,7 +3132,8 @@ def _to_readable_code(self, skip_op_callstack=True):
                     "'%s'" % var.name() for var in self.desc.attr(name, True)
                 ]
                 a = "{name} = Vars[{value}]".format(
-                    name=name, type=attr_type, value=','.join(attr_var_names))
+                    name=name, type=attr_type, value=','.join(attr_var_names)
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -2976,7 +3141,8 @@ def _to_readable_code(self, skip_op_callstack=True):
 
             if attr_type == core.AttrType.BLOCK:
                 a = "{name} = block[{value}]".format(
-                    name=name, type=attr_type, value=self._block_attr_id(name))
+                    name=name, type=attr_type, value=self._block_attr_id(name)
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
@@ -2984,17 +3150,19 @@ def _to_readable_code(self, skip_op_callstack=True):
 
             if attr_type == core.AttrType.BLOCKS:
                 a = "{name} = blocks{value}".format(
-                    name=name,
-                    type=attr_type,
-                    value=self._blocks_attr_ids(name))
+                    name=name, type=attr_type, value=self._blocks_attr_ids(name)
+                )
                 attrs_str += a
                 if i != len(attr_names) - 1:
                     attrs_str += ", "
                 continue
 
             # it is bytes of serialized protobuf
-            if is_compiled_with_cinn(
-            ) and self.type == 'cinn_launch' and name == 'compilation_key':
+            if (
+                is_compiled_with_cinn()
+                and self.type == 'cinn_launch'
+                and name == 'compilation_key'
+            ):
                 key = self.desc.attr(name)
                 v = core.get_serialize_comile_key(key)
                 prog = Program()
@@ -3006,28 +3174,36 @@ def _to_readable_code(self, skip_op_callstack=True):
             else:
                 value = self.desc.attr(name)
 
-            a = "{name} = {value}".format(name=name,
-                                          type=attr_type,
-                                          value=value)
+            a = "{name} = {value}".format(
+                name=name, type=attr_type, value=value
+            )
 
             attrs_str += a
             if i != len(attr_names) - 1:
                 attrs_str += ", "
 
-        from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+        from paddle.distributed.auto_parallel.dist_context import (
+            get_default_distributed_context,
+        )
+
         dist_context = get_default_distributed_context()
         dist_op = dist_context.get_dist_op_for_program(self)
         if dist_op is not None:
-            attrs_str += ", {name} = {value}".format(name="dist_attr",
-                                                     value=dist_op)
+            attrs_str += ", {name} = {value}".format(
+                name="dist_attr", value=dist_op
+            )
 
         if outputs_str != "{}":
-            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".\
-                format(outputs=outputs_str, op_type=self.type,
-                       inputs=inputs_str, attrs=attrs_str)
+            op_str = "{outputs} = {op_type}(inputs={inputs}, {attrs})".format(
+                outputs=outputs_str,
+                op_type=self.type,
+                inputs=inputs_str,
+                attrs=attrs_str,
+            )
         else:
-            op_str = "{op_type}(inputs={inputs}, {attrs})".\
-                format(op_type=self.type, inputs=inputs_str, attrs=attrs_str)
+            op_str = "{op_type}(inputs={inputs}, {attrs})".format(
+                op_type=self.type, inputs=inputs_str, attrs=attrs_str
+            )
         return op_str
 
     def __str__(self):
@@ -3041,14 +3217,16 @@ def type(self):
 
     def input(self, name):
         r"""
+
         Get the input arguments according to the input parameter name.
 
         Args:
             name(str): The input parameter name.
 
         Returns:
-            list: return the list of argument names that associated with \
+            list, return the list of argument names that associated with \
                 the specific parameter name.
+
         """
         return self.desc.input(name)
 
@@ -3113,7 +3291,8 @@ def idx(self):
             if op == self:
                 return i
         raise ValueError(
-            "Can't find op itself in it's block. It could be a bug of Paddle.")
+            "Can't find op itself in it's block. It could be a bug of Paddle."
+        )
 
     def has_attr(self, name):
         """
@@ -3175,8 +3354,9 @@ def _update_desc_attr(self, name, val):
             self.desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and _all_is_type(val, Block):
             self.desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
+        elif isinstance(val, core.BlockDesc) or isinstance(
+            val, core.ProgramDesc
+        ):
             self.desc.set_serialized_attr(name, val.serialize_to_string())
         else:
             self._update_desc_plain_attr(name, val)
@@ -3257,7 +3437,7 @@ def _block_attr(self, name):
         """
 
         id = self._block_attr_id(name)
-        assert (id >= 0 and id < len(self.block.program.blocks))
+        assert id >= 0 and id < len(self.block.program.blocks)
         return self.block.program.blocks[id]
 
     def _blocks_attr(self, name):
@@ -3272,7 +3452,7 @@ def _blocks_attr(self, name):
         """
         attrs = []
         for i in self._blocks_attr_ids(name):
-            assert (i >= 0 and i < len(self.block.program.blocks))
+            assert i >= 0 and i < len(self.block.program.blocks)
             attrs.append(self.block.program.blocks[i])
 
         return attrs
@@ -3301,8 +3481,11 @@ def _var_attr(self, name):
             Variable: the Variable attribute.
         """
         attr_type = self.desc.attr_type(name, True)
-        assert attr_type == core.AttrType.VAR, "Required type attr({}) is Variable, but received {}".format(
-            name, attr_type)
+        assert (
+            attr_type == core.AttrType.VAR
+        ), "Required type attr({}) is Variable, but received {}".format(
+            name, attr_type
+        )
         attr_var_name = self.desc.attr(name, True).name()
         return self.block._var_recursive(attr_var_name)
 
@@ -3317,8 +3500,11 @@ def _vars_attr(self, name):
             Variables: the Variables attribute.
         """
         attr_type = self.desc.attr_type(name, True)
-        assert attr_type == core.AttrType.VARS, "Required type attr({}) is list[Variable], but received {}".format(
-            name, attr_type)
+        assert (
+            attr_type == core.AttrType.VARS
+        ), "Required type attr({}) is list[Variable], but received {}".format(
+            name, attr_type
+        )
         attr_vars = [
             self.block._var_recursive(var.name())
             for var in self.desc.attr(name, True)
@@ -3465,7 +3651,8 @@ def _to_readable_code(self, skip_op_callstack=True):
         assert isinstance(
             skip_op_callstack, bool
         ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack))
+            type(skip_op_callstack)
+        )
         block_str = "{ // block "
         block_str += "{}\n".format(self.idx)
         for var in list(self.vars.values()):
@@ -3473,7 +3660,8 @@ def _to_readable_code(self, skip_op_callstack=True):
         block_str += "\n"
         for op in self.ops:
             block_str += "    {}\n".format(
-                op._to_readable_code(skip_op_callstack))
+                op._to_readable_code(skip_op_callstack)
+            )
         block_str += "}"
         return block_str
 
@@ -3492,22 +3680,28 @@ def to_string(self, throw_on_error, with_details=False):
             str: The debug string.
         """
         assert isinstance(throw_on_error, bool) and isinstance(
-            with_details, bool)
+            with_details, bool
+        )
         if with_details:
             re_add_indent = re.compile(r"\n(.)")
             res_str = "blocks {\n  idx: %d\n  parent_idx: %d" % (
-                self.idx, self.parent_idx)
+                self.idx,
+                self.parent_idx,
+            )
             for var in list(self.vars.values()):
                 res_str += "\n  vars {\n    %s  }" % re_add_indent.sub(
-                    r"\n    \1", var.to_string(throw_on_error, with_details))
+                    r"\n    \1", var.to_string(throw_on_error, with_details)
+                )
             for op in self.ops:
                 res_str += "\n  ops {\n    %s  }" % re_add_indent.sub(
-                    r"\n    \1", op.to_string(throw_on_error))
+                    r"\n    \1", op.to_string(throw_on_error)
+                )
             res_str += "\n}"
         else:
             protostr = self.desc.serialize_to_string()
             proto = framework_pb2.BlockDesc.FromString(
-                six.binary_type(protostr))
+                six.binary_type(protostr)
+            )
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -3561,8 +3755,9 @@ def var(self, name):
         """
         if not isinstance(name, six.string_types):
             raise TypeError(
-                "var require string as parameter, but get %s instead." %
-                (type(name)))
+                "var require string as parameter, but get %s instead."
+                % (type(name))
+            )
         v = self.vars.get(name, None)
         if v is None:
             raise ValueError("var %s not in this block" % name)
@@ -3628,8 +3823,11 @@ def all_parameters(self):
         return list(self.iter_parameters())
 
     def iter_parameters(self):
-        return (item[1] for item in six.iteritems(self.vars)
-                if isinstance(item[1], Parameter))
+        return (
+            item[1]
+            for item in six.iteritems(self.vars)
+            if isinstance(item[1], Parameter)
+        )
 
     def create_var(self, *args, **kwargs):
         if _non_static_mode():
@@ -3684,43 +3882,51 @@ def _rename_var(self, name, new_name):
         d = self.desc.find_var(cpt.to_bytes(new_name))
         if var_type == "Parameter":
             if in_dygraph_mode():
-                var = EagerParamBase(d.shape(),
-                                     d.dtype(),
-                                     type=orig_var_type,
-                                     name=new_name,
-                                     stop_gradient=stop_gradient,
-                                     trainable=trainable,
-                                     optimize_attr=optimize_attr,
-                                     regularizer=regularizer,
-                                     error_clip=error_clip)
+                var = EagerParamBase(
+                    d.shape(),
+                    d.dtype(),
+                    type=orig_var_type,
+                    name=new_name,
+                    stop_gradient=stop_gradient,
+                    trainable=trainable,
+                    optimize_attr=optimize_attr,
+                    regularizer=regularizer,
+                    error_clip=error_clip,
+                )
             else:
                 if _in_legacy_dygraph():
-                    var = ParamBase(d.shape(),
-                                    d.dtype(),
-                                    type=orig_var_type,
-                                    name=new_name,
-                                    stop_gradient=stop_gradient,
-                                    trainable=trainable,
-                                    optimize_attr=optimize_attr,
-                                    regularizer=regularizer,
-                                    error_clip=error_clip)
+                    var = ParamBase(
+                        d.shape(),
+                        d.dtype(),
+                        type=orig_var_type,
+                        name=new_name,
+                        stop_gradient=stop_gradient,
+                        trainable=trainable,
+                        optimize_attr=optimize_attr,
+                        regularizer=regularizer,
+                        error_clip=error_clip,
+                    )
                 else:
-                    var = Parameter(self,
-                                    d.shape(),
-                                    d.dtype(),
-                                    type=orig_var_type,
-                                    name=new_name,
-                                    stop_gradient=stop_gradient,
-                                    trainable=trainable,
-                                    optimize_attr=optimize_attr,
-                                    regularizer=regularizer,
-                                    error_clip=error_clip)
+                    var = Parameter(
+                        self,
+                        d.shape(),
+                        d.dtype(),
+                        type=orig_var_type,
+                        name=new_name,
+                        stop_gradient=stop_gradient,
+                        trainable=trainable,
+                        optimize_attr=optimize_attr,
+                        regularizer=regularizer,
+                        error_clip=error_clip,
+                    )
         elif var_type == "Variable":
-            var = Variable(self,
-                           type=orig_var_type,
-                           name=new_name,
-                           error_clip=error_clip,
-                           stop_gradient=stop_gradient)
+            var = Variable(
+                self,
+                type=orig_var_type,
+                name=new_name,
+                error_clip=error_clip,
+                stop_gradient=stop_gradient,
+            )
 
         # rename the python side, _sync_with_cpp will only add
         # new vars/ops to python side.
@@ -3757,8 +3963,9 @@ def _is_inited_by(block, var):
                         # Think of "c_broadcast" and "c_sync_comm_stream" as a special case here.
                         # NOTE: "coalesce_tensor" is a special case for rnn with cudnn support
                         if op.type in [
-                                "c_broadcast", "c_sync_comm_stream",
-                                "coalesce_tensor"
+                            "c_broadcast",
+                            "c_sync_comm_stream",
+                            "coalesce_tensor",
                         ]:
                             continue
                         init_ops.append(op)
@@ -3768,9 +3975,12 @@ def _is_inited_by(block, var):
             init_ops = _is_inited_by(global_block, param)
             init_ops_len = len(init_ops)
             if init_ops_len > 1:
-                raise RuntimeError("param " + param.name +
-                                   " is inited by multiple init ops " +
-                                   str(init_ops))
+                raise RuntimeError(
+                    "param "
+                    + param.name
+                    + " is inited by multiple init ops "
+                    + str(init_ops)
+                )
             elif init_ops_len == 1:
                 # TODO already inited, do nothing, should log a warning
                 pass
@@ -3792,24 +4002,31 @@ def append_op(self, *args, **kwargs):
             warnings.warn(
                 "Op `%s` is executed through `append_op` under the dynamic mode, "
                 "the corresponding API implementation needs to be upgraded to "
-                "using `_C_ops` method." % type, DeprecationWarning)
-            op = Operator(block=self,
-                          desc=None,
-                          type=type,
-                          inputs=None,
-                          outputs=None,
-                          attrs=attrs)
+                "using `_C_ops` method." % type,
+                DeprecationWarning,
+            )
+            op = Operator(
+                block=self,
+                desc=None,
+                type=type,
+                inputs=None,
+                outputs=None,
+                attrs=attrs,
+            )
 
             # record ops in tracer rather than blocks
             #
             # TODO(minqiyang): add op stop_gradient support in static mode too.
             # currently, we only support stop_gradient in dygraph mode.
 
-            _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}),
-                                       kwargs.get("outputs",
-                                                  {}), attrs if attrs else {},
-                                       kwargs.get("stop_gradient", False),
-                                       inplace_map)
+            _dygraph_tracer().trace_op(
+                type,
+                kwargs.get("inputs", {}),
+                kwargs.get("outputs", {}),
+                attrs if attrs else {},
+                kwargs.get("stop_gradient", False),
+                inplace_map,
+            )
         else:
             from paddle.fluid.dygraph.base import param_guard
 
@@ -3820,12 +4037,14 @@ def append_op(self, *args, **kwargs):
             inputs = kwargs.get("inputs", None)
             outputs = kwargs.get("outputs", None)
             with param_guard(inputs), param_guard(outputs):
-                op = Operator(block=self,
-                              desc=op_desc,
-                              type=kwargs.get("type", None),
-                              inputs=inputs,
-                              outputs=outputs,
-                              attrs=kwargs.get("attrs", None))
+                op = Operator(
+                    block=self,
+                    desc=op_desc,
+                    type=kwargs.get("type", None),
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=kwargs.get("attrs", None),
+                )
 
             self.ops.append(op)
 
@@ -3846,7 +4065,7 @@ def _insert_op(self, index, *args, **kwargs):
 
     def _insert_op_without_sync(self, index, *args, **kwargs):
         """
-        Insert an Operator according to the giving arguments, 
+        Insert an Operator according to the giving arguments,
         without sync_with_cpp to meke the compilation faster.
 
         Args:
@@ -3892,25 +4111,27 @@ def _prepend_op(self, *args, **kwargs):
         if _non_static_mode():
             type = kwargs.get("type", None)
             attrs = kwargs.get("attrs", {})
-            op = Operator(self,
-                          None,
-                          type=type,
-                          inputs=None,
-                          outputs=None,
-                          attrs=attrs)
-
-            _dygraph_tracer().trace_op(type, kwargs.get("inputs", {}),
-                                       kwargs.get("outputs", {}),
-                                       attrs if attrs else {},
-                                       kwargs.get("stop_gradient", False))
+            op = Operator(
+                self, None, type=type, inputs=None, outputs=None, attrs=attrs
+            )
+
+            _dygraph_tracer().trace_op(
+                type,
+                kwargs.get("inputs", {}),
+                kwargs.get("outputs", {}),
+                attrs if attrs else {},
+                kwargs.get("stop_gradient", False),
+            )
         else:
             op_desc = self.desc._prepend_op()
-            op = Operator(self,
-                          op_desc,
-                          type=kwargs.get("type", None),
-                          inputs=kwargs.get("inputs", None),
-                          outputs=kwargs.get("outputs", None),
-                          attrs=kwargs.get("attrs", None))
+            op = Operator(
+                self,
+                op_desc,
+                type=kwargs.get("type", None),
+                inputs=kwargs.get("inputs", None),
+                outputs=kwargs.get("outputs", None),
+                attrs=kwargs.get("attrs", None),
+            )
             self.ops.insert(0, op)
 
         return op
@@ -3927,17 +4148,21 @@ def _sync_with_cpp(self):
                 if var.has_stop_gradient():
                     is_stop_gradient = var.stop_gradient()
                 if var.has_is_parameter() and var.is_parameter():
-                    self.create_parameter(name=var.name(),
-                                          desc=var,
-                                          type=var.type(),
-                                          shape=var.shape(),
-                                          dtype=var.dtype(),
-                                          stop_gradient=is_stop_gradient)
+                    self.create_parameter(
+                        name=var.name(),
+                        desc=var,
+                        type=var.type(),
+                        shape=var.shape(),
+                        dtype=var.dtype(),
+                        stop_gradient=is_stop_gradient,
+                    )
                 else:
-                    self.create_var(name=var.name(),
-                                    desc=var,
-                                    type=var.type(),
-                                    stop_gradient=is_stop_gradient)
+                    self.create_var(
+                        name=var.name(),
+                        desc=var,
+                        type=var.type(),
+                        stop_gradient=is_stop_gradient,
+                    )
 
         # sync variables removed from c++ end
         for var in list(self.vars.keys()):
@@ -3983,9 +4208,12 @@ def _sync_with_cpp(self):
             ops_in_cpp_index = 0
             ops_in_python_index = 0
             while ops_in_python_index < len(
-                    self.ops) and ops_in_cpp_index < len(ops_in_cpp):
-                if self.ops[ops_in_python_index].desc != ops_in_cpp[
-                        ops_in_cpp_index]:
+                self.ops
+            ) and ops_in_cpp_index < len(ops_in_cpp):
+                if (
+                    self.ops[ops_in_python_index].desc
+                    != ops_in_cpp[ops_in_cpp_index]
+                ):
                     del self.ops[ops_in_python_index]
                 else:
                     ops_in_cpp_index += 1
@@ -4011,7 +4239,8 @@ def _copy_param_info_from(self, other):
         """
         if not isinstance(other, Block):
             raise TypeError(
-                "_copy_param_info_from should be invoked with Block")
+                "_copy_param_info_from should be invoked with Block"
+            )
         for p in other.iter_parameters():
             assert isinstance(p, Parameter)
             v = self.vars.get(p.name, None)
@@ -4021,28 +4250,32 @@ def _copy_param_info_from(self, other):
             assert isinstance(v, Variable)
             new_p = None
             if in_dygraph_mode():
-                new_p = EagerParamBase(shape=v.shape,
-                                       dtype=v.dtype,
-                                       type=v.type,
-                                       lod_level=v.lod_level,
-                                       stop_gradient=p.stop_gradient,
-                                       trainable=p.trainable,
-                                       optimize_attr=p.optimize_attr,
-                                       regularizer=p.regularizer,
-                                       error_clip=p.error_clip,
-                                       name=v.name)
+                new_p = EagerParamBase(
+                    shape=v.shape,
+                    dtype=v.dtype,
+                    type=v.type,
+                    lod_level=v.lod_level,
+                    stop_gradient=p.stop_gradient,
+                    trainable=p.trainable,
+                    optimize_attr=p.optimize_attr,
+                    regularizer=p.regularizer,
+                    error_clip=p.error_clip,
+                    name=v.name,
+                )
             else:
                 if _in_legacy_dygraph():
-                    new_p = ParamBase(shape=v.shape,
-                                      dtype=v.dtype,
-                                      type=v.type,
-                                      lod_level=v.lod_level,
-                                      stop_gradient=p.stop_gradient,
-                                      trainable=p.trainable,
-                                      optimize_attr=p.optimize_attr,
-                                      regularizer=p.regularizer,
-                                      error_clip=p.error_clip,
-                                      name=v.name)
+                    new_p = ParamBase(
+                        shape=v.shape,
+                        dtype=v.dtype,
+                        type=v.type,
+                        lod_level=v.lod_level,
+                        stop_gradient=p.stop_gradient,
+                        trainable=p.trainable,
+                        optimize_attr=p.optimize_attr,
+                        regularizer=p.regularizer,
+                        error_clip=p.error_clip,
+                        name=v.name,
+                    )
                 else:
                     new_p = Parameter(
                         block=self,
@@ -4050,13 +4283,15 @@ def _copy_param_info_from(self, other):
                         dtype=v.dtype,
                         type=v.type,
                         lod_level=v.lod_level
-                        if v.type == core.VarDesc.VarType.LOD_TENSOR else None,
+                        if v.type == core.VarDesc.VarType.LOD_TENSOR
+                        else None,
                         stop_gradient=p.stop_gradient,
                         trainable=p.trainable,
                         optimize_attr=p.optimize_attr,
                         regularizer=p.regularizer,
                         error_clip=p.error_clip,
-                        name=v.name)
+                        name=v.name,
+                    )
             self.vars[new_p.name] = new_p
 
     def _clone_variable(self, var, force_persistable=True):
@@ -4076,13 +4311,13 @@ def _clone_variable(self, var, force_persistable=True):
         ret_var = None
         # make STEP_SCOPES var can be safely cloned.
         if var.type == core.VarDesc.VarType.STEP_SCOPES:
-            ret_var = self.create_var(name=var.name,
-                                      persistable=var.persistable,
-                                      type=var.type)
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type
+            )
         elif var.type == core.VarDesc.VarType.RAW:
-            ret_var = self.create_var(name=var.name,
-                                      persistable=var.persistable,
-                                      type=var.type)
+            ret_var = self.create_var(
+                name=var.name, persistable=var.persistable, type=var.type
+            )
         elif var.type == core.VarDesc.VarType.SELECTED_ROWS:
             ret_var = self.create_var(
                 name=var.name,
@@ -4091,7 +4326,8 @@ def _clone_variable(self, var, force_persistable=True):
                 type=var.type,
                 persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data,
-                need_check_feed=var.desc.need_check_feed())
+                need_check_feed=var.desc.need_check_feed(),
+            )
         else:
             ret_var = self.create_var(
                 name=var.name,
@@ -4101,7 +4337,8 @@ def _clone_variable(self, var, force_persistable=True):
                 lod_level=var.lod_level,
                 persistable=True if force_persistable else var.persistable,
                 is_data=var.is_data,
-                need_check_feed=var.desc.need_check_feed())
+                need_check_feed=var.desc.need_check_feed(),
+            )
         return ret_var
 
 
@@ -4111,17 +4348,20 @@ def _clone_variable(self, var, force_persistable=True):
 # re-constructed inside this method. The underlying VarDesc(OpDesc)
 # of some old Python Variables(all old Python Operators) may have
 # been destructed.
-def _apply_pass(main_program,
-                startup_program,
-                pass_name,
-                pass_attrs={},
-                pass_attr_types={}):
+def _apply_pass(
+    main_program, startup_program, pass_name, pass_attrs={}, pass_attr_types={}
+):
     assert isinstance(pass_attrs, dict), "pass_attrs must be dict"
     assert isinstance(pass_attr_types, dict), "pass_attr_types must be dict"
     tmp_main_program = core.ProgramDesc(main_program.desc)
     tmp_startup_program = core.ProgramDesc(startup_program.desc)
-    attrs = core.apply_pass(tmp_main_program, tmp_startup_program, pass_name,
-                            pass_attrs, pass_attr_types)
+    attrs = core.apply_pass(
+        tmp_main_program,
+        tmp_startup_program,
+        pass_name,
+        pass_attrs,
+        pass_attr_types,
+    )
     main_program._rebuild_from_desc(tmp_main_program)
     startup_program._rebuild_from_desc(tmp_startup_program)
     return attrs
@@ -4139,8 +4379,9 @@ def __init__(self, node):
         Args:
             node(core.Node): C++ Node.
         """
-        assert isinstance(node,
-                          core.Node), 'node must be the instance of core.Node.'
+        assert isinstance(
+            node, core.Node
+        ), 'node must be the instance of core.Node.'
         self.node = node
 
     def name(self):
@@ -4316,8 +4557,9 @@ def __init__(self, node):
         Args:
             node(core.Node): C++ Node.
         """
-        assert isinstance(node, core.Node) and node.is_var(), \
-            'node must be the instance of core.Node and it must be a variable node.'
+        assert (
+            isinstance(node, core.Node) and node.is_var()
+        ), 'node must be the instance of core.Node and it must be a variable node.'
         super(IrVarNode, self).__init__(node)
         self.node = node
 
@@ -4328,8 +4570,9 @@ def set_shape(self, shape):
         Args:
             shape(list): shape to be set.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         self.node.var().set_shape(shape)
 
     def persistable(self):
@@ -4339,8 +4582,9 @@ def persistable(self):
         Returns:
             bool: indicate whether the variable is persistable.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().persistable()
 
     def type(self):
@@ -4350,8 +4594,9 @@ def type(self):
         Returns:
             core.VarDesc.VarType: the variable type.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().type()
 
     def dtype(self):
@@ -4361,8 +4606,9 @@ def dtype(self):
         Returns:
             core.VarDesc.VarType: the variable data type.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().dtype()
 
     def shape(self):
@@ -4372,8 +4618,9 @@ def shape(self):
         Returns:
             list: the variable shape.
         """
-        assert self.node.var() is not None, \
-            "The node variable description can not be None."
+        assert (
+            self.node.var() is not None
+        ), "The node variable description can not be None."
         return self.node.var().shape()
 
     @property
@@ -4409,8 +4656,9 @@ def __init__(self, node):
         Args:
             node(core.Node): C++ Node.
         """
-        assert isinstance(node, core.Node) and node.is_op(), \
-            'node must be the instance of core.Node and it must be a operator node.'
+        assert (
+            isinstance(node, core.Node) and node.is_op()
+        ), 'node must be the instance of core.Node and it must be a operator node.'
         super(IrOpNode, self).__init__(node)
         self.node = node
 
@@ -4422,8 +4670,9 @@ def rename_input(self, old_input_name, new_input_name):
             old_input_name(str): the old input name.
             new_input_name(str): the new input name.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         self.node.op()._rename_input(old_input_name, new_input_name)
 
     def rename_output(self, old_output_name, new_output_name):
@@ -4434,8 +4683,9 @@ def rename_output(self, old_output_name, new_output_name):
             old_output_name(str): the old output name.
             new_output_name(str): the new output name.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         self.node.op()._rename_output(old_output_name, new_output_name)
 
     def input(self, name):
@@ -4448,8 +4698,9 @@ def input(self, name):
         Returns:
             list(str): the argument name list.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().input(name)
 
     def output(self, name):
@@ -4462,8 +4713,9 @@ def output(self, name):
         Returns:
             list(str): the argument name list.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().output(name)
 
     def set_type(self, new_type):
@@ -4473,8 +4725,9 @@ def set_type(self, new_type):
         Args:
             new_type(str): new operator type to be set.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().set_type(new_type)
 
     def set_attr(self, name, val):
@@ -4491,8 +4744,9 @@ def _update_desc_attr(self, name, val):
         """
         Update the value of the op desc's attribute by attribute's name.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         desc = self.node.op()
         if isinstance(val, Variable):
             desc.set_var_attr(name, val.desc)
@@ -4502,8 +4756,9 @@ def _update_desc_attr(self, name, val):
             desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and _all_is_type(val, Block):
             desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
+        elif isinstance(val, core.BlockDesc) or isinstance(
+            val, core.ProgramDesc
+        ):
             desc.set_serialized_attr(name, val.serialize_to_string())
         else:
             desc._set_attr(name, val)
@@ -4515,8 +4770,9 @@ def input_arg_names(self):
         Returns:
             list(str): input arguments' names of this op node.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().input_arg_names()
 
     def output_arg_names(self):
@@ -4526,8 +4782,9 @@ def output_arg_names(self):
         Returns:
             list(str): output arguments' names of this op node.
         """
-        assert self.node.op() is not None, \
-            "The node operator description can not be None."
+        assert (
+            self.node.op() is not None
+        ), "The node operator description can not be None."
         return self.node.op().output_arg_names()
 
     @property
@@ -4568,7 +4825,8 @@ def __init__(self, graph, for_test=False):
             for_test(bool): True for the test graph and false for the train graph.
         """
         assert isinstance(
-            graph, core.Graph), 'graph must be the instance of core.Graph.'
+            graph, core.Graph
+        ), 'graph must be the instance of core.Graph.'
         self.graph = graph
         self._for_test = for_test
 
@@ -4609,8 +4867,11 @@ def all_persistable_nodes(self):
         """
         persistable_nodes = set()
         for node in self.graph.nodes():
-            if node.is_var() and node.var() is not None and node.var(
-            ).persistable():
+            if (
+                node.is_var()
+                and node.var() is not None
+                and node.var().persistable()
+            ):
                 persistable_nodes.add(node)
         return {IrVarNode(p) for p in persistable_nodes}
 
@@ -4717,13 +4978,15 @@ def create_op_node(self, op_type, attrs, inputs, outputs):
         for input_name, var_nodes in six.iteritems(inputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
-            op_desc.set_input(input_name,
-                              [var_node.name() for var_node in var_nodes])
+            op_desc.set_input(
+                input_name, [var_node.name() for var_node in var_nodes]
+            )
         for output_name, var_nodes in six.iteritems(outputs):
             if not isinstance(var_nodes, list):
                 var_nodes = [var_nodes]
-            op_desc.set_output(output_name,
-                               [var_node.name() for var_node in var_nodes])
+            op_desc.set_output(
+                output_name, [var_node.name() for var_node in var_nodes]
+            )
         return IrOpNode(self.graph.create_op_node(op_desc))
 
     def create_op_node_from_desc(self, op_desc):
@@ -4747,9 +5010,11 @@ def update_input_link(self, old_input_node, new_input_node, op_node):
             new_input_node(IrNode): the new input node of the giving op_node.
             op_node(IrOpNode): the operator node that is needed to update input's link.
         """
-        assert old_input_node.node in self.graph.nodes() and new_input_node.node in \
-            self.graph.nodes() and op_node.node in self.graph.nodes(), \
-            'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
+        assert (
+            old_input_node.node in self.graph.nodes()
+            and new_input_node.node in self.graph.nodes()
+            and op_node.node in self.graph.nodes()
+        ), 'The three arguments(old_input_node&new_input_node&op_node) must be in the graph nodes.'
         old_input_node.remove_output(op_node)
         op_node.remove_input(old_input_node)
         new_input_node.append_output(op_node)
@@ -4765,9 +5030,11 @@ def update_output_link(self, old_output_node, new_output_node, op_node):
             new_output_node(IrNode): the new output node of the giving op_node.
             op_node(IrOpNode): the operator node that is needed to update input's link.
         """
-        assert old_output_node.node in self.graph.nodes() and new_output_node.node in \
-            self.graph.nodes() and op_node.node in self.graph.nodes(), \
-            'The three arguments(old_output_node &new_output_node &op_node) must be in the graph nodes.'
+        assert (
+            old_output_node.node in self.graph.nodes()
+            and new_output_node.node in self.graph.nodes()
+            and op_node.node in self.graph.nodes()
+        ), 'The three arguments(old_output_node &new_output_node &op_node) must be in the graph nodes.'
         old_output_node.remove_input(op_node)
         op_node.remove_output(old_output_node)
         new_output_node.append_input(op_node)
@@ -4783,9 +5050,11 @@ def link_to(self, node_in, node_out):
             node_out(IrNode): the output node.
         """
         assert node_in.node in self.graph.nodes(), (
-            'node_in(%s) must be in the graph nodes.' % node_in.node.name())
+            'node_in(%s) must be in the graph nodes.' % node_in.node.name()
+        )
         assert node_out.node in self.graph.nodes(), (
-            'node_out(%s) must be in the graph nodes.' % node_out.node.name())
+            'node_out(%s) must be in the graph nodes.' % node_out.node.name()
+        )
         node_in.append_output(node_out)
         node_out.append_input(node_in)
 
@@ -4822,8 +5091,8 @@ def resolve_hazard(self):
                         ]
                     else:
                         var_nodes[each_var_name].append(
-                            self._find_node_by_name(node.outputs,
-                                                    each_var_name))
+                            self._find_node_by_name(node.outputs, each_var_name)
+                        )
         self.graph.resolve_hazard(var_nodes)
 
     def has_circle(self):
@@ -4885,13 +5154,15 @@ def draw(self, save_path, name, marked_nodes=None, remove_ctr_var=True):
 
         def _convert_to_pdf(dot_file_path):
             pdf_save_path = os.path.splitext(dot_file_path)[0] + '.pdf'
-            exited_code = subprocess.call('dot -Tpdf ' + dot_file_path +
-                                          ' -o ' + pdf_save_path,
-                                          shell=True)
+            exited_code = subprocess.call(
+                'dot -Tpdf ' + dot_file_path + ' -o ' + pdf_save_path,
+                shell=True,
+            )
             if exited_code != 0:
                 print('The dot command is needed for creating pdf files.')
-                print('The {} is saved as the dot filetype.'.format(
-                    dot_file_path))
+                print(
+                    'The {} is saved as the dot filetype.'.format(dot_file_path)
+                )
 
         remove_ctr_vars = set()
         if remove_ctr_var:
@@ -4948,7 +5219,8 @@ def _find_node_by_name(self, nodes, node_name):
             if n.name() == node_name:
                 target_node = n
         assert target_node is not None, (
-            "Cannot find the target node (%s)in the giving set." % node_name)
+            "Cannot find the target node (%s)in the giving set." % node_name
+        )
         return target_node
 
     def _update_desc_attr(self, desc, name, val):
@@ -4963,8 +5235,9 @@ def _update_desc_attr(self, desc, name, val):
             desc.set_block_attr(name, val.desc)
         elif isinstance(val, list) and val and _all_is_type(val, Block):
             desc.set_blocks_attr(name, [v.desc for v in val])
-        elif isinstance(val, core.BlockDesc) or \
-                isinstance(val, core.ProgramDesc):
+        elif isinstance(val, core.BlockDesc) or isinstance(
+            val, core.ProgramDesc
+        ):
             desc.set_serialized_attr(name, val.serialize_to_string())
         else:
             desc._set_attr(name, val)
@@ -5066,7 +5339,8 @@ def __init__(self):
 
         # identifier for auto checkpoint
         self._auto_checkpoint_name = unique_name.generate(
-            "__auto_checkpoint_program__")
+            "__auto_checkpoint_program__"
+        )
 
         # compiled program, i.e. Graph
         self._graph = None
@@ -5086,7 +5360,7 @@ def get_var_desc_attr_or_none(var_desc, attr_name, allowed_types):
         all_new_vars = []
         block_num = new_desc.num_blocks()
         for idx in range(block_num):
-            if (idx > (len(self.blocks) - 1)):
+            if idx > (len(self.blocks) - 1):
                 self._create_block()
             new_block_desc = new_desc.block(idx)
             all_new_vars.append([])
@@ -5098,60 +5372,75 @@ def get_var_desc_attr_or_none(var_desc, attr_name, allowed_types):
                     old_var = None
 
                 kwargs = {
-                    'type':
-                    new_var_desc.type(),
-                    'name':
-                    new_var_desc.name(),
-                    'shape':
-                    get_var_desc_attr_or_none(new_var_desc, "shape", [
-                        core.VarDesc.VarType.LOD_TENSOR,
-                        core.VarDesc.VarType.SELECTED_ROWS,
-                        core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                    ]),
-                    'dtype':
-                    get_var_desc_attr_or_none(new_var_desc, "dtype", [
-                        core.VarDesc.VarType.LOD_TENSOR,
-                        core.VarDesc.VarType.SELECTED_ROWS,
-                        core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                    ]),
-                    'lod_level':
-                    get_var_desc_attr_or_none(new_var_desc, "lod_level", [
-                        core.VarDesc.VarType.LOD_TENSOR,
-                        core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                    ]),
-                    'error_clip':
-                    old_var.error_clip if old_var is not None else None,
-                    'stop_gradient':
-                    old_var.stop_gradient if old_var is not None else False,
-                    'is_data':
-                    old_var.is_data if old_var is not None else False,
-                    'need_check_feed':
-                    new_var_desc.need_check_feed(),
-                    'belong_to_optimizer':
-                    old_var.belong_to_optimizer
-                    if old_var is not None else False,
+                    'type': new_var_desc.type(),
+                    'name': new_var_desc.name(),
+                    'shape': get_var_desc_attr_or_none(
+                        new_var_desc,
+                        "shape",
+                        [
+                            core.VarDesc.VarType.LOD_TENSOR,
+                            core.VarDesc.VarType.SELECTED_ROWS,
+                            core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                        ],
+                    ),
+                    'dtype': get_var_desc_attr_or_none(
+                        new_var_desc,
+                        "dtype",
+                        [
+                            core.VarDesc.VarType.LOD_TENSOR,
+                            core.VarDesc.VarType.SELECTED_ROWS,
+                            core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                        ],
+                    ),
+                    'lod_level': get_var_desc_attr_or_none(
+                        new_var_desc,
+                        "lod_level",
+                        [
+                            core.VarDesc.VarType.LOD_TENSOR,
+                            core.VarDesc.VarType.LOD_TENSOR_ARRAY,
+                        ],
+                    ),
+                    'error_clip': old_var.error_clip
+                    if old_var is not None
+                    else None,
+                    'stop_gradient': old_var.stop_gradient
+                    if old_var is not None
+                    else False,
+                    'is_data': old_var.is_data
+                    if old_var is not None
+                    else False,
+                    'need_check_feed': new_var_desc.need_check_feed(),
+                    'belong_to_optimizer': old_var.belong_to_optimizer
+                    if old_var is not None
+                    else False,
                 }
 
                 if isinstance(old_var, Parameter):
-                    kwargs.update({
-                        'trainable': old_var.trainable,
-                        'optimize_attr': old_var.optimize_attr,
-                        'regularizer': old_var.regularizer,
-                        'do_model_average': old_var.do_model_average,
-                        'need_clip': old_var.need_clip,
-                        'is_distributed': old_var.is_distributed,
-                        'is_parameter': old_var.is_parameter,
-                    })
-                    block_new_vars.append({
-                        'class': Parameter,
-                        'kwargs': copy.deepcopy(kwargs),
-                    })
+                    kwargs.update(
+                        {
+                            'trainable': old_var.trainable,
+                            'optimize_attr': old_var.optimize_attr,
+                            'regularizer': old_var.regularizer,
+                            'do_model_average': old_var.do_model_average,
+                            'need_clip': old_var.need_clip,
+                            'is_distributed': old_var.is_distributed,
+                            'is_parameter': old_var.is_parameter,
+                        }
+                    )
+                    block_new_vars.append(
+                        {
+                            'class': Parameter,
+                            'kwargs': copy.deepcopy(kwargs),
+                        }
+                    )
                 else:
                     kwargs['persistable'] = new_var_desc.persistable()
-                    block_new_vars.append({
-                        'class': Variable,
-                        'kwargs': copy.deepcopy(kwargs),
-                    })
+                    block_new_vars.append(
+                        {
+                            'class': Variable,
+                            'kwargs': copy.deepcopy(kwargs),
+                        }
+                    )
 
         return all_new_vars
 
@@ -5383,7 +5672,8 @@ def _to_readable_code(self, skip_op_callstack=True):
         assert isinstance(
             skip_op_callstack, bool
         ), "skip_op_callstack parameter's type is error, expect bool, received {}".format(
-            type(skip_op_callstack))
+            type(skip_op_callstack)
+        )
         program_str = ""
         for block in self.blocks:
             program_str += block._to_readable_code(skip_op_callstack)
@@ -5425,11 +5715,13 @@ def to_string(self, throw_on_error, with_details=False):
         assert isinstance(
             throw_on_error, bool
         ), "The type of throw_on_error parameter is wrong, expected bool, but received {}.".format(
-            type(throw_on_error))
+            type(throw_on_error)
+        )
         assert isinstance(
             with_details, bool
         ), "The type of with_details parameter is wrong, expected bool, but received {}.".format(
-            type(with_details))
+            type(with_details)
+        )
 
         if with_details:
             res_str = ""
@@ -5438,7 +5730,8 @@ def to_string(self, throw_on_error, with_details=False):
         else:
             protostr = self.desc.serialize_to_string()
             proto = framework_pb2.ProgramDesc.FromString(
-                six.binary_type(protostr))
+                six.binary_type(protostr)
+            )
             res_str = _debug_string_(proto, throw_on_error)
         return res_str
 
@@ -5458,8 +5751,8 @@ def _version(self):
     def clone(self, for_test=False):
         """
         .. note:::
-            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` . 
-            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` . 
+            1. :code:`Program.clone()` method DOES NOT clone :ref:`api_paddle_io_DataLoader` .
+            2. Recommend you to use :code:`clone` before using :code:`Opimizer.minimize` .
             3. This API has no effect in Dygraph Mode.
 
         Create a new Program with forward content of original one when ``for_test=True``.
@@ -5636,7 +5929,8 @@ def network():
         if for_test:
             forward_prog = Program()
             forward_prog.desc, pruned_origin_block_id_map = core.prune_backward(
-                self.desc)
+                self.desc
+            )
             forward_prog.blocks = [
                 Block(forward_prog, i)
                 for i in six.moves.range(forward_prog.desc.num_blocks())
@@ -5687,8 +5981,8 @@ def _prune(self, targets):
     def _prune_with_input(self, feeded_var_names, targets):
         """
         Prune operators and variables which are not needed to generate
-        :code:`targets`. Prune operators and variables which are needed 
-        to generate feeded_var 
+        :code:`targets`. Prune operators and variables which are needed
+        to generate feeded_var
 
         Notes: This is a very low level API. Users should not use this API
         directly. This API is in flux and not stable.
@@ -5716,7 +6010,8 @@ def _prune_with_input(self, feeded_var_names, targets):
             if not isinstance(var, six.string_types):
                 raise ValueError(
                     "All feeded_var_names of Program._prune_with_input() can only be "
-                    "str, but received %s." % type(var))
+                    "str, but received %s." % type(var)
+                )
 
         # find out all variables that can be generated or updated with given feed
         generatable_vars = set()
@@ -5744,7 +6039,8 @@ def _prune_with_input(self, feeded_var_names, targets):
                 else:
                     raise ValueError(
                         "All targets of Program._prune_with_input() can only be "
-                        "Variable or Operator, but received %s." % type(t))
+                        "Variable or Operator, but received %s." % type(t)
+                    )
 
                 # NOTEZ(zhiqiu): For variable to be fed in fetch_list, there two cases:
                 # (1) the variable is leaf, it has no op that generates it;
@@ -5778,7 +6074,8 @@ def _prune_with_input(self, feeded_var_names, targets):
 
         res = Program()
         res.desc, pruned_origin_block_id_map = core.prune(
-            self.desc, set(feeded_var_names), targets_idx)
+            self.desc, set(feeded_var_names), targets_idx
+        )
         res.blocks = [
             Block(res, i) for i in six.moves.range(res.desc.num_blocks())
         ]
@@ -5819,8 +6116,10 @@ def _inference_optimize(self, prune_read_op=True):
         root_block = res.desc.block(0)
         if prune_read_op:
             while True:
-                if read_op_idx >= root_block.op_size() or root_block.op(
-                        read_op_idx).type() == 'read':
+                if (
+                    read_op_idx >= root_block.op_size()
+                    or root_block.op(read_op_idx).type() == 'read'
+                ):
                     break
                 read_op_idx += 1
             if read_op_idx < root_block.op_size():
@@ -5835,7 +6134,7 @@ def _inference_optimize(self, prune_read_op=True):
             for j in six.moves.range(block.op_size()):
                 op = block.op(j)
                 if op.has_attr('is_test'):
-                    op._set_attr('is_test', True)
+                    op._set_bool_attr('is_test', True)
                 if op.type() == "batch_norm":
                     # Remove the output ReserveSpace of batch_norm if exists.
                     op.remove_output("ReserveSpace")
@@ -5867,9 +6166,7 @@ def _remove_training_info(self, clip_extra=True):
 
         # Note: The op_role and op_role_var cann't be deleted currently,
         # and we will try to remove them in the future.
-        common_clipped_attrs_list = [
-            'op_namescope', 'op_callstack', 'op_device', 'with_quant_attr'
-        ]
+        common_clipped_attrs_list = ['op_callstack', 'with_quant_attr']
 
         for i in six.moves.range(res.desc.num_blocks()):
             block = res.desc.block(i)
@@ -5898,8 +6195,9 @@ def _remove_training_info(self, clip_extra=True):
                         break
                     if not find:
                         remove_input_list.append(name)
-                for name in remove_input_list:
-                    op.remove_input(name)
+                # The extra input of op will be removed in the future
+                # for name in remove_input_list:
+                #     op.remove_input(name)
 
                 remove_output_list = []
                 for name in op.output_names():
@@ -5913,19 +6211,30 @@ def _remove_training_info(self, clip_extra=True):
                         break
                     if not find:
                         remove_output_list.append(name)
-                for name in remove_output_list:
-                    op.remove_output(name)
+                # The extra output of op will be removed in the future
+                # for name in remove_output_list:
+                #     op.remove_output(name)
 
-                remove_attr_list = []
-                op_quant_name = core.op_proto_and_checker_maker.kOpWithQuantAttrName(
+                op_quant_name = (
+                    core.op_proto_and_checker_maker.kOpWithQuantAttrName()
+                )
+                quant = (
+                    bool(op.attr(op_quant_name))
+                    if op_quant_name in op.attr_names()
+                    else False
                 )
-                quant = bool(op.attr(op_quant_name)
-                             ) if op_quant_name in op.attr_names() else False
                 quant_attrs = [
-                    op_quant_name, "quantization_type", "skip_quant",
-                    "activation_bits", "bit_length", "quantize_weight_bits",
-                    "weight_quant_scale"
+                    op_quant_name,
+                    "quantization_type",
+                    "skip_quant",
+                    "activation_bits",
+                    "bit_length",
+                    "quantize_weight_bits",
+                    "weight_quant_scale",
                 ]
+                for extra_attr_name in extra_attrs_map.keys():
+                    op.remove_attr(extra_attr_name)
+                remove_attr_list = []
                 for name in op.attr_names():
                     if quant:
                         if name in quant_attrs:
@@ -5933,15 +6242,13 @@ def _remove_training_info(self, clip_extra=True):
                         if name.endswith("_threshold"):
                             continue
                     if len(extra_attrs_map) > 0:
-                        if name in extra_attrs_map or name in common_clipped_attrs_list:
+                        if name in common_clipped_attrs_list:
                             op.remove_attr(name)
                         continue
                     find = False
                     for attr_proto in proto.attrs:
                         if attr_proto.name != name:
                             continue
-                        if attr_proto.extra:
-                            remove_attr_list.append(name)
                         find = True
                         break
                     if not find:
@@ -5954,7 +6261,7 @@ def _remove_training_info(self, clip_extra=True):
     def parse_from_string(binary_str):
         """
         .. note::
-            1. All information about parameters will be lost after serialization; 
+            1. All information about parameters will be lost after serialization;
             2. This API has no effect in Dygraph mode.
 
         Deserialize a Program from  `protobuf <https://en.wikipedia.org/wiki/Protocol_Buffers>`_  binary string.
@@ -6019,7 +6326,7 @@ def random_seed(self):
         The default random seed for random operators in Program. ``0`` means get
         the random seed from random device.
 
-        .. note:: 
+        .. note::
             It must be set before the operators have been added.
 
         Returns:
@@ -6057,7 +6364,7 @@ def num_blocks(self):
         """
         The number of :ref:`api_guide_Block_en`  in this Program.
 
-        .. note:: 
+        .. note::
             This API has no effect in Dygraph mode.
 
         Returns:
@@ -6086,7 +6393,8 @@ def random_seed(self, seed):
         if not isinstance(seed, int):
             raise ValueError(
                 "Program.random_seed's input seed must be an integer, but received %s."
-                % type(seed))
+                % type(seed)
+            )
         self._seed = seed
 
     def __repr__(self):
@@ -6183,8 +6491,11 @@ def _create_block(self, parent_idx=None):
             Block: The new block.
         """
         new_block_idx = len(self.blocks)
-        parent = self.current_block() if parent_idx is None else self.block(
-            parent_idx)
+        parent = (
+            self.current_block()
+            if parent_idx is None
+            else self.block(parent_idx)
+        )
         self.desc.append_block(parent.desc)
         self.current_block_idx = new_block_idx
         self.blocks.append(Block(self, self.current_block_idx))
@@ -6230,7 +6541,8 @@ def _copy_param_info_from(self, other):
         if not isinstance(other, Program):
             raise TypeError(
                 "Function Program._copy_param_info_from() needs to pass in a source Program, but received %s"
-                % type(other))
+                % type(other)
+            )
 
         self.global_block()._copy_param_info_from(other.global_block())
 
@@ -6247,7 +6559,8 @@ def _copy_dist_param_info_from(self, other):
         if not isinstance(other, Program):
             raise TypeError(
                 "Function Program._copy_param_info_from() needs to pass in a source Program, but received %s"
-                % type(other))
+                % type(other)
+            )
         self._is_distributed = other._is_distributed
         self._is_chief = other._is_chief
         self._parameters_on_pservers = other._parameters_on_pservers
@@ -6265,8 +6578,8 @@ def _copy_data_info_from(self, other, pruned_origin_block_id_map=None):
         Args:
             other(Program): Other program
             pruned_origin_block_id_map(dict{int:int}): A dict which maps the block id in program
-            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is 
-            cloned from block 0 in other, etc. Default is None, which means default mapped, 
+            self to the block id in program other. For example, {0:0, 1:1, 2:3} means block 0 in self is
+            cloned from block 0 in other, etc. Default is None, which means default mapped,
             {0:0, 1:1,..., n:n}.
 
         Returns:
@@ -6275,12 +6588,12 @@ def _copy_data_info_from(self, other, pruned_origin_block_id_map=None):
         if not isinstance(other, Program):
             raise TypeError(
                 "Function Program._copy_param_info_from() needs to pass in a source Program, but received %s"
-                % type(other))
+                % type(other)
+            )
 
         if not pruned_origin_block_id_map:
             pruned_origin_block_id_map = {
-                i: i
-                for i in six.moves.range(self.desc.num_blocks())
+                i: i for i in six.moves.range(self.desc.num_blocks())
             }
 
         # NOTE(zhiqiu): All vars in cloned program exist in original program.
@@ -6372,12 +6685,12 @@ def state_dict(self, mode='all', scope=None):
             This function MUST called after run start_up_program
 
         Args:
-            mode(str, optional): Source of the obtained parameters and buffers. 
-                    'opt' :  The return value only contains the variable in the optimizer. 
-                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.  
+            mode(str, optional): Source of the obtained parameters and buffers.
+                    'opt' :  The return value only contains the variable in the optimizer.
+                    'param' : The return value only contains the variable in the network, not the variable in the optimizer.
                     'all' : The return value contains the variable in the network and optimizer.
                     Default: 'all'
-            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
 
@@ -6408,10 +6721,13 @@ def state_dict(self, mode='all', scope=None):
         # can not be imported at the begainning of this file.
         # Therefore, the above two modules are dynamically imported.
         from .executor import global_scope
+
         if scope is not None and not isinstance(scope, core._Scope):
             raise TypeError(
-                "`scope` should be None or `paddle.static.Scope'` type, but received {}."
-                .format(type(scope)))
+                "`scope` should be None or `paddle.static.Scope'` type, but received {}.".format(
+                    type(scope)
+                )
+            )
 
         if scope is None:
             scope = global_scope()
@@ -6419,15 +6735,19 @@ def state_dict(self, mode='all', scope=None):
         if not isinstance(mode, str):
             raise TypeError(
                 "Type of `mode` should be string, but received {}.".format(
-                    type(mode)))
+                    type(mode)
+                )
+            )
 
         def is_parameter(var):
             return isinstance(var, Parameter)
 
         def is_persistable(var):
-            if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                var.desc.type() == core.VarDesc.VarType.READER:
+            if (
+                var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH
+                or var.desc.type() == core.VarDesc.VarType.FETCH_LIST
+                or var.desc.type() == core.VarDesc.VarType.READER
+            ):
                 return False
             return var.persistable
 
@@ -6446,8 +6766,10 @@ def condition(var):
                 return is_parameter(var) or is_belong_to_optimizer(var)
             else:
                 raise ValueError(
-                    "`mode` string should be 'param', 'opt' or 'all', but received {}."
-                    .format(mode))
+                    "`mode` string should be 'param', 'opt' or 'all', but received {}.".format(
+                        mode
+                    )
+                )
 
         var_list = filter(condition, self.list_vars())
 
@@ -6456,28 +6778,30 @@ def condition(var):
             var_temp = scope.find_var(var.name)
             if var_temp is None:
                 raise ValueError(
-                    "Can not find Variable '{}' in the scope. Make sure it is initialized"
-                    .format(var.name))
+                    "Can not find Variable '{}' in the scope. Make sure it is initialized".format(
+                        var.name
+                    )
+                )
             state_dict[var.name] = var_temp.get_tensor()
 
         return state_dict
 
     def set_state_dict(self, state_dict, scope=None):
         """
-        Set parameters and persistable buffers in state_dict to program. 
+        Set parameters and persistable buffers in state_dict to program.
         An exception will throw if shape or dtype of the parameters is not match.
-        
+
         .. note::
             This function MUST called after run start_up_program
 
         Args:
-            state_dict(dict): the dict store parameters and persistable buffers. 
+            state_dict(dict): the dict store parameters and persistable buffers.
                 The key is the name of the parameter or the name of the buffer.
                 The value is the tensor of this variable in the given scope.
-            scope(Scope, optional) : If scope is None, state_dict will be set to global scope 
+            scope(Scope, optional) : If scope is None, state_dict will be set to global scope
                 obtained through 'paddle.static.global_scope()'. Otherwise, value will be set to scope.
                 Default: None
-        
+
         Returns:
             None
 
@@ -6507,10 +6831,14 @@ def set_state_dict(self, state_dict, scope=None):
         if not isinstance(state_dict, dict):
             raise TypeError(
                 "Type of `state_dict` should be dict, but received {}.".format(
-                    type(state_dict)))
+                    type(state_dict)
+                )
+            )
 
         vars_dict = {var.name: var for var in self.list_vars()}
-        condition = True if 'StructuredToParameterName@@' in state_dict else False
+        condition = (
+            True if 'StructuredToParameterName@@' in state_dict else False
+        )
         for name, value in state_dict.items():
             if condition:
                 if name == "StructuredToParameterName@@":
@@ -6522,14 +6850,20 @@ def set_state_dict(self, state_dict, scope=None):
                     vars_dict[name].set_value(value, scope)
                 except ValueError as err:
                     warnings.warn(
-                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                        ("Skip loading for '{}'. ".format(name) + str(err))
+                    )
                 except TypeError as err:
                     warnings.warn(
-                        ("Skip loading for '{}'. ".format(name) + str(err)))
+                        ("Skip loading for '{}'. ".format(name) + str(err))
+                    )
             else:
                 warnings.warn(
-                    ("Skip loading for '{0}'. Because '{0}' not in the program."
-                     .format(name)))
+                    (
+                        "Skip loading for '{0}'. Because '{0}' not in the program.".format(
+                            name
+                        )
+                    )
+                )
 
 
 @six.add_metaclass(ParameterMetaClass)
@@ -6553,16 +6887,18 @@ class Parameter(Variable):
             be applied on the parameter. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this parameter.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
-    def __init__(self,
-                 block,
-                 shape,
-                 dtype,
-                 type=core.VarDesc.VarType.LOD_TENSOR,
-                 **kwargs):
+    def __init__(
+        self,
+        block,
+        shape,
+        dtype,
+        type=core.VarDesc.VarType.LOD_TENSOR,
+        **kwargs
+    ):
         if shape is None:
             raise ValueError("The shape of Parameter should not be None")
         if dtype is None:
@@ -6570,21 +6906,25 @@ def __init__(self,
 
         if len(shape) == 0:
             raise ValueError(
-                "The dimensions of shape for Parameter must be greater than 0")
+                "The dimensions of shape for Parameter must be greater than 0"
+            )
 
         for each in shape:
             if each < 0:
                 raise ValueError(
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
-                    % list(shape))
-
-        Variable.__init__(self,
-                          block,
-                          persistable=True,
-                          shape=shape,
-                          dtype=dtype,
-                          type=type,
-                          **kwargs)
+                    % list(shape)
+                )
+
+        Variable.__init__(
+            self,
+            block,
+            persistable=True,
+            shape=shape,
+            dtype=dtype,
+            type=type,
+            **kwargs
+        )
         self.trainable = kwargs.get('trainable', True)
 
         self.optimize_attr = kwargs.get('optimize_attr', {'learning_rate': 1.0})
@@ -6625,14 +6965,22 @@ def to_string(self, throw_on_error, with_details=False):
                 print(debug_str)
         """
         assert isinstance(throw_on_error, bool) and isinstance(
-            with_details, bool)
+            with_details, bool
+        )
         if with_details:
             res_str = Variable.to_string(self, throw_on_error, True)
-            additional_attr = ("trainable", "optimize_attr", "regularizer",
-                               "do_model_average", "need_clip")
+            additional_attr = (
+                "trainable",
+                "optimize_attr",
+                "regularizer",
+                "do_model_average",
+                "need_clip",
+            )
             for attr_name in additional_attr:
-                res_str += "%s: %s\n" % (attr_name,
-                                         cpt.to_text(getattr(self, attr_name)))
+                res_str += "%s: %s\n" % (
+                    attr_name,
+                    cpt.to_text(getattr(self, attr_name)),
+                )
         else:
             res_str = Variable.to_string(self, throw_on_error, False)
         return res_str
@@ -6642,8 +6990,8 @@ def to_string(self, throw_on_error, with_details=False):
 
 class ParamBase(core.VarBase):
     """
-    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode). 
-    A ParamBase is a persistable Tensor, and will be updated by optimizers 
+    ParamBase is derived from Tensor( Which is the concept in Dygraph Mode).
+    A ParamBase is a persistable Tensor, and will be updated by optimizers
     after each iteration.
     The training of a neural network is essentially the updating of
     its ParamBase.
@@ -6661,7 +7009,7 @@ class ParamBase(core.VarBase):
             be applied on the ParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this ParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6674,13 +7022,15 @@ def __init__(self, shape, dtype, **kwargs):
 
         if len(shape) == 0:
             raise ValueError(
-                "The dimensions of shape for Parameter must be greater than 0")
+                "The dimensions of shape for Parameter must be greater than 0"
+            )
 
         for each in shape:
             if each < 0:
                 raise ValueError(
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
-                    % list(shape))
+                    % list(shape)
+                )
 
         if dtype is not None:
             if not isinstance(dtype, core.VarDesc.VarType):
@@ -6688,10 +7038,13 @@ def __init__(self, shape, dtype, **kwargs):
 
         name = kwargs.get('name', unique_name.generate('_param_base'))
 
-        super(ParamBase,
-              self).__init__(dtype if dtype else core.VarDesc.VarType.FP32,
-                             list(shape) if shape else [], name,
-                             core.VarDesc.VarType.LOD_TENSOR, True)
+        super(ParamBase, self).__init__(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [],
+            name,
+            core.VarDesc.VarType.LOD_TENSOR,
+            True,
+        )
 
         trainable = kwargs.get('trainable', True)
         self.stop_gradient = not trainable
@@ -6718,7 +7071,8 @@ def trainable(self, trainable):
         else:
             raise ValueError(
                 "The type of trainable MUST be bool, but the type is ",
-                type(trainable))
+                type(trainable),
+            )
 
     def __str__(self):
         """
@@ -6739,7 +7093,8 @@ def __str__(self):
                 #         [-0.54217887,  0.48439729,  0.34082305]])
         """
         return "Parameter containing:\n{tensor}".format(
-            tensor=super(ParamBase, self).__str__())
+            tensor=super(ParamBase, self).__str__()
+        )
 
     def __deepcopy__(self, memo):
         """
@@ -6788,8 +7143,8 @@ def _copy_to(self, device, blocking):
 
 class EagerParamBase(_core_eager_eagertensor):
     """
-    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode). 
-    A EagerParamBase is a persistable Tensor, and will be updated by optimizers 
+    EagerParamBase is derived from Tensor( Which is the concept in Eager-Dygraph Mode).
+    A EagerParamBase is a persistable Tensor, and will be updated by optimizers
     after each iteration.
     The training of a neural network is essentially the updating of
     its EagerParamBase.
@@ -6807,7 +7162,7 @@ class EagerParamBase(_core_eager_eagertensor):
             be applied on the EagerParamBase. Default: None
         do_model_average(bool): True if the model average strategy will
             be applied on this EagerParamBase.
-        need_clip (bool): Whether the parameter gradient need to be cliped 
+        need_clip (bool): Whether the parameter gradient need to be cliped
             in optimizer. Default is True.
     """
 
@@ -6820,13 +7175,15 @@ def __init__(self, shape, dtype, **kwargs):
 
         if len(shape) == 0:
             raise ValueError(
-                "The dimensions of shape for Parameter must be greater than 0")
+                "The dimensions of shape for Parameter must be greater than 0"
+            )
 
         for each in shape:
             if each < 0:
                 raise ValueError(
                     "Each dimension of shape for Parameter must be greater than 0, but received %s"
-                    % list(shape))
+                    % list(shape)
+                )
 
         if dtype is not None:
             if not isinstance(dtype, core.VarDesc.VarType):
@@ -6837,10 +7194,13 @@ def __init__(self, shape, dtype, **kwargs):
         if isinstance(shape, core.eager.Tensor):
             shape = shape.numpy()
 
-        super(EagerParamBase,
-              self).__init__(dtype if dtype else core.VarDesc.VarType.FP32,
-                             list(shape) if shape else [], name,
-                             core.VarDesc.VarType.LOD_TENSOR, True)
+        super(EagerParamBase, self).__init__(
+            dtype if dtype else core.VarDesc.VarType.FP32,
+            list(shape) if shape else [],
+            name,
+            core.VarDesc.VarType.LOD_TENSOR,
+            True,
+        )
         self.retain_grads()
 
         trainable = kwargs.get('trainable', True)
@@ -6864,7 +7224,9 @@ def set_init_func(self, obj):
 
     @dygraph_only
     def initialize(self):
-        assert self._init_func is not None, "Required self._init_func is not None, but received None."
+        assert (
+            self._init_func is not None
+        ), "Required self._init_func is not None, but received None."
         self._init_func()
         # clear function handle to release resource
         self._init_func = None
@@ -6880,13 +7242,16 @@ def trainable(self, trainable):
         else:
             raise ValueError(
                 "The type of trainable MUST be bool, but the type is ",
-                type(trainable))
+                type(trainable),
+            )
 
     def _create_init_op(self, block):
         """
         Call init_op_creator function to create initializer operation in block.
         """
-        assert self._init_op_creator is not None, "Required self._init_op_creator is not None, but received None."
+        assert (
+            self._init_op_creator is not None
+        ), "Required self._init_op_creator is not None, but received None."
         self._init_op_creator(block)
 
     def __str__(self):
@@ -6908,7 +7273,8 @@ def __str__(self):
                 #         [-0.54217887,  0.48439729,  0.34082305]])
         """
         return "Parameter containing:\n{tensor}".format(
-            tensor=super(EagerParamBase, self).__str__())
+            tensor=super(EagerParamBase, self).__str__()
+        )
 
     def __deepcopy__(self, memo):
         """
@@ -6960,7 +7326,7 @@ def default_startup_program():
     Get default/global startup program.
 
     The :code:`paddle.nn` function will append the initialization operators into startup program.
-    The :code:`startup_program` will initialize the parameters by the OPs. 
+    The :code:`startup_program` will initialize the parameters by the OPs.
 
     This method will return the default or the current startup program. Users can use
     :ref:`api_paddle_fluid_framework_program_guard`  to switch :ref:`api_paddle_fluid_framework_Program` .
@@ -6968,7 +7334,7 @@ def default_startup_program():
     Returns:
         Program: current default startup program.
 
-    Returns type: 
+    Returns type:
 
     Examples:
         .. code-block:: python
@@ -6986,13 +7352,13 @@ def default_startup_program():
 
 def default_main_program():
     """
-    This API can be used to get ``default main program`` which store the 
+    This API can be used to get ``default main program`` which store the
     descriptions of Ops and tensors.
 
-    For example ``z = paddle.add(x, y)`` will create a new ``add`` 
-    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` . 
+    For example ``z = paddle.add(x, y)`` will create a new ``add``
+    Op and a new ``z`` tensor, and they will be recorded in ``default main program`` .
 
-    The ``default main program`` is the default value for ``Program`` parameter in 
+    The ``default main program`` is the default value for ``Program`` parameter in
     a lot of APIs. For example, the :code:`Executor.run()` will execute the
     :code:`default_main_program` when the program is not specified.
 
@@ -7062,8 +7428,8 @@ def program_guard(main_program, startup_program=None):
 
     Args:
         main_program(Program): New main program inside ``with`` statement.
-        startup_program(Program, optional): New startup program inside ``with`` 
-            statement. :code:`None` means not changing startup program, 
+        startup_program(Program, optional): New startup program inside ``with``
+            statement. :code:`None` means not changing startup program,
             default_startup_program is still used.
             Default: None.
 
@@ -7095,12 +7461,18 @@ def program_guard(main_program, startup_program=None):
 
     """
     from .data_feeder import check_type
-    check_type(main_program, 'main_program', Program,
-               'paddle.static.program_guard')
+
+    check_type(
+        main_program, 'main_program', Program, 'paddle.static.program_guard'
+    )
     main_program = switch_main_program(main_program)
     if startup_program is not None:
-        check_type(startup_program, 'startup_program', Program,
-                   'paddle.static.program_guard')
+        check_type(
+            startup_program,
+            'startup_program',
+            Program,
+            'paddle.static.program_guard',
+        )
         # Tag the program __is_start_up as True
         startup_program._is_start_up_program_ = True
         startup_program = switch_startup_program(startup_program)
@@ -7170,7 +7542,7 @@ def switch_device(device):
 @signature_safe_contextmanager
 def device_guard(device=None):
     """
-    
+
     Note:
         The API only supports static mode.
 
@@ -7178,7 +7550,7 @@ def device_guard(device=None):
 
     Args:
         device(str|None): Specify the device to use in the context. It should be ``cpu``,
-            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs. 
+            ``gpu`` or ``gpu:x``, where ``x`` is the index of the GPUs.
             When it is set to 'cpu' or 'gpu', all OPs created in the context will be
             placed on CPUPlace or CUDAPlace. When 'gpu' is set and the program runs on
             single-card, the device index will be the same as the device on which the
@@ -7186,9 +7558,9 @@ def device_guard(device=None):
             assigned devices.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             # required: gpu
             import paddle
 
@@ -7223,7 +7595,8 @@ def device_guard(device=None):
     if device not in ['cpu', 'gpu', 'npu', 'xpu', 'mlu', '', None]:
         raise ValueError(
             "The Attr(device) should be 'cpu' 'npu' 'xpu' 'mlu' or 'gpu', and it can also be empty string or None "
-            "when there is no need to specify device. But received %s" % device)
+            "when there is no need to specify device. But received %s" % device
+        )
     if index:
         device = ":".join([device, index])
     pre_device = switch_device(device)
@@ -7253,9 +7626,11 @@ def _cuda_graph_guard(cuda_graph_attr=None):
         cuda_graph_attr(str|None): The cuda graph attr with the format of:
                                    cuda_graph_capture_mode;memory_pool_id;cuda_graph_id
     """
-    assert not _non_static_mode(
+    assert (
+        not _non_static_mode()
     ), "cuda_graph_guard only works under static mode"
-    assert core.is_compiled_with_cuda(
+    assert (
+        core.is_compiled_with_cuda()
     ), "cuda_graph_guard context can be only used when Paddle is compiled with cuda"
     pre_mode = _switch_cuda_graph_mode(cuda_graph_attr)
     try:
@@ -7285,7 +7660,8 @@ def set_flags(flags):
             _global_flags()[key] = value
         else:
             raise ValueError(
-                "Flag %s cannot set its value through this function." % (key))
+                "Flag %s cannot set its value through this function." % (key)
+            )
 
 
 def get_flags(flags):
@@ -7312,22 +7688,24 @@ def get_flags(flags):
     flags_value = {}
     if isinstance(flags, (list, tuple)):
         for key in flags:
-            if (_global_flags().is_public(key)):
+            if _global_flags().is_public(key):
                 value = _global_flags()[key]
                 temp = {key: value}
                 flags_value.update(temp)
             else:
                 raise ValueError(
-                    'Flag %s cannot get its value through this function.' %
-                    (key))
+                    'Flag %s cannot get its value through this function.'
+                    % (key)
+                )
     elif isinstance(flags, str):
-        if (_global_flags().is_public(flags)):
+        if _global_flags().is_public(flags):
             value = _global_flags()[flags]
             temp = {flags: value}
             flags_value.update(temp)
         else:
             raise ValueError(
-                'Flag %s cannot get its value through this function.' % (flags))
+                'Flag %s cannot get its value through this function.' % (flags)
+            )
     else:
         raise TypeError('Flags in get_flags should be a list, tuple or string.')
     return flags_value
@@ -7337,20 +7715,32 @@ def _get_paddle_place(place):
     "convert the string to paddle Place"
     if place is None:
         return place
-    if isinstance(place, (core.Place, core.XPUPlace, core.CPUPlace,
-                          core.CUDAPinnedPlace, core.CUDAPlace, core.NPUPlace,
-                          core.IPUPlace, core.MLUPlace, core.CustomPlace)):
+    if isinstance(
+        place,
+        (
+            core.Place,
+            core.XPUPlace,
+            core.CPUPlace,
+            core.CUDAPinnedPlace,
+            core.CUDAPlace,
+            core.NPUPlace,
+            core.IPUPlace,
+            core.MLUPlace,
+            core.CustomPlace,
+        ),
+    ):
         return place
 
     if not isinstance(place, str):
         raise ValueError(
-            "place only support string which is 'Place' and so on.")
+            "place only support string which is 'Place' and so on."
+        )
 
     place = place.lower()
-    if (place == "cpu"):
+    if place == "cpu":
         return core.CPUPlace()
 
-    if (place == "device"):
+    if place == "device":
         return core.Place()
 
     # GPU
@@ -7358,8 +7748,9 @@ def _get_paddle_place(place):
     if place == "gpu_pinned" or place == "gpu" or avaliable_gpu_place:
         if not core.is_compiled_with_cuda():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with CUDA".format(avaliable_gpu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with CUDA".format(avaliable_gpu_place)
+            )
         if place == "gpu_pinned":
             return core.CUDAPinnedPlace()
         elif place == "gpu":
@@ -7375,8 +7766,9 @@ def _get_paddle_place(place):
     if avaliable_xpu_place:
         if not core.is_compiled_with_xpu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with XPU".format(avaliable_xpu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with XPU".format(avaliable_xpu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
@@ -7387,8 +7779,9 @@ def _get_paddle_place(place):
     if avaliable_npu_place:
         if not core.is_compiled_with_npu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with NPU".format(avaliable_npu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with NPU".format(avaliable_npu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
@@ -7399,8 +7792,9 @@ def _get_paddle_place(place):
     if avaliable_ipu_place:
         if not core.is_compiled_with_ipu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with IPU".format(avaliable_ipu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with IPU".format(avaliable_ipu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
@@ -7411,16 +7805,19 @@ def _get_paddle_place(place):
     if avaliable_mlu_place:
         if not core.is_compiled_with_mlu():
             raise ValueError(
-                "The device should not be {}, since PaddlePaddle is " \
-                "not compiled with MLU".format(avaliable_mlu_place))
+                "The device should not be {}, since PaddlePaddle is "
+                "not compiled with MLU".format(avaliable_mlu_place)
+            )
         place_info_list = place.split(':', 1)
         device_id = place_info_list[1]
         device_id = int(device_id)
         return core.MLUPlace(device_id)
 
     raise ValueError(
-        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}."
-        .format(place))
+        "Paddle supports CPUPlace, CUDAPlace,CUDAPinnedPlace, XPUPlace, IPUPlace, MLUPlace and NPUPlace, but received {}.".format(
+            place
+        )
+    )
 
 
 def _get_paddle_place_list(places):
diff --git a/python/paddle/fluid/inference/__init__.py b/python/paddle/fluid/inference/__init__.py
index 6f7bc32a518367..d6b8b102487923 100644
--- a/python/paddle/fluid/inference/__init__.py
+++ b/python/paddle/fluid/inference/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .wrapper import Config, DataType, PlaceType, PrecisionType, BackendType, Tensor, Predictor
+from .wrapper import Config, DataType, PlaceType, PrecisionType, Tensor, Predictor
 from .wrapper import convert_to_mixed_precision
 
-from ..core import create_predictor, get_version, get_num_bytes_of_data_type, PredictorPool, get_trt_compile_version, get_trt_runtime_version
+from ..core import create_predictor, get_version, _get_phi_kernel_name, get_num_bytes_of_data_type, PredictorPool, get_trt_compile_version, get_trt_runtime_version
diff --git a/python/paddle/fluid/inference/wrapper.py b/python/paddle/fluid/inference/wrapper.py
index ec778c6339d687..83811012e52945 100644
--- a/python/paddle/fluid/inference/wrapper.py
+++ b/python/paddle/fluid/inference/wrapper.py
@@ -24,7 +24,6 @@
 DataType = PaddleDType
 PlaceType = PaddlePlace
 PrecisionType = AnalysisConfig.Precision
-BackendType = AnalysisConfig.Backend
 Config = AnalysisConfig
 Tensor = PaddleInferTensor
 Predictor = PaddleInferPredictor
@@ -59,7 +58,7 @@ def convert_to_mixed_precision(model_file: str,
                                mixed_model_file: str,
                                mixed_params_file: str,
                                mixed_precision: PrecisionType,
-                               backend: BackendType,
+                               backend: PlaceType,
                                keep_io_types: bool = True,
                                black_list: Set = set()):
     '''
@@ -71,7 +70,7 @@ def convert_to_mixed_precision(model_file: str,
         mixed_model_file: The storage path of the converted mixed-precision model.
         mixed_params_file: The storage path of the converted mixed-precision params.
         mixed_precision: The precision, e.g. PrecisionType.Half.
-        backend: The backend, e.g. BackendType.GPU.
+        backend: The backend, e.g. PlaceType.GPU.
         keep_io_types: Whether the model input and output dtype remains unchanged.
         black_list: Operators that do not convert precision.
     '''
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index 26ed67f6e8ca25..5e59c8be062191 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -374,19 +374,6 @@ def forward(self, var, block=None):
                                  ["uint16", "float16", "float32", "float64"],
                                  "guassian_random")
 
-        # to be compatible of fp16 initalizers
-        if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-            out_dtype = VarDesc.VarType.FP32
-            out_var = block.create_var(name=unique_name.generate(".".join(
-                ['normal_init', var.name, 'tmp'])),
-                                       shape=var.shape,
-                                       dtype=out_dtype,
-                                       type=VarDesc.VarType.LOD_TENSOR,
-                                       persistable=False)
-        else:
-            out_dtype = var.dtype
-            out_var = var
-
         if self._seed == 0:
             self._seed = block.program.random_seed
 
@@ -394,48 +381,29 @@ def forward(self, var, block=None):
             place = _current_expected_place()
             out_var = _C_ops.gaussian_random(var.shape, self._mean,
                                              self._std_dev, self._seed,
-                                             out_dtype, place)
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _C_ops.cast(out_var, var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
+                                             var.dtype, place)
+            out_var._share_underline_tensor_to(var)
             return None
 
         if _in_legacy_dygraph():
             out_var = _legacy_C_ops.gaussian_random(
-                'shape', var.shape, 'dtype', out_dtype, 'mean', self._mean,
+                'shape', var.shape, 'dtype', var.dtype, 'mean', self._mean,
                 'std', self._std_dev, 'seed', self._seed, 'use_mkldnn', False)
 
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                var_tmp = _legacy_C_ops.cast(out_var, 'in_dtype', out_var.dtype,
-                                             'out_dtype', var.dtype)
-                var_tmp._share_underline_tensor_to(var)
-            else:
-                out_var._share_underline_tensor_to(var)
+            out_var._share_underline_tensor_to(var)
             return None
         else:
             op = block.append_op(type="gaussian_random",
-                                 outputs={"Out": out_var},
+                                 outputs={"Out": var},
                                  attrs={
                                      "shape": var.shape,
-                                     "dtype": out_dtype,
+                                     "dtype": var.dtype,
                                      "mean": self._mean,
                                      "std": self._std_dev,
                                      "seed": self._seed,
                                      "use_mkldnn": False
                                  },
                                  stop_gradient=True)
-
-            if var.dtype in [VarDesc.VarType.FP16, VarDesc.VarType.BF16]:
-                block.append_op(type="cast",
-                                inputs={"X": out_var},
-                                outputs={"Out": var},
-                                attrs={
-                                    "in_dtype": out_var.dtype,
-                                    "out_dtype": var.dtype
-                                })
             var.op = op
             return op
 
@@ -695,7 +663,7 @@ def forward(self, var, block=None):
                                      outputs={"Out": out_var},
                                      attrs={
                                          "shape": out_var.shape,
-                                         "dtype": out_dtype,
+                                         "dtype": out_var.dtype,
                                          "mean": 0.0,
                                          "std": std,
                                          "seed": self._seed
diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py
index 0f69949018de71..7169a90a98f415 100644
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -1232,7 +1232,7 @@ def save_inference_model(dirname,
                          params_filename=None,
                          export_for_deployment=True,
                          program_only=False,
-                         clip_extra=False):
+                         clip_extra=True):
     """
     Prune the given `main_program` to build a new program especially for inference,
     and then save it and all related parameters to given `dirname` .
diff --git a/python/paddle/fluid/layer_helper_base.py b/python/paddle/fluid/layer_helper_base.py
index 18b594d899c4c4..eeeea1a29090df 100644
--- a/python/paddle/fluid/layer_helper_base.py
+++ b/python/paddle/fluid/layer_helper_base.py
@@ -410,6 +410,30 @@ def create_variable_for_type_inference(self,
             persistable=False,
             stop_gradient=stop_gradient)
 
+    def create_sparse_variable_for_type_inference(self,
+                                                  dtype,
+                                                  stop_gradient=False,
+                                                  shape=None):
+        """Create a temporary sparse variable that should be type inferred layer.
+
+        Note:
+            The default type will be set to SPARSE_COO. However, when
+            the var is used as operator output, its type will be updated
+            based on operator's `VarTypeInference` implementation in
+            infer_var_type.
+        """
+        # set global dtype
+        if not dtype:
+            dtype = self.__dtype
+        return self.main_program.current_block().create_var(
+            name=unique_name.generate_with_ignorable_key(".".join(
+                [self.name, 'tmp'])),
+            dtype=dtype,
+            shape=shape,
+            type=core.VarDesc.VarType.SPARSE_COO,
+            persistable=False,
+            stop_gradient=stop_gradient)
+
     def create_variable(self, *args, **kwargs):
         """Create Variable for this layers.
         Returns created Variable.
diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py
index a9f2eaa40e2a54..49a779a2eb065c 100755
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -18,31 +18,73 @@
 from .layer_function_generator import autodoc, templatedoc
 from .tensor import assign, cast, fill_constant
 from .. import core
-from ..framework import Program, Variable, Operator, _non_static_mode, static_only, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Program,
+    Variable,
+    Operator,
+    _non_static_mode,
+    static_only,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from ..layer_helper import LayerHelper, unique_name
 from .nn import logical_and, logical_not, logical_or
-from .utils import assert_same_structure, map_structure, hold_mutable_vars, copy_mutable_vars, padding_to_same_structure, is_sequence, pack_sequence_as, flatten, to_sequence
+from .utils import (
+    assert_same_structure,
+    map_structure,
+    hold_mutable_vars,
+    copy_mutable_vars,
+    padding_to_same_structure,
+    is_sequence,
+    pack_sequence_as,
+    flatten,
+    to_sequence,
+)
 import numpy
 import warnings
 import six
 from functools import reduce, partial
-from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ... import compat as cpt
 from ..backward import _infer_var_data_type_shape_
 from paddle import _C_ops, _legacy_C_ops
 
 __all__ = [
-    'While', 'Switch', 'increment', 'array_write', 'create_array', 'less_than',
-    'less_equal', 'greater_than', 'greater_equal', 'equal', 'not_equal',
-    'array_read', 'array_length', 'cond', 'IfElse', 'DynamicRNN', 'StaticRNN',
-    'reorder_lod_tensor_by_rank', 'Print', 'Assert', 'is_empty', 'case',
-    'switch_case', 'while_loop'
+    'While',
+    'Switch',
+    'increment',
+    'array_write',
+    'create_array',
+    'less_than',
+    'less_equal',
+    'greater_than',
+    'greater_equal',
+    'equal',
+    'not_equal',
+    'array_read',
+    'array_length',
+    'cond',
+    'IfElse',
+    'DynamicRNN',
+    'StaticRNN',
+    'reorder_lod_tensor_by_rank',
+    'Print',
+    'Assert',
+    'is_empty',
+    'case',
+    'switch_case',
+    'while_loop',
 ]
 
 
 def select_output(input, outputs, mask):
     """
-    **select_output**    
+    **select_output**
     This API takes in one input and multiple outputs and an integer mask. It
     selects the output specified by the mask and copy the input to selected
     output. It is useful in control flow.
@@ -61,19 +103,38 @@ def select_output(input, outputs, mask):
     check_variable_and_dtype(mask, 'mask', ['int32'], 'select_output')
     check_type(outputs, 'outputs', (list, tuple), 'select_output')
 
-    helper.append_op(type='select_output',
-                     inputs={
-                         'X': input,
-                         'Mask': mask
-                     },
-                     outputs={'Out': outputs})
+    helper.append_op(
+        type='select_output',
+        inputs={'X': input, 'Mask': mask},
+        outputs={'Out': outputs},
+    )
     return outputs
 
 
+def _select_input_infer_shape(first_shape, second_shape):
+    """
+    This function infer the output shape by following algorithm:
+    1. if the dims is different, raise a error.
+    2. compare axis one by one:
+        if a == b: we set axis to a
+        if a != b: we set axis to -1
+    for compatibility，non declarative mode, we just return second_shape.
+    """
+    if len(first_shape) != len(second_shape):
+        warnings.warn(
+            f"the input shapes of select_input should have the same rank, but get {first_shape}, {second_shape}"
+        )
+        return second_shape
+    out_shape = list(
+        map(lambda a, b: a if a == b else -1, first_shape, second_shape)
+    )
+    return out_shape
+
+
 def select_input(inputs, mask):
     """
     **select_input**
-    
+
     This API takes in multiple inputs and uses an integer mask to select one
     input to output. It is useful in control flow.
 
@@ -89,83 +150,99 @@ def select_input(inputs, mask):
     check_type(inputs, 'inputs', (list, tuple), 'select_input')
     check_variable_and_dtype(mask, 'mask', ['int32'], 'select_input')
 
-    input_dtype = inputs[1].dtype
-    input_shape = inputs[1].shape
-    input_type = inputs[1].type
-
-    out = helper.create_variable(dtype=input_dtype,
-                                 shape=input_shape,
-                                 type=input_type)
-    helper.append_op(type='select_input',
-                     inputs={
-                         'X': inputs,
-                         'Mask': mask
-                     },
-                     outputs={'Out': out})
+    # Select input should expand the shape. If it is - 1 and valid number, use - 1 first. If the dim is different, an error will be reported directly
+    # assert inputs[0].dtype == inputs[1].dtype, f"Expect the inputs should have the same dtype, but get {inputs[0].dtype} and {inputs[1].dtype}"
+    output_shape = _select_input_infer_shape(inputs[0].shape, inputs[1].shape)
+    output_dtype = inputs[1].dtype
+    output_type = inputs[1].type
+
+    out = helper.create_variable(
+        dtype=output_dtype, shape=output_shape, type=output_type
+    )
+    helper.append_op(
+        type='select_input',
+        inputs={'X': inputs, 'Mask': mask},
+        outputs={'Out': out},
+    )
     return out
 
 
-def select_input_with_buildin_type(inputs, mask):
-    from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import to_static_variable
-    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_var_like
+def select_input_with_buildin_type(inputs, mask, name):
+    from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import (
+        to_static_variable,
+    )
+    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar
+
     false_var, true_var = inputs
 
     if isinstance(false_var, UndefinedVar) and isinstance(
-            true_var, UndefinedVar):
-        """ None -> UndefinedVar, so the real value is a [None, UndefinedVar] or [None, None], we just return None.
-        """
+        true_var, UndefinedVar
+    ):
+        """None -> UndefinedVar, so the real value is a [None, UndefinedVar] or [None, None], we just return None."""
         return None
 
     if isinstance(false_var, Variable) and isinstance(true_var, Variable):
-        return select_input(inputs, mask)
+        try:
+            return select_input(inputs, mask)
+        except Exception as e:
+            raise RuntimeError(
+                f"Exceptions throwed while doing select_input on {name}:\n{e}"
+            )
 
-    elif (isinstance(false_var, (support_ret_buildin_type))
-          and isinstance(false_var, type(true_var))):
+    elif isinstance(false_var, (support_ret_buildin_type)) and isinstance(
+        false_var, type(true_var)
+    ):
         if false_var == true_var:
             return false_var
         else:
             inputs = [
                 to_static_variable(false_var),
-                to_static_variable(true_var)
+                to_static_variable(true_var),
             ]
     # Deal with the situations like this: false_var is int and true_var is Variable
-    elif ((isinstance(false_var, support_ret_buildin_type)
-           and isinstance(true_var, Variable))
-          or (isinstance(true_var, support_ret_buildin_type)
-              and isinstance(false_var, Variable))):
+    elif (
+        isinstance(false_var, support_ret_buildin_type)
+        and isinstance(true_var, Variable)
+    ) or (
+        isinstance(true_var, support_ret_buildin_type)
+        and isinstance(false_var, Variable)
+    ):
         inputs = [to_static_variable(false_var), to_static_variable(true_var)]
         warnings.warn(
             "Return results from different branches in cond are not same type: "
             "false_var returned by fasle_fn is '{}' and true_var of true_fn is "
-            "'{}'".format(type(false_var), type(true_var)))
-    elif ((isinstance(false_var, UndefinedVar)
-           and isinstance(true_var, (Variable, ) + support_ret_buildin_type))
-          or (isinstance(true_var, UndefinedVar)
-              and isinstance(false_var,
-                             (Variable, ) + support_ret_buildin_type))):
+            "'{}'".format(type(false_var), type(true_var))
+        )
+    elif (
+        isinstance(false_var, UndefinedVar)
+        and isinstance(true_var, (Variable,) + support_ret_buildin_type)
+    ) or (
+        isinstance(true_var, UndefinedVar)
+        and isinstance(false_var, (Variable,) + support_ret_buildin_type)
+    ):
 
         def create_var_if_not_undefined_var(a):
-            if isinstance(a, UndefinedVar): return a
+            if isinstance(a, UndefinedVar):
+                return a
             return to_static_variable(a)
 
-        def create_like_if_undefined_var(a, b):
-            if isinstance(a, UndefinedVar): return create_undefined_var_like(b)
-            return a
-
-        # TODO(xiongkun): add warning here.
-        true_var, false_var = create_var_if_not_undefined_var(
-            true_var), create_var_if_not_undefined_var(false_var)
-        inputs = [
-            create_like_if_undefined_var(false_var, true_var),
-            create_like_if_undefined_var(true_var, false_var)
-        ]
+        true_var, false_var = to_static_variable(true_var), to_static_variable(
+            false_var
+        )
+        inputs = [false_var, true_var]
     else:
         raise TypeError(
             "Unsupported return type of true_fn and false_fn in cond: false_var "
-            "returned by fasle_fn is '{}' and true_var of true_fn is '{}'".
-            format(type(false_var), type(true_var)))
-
-    return select_input(inputs, mask)
+            "returned by fasle_fn is '{}' and true_var of true_fn is '{}'".format(
+                type(false_var), type(true_var)
+            )
+        )
+    try:
+        return select_input(inputs, mask)
+    except Exception as e:
+        raise RuntimeError(
+            f"Exceptions throwed while doing select_input on {name}:\n{e}"
+        )
 
 
 def split_lod_tensor(input, mask, level=0):
@@ -202,23 +279,26 @@ def split_lod_tensor(input, mask, level=0):
                 input=x, mask=y, level=level)
 
     """
-    check_type(input, 'input', (Variable, list, tuple, type(None)),
-               'fluid.layers.split_lod_tensor')
+    check_type(
+        input,
+        'input',
+        (Variable, list, tuple, type(None)),
+        'fluid.layers.split_lod_tensor',
+    )
     check_type(mask, 'mask', (Variable, list), 'fluid.layers.split_lod_tensor')
     check_type(level, 'level', int, 'fluid.layers.split_lod_tensor')
     helper = LayerHelper('split_lod_tensor', **locals())
     out_true = helper.create_variable_for_type_inference(dtype=input.dtype)
     out_false = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='split_lod_tensor',
-                     inputs={
-                         'X': input,
-                         'Mask': mask,
-                     },
-                     outputs={
-                         'OutTrue': out_true,
-                         'OutFalse': out_false
-                     },
-                     attrs={'level': level})
+    helper.append_op(
+        type='split_lod_tensor',
+        inputs={
+            'X': input,
+            'Mask': mask,
+        },
+        outputs={'OutTrue': out_true, 'OutFalse': out_false},
+        attrs={'level': level},
+    )
     return out_true, out_false
 
 
@@ -260,37 +340,48 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
                 in_true=out_true, in_false=out_false, mask=y, x=x, level=level)
     """
     helper = LayerHelper('merge_lod_tensor', **locals())
-    check_type(x, 'x', (Variable, list, tuple, type(None)),
-               'fluid.layers.merge_lod_tensor')
+    check_type(
+        x,
+        'x',
+        (Variable, list, tuple, type(None)),
+        'fluid.layers.merge_lod_tensor',
+    )
     check_type(mask, 'mask', (Variable, list), 'fluid.layers.merge_lod_tensor')
-    check_type(in_true, 'in_true', (Variable, list, tuple, type(None)),
-               'fluid.layers.merge_lod_tensor')
-    check_type(in_false, 'in_false', (Variable, list, tuple, type(None)),
-               'fluid.layers.merge_lod_tensor')
+    check_type(
+        in_true,
+        'in_true',
+        (Variable, list, tuple, type(None)),
+        'fluid.layers.merge_lod_tensor',
+    )
+    check_type(
+        in_false,
+        'in_false',
+        (Variable, list, tuple, type(None)),
+        'fluid.layers.merge_lod_tensor',
+    )
     out = helper.create_variable_for_type_inference(dtype=in_true.dtype)
-    helper.append_op(type='merge_lod_tensor',
-                     inputs={
-                         'X': x,
-                         'Mask': mask,
-                         'InTrue': in_true,
-                         'InFalse': in_false
-                     },
-                     outputs={'Out': out},
-                     attrs={'level': level})
+    helper.append_op(
+        type='merge_lod_tensor',
+        inputs={'X': x, 'Mask': mask, 'InTrue': in_true, 'InFalse': in_false},
+        outputs={'Out': out},
+        attrs={'level': level},
+    )
     return out
 
 
 @static_only
-def Print(input,
-          first_n=-1,
-          message=None,
-          summarize=20,
-          print_tensor_name=True,
-          print_tensor_type=True,
-          print_tensor_shape=True,
-          print_tensor_layout=True,
-          print_tensor_lod=True,
-          print_phase='both'):
+def Print(
+    input,
+    first_n=-1,
+    message=None,
+    summarize=20,
+    print_tensor_name=True,
+    print_tensor_type=True,
+    print_tensor_shape=True,
+    print_tensor_layout=True,
+    print_tensor_lod=True,
+    print_phase='both',
+):
     '''
     :api_attr: Static Graph
 
@@ -314,7 +405,7 @@ def Print(input,
         print_tensor_layout (bool, optional): Print the tensor layout. Default: True.
         print_tensor_lod (bool, optional): Print the tensor lod. Default: True.
         print_phase (str): Which phase to displace, including 'forward',
-                'backward' and 'both'. Default: 'both'. If set to 'backward', will 
+                'backward' and 'both'. Default: 'both'. If set to 'backward', will
                 only print the gradients of input tensor; If set to 'both', will
                 both print the input tensor itself and the gradients of input tensor.
 
@@ -328,11 +419,11 @@ def Print(input,
 
     Examples:
         .. code-block:: python
-           
+
            import paddle
 
            paddle.enable_static()
-        
+
            x = paddle.full(shape=[2, 3], fill_value=3, dtype='int64')
            out = paddle.static.Print(x, message="The content of input layer:")
 
@@ -348,26 +439,31 @@ def Print(input,
            #   - dtype: long
            #   - data: [3 3 3 3 3 3]
     '''
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64', 'bool'],
-                             'fluid.layers.Print')
+    check_variable_and_dtype(
+        input,
+        'input',
+        ['float32', 'float64', 'int32', 'int64', 'bool'],
+        'fluid.layers.Print',
+    )
 
     helper = LayerHelper('print' + "_" + input.name, **locals())
     output = helper.create_variable_for_type_inference(input.dtype)
-    helper.append_op(type='print',
-                     inputs={'In': input},
-                     outputs={'Out': output},
-                     attrs={
-                         'first_n': first_n,
-                         'summarize': summarize,
-                         'message': message or "",
-                         'print_tensor_name': print_tensor_name,
-                         'print_tensor_type': print_tensor_type,
-                         'print_tensor_shape': print_tensor_shape,
-                         'print_tensor_layout': print_tensor_layout,
-                         'print_tensor_lod': print_tensor_lod,
-                         'print_phase': print_phase.upper()
-                     })
+    helper.append_op(
+        type='print',
+        inputs={'In': input},
+        outputs={'Out': output},
+        attrs={
+            'first_n': first_n,
+            'summarize': summarize,
+            'message': message or "",
+            'print_tensor_name': print_tensor_name,
+            'print_tensor_type': print_tensor_type,
+            'print_tensor_shape': print_tensor_shape,
+            'print_tensor_layout': print_tensor_layout,
+            'print_tensor_lod': print_tensor_lod,
+            'print_phase': print_phase.upper(),
+        },
+    )
     return output
 
 
@@ -434,12 +530,11 @@ def Assert(cond, data=None, summarize=20, name=None):
     layer_name = name if name else ('assert_' + cond.name)
     helper = LayerHelper(layer_name, **locals())
 
-    op = helper.append_op(type="assert",
-                          inputs={
-                              "Cond": cond,
-                              "Data": [] if data is None else list(data)
-                          },
-                          attrs={"summarize": summarize})
+    op = helper.append_op(
+        type="assert",
+        inputs={"Cond": cond, "Data": [] if data is None else list(data)},
+        attrs={"summarize": summarize},
+    )
 
     return op
 
@@ -489,8 +584,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
             return False
         self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
         self.rnn._complete_op()
-        return super(BlockGuardWithCompletion,
-                     self).__exit__(exc_type, exc_val, exc_tb)
+        return super(BlockGuardWithCompletion, self).__exit__(
+            exc_type, exc_val, exc_tb
+        )
 
 
 class StaticRNNMemoryLink(object):
@@ -556,12 +652,13 @@ class StaticRNN(object):
                 hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
                 # use hidden to update prev
                 rnn.update_memory(prev, hidden)
-                # mark hidden as output 
+                # mark hidden as output
                 rnn.step_output(hidden)
             # get StaticrNN final output
             result = rnn()
 
     """
+
     BEFORE_RNN_BLOCK = 0
     IN_RNN_BLOCK = 1
     AFTER_RNN_BLOCK = 2
@@ -587,13 +684,15 @@ def _assert_in_rnn_block_(self, method):
         if self.status != StaticRNN.IN_RNN_BLOCK:
             raise ValueError("You must invoke {0} in rnn block".format(method))
 
-    def memory(self,
-               init=None,
-               shape=None,
-               batch_ref=None,
-               init_value=0.0,
-               init_batch_dim_idx=0,
-               ref_batch_dim_idx=1):
+    def memory(
+        self,
+        init=None,
+        shape=None,
+        batch_ref=None,
+        init_value=0.0,
+        init_batch_dim_idx=0,
+        ref_batch_dim_idx=1,
+    ):
         """
         Create a memory variable for static rnn.
         If the :code:`init` is not None, :code:`memory` will be initialized by
@@ -619,97 +718,118 @@ def memory(self,
         Examples 1:
             .. code-block:: python
 
-            	import paddle.fluid as fluid
-            	import paddle.fluid.layers as layers
-
-            	vocab_size, hidden_size=10000, 200
-            	x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
-            	# create word sequence
-            	x_emb = layers.embedding(
-                	input=x,
-                	size=[vocab_size, hidden_size],
-                	dtype='float32',
-                	is_sparse=False)
-            	# transform batch size to dim 1
-            	x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-            	rnn = fluid.layers.StaticRNN()
-            	with rnn.step():
-                	# mark created x_emb as input, each step process a word
-                	word = rnn.step_input(x_emb)
-                	# create prev memory parameter, batch size comes from word
-                	prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                	hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                	# use hidden to update prev
-                	rnn.update_memory(prev, hidden)
+                import paddle.fluid as fluid
+                import paddle.fluid.layers as layers
+
+                vocab_size, hidden_size=10000, 200
+                x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
+                # create word sequence
+                x_emb = layers.embedding(
+                        input=x,
+                        size=[vocab_size, hidden_size],
+                        dtype='float32',
+                        is_sparse=False)
+                # transform batch size to dim 1
+                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+
+                rnn = fluid.layers.StaticRNN()
+                with rnn.step():
+                        # mark created x_emb as input, each step process a word
+                        word = rnn.step_input(x_emb)
+                        # create prev memory parameter, batch size comes from word
+                        prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+                        hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+                        # use hidden to update prev
+                        rnn.update_memory(prev, hidden)
 
 
         Examples 2:
             .. code-block:: python
 
-            	import paddle.fluid as fluid
-            	import paddle.fluid.layers as layers
-            	vocab_size, hidden_size=10000, 200
-            	x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
-            	# create word sequence
-            	x_emb = layers.embedding(
-                	input=x,
-                	size=[vocab_size, hidden_size],
-                	dtype='float32',
-                	is_sparse=False)
-            	# transform batch size to dim 1
-            	x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-            	boot_memory = fluid.layers.data(name='boot', shape=[hidden_size], dtype='float32', lod_level=1)
-            	rnn = fluid.layers.StaticRNN()
-            	with rnn.step():
-            		# mark created x_emb as input, each step process a word
-            		word = rnn.step_input(x_emb)
-            		# init memory
-            		prev = rnn.memory(init=boot_memory)
-            		hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-            		# update hidden with prev
-            		rnn.update_memory(prev, hidden)
+                import paddle.fluid as fluid
+                import paddle.fluid.layers as layers
+                vocab_size, hidden_size=10000, 200
+                x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
+                # create word sequence
+                x_emb = layers.embedding(
+                        input=x,
+                        size=[vocab_size, hidden_size],
+                        dtype='float32',
+                        is_sparse=False)
+                # transform batch size to dim 1
+                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+                boot_memory = fluid.layers.data(name='boot', shape=[hidden_size], dtype='float32', lod_level=1)
+                rnn = fluid.layers.StaticRNN()
+                with rnn.step():
+                        # mark created x_emb as input, each step process a word
+                        word = rnn.step_input(x_emb)
+                        # init memory
+                        prev = rnn.memory(init=boot_memory)
+                        hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+                        # update hidden with prev
+                        rnn.update_memory(prev, hidden)
 
         """
         self._assert_in_rnn_block_('memory')
-        check_type(init, "init", (Variable, type(None)),
-                   "fluid.layers.StaticRNN.memory")
-        check_type(shape, "shape", (list, tuple, type(None)),
-                   "fluid.layers.StaticRNN.memory")
-        check_type(batch_ref, "batch_ref", (Variable, type(None)),
-                   "fluid.layers.StaticRNN.memory")
+        check_type(
+            init,
+            "init",
+            (Variable, type(None)),
+            "fluid.layers.StaticRNN.memory",
+        )
+        check_type(
+            shape,
+            "shape",
+            (list, tuple, type(None)),
+            "fluid.layers.StaticRNN.memory",
+        )
+        check_type(
+            batch_ref,
+            "batch_ref",
+            (Variable, type(None)),
+            "fluid.layers.StaticRNN.memory",
+        )
         if init is None:
             if shape is None or batch_ref is None:
                 raise ValueError(
-                    "if init is None, memory at least need shape and batch_ref")
+                    "if init is None, memory at least need shape and batch_ref"
+                )
             parent_block = self._parent_block()
-            var_name = unique_name.generate_with_ignorable_key("@".join(
-                [self.helper.name, "memory_boot"]))
-            boot_var = parent_block.create_var(name=var_name,
-                                               shape=shape,
-                                               dtype=batch_ref.dtype,
-                                               persistable=False)
-
-            parent_block.append_op(type="fill_constant_batch_size_like",
-                                   inputs={'Input': [batch_ref]},
-                                   outputs={'Out': [boot_var]},
-                                   attrs={
-                                       'value': init_value,
-                                       'shape': boot_var.shape,
-                                       'dtype': boot_var.dtype,
-                                       'input_dim_idx': ref_batch_dim_idx,
-                                       'output_dim_idx': init_batch_dim_idx
-                                   })
+            var_name = unique_name.generate_with_ignorable_key(
+                "@".join([self.helper.name, "memory_boot"])
+            )
+            boot_var = parent_block.create_var(
+                name=var_name,
+                shape=shape,
+                dtype=batch_ref.dtype,
+                persistable=False,
+            )
+
+            parent_block.append_op(
+                type="fill_constant_batch_size_like",
+                inputs={'Input': [batch_ref]},
+                outputs={'Out': [boot_var]},
+                attrs={
+                    'value': init_value,
+                    'shape': boot_var.shape,
+                    'dtype': boot_var.dtype,
+                    'input_dim_idx': ref_batch_dim_idx,
+                    'output_dim_idx': init_batch_dim_idx,
+                },
+            )
 
             return self.memory(init=boot_var)
         else:
             pre_mem = self.helper.create_variable(
-                name=unique_name.generate_with_ignorable_key("@".join(
-                    [self.helper.name, "mem"])),
+                name=unique_name.generate_with_ignorable_key(
+                    "@".join([self.helper.name, "mem"])
+                ),
                 dtype=init.dtype,
-                shape=init.shape)
-            self.memories[pre_mem.name] = StaticRNNMemoryLink(init=init,
-                                                              pre_mem=pre_mem)
+                shape=init.shape,
+            )
+            self.memories[pre_mem.name] = StaticRNNMemoryLink(
+                init=init, pre_mem=pre_mem
+            )
             return pre_mem
 
     def step_input(self, x):
@@ -726,29 +846,29 @@ def step_input(self, x):
         Examples:
             .. code-block:: python
 
-            	import paddle.fluid as fluid
-            	import paddle.fluid.layers as layers
-
-            	vocab_size, hidden_size=10000, 200
-            	x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
-            	# create word sequence
-            	x_emb = layers.embedding(
-                	input=x,
-                	size=[vocab_size, hidden_size],
-                	dtype='float32',
-                	is_sparse=False)
-            	# transform batch size to dim 1
-            	x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-            	rnn = fluid.layers.StaticRNN()
-            	with rnn.step():
-                	# mark created x_emb as input, each step process a word
-                	word = rnn.step_input(x_emb)
-                	# create prev memory parameter, batch size comes from word
-                	prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                	hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                	# use hidden to update prev
-                	rnn.update_memory(prev, hidden)
+                import paddle.fluid as fluid
+                import paddle.fluid.layers as layers
+
+                vocab_size, hidden_size=10000, 200
+                x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
+                # create word sequence
+                x_emb = layers.embedding(
+                        input=x,
+                        size=[vocab_size, hidden_size],
+                        dtype='float32',
+                        is_sparse=False)
+                # transform batch size to dim 1
+                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+
+                rnn = fluid.layers.StaticRNN()
+                with rnn.step():
+                        # mark created x_emb as input, each step process a word
+                        word = rnn.step_input(x_emb)
+                        # create prev memory parameter, batch size comes from word
+                        prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+                        hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+                        # use hidden to update prev
+                        rnn.update_memory(prev, hidden)
 
         """
         self._assert_in_rnn_block_('step_input')
@@ -758,10 +878,9 @@ def step_input(self, x):
         elif x.shape[0] != -1 and self.seq_len != x.shape[0]:
             raise ValueError("Static RNN only take fix seq_len input")
 
-        ipt = self.helper.create_variable(name=x.name,
-                                          dtype=x.dtype,
-                                          shape=list(x.shape[1:]),
-                                          type=x.type)
+        ipt = self.helper.create_variable(
+            name=x.name, dtype=x.dtype, shape=list(x.shape[1:]), type=x.type
+        )
         self.inputs.append(ipt)
         return ipt
 
@@ -778,47 +897,50 @@ def step_output(self, o):
         Examples:
             .. code-block:: python
 
-            	import paddle.fluid as fluid
-            	import paddle.fluid.layers as layers
-
-            	vocab_size, hidden_size=10000, 200
-            	x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
-            	# create word sequence
-            	x_emb = layers.embedding(
-                	input=x,
-                	size=[vocab_size, hidden_size],
-               		dtype='float32',
-                	is_sparse=False)
-            	# transform batch size to dim 1
-            	x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-            	rnn = fluid.layers.StaticRNN()
-            	with rnn.step():
-                	# mark created x_emb as input, each step process a word
-               		word = rnn.step_input(x_emb)
-                	# create prev memory parameter, batch size comes from word
-                	prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                	hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                	# use hidden to update prev
-                	rnn.update_memory(prev, hidden)
-                	rnn.step_output(hidden)
-
-            	result = rnn()
+                import paddle.fluid as fluid
+                import paddle.fluid.layers as layers
+
+                vocab_size, hidden_size=10000, 200
+                x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
+                # create word sequence
+                x_emb = layers.embedding(
+                        input=x,
+                        size=[vocab_size, hidden_size],
+                        dtype='float32',
+                        is_sparse=False)
+                # transform batch size to dim 1
+                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+
+                rnn = fluid.layers.StaticRNN()
+                with rnn.step():
+                        # mark created x_emb as input, each step process a word
+                        word = rnn.step_input(x_emb)
+                        # create prev memory parameter, batch size comes from word
+                        prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+                        hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+                        # use hidden to update prev
+                        rnn.update_memory(prev, hidden)
+                        rnn.step_output(hidden)
+
+                result = rnn()
 
         """
         self._assert_in_rnn_block_('step_output')
         check_type(o, "o", Variable, "fluid.layers.StaticRNN.step_output")
 
         tmp_o = self.helper.create_variable_for_type_inference(dtype=o.dtype)
-        self.helper.append_op(type='rnn_memory_helper',
-                              inputs={'X': [o]},
-                              outputs={'Out': tmp_o},
-                              attrs={'dtype': o.dtype})
-
-        out_var = self._parent_block().create_var(name=tmp_o.name,
-                                                  shape=[self.seq_len] +
-                                                  list(tmp_o.shape),
-                                                  dtype=tmp_o.dtype)
+        self.helper.append_op(
+            type='rnn_memory_helper',
+            inputs={'X': [o]},
+            outputs={'Out': tmp_o},
+            attrs={'dtype': o.dtype},
+        )
+
+        out_var = self._parent_block().create_var(
+            name=tmp_o.name,
+            shape=[self.seq_len] + list(tmp_o.shape),
+            dtype=tmp_o.dtype,
+        )
 
         self.outputs.append(out_var)
 
@@ -835,33 +957,33 @@ def output(self, *outputs):
         Examples:
             .. code-block:: python
 
-            	import paddle.fluid as fluid
-            	import paddle.fluid.layers as layers
-
-            	vocab_size, hidden_size=10000, 200
-            	x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
-            	# create word sequence
-            	x_emb = layers.embedding(
-                	input=x,
-                	size=[vocab_size, hidden_size],
-                	dtype='float32',
-                	is_sparse=False)
-            	# transform batch size to dim 1
-            	x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
-
-            	rnn = fluid.layers.StaticRNN()
-            	with rnn.step():
-                	# mark created x_emb as input, each step process a word
-                	word = rnn.step_input(x_emb)
-                	# create prev memory parameter, batch size comes from word
-                	prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
-                	hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
-                	# use hidden to update prev
-                	rnn.update_memory(prev, hidden)
-                	# mark each step's hidden and word as output
-                	rnn.output(hidden, word)
-
-            	result = rnn()
+                import paddle.fluid as fluid
+                import paddle.fluid.layers as layers
+
+                vocab_size, hidden_size=10000, 200
+                x = fluid.data(name="x", shape=[None, 1, 1], dtype='int64')
+                # create word sequence
+                x_emb = layers.embedding(
+                        input=x,
+                        size=[vocab_size, hidden_size],
+                        dtype='float32',
+                        is_sparse=False)
+                # transform batch size to dim 1
+                x_emb = layers.transpose(x_emb, perm=[1, 0, 2])
+
+                rnn = fluid.layers.StaticRNN()
+                with rnn.step():
+                        # mark created x_emb as input, each step process a word
+                        word = rnn.step_input(x_emb)
+                        # create prev memory parameter, batch size comes from word
+                        prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word)
+                        hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu')
+                        # use hidden to update prev
+                        rnn.update_memory(prev, hidden)
+                        # mark each step's hidden and word as output
+                        rnn.output(hidden, word)
+
+                result = rnn()
         """
         for each in outputs:
             self.step_output(each)
@@ -934,7 +1056,8 @@ def _complete_op(self):
         ]
 
         step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
+            type=core.VarDesc.VarType.STEP_SCOPES
+        )
 
         inlinks = [parent_block.var(i.name) for i in self.inputs]
         outlinks = self.outputs
@@ -946,39 +1069,41 @@ def _complete_op(self):
         for _, mem in six.iteritems(self.memories):
             boot_memories.append(mem.init)
             pre_memories.append(mem.pre_mem.name)
-            assert mem.mem is not None, "%s should be updated in every step." % (
-                mem.init.name)
+            assert (
+                mem.mem is not None
+            ), "%s should be updated in every step." % (mem.init.name)
             mem_var = rnn_block.var(mem.mem.name)
             assert isinstance(mem_var, Variable)
             new_mem = self.helper.create_variable_for_type_inference(
-                dtype=mem_var.dtype)
-            rnn_block.append_op(type='rnn_memory_helper',
-                                inputs={'X': [mem_var]},
-                                outputs={'Out': [new_mem]},
-                                attrs={'dtype': mem_var.dtype})
+                dtype=mem_var.dtype
+            )
+            rnn_block.append_op(
+                type='rnn_memory_helper',
+                inputs={'X': [mem_var]},
+                outputs={'Out': [new_mem]},
+                attrs={'dtype': mem_var.dtype},
+            )
 
             memories.append(new_mem.name)
 
-        parent_block.append_op(type='recurrent',
-                               inputs={
-                                   'inputs': inlinks,
-                                   'initial_states': boot_memories,
-                                   'parameters': parameters
-                               },
-                               outputs={
-                                   'outputs': outlinks,
-                                   'step_scopes': [step_scope]
-                               },
-                               attrs={
-                                   'has_states': len(pre_memories) > 0,
-                                   'ex_states': pre_memories,
-                                   'states': memories,
-                                   'sub_block': rnn_block
-                               })
+        parent_block.append_op(
+            type='recurrent',
+            inputs={
+                'inputs': inlinks,
+                'initial_states': boot_memories,
+                'parameters': parameters,
+            },
+            outputs={'outputs': outlinks, 'step_scopes': [step_scope]},
+            attrs={
+                'has_states': len(pre_memories) > 0,
+                'ex_states': pre_memories,
+                'states': memories,
+                'sub_block': rnn_block,
+            },
+        )
 
 
 class WhileGuard(BlockGuard):
-
     def __init__(self, while_op):
         if not isinstance(while_op, While):
             raise TypeError("WhileGuard takes a while op")
@@ -997,8 +1122,9 @@ def __exit__(self, exc_type, exc_val, exc_tb):
         return super(WhileGuard, self).__exit__(exc_type, exc_val, exc_tb)
 
 
-def get_inputs_outputs_in_block(current_block, inner_inputs, inner_outputs,
-                                helper):
+def get_inputs_outputs_in_block(
+    current_block, inner_inputs, inner_outputs, helper
+):
     """
     Find inputs and outputs in current control flow block.
     :param current_block: Current control flow block.
@@ -1029,7 +1155,8 @@ def is_ignore_vars(op, var_name):
         for iname in op.input_names:
             for in_var_name in op.input(iname):
                 if in_var_name not in inner_outputs and not is_ignore_vars(
-                        op, in_var_name):
+                    op, in_var_name
+                ):
                     inner_inputs.add(in_var_name)
 
         for oname in op.output_names:
@@ -1045,8 +1172,11 @@ def is_ignore_vars(op, var_name):
         current_block_var = None
         if current_block.has_var(in_var_name):
             current_block_var = current_block.var(in_var_name)
-        if not parent_block_var and current_block_var and \
-                current_block_var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        if (
+            not parent_block_var
+            and current_block_var
+            and current_block_var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
             remove_inner_inputs.add(in_var_name)
 
     inner_inputs = inner_inputs - remove_inner_inputs
@@ -1057,7 +1187,7 @@ def is_ignore_vars(op, var_name):
 class While(object):
     """
     :api_attr: Static Graph
-    
+
     while loop control flow. Repeat while body until cond is False.
 
     Note:
@@ -1077,7 +1207,7 @@ class While(object):
 
     Examples 1:
           .. code-block:: python
-            
+
             import paddle.fluid as fluid
             import numpy as np
 
@@ -1136,8 +1266,10 @@ def __init__(self, cond, is_test=False, name=None):
         check_variable_and_dtype(cond, 'cond', ['bool'], 'fluid.layers.While')
         if reduce(lambda a, b: a * b, cond.shape, 1) != 1:
             raise TypeError(
-                "condition expected shape as [1], but given shape as {0}.".
-                format(list(cond.shape)))
+                "condition expected shape as [1], but given shape as {0}.".format(
+                    list(cond.shape)
+                )
+            )
         self.cond_var = cond
         self.is_test = is_test
 
@@ -1148,12 +1280,14 @@ def _complete(self):
         main_program = self.helper.main_program
         while_block = main_program.current_block()
         parent_block = main_program.block(
-            main_program.current_block().parent_idx)
+            main_program.current_block().parent_idx
+        )
 
         inner_outputs = {self.cond_var.name}
         x_name_list = set()
         x_name_list, inner_outputs = get_inputs_outputs_in_block(
-            while_block, x_name_list, inner_outputs, self.helper)
+            while_block, x_name_list, inner_outputs, self.helper
+        )
 
         out_vars = []
         for inner_out_name in inner_outputs:
@@ -1167,23 +1301,21 @@ def _complete(self):
         x_name_list -= {self.cond_var.name}
 
         step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
+            type=core.VarDesc.VarType.STEP_SCOPES
+        )
 
         parent_block.append_op(
             type='while',
             inputs={
-                'X':
-                [parent_block._var_recursive(x_name) for x_name in x_name_list],
-                'Condition': [self.cond_var]
-            },
-            outputs={
-                'Out': out_vars,
-                'StepScopes': [step_scope]
+                'X': [
+                    parent_block._var_recursive(x_name)
+                    for x_name in x_name_list
+                ],
+                'Condition': [self.cond_var],
             },
-            attrs={
-                'sub_block': while_block,
-                "is_test": self.is_test
-            })
+            outputs={'Out': out_vars, 'StepScopes': [step_scope]},
+            attrs={'sub_block': while_block, "is_test": self.is_test},
+        )
 
 
 support_ret_buildin_type = (bool, float, six.integer_types)
@@ -1195,14 +1327,17 @@ def assign_skip_lod_tensor_array(input, output):
     """
 
     def has_shape_diff(x_var, y_var):
-        if len(x_var.shape) != len(y_var.shape): return True
+        if len(x_var.shape) != len(y_var.shape):
+            return True
         for x_dim, y_dim in zip(x_var.shape, y_var.shape):
-            if x_dim != y_dim and -1 not in [x_dim, y_dim]: return True
+            if x_dim != y_dim and -1 not in [x_dim, y_dim]:
+                return True
         return False
 
     if not isinstance(input, (Variable, core.VarBase)):
         if isinstance(output, Variable) and isinstance(
-                input, support_ret_buildin_type):
+            input, support_ret_buildin_type
+        ):
             assign(input, output)
         else:
             output = input
@@ -1211,15 +1346,21 @@ def has_shape_diff(x_var, y_var):
     if input.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
         main_program = input.block.program
         parent_block = main_program.block(
-            main_program.current_block().parent_idx)
+            main_program.current_block().parent_idx
+        )
         if parent_block and not parent_block._find_var_recursive(input.name):
             assign(input, output)
     else:
-        if isinstance(output, Variable) and isinstance(
-                input, Variable) and has_shape_diff(input, output):
+        if (
+            isinstance(output, Variable)
+            and isinstance(input, Variable)
+            and has_shape_diff(input, output)
+        ):
             warnings.warn(
-                "In dy2static mode, we attemp to assign a variable with shape {} into a variable with shape{}, which is not always right."
-                .format(input.shape, output.shape))
+                "In dy2static mode, we attemp to assign a variable with shape {} into a variable with shape{}, which is not always right.".format(
+                    input.shape, output.shape
+                )
+            )
         assign(input, output)
 
 
@@ -1235,7 +1376,7 @@ def while_loop(cond, body, loop_vars, is_test=False, name=None):
 
     Args:
         cond(Callable): A callable returning a boolean tensor controlling whether to continue looping. And ``cond`` takes
-	    as many arguments as ``loop_vars`` .
+            as many arguments as ``loop_vars`` .
         body(Callable): A callable returning a tuple or list of tensors or LoDTensorArrays of the same arity
             (length and structure) and types as ``loops_vars`` . And ``body`` takes as many arguments as ``loop_vars`` .
         loop_vars(list|tuple): A list or tuple of tensors or LoDTensorArrays that is passed to both ``cond`` and ``body`` .
@@ -1265,7 +1406,7 @@ def body(i, ten):
                 i = paddle.full(shape=[1], fill_value=0, dtype='int64')     # loop counter
                 ten = paddle.full(shape=[1], fill_value=10, dtype='int64')  # loop length
                 i, ten = paddle.static.nn.while_loop(cond, body, [i, ten])
-                
+
                 exe = paddle.static.Executor(paddle.CPUPlace())
                 res = exe.run(main_program, feed={}, fetch_list=[i])
                 print(res) # [array([10])]
@@ -1281,23 +1422,26 @@ def body(i, ten):
         raise ValueError("loop_vars in while_loop should not be empty")
 
     pre_cond = cond(*loop_vars)
-    check_variable_and_dtype(pre_cond, 'var of cond returned', ['bool'],
-                             'fluid.layers.while_loop')
+    check_variable_and_dtype(
+        pre_cond, 'var of cond returned', ['bool'], 'fluid.layers.while_loop'
+    )
     if reduce(lambda a, b: a * b, pre_cond.shape, 1) != 1:
         raise TypeError(
             "the shape of the variable returned by cond should be [1],"
-            "but given shape as {0}.".format(list(pre_cond.shape)))
+            "but given shape as {0}.".format(list(pre_cond.shape))
+        )
 
     if _non_static_mode():
         now_cond = pre_cond.numpy()[0]
-        while (now_cond):
+        while now_cond:
             output_vars = body(*loop_vars)
             if not isinstance(output_vars, (list, tuple)):
                 output_vars = [output_vars]
             if len(output_vars) != len(loop_vars):
                 raise ValueError(
                     "body in while_loop should return the same arity "
-                    "(length and structure) and types as loop_vars")
+                    "(length and structure) and types as loop_vars"
+                )
             now_cond = cond(*output_vars).numpy()[0]
             map_structure(assign_skip_lod_tensor_array, output_vars, loop_vars)
         return loop_vars
@@ -1322,7 +1466,8 @@ def body(i, ten):
         except ValueError as e:
             raise ValueError(
                 "body in while_loop should return the same arity "
-                "(length and structure) as loop_vars: {0}".format(e))
+                "(length and structure) as loop_vars: {0}".format(e)
+            )
         now_cond = cond(*output_vars)
         map_structure(assign_skip_lod_tensor_array, output_vars, loop_vars)
         assign(now_cond, pre_cond)
@@ -1330,22 +1475,27 @@ def body(i, ten):
 
 
 def _deal_with_undefined_var(output_vars, loop_vars):
-    """ Deal with undefined var cases, We create undefined variable based on the results of body().
-        In Dy2Static, we use undefined var to represent the var created in control flow. This function
-        expand the loop_vars and replace original loop_vars.
-        1. UndefinedVar = Variable      # create a variable
-        2. UndefinedVar = None          # create a undefined var with RETURN_NO_VALUE_MAGIC_NUM
-        3. UndefinedVar = List(int)     # create a list of variable
-        4. UndefinedVar = value         # create a variable
+    """Deal with undefined var cases, We create undefined variable based on the results of body().
+    In Dy2Static, we use undefined var to represent the var created in control flow. This function
+    expand the loop_vars and replace original loop_vars.
+    1. UndefinedVar = Variable      # create a variable
+    2. UndefinedVar = None          # create a undefined var with RETURN_NO_VALUE_MAGIC_NUM
+    3. UndefinedVar = List(int)     # create a list of variable
+    4. UndefinedVar = value         # create a variable
     """
-    from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar, create_undefined_variable
+    from paddle.fluid.dygraph.dygraph_to_static.utils import (
+        UndefinedVar,
+        create_undefined_variable,
+    )
 
     def create_var_like(o_var):
-        if isinstance(o_var,
-                      (Variable, ) + support_ret_buildin_type) or o_var is None:
+        if (
+            isinstance(o_var, (Variable,) + support_ret_buildin_type)
+            or o_var is None
+        ):
             return create_undefined_variable()
         if is_sequence(o_var):
-            """ 
+            """
             Create a complex container class inside the body of while, including Python list and python Dict
             """
             return map_structure(lambda x: create_undefined_variable(), o_var)
@@ -1413,16 +1563,21 @@ def lod_rank_table(x, level=0):
     check_type(x, 'x', (Variable, list), 'lod_rank_table')
     if isinstance(x, (list)):
         for i, input_x in enumerate(x):
-            check_type(input_x, 'input[' + str(i) + ']', Variable,
-                       'lod_rank_table')
+            check_type(
+                input_x, 'input[' + str(i) + ']', Variable, 'lod_rank_table'
+            )
 
     helper = LayerHelper("lod_rank_table", **locals())
-    table = helper.create_variable(type=core.VarDesc.VarType.LOD_RANK_TABLE,
-                                   name=unique_name.generate("lod_rank_table"))
-    helper.append_op(type='lod_rank_table',
-                     inputs={'X': x},
-                     outputs={'Out': table},
-                     attrs={'level': level})
+    table = helper.create_variable(
+        type=core.VarDesc.VarType.LOD_RANK_TABLE,
+        name=unique_name.generate("lod_rank_table"),
+    )
+    helper.append_op(
+        type='lod_rank_table',
+        inputs={'X': x},
+        outputs={'Out': table},
+        attrs={'level': level},
+    )
     return table
 
 
@@ -1445,9 +1600,11 @@ def max_sequence_len(rank_table):
     """
     helper = LayerHelper("max_seqence_len", **locals())
     res = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(type="max_sequence_len",
-                     inputs={"RankTable": rank_table},
-                     outputs={"Out": res})
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res},
+    )
     return res
 
 
@@ -1483,24 +1640,32 @@ def lod_tensor_to_array(x, table):
     check_type(x, 'x', (Variable, list), 'lod_tensor_to_array')
     if isinstance(x, (list)):
         for i, input_x in enumerate(x):
-            check_type(input_x, 'input[' + str(i) + ']', Variable,
-                       'lod_tensor_to_array')
+            check_type(
+                input_x,
+                'input[' + str(i) + ']',
+                Variable,
+                'lod_tensor_to_array',
+            )
     check_type(table, 'table', (Variable, list), 'lod_tensor_to_array')
     if isinstance(table, (list)):
         for i, table_x in enumerate(table):
-            check_type(table_x, 'table[' + str(i) + ']', Variable,
-                       'lod_tensor_to_array')
+            check_type(
+                table_x,
+                'table[' + str(i) + ']',
+                Variable,
+                'lod_tensor_to_array',
+            )
     helper = LayerHelper("lod_tensor_to_array", **locals())
     array = helper.create_variable(
         name=unique_name.generate("lod_tensor_to_array"),
         type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=x.dtype)
-    helper.append_op(type='lod_tensor_to_array',
-                     inputs={
-                         'X': x,
-                         'RankTable': table
-                     },
-                     outputs={'Out': array})
+        dtype=x.dtype,
+    )
+    helper.append_op(
+        type='lod_tensor_to_array',
+        inputs={'X': x, 'RankTable': table},
+        outputs={'Out': array},
+    )
     return array
 
 
@@ -1529,22 +1694,29 @@ def array_to_lod_tensor(x, table):
     check_type(x, 'x', (Variable, list), 'array_to_lod_tensor')
     if isinstance(x, (list)):
         for i, input_x in enumerate(x):
-            check_type(input_x, 'input[' + str(i) + ']', Variable,
-                       'array_to_lod_tensor')
+            check_type(
+                input_x,
+                'input[' + str(i) + ']',
+                Variable,
+                'array_to_lod_tensor',
+            )
     check_type(table, 'table', (Variable, list), 'array_to_lod_tensor')
     if isinstance(table, (list)):
         for i, table_x in enumerate(table):
-            check_type(table_x, 'table[' + str(i) + ']', Variable,
-                       'array_to_lod_tensor')
+            check_type(
+                table_x,
+                'table[' + str(i) + ']',
+                Variable,
+                'array_to_lod_tensor',
+            )
 
     helper = LayerHelper("array_to_lod_tensor", **locals())
     tmp = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="array_to_lod_tensor",
-                     inputs={
-                         'X': x,
-                         'RankTable': table
-                     },
-                     outputs={'Out': tmp})
+    helper.append_op(
+        type="array_to_lod_tensor",
+        inputs={'X': x, 'RankTable': table},
+        outputs={'Out': tmp},
+    )
     return tmp
 
 
@@ -1572,17 +1744,20 @@ def increment(x, value=1.0, in_place=True):
     if in_dygraph_mode():
         return _C_ops.increment_(x, value)
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'increment')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'increment'
+    )
     helper = LayerHelper("increment", **locals())
     if not in_place:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
         out = x
-    helper.append_op(type='increment',
-                     inputs={'X': [x]},
-                     outputs={'Out': [out]},
-                     attrs={'step': float(value)})
+    helper.append_op(
+        type='increment',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'step': float(value)},
+    )
     return out
 
 
@@ -1598,8 +1773,8 @@ def array_write(x, i, array=None):
             Tensor or LoDTensor. Data type: float32, float64, int32, int64.
         i (Variable): 1-D Tensor with shape [1], which represents the position into which
             ``x`` is written. Data type: int64.
-        array (LoDTensorArray, optional): The LoDTensorArray into which ``x`` is written. 
-            The default value is None, when a new LoDTensorArray will be created and returned 
+        array (LoDTensorArray, optional): The LoDTensorArray into which ``x`` is written.
+            The default value is None, when a new LoDTensorArray will be created and returned
             as a result.
 
     Returns:
@@ -1631,8 +1806,8 @@ def array_write(x, i, array=None):
 
             # the output is 2-D Tensor with shape [3,2], which is tmp above.
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
 
     """
@@ -1650,8 +1825,8 @@ def array_write(x, i, array=None):
         if array is None:
             array = create_array(x.dtype)
         assert isinstance(
-            array,
-            list), "The 'array' in array_write must be a list in dygraph mode"
+            array, list
+        ), "The 'array' in array_write must be a list in dygraph mode"
         assert i <= len(
             array
         ), "The index 'i' should not be greater than the length of 'array' in dygraph mode"
@@ -1665,29 +1840,31 @@ def array_write(x, i, array=None):
     check_type(x, 'x', (Variable), 'array_write')
     helper = LayerHelper('array_write', **locals())
     if array is not None:
-        if not isinstance(
-                array, Variable
-        ) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        if (
+            not isinstance(array, Variable)
+            or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
             raise TypeError(
-                "array should be tensor array vairable in array_write Op")
+                "array should be tensor array vairable in array_write Op"
+            )
     if array is None:
         array = helper.create_variable(
             name="{0}.out".format(helper.name),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
-    helper.append_op(type='write_to_array',
-                     inputs={
-                         'X': [x],
-                         'I': [i]
-                     },
-                     outputs={'Out': [array]})
+            dtype=x.dtype,
+        )
+    helper.append_op(
+        type='write_to_array',
+        inputs={'X': [x], 'I': [i]},
+        outputs={'Out': [array]},
+    )
     return array
 
 
 def create_array(dtype, initialized_list=None):
     """
     This OP creates an LOD_TENSOR_ARRAY. It is used as
-    the input of :ref:`api_fluid_layers_array_read` and 
+    the input of :ref:`api_fluid_layers_array_read` and
     :ref:`api_fluid_layers_array_write`. Also it can be used
     with  :ref:`api_fluid_layers_While` to create RNN network.
 
@@ -1711,16 +1888,20 @@ def create_array(dtype, initialized_list=None):
     if initialized_list is not None:
         if not isinstance(initialized_list, (list, tuple)):
             raise TypeError(
-                "Require type(initialized_list) should be list/tuple, but received {}"
-                .format(type(initialized_list)))
+                "Require type(initialized_list) should be list/tuple, but received {}".format(
+                    type(initialized_list)
+                )
+            )
         array = list(initialized_list)
 
     # NOTE: Only support plain list like [x, y,...], not support nested list in static mode.
     for val in array:
         if not isinstance(val, Variable):
             raise TypeError(
-                "All values in `initialized_list` should be Variable, but recevied {}."
-                .format(type(val)))
+                "All values in `initialized_list` should be Variable, but recevied {}.".format(
+                    type(val)
+                )
+            )
 
     if _non_static_mode():
         return array
@@ -1729,7 +1910,8 @@ def create_array(dtype, initialized_list=None):
     tensor_array = helper.create_variable(
         name="{0}.out".format(helper.name),
         type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-        dtype=dtype)
+        dtype=dtype,
+    )
 
     for val in array:
         array_write(x=val, i=array_length(tensor_array), array=tensor_array)
@@ -1766,10 +1948,12 @@ def less_than(x, y, force_cpu=None, cond=None, name=None):
             print(result) # [True, False, False, False]
 
     """
-    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
-                             "less_than")
-    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
-                             "less_than")
+    check_variable_and_dtype(
+        x, "x", ["float32", "float64", "int32", "int64"], "less_than"
+    )
+    check_variable_and_dtype(
+        y, "y", ["float32", "float64", "int32", "int64"], "less_than"
+    )
     if cond is not None:
         check_type(cond, "cond", Variable, "less_than")
     if force_cpu != None:
@@ -1784,13 +1968,12 @@ def less_than(x, y, force_cpu=None, cond=None, name=None):
     if force_cpu is not None:
         attrs['force_cpu'] = force_cpu
 
-    helper.append_op(type='less_than',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [cond]},
-                     attrs=attrs)
+    helper.append_op(
+        type='less_than',
+        inputs={'X': [x], 'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs,
+    )
     return cond
 
 
@@ -1798,13 +1981,13 @@ def less_than(x, y, force_cpu=None, cond=None, name=None):
 def less_equal(x, y, cond=None, name=None):
     """
     :alias_main: paddle.less_equal
-	:alias: paddle.less_equal,paddle.tensor.less_equal,paddle.tensor.logic.less_equal
-	:old_api: paddle.fluid.layers.less_equal
+        :alias: paddle.less_equal,paddle.tensor.less_equal,paddle.tensor.logic.less_equal
+        :old_api: paddle.fluid.layers.less_equal
 
     This OP returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *less_equal*.
             if cond is None, a new Varibale will be created to store the result.
@@ -1825,10 +2008,12 @@ def less_equal(x, y, cond=None, name=None):
           out1 = label<= limit #out1=[True, False]
 
     """
-    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
-                             "less_equal")
-    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
-                             "less_equal")
+    check_variable_and_dtype(
+        x, "x", ["float32", "float64", "int32", "int64"], "less_equal"
+    )
+    check_variable_and_dtype(
+        y, "y", ["float32", "float64", "int32", "int64"], "less_equal"
+    )
     if cond is not None:
         check_type(cond, "cond", Variable, "less_equal")
 
@@ -1839,13 +2024,12 @@ def less_equal(x, y, cond=None, name=None):
 
     attrs = dict()
 
-    helper.append_op(type='less_equal',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [cond]},
-                     attrs=attrs)
+    helper.append_op(
+        type='less_equal',
+        inputs={'X': [x], 'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs,
+    )
     return cond
 
 
@@ -1853,13 +2037,13 @@ def less_equal(x, y, cond=None, name=None):
 def greater_than(x, y, cond=None, name=None):
     """
     :alias_main: paddle.greater_than
-	:alias: paddle.greater_than,paddle.tensor.greater_than,paddle.tensor.logic.greater_than
-	:old_api: paddle.fluid.layers.greater_than
+        :alias: paddle.greater_than,paddle.tensor.greater_than,paddle.tensor.logic.greater_than
+        :old_api: paddle.fluid.layers.greater_than
 
     This OP returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *greater_than*.
             if cond is None, a new Varibale will be created to store the result.
@@ -1879,10 +2063,12 @@ def greater_than(x, y, cond=None, name=None):
           out = fluid.layers.greater_than(x=label, y=limit) #out=[False, True]
           out1 = label > limit #out1=[False, True]
     """
-    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
-                             "greater_than")
-    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
-                             "greater_than")
+    check_variable_and_dtype(
+        x, "x", ["float32", "float64", "int32", "int64"], "greater_than"
+    )
+    check_variable_and_dtype(
+        y, "y", ["float32", "float64", "int32", "int64"], "greater_than"
+    )
     if cond is not None:
         check_type(cond, "cond", Variable, "greater_than")
 
@@ -1896,13 +2082,12 @@ def greater_than(x, y, cond=None, name=None):
     if in_dygraph_mode():
         return _C_ops.greater_than(x, y, -1)
     else:
-        helper.append_op(type='greater_than',
-                         inputs={
-                             'X': [x],
-                             'Y': [y]
-                         },
-                         outputs={'Out': [cond]},
-                         attrs=attrs)
+        helper.append_op(
+            type='greater_than',
+            inputs={'X': [x], 'Y': [y]},
+            outputs={'Out': [cond]},
+            attrs=attrs,
+        )
         return cond
 
 
@@ -1910,13 +2095,13 @@ def greater_than(x, y, cond=None, name=None):
 def greater_equal(x, y, cond=None, name=None):
     """
     :alias_main: paddle.greater_equal
-	:alias: paddle.greater_equal,paddle.tensor.greater_equal,paddle.tensor.logic.greater_equal
-	:old_api: paddle.fluid.layers.greater_equal
+        :alias: paddle.greater_equal,paddle.tensor.greater_equal,paddle.tensor.logic.greater_equal
+        :old_api: paddle.fluid.layers.greater_equal
 
     This OP returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *greater_equal*.
             if cond is None, a new Varibale will be created to store the result.
@@ -1938,10 +2123,12 @@ def greater_equal(x, y, cond=None, name=None):
           out_1 = label >= limit #out1=[True, False]
 
     """
-    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
-                             "greater_equal")
-    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
-                             "greater_equal")
+    check_variable_and_dtype(
+        x, "x", ["float32", "float64", "int32", "int64"], "greater_equal"
+    )
+    check_variable_and_dtype(
+        y, "y", ["float32", "float64", "int32", "int64"], "greater_equal"
+    )
     if cond is not None:
         check_type(cond, "cond", Variable, "greater_equal")
 
@@ -1952,13 +2139,12 @@ def greater_equal(x, y, cond=None, name=None):
 
     attrs = dict()
 
-    helper.append_op(type='greater_equal',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [cond]},
-                     attrs=attrs)
+    helper.append_op(
+        type='greater_equal',
+        inputs={'X': [x], 'Y': [y]},
+        outputs={'Out': [cond]},
+        attrs=attrs,
+    )
     return cond
 
 
@@ -1969,7 +2155,7 @@ def equal(x, y, cond=None, name=None):
     Args:
         x(Variable): Tensor, data type is float32, float64, int32, int64.
         y(Variable): Tensor, data type is float32, float64, int32, int64.
-        cond(Variable, optional): Optional output which can be any created 
+        cond(Variable, optional): Optional output which can be any created
             Variable that meets the requirements to store the result of *equal*.
             if cond is None, a new Varibale will be created to store the result.
         name(str, optional): The default value is None.  Normally there is no need for
@@ -1995,10 +2181,12 @@ def equal(x, y, cond=None, name=None):
         default_axis = -1
         return _C_ops.equal(x, y, default_axis)
 
-    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
-                             "equal")
-    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
-                             "equal")
+    check_variable_and_dtype(
+        x, "x", ["float32", "float64", "int32", "int64"], "equal"
+    )
+    check_variable_and_dtype(
+        y, "y", ["float32", "float64", "int32", "int64"], "equal"
+    )
     if cond is not None:
         check_type(cond, "cond", Variable, "equal")
 
@@ -2007,25 +2195,22 @@ def equal(x, y, cond=None, name=None):
         cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
-    helper.append_op(type='equal',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [cond]})
+    helper.append_op(
+        type='equal', inputs={'X': [x], 'Y': [y]}, outputs={'Out': [cond]}
+    )
     return cond
 
 
 def not_equal(x, y, cond=None, name=None):
     """
     :alias_main: paddle.not_equal
-	:alias: paddle.not_equal,paddle.tensor.not_equal,paddle.tensor.logic.not_equal
-	:old_api: paddle.fluid.layers.not_equal
+        :alias: paddle.not_equal,paddle.tensor.not_equal,paddle.tensor.logic.not_equal
+        :old_api: paddle.fluid.layers.not_equal
 
     This OP returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
 
     Args:
-        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64. 
+        x(Variable): First input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         y(Variable): Second input to compare which is N-D tensor. The input data type should be float32, float64, int32, int64.
         cond(Variable, optional): Optional output which can be any created Variable that meets the requirements to store the result of *not_equal*.
             if cond is None, a new Varibale will be created to store the result.
@@ -2039,15 +2224,17 @@ def not_equal(x, y, cond=None, name=None):
         .. code-block:: python
 
           import paddle.fluid as fluid
-          
+
           label = fluid.layers.data(name='label', shape=[1], dtype='int64')
           limit = fluid.layers.fill_constant(shape=[1], value=1, dtype='int64')
           out = fluid.layers.not_equal(x=label, y=limit)
     """
-    check_variable_and_dtype(x, "x", ["float32", "float64", "int32", "int64"],
-                             "not_equal")
-    check_variable_and_dtype(y, "y", ["float32", "float64", "int32", "int64"],
-                             "not_equal")
+    check_variable_and_dtype(
+        x, "x", ["float32", "float64", "int32", "int64"], "not_equal"
+    )
+    check_variable_and_dtype(
+        y, "y", ["float32", "float64", "int32", "int64"], "not_equal"
+    )
     if cond is not None:
         check_type(cond, "cond", Variable, "not_equal")
 
@@ -2056,20 +2243,17 @@ def not_equal(x, y, cond=None, name=None):
         cond = helper.create_variable_for_type_inference(dtype='bool')
         cond.stop_gradient = True
 
-    helper.append_op(type='not_equal',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [cond]})
+    helper.append_op(
+        type='not_equal', inputs={'X': [x], 'Y': [y]}, outputs={'Out': [cond]}
+    )
     return cond
 
 
 def array_read(array, i):
     """
-    This OP is used to read data at the specified position from the input array 
+    This OP is used to read data at the specified position from the input array
     :ref:`api_fluid_LoDTensorArray` . ``array`` is the input array and ``i``
-    is the specified read position. This OP is often used together with 
+    is the specified read position. This OP is often used together with
     :ref:`api_fluid_layers_array_write` OP.
 
     Case 1:
@@ -2122,14 +2306,14 @@ def array_read(array, i):
 
             # the output is 2-D Tensor with shape [3,2].
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
     """
     if _non_static_mode():
         assert isinstance(
-            array,
-            list), "The 'array' in array_read must be list in dygraph mode"
+            array, list
+        ), "The 'array' in array_read must be list in dygraph mode"
         assert isinstance(
             i, Variable
         ), "The index 'i' in array_read must be Variable in dygraph mode"
@@ -2141,17 +2325,17 @@ def array_read(array, i):
 
     check_variable_and_dtype(i, 'i', ['int64'], 'array_read')
     helper = LayerHelper('array_read', **locals())
-    if not isinstance(
-            array,
-            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+    if (
+        not isinstance(array, Variable)
+        or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY
+    ):
         raise TypeError("array should be tensor array vairable")
     out = helper.create_variable_for_type_inference(dtype=array.dtype)
-    helper.append_op(type='read_from_array',
-                     inputs={
-                         'X': [array],
-                         'I': [i]
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='read_from_array',
+        inputs={'X': [array], 'I': [i]},
+        outputs={'Out': [out]},
+    )
     return out
 
 
@@ -2185,21 +2369,19 @@ def shrink_memory(x, i, table):
     check_type(i, 'i', Variable, 'shrink_memory')
     check_type(table, 'table', Variable, 'shrink_memory')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='shrink_rnn_memory',
-                     inputs={
-                         'X': [x],
-                         'I': [i],
-                         'RankTable': [table]
-                     },
-                     outputs={'Out': [out]},
-                     attrs={})
+    helper.append_op(
+        type='shrink_rnn_memory',
+        inputs={'X': [x], 'I': [i], 'RankTable': [table]},
+        outputs={'Out': [out]},
+        attrs={},
+    )
     return out
 
 
 def array_length(array):
     """
     This OP is used to get the length of the input array :ref:`api_fluid_LoDTensorArray` .
-    It can be used together with :ref:`api_fluid_layers_array_read` , :ref:`api_fluid_layers_array_write` , 
+    It can be used together with :ref:`api_fluid_layers_array_read` , :ref:`api_fluid_layers_array_write` ,
     :ref:`api_fluid_layers_While` OP to traverse, read and write LoDTensorArray.
 
     Args:
@@ -2233,33 +2415,35 @@ def array_length(array):
             #    shape: [1,]
             #    dtype: l
             #    data: 11,
-            
+
             # 1-D Tensor with shape [1], whose value is 11. It means that the length of LoDTensorArray
             # is 11.
             # dtype is the corresponding C++ data type, which may vary in different environments.
-            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t, 
-            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux, 
+            # Eg: if the data type of tensor is int64, then the corresponding C++ data type is int64_t,
+            #       so the dtype value is typeid(int64_t).Name(), which is 'x' on MacOS, 'l' on Linux,
             #       and '__int64' on Windows. They both represent 64-bit integer variables.
     """
 
     if _non_static_mode():
         assert isinstance(
-            array,
-            list), "The 'array' in array_write must be a list in dygraph mode"
+            array, list
+        ), "The 'array' in array_write must be a list in dygraph mode"
         return len(array)
 
-    if not isinstance(
-            array,
-            Variable) or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+    if (
+        not isinstance(array, Variable)
+        or array.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY
+    ):
         raise TypeError(
-            "array should be tensor array vairable in array_length Op")
+            "array should be tensor array vairable in array_length Op"
+        )
 
     helper = LayerHelper('array_length', **locals())
     tmp = helper.create_variable_for_type_inference(dtype='int64')
     tmp.stop_gradient = True
-    helper.append_op(type='lod_array_length',
-                     inputs={'X': [array]},
-                     outputs={'Out': [tmp]})
+    helper.append_op(
+        type='lod_array_length', inputs={'X': [array]}, outputs={'Out': [tmp]}
+    )
     return tmp
 
 
@@ -2281,8 +2465,9 @@ def __enter__(self):
 
     def __exit__(self, exc_type, exc_val, exc_tb):
         self.block.complete()
-        return super(ConditionalBlockGuard,
-                     self).__exit__(exc_type, exc_val, exc_tb)
+        return super(ConditionalBlockGuard, self).__exit__(
+            exc_type, exc_val, exc_tb
+        )
 
 
 class ConditionalBlock(object):
@@ -2328,10 +2513,9 @@ def complete(self):
 
         intermediate = set()
         params = set()
-        params, intermediate = get_inputs_outputs_in_block(inside_block,
-                                                           params,
-                                                           intermediate,
-                                                           helper=self.helper)
+        params, intermediate = get_inputs_outputs_in_block(
+            inside_block, params, intermediate, helper=self.helper
+        )
 
         # Todo(liym27) Here assume that all params are in recursive parent block
         # but when minimize() called in control flow, some params may be in
@@ -2347,25 +2531,25 @@ def complete(self):
                 out_list.append(inner_var)
 
         step_scope = parent_block.create_var(
-            type=core.VarDesc.VarType.STEP_SCOPES)
+            type=core.VarDesc.VarType.STEP_SCOPES
+        )
         conditional_block_op = parent_block.append_op(
             type='conditional_block',
             inputs={
                 'Cond': self.inputs,
                 'Input': param_list,
             },
-            outputs={
-                'Out': out_list,
-                'Scope': [step_scope]
-            },
+            outputs={'Out': out_list, 'Scope': [step_scope]},
             attrs={
                 'sub_block': inside_block,
-                'is_scalar_condition': self.is_scalar_condition
-            })
+                'is_scalar_condition': self.is_scalar_condition,
+            },
+        )
 
         if self.need_append_conditional_block_grad(inside_block):
-            self.append_conditional_block_grad(parent_block, inside_block,
-                                               conditional_block_op)
+            self.append_conditional_block_grad(
+                parent_block, inside_block, conditional_block_op
+            )
 
     def need_append_conditional_block_grad(self, inside_block):
         grad_sub_block_idx = inside_block.backward_block_idx
@@ -2373,10 +2557,13 @@ def need_append_conditional_block_grad(self, inside_block):
 
         # if inside_block have grad_block and grad_block is not itself,
         # we will append conditional block grad.
-        return grad_sub_block_idx != -1 and grad_sub_block_idx != inside_block_idx
+        return (
+            grad_sub_block_idx != -1 and grad_sub_block_idx != inside_block_idx
+        )
 
-    def append_conditional_block_grad(self, parent_block, inside_block,
-                                      conditional_block_op):
+    def append_conditional_block_grad(
+        self, parent_block, inside_block, conditional_block_op
+    ):
         '''
         Append op `conditional_block_grad` manually.
         When `optimizer.minimize/append_backward` is called in Paddle control flow,
@@ -2415,8 +2602,8 @@ def append_conditional_block_grad(self, parent_block, inside_block,
                 param_list.append(cpt.to_text(inner_var.name))
 
         grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            conditional_block_op.desc, cpt.to_text(set()),
-            [grad_sub_block.desc])
+            conditional_block_op.desc, cpt.to_text(set()), [grad_sub_block.desc]
+        )
 
         # append op_desc in grad_op_descs to target_block
         op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
@@ -2426,13 +2613,18 @@ def append_conditional_block_grad(self, parent_block, inside_block,
         new_op_desc._set_attr(op_role_attr_name, backward)
         # set input and output manually
         new_op_desc.set_input('Input', param_list)
-        new_op_desc.set_output('Input@GRAD',
-                               [param + "@GRAD" for param in param_list])
+        new_op_desc.set_output(
+            'Input@GRAD', [param + "@GRAD" for param in param_list]
+        )
 
         new_vars = set()
         for grad_var_name in new_op_desc.output_arg_names():
-            if grad_sub_block.desc.has_var_recursive(cpt.to_bytes(
-                    grad_var_name)) or grad_var_name == core.empty_var_name():
+            if (
+                grad_sub_block.desc.has_var_recursive(
+                    cpt.to_bytes(grad_var_name)
+                )
+                or grad_var_name == core.empty_var_name()
+            ):
                 continue
             grad_sub_block.desc.var(cpt.to_bytes(grad_var_name))
             new_vars.add(grad_var_name)
@@ -2455,16 +2647,20 @@ def copy_var_to_parent_block(var, layer_helper):
         return var
     prog = layer_helper.main_program
     parent_idx = prog.current_block().parent_idx
-    assert parent_idx >= 0, "Got wrong parent block index when assigning var to parent scope in control_flow"
+    assert (
+        parent_idx >= 0
+    ), "Got wrong parent block index when assigning var to parent scope in control_flow"
     parent_block = prog.block(parent_idx)
 
-    if var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY \
-            and parent_block._find_var_recursive(var.name):
+    if (
+        var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        and parent_block._find_var_recursive(var.name)
+    ):
         parent_block_var = var
     else:
-        parent_block_var = parent_block.create_var(dtype=var.dtype,
-                                                   shape=var.shape,
-                                                   type=var.type)
+        parent_block_var = parent_block.create_var(
+            dtype=var.dtype, shape=var.shape, type=var.type
+        )
         assign(var, parent_block_var)
     return parent_block_var
 
@@ -2480,8 +2676,8 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
     or both return ``None`` if user doens't like to return anything. A nest
     structure of tensors in PaddlePaddle is tensor(s), or tuple of tensors, or
     list of tensors.
-    
-    Note: 
+
+    Note:
         1. The tuples or lists returned by ``true_fn`` and ``false_fn`` must have
         the same shape because of dataflow model of PaddlePaddle while the
         tensors in the tuples or the lists can have different shapes.
@@ -2489,7 +2685,7 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         2. This API could be used under both static mode or dygraph mode. If it
         is in dygraph mode, the API only runs one branch based on condition.
 
-        3. If it is in static mode, any tensors or operations created outside 
+        3. If it is in static mode, any tensors or operations created outside
         or inside of ``true_fn`` and ``false_fn`` will be in net building
         regardless of which branch is selected at runtime. This has frequently
         surprised users who expected a lazy semantics. For example:
@@ -2518,9 +2714,9 @@ def cond(pred, true_fn=None, false_fn=None, name=None, return_names=None):
         name(str, optional): The default value is ``None`` . Normally users
              don't have to set this parameter. For more information, please
              refer to :ref:`api_guide_Name` .
-        return_names(sequence of string, optional): The default value is ``None`` . 
-             Normally users don't have to set this parameters.  A sequence of strings 
-             to represents the name of returned vars.  The structure of sequence must 
+        return_names(sequence of string, optional): The default value is ``None`` .
+             Normally users don't have to set this parameters.  A sequence of strings
+             to represents the name of returned vars.  The structure of sequence must
              be same with return values of true_fn and false_fn.
 
     Returns:
@@ -2566,7 +2762,7 @@ def false_func():
             # ret is a tuple containing 2 tensors
             # ret[0] = [[1 1]]
             # ret[1] = [[ True  True  True]
-            #           [ True  True  True]]            
+            #           [ True  True  True]]
 
     """
     if _non_static_mode():
@@ -2577,15 +2773,19 @@ def false_func():
             if true_fn is not None:
                 if not callable(true_fn):
                     raise TypeError(
-                        "The true_fn in cond must be callable, but received {}".
-                        format(type(true_fn).__name__))
+                        "The true_fn in cond must be callable, but received {}".format(
+                            type(true_fn).__name__
+                        )
+                    )
                 return true_fn()
         else:
             if false_fn is not None:
                 if not callable(false_fn):
                     raise TypeError(
-                        "The false_fn in cond must be callable, but received {}"
-                        .format(type(false_fn).__name__))
+                        "The false_fn in cond must be callable, but received {}".format(
+                            type(false_fn).__name__
+                        )
+                    )
                 return false_fn()
         return None
 
@@ -2599,25 +2799,32 @@ def false_func():
         if not callable(true_fn):
             raise TypeError(
                 "The true_fn in cond must be callable, but received {}".format(
-                    type(true_fn).__name__))
+                    type(true_fn).__name__
+                )
+            )
         true_cond_block = ConditionalBlock([pred], is_scalar_condition=True)
         with true_cond_block.block():
             origin_true_output = true_fn()
             if origin_true_output is not None:
-                true_output = map_structure(copy_to_parent_func,
-                                            origin_true_output)
+                true_output = map_structure(
+                    copy_to_parent_func, origin_true_output
+                )
     if false_fn is not None:
         if not callable(false_fn):
             raise TypeError(
                 "The false_fn in cond must be callable, but received {}".format(
-                    type(false_fn).__name__))
-        false_cond_block = ConditionalBlock([logical_not(pred)],
-                                            is_scalar_condition=True)
+                    type(false_fn).__name__
+                )
+            )
+        false_cond_block = ConditionalBlock(
+            [logical_not(pred)], is_scalar_condition=True
+        )
         with false_cond_block.block():
             origin_false_output = false_fn()
             if origin_false_output is not None:
-                false_output = map_structure(copy_to_parent_func,
-                                             origin_false_output)
+                false_output = map_structure(
+                    copy_to_parent_func, origin_false_output
+                )
 
     if true_output is None and false_output is None:
         return None
@@ -2625,42 +2832,109 @@ def false_func():
     if true_output is None:
         raise ValueError(
             "Incompatible return values of true_fn and false_fn in cond: "
-            "true_fn returns None while false_fn returns non-None")
+            "true_fn returns None while false_fn returns non-None"
+        )
     if false_output is None:
         raise ValueError(
             "Incompatible return values of true_fn and false_fn in cond: "
-            "true_fn returns non-None while false_fn returns None")
+            "true_fn returns non-None while false_fn returns None"
+        )
 
     # Merge ture and false output if they are not None
     if return_names is None:
-        return_names = ["no name"] * len(to_sequence(true_output))
+        is_dy2staic = False
+        return_names = ["no name"] * len(_to_sequence_except_dict(true_output))
     else:
-        """ 
+        """
         dy2static will set the return_names and expand the return values to UndefinedVar.
         """
+        is_dy2staic = True
+
+        # TODO:  expand_undefined_var will replace None to Undefinedvar(), to fix cases like:
+        #       a = None
+        #       if condition:
+        #           a = 1
+        # Because we can not use variable to express 'None'
         true_output, false_output = expand_undefined_var(
-            true_output, false_output, return_names)
-        true_output, false_output = change_none_to_undefinedvar(
-            true_output, false_output)
-    if len(to_sequence(true_output)) != len(to_sequence(false_output)):
+            true_output, false_output, return_names
+        )
+
+    if len(_to_sequence_except_dict(true_output)) != len(
+        _to_sequence_except_dict(false_output)
+    ):
         raise ValueError(
-            "true fn returns {} vars, but false fn returns {} vars, which is not equals"
-            .format(len(to_sequence(true_output)),
-                    len(to_sequence(false_output))))
-    for true_out, false_out, return_name in zip(to_sequence(true_output),
-                                                to_sequence(false_output),
-                                                to_sequence(return_names)):
+            "true fn returns {} vars, but false fn returns {} vars, which is not equals".format(
+                len(_to_sequence_except_dict(true_output)),
+                len(_to_sequence_except_dict(false_output)),
+            )
+        )
+    for true_out, false_out, return_name in zip(
+        _to_sequence_except_dict(true_output),
+        _to_sequence_except_dict(false_output),
+        _to_sequence_except_dict(return_names),
+    ):
         try:
             assert_same_structure(true_out, false_out, check_types=False)
         except ValueError as e:
             raise ValueError(
-                "Incompatible return values of `{}` in true_fn and false_fn in cond: {}"
-                .format(return_name, e))
+                "Incompatible return values of `{}` in true_fn and false_fn in cond: {}".format(
+                    return_name, e
+                )
+            )
+
+    def check_ret_none(seq_true, seq_false, seq_names):
+        for f_true, f_false, f_name in zip(seq_true, seq_false, seq_names):
+            f_true = flatten(f_true)
+            f_false = flatten(f_false)
+            for idx in range(len(f_true)):
+                if (
+                    f_true[idx] is None
+                    and f_false[idx] is not None
+                    or f_false[idx] is None
+                    and f_true[idx] is not None
+                ):
+                    warnings.warn(
+                        "In cond : Var '{}' or part of it is set differently in ifelse branchs, "
+                        "<{}, {}> in true branch and <{}, {}> in false branch. Set var to "
+                        "'None' in ifelse block might lead to error.".format(
+                            f_name,
+                            type(f_true[idx]),
+                            f_true[idx],
+                            type(f_false[idx]),
+                            f_false[idx],
+                        )
+                    )
+
+    check_ret_none(
+        _to_sequence_except_dict(true_output),
+        _to_sequence_except_dict(false_output),
+        _to_sequence_except_dict(return_names),
+    )
+
+    if is_dy2staic:
+        true_output, false_output = change_none_to_undefinedvar(
+            true_output, false_output
+        )
 
     mask = cast(pred, dtype='int32')
-    merge_func = lambda false_var, true_var: select_input_with_buildin_type(
-        [false_var, true_var], mask)
-    merged_output = map_structure(merge_func, false_output, true_output)
+    merge_func = (
+        lambda name, false_var, true_var: select_input_with_buildin_type(
+            [false_var, true_var], mask, name
+        )
+    )
+
+    def merge_every_var_list(false_vars, true_vars, name):
+        return map_structure(partial(merge_func, name), false_vars, true_vars)
+
+    merged_output = list(
+        map(
+            merge_every_var_list,
+            _to_sequence_except_dict(false_output),
+            _to_sequence_except_dict(true_output),
+            _to_sequence_except_dict(return_names),
+        )
+    )
+    merged_output = pack_sequence_as(false_output, flatten(merged_output))
     return merged_output
 
 
@@ -2668,7 +2942,8 @@ def change_none_to_undefinedvar(nest1, nest2):
     from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar
 
     def map_fn(x):
-        if x is None: return UndefinedVar("padding")
+        if x is None:
+            return UndefinedVar("padding")
         return x
 
     nest1_out = pack_sequence_as(nest1, list(map(map_fn, flatten(nest1))))
@@ -2676,42 +2951,100 @@ def map_fn(x):
     return nest1_out, nest2_out
 
 
+def _to_sequence_except_dict(x):
+    """
+    In this function, dict is not viewed as sequence.
+    """
+    if isinstance(x, dict):
+        return [x]
+    return to_sequence(x)
+
+
+def _is_sequence_except_dict(x):
+    """
+    In this function, dict is not viewed as sequence.
+    """
+    if isinstance(x, dict):
+        return False
+    return is_sequence(x)
+
+
 def expand_undefined_var(nest1, nest2, names):
-    """ TODO: make this function recursively.
-        nest1: Var1, (UndefinedVar, [1,2,3])
-        nest2: Var2, ([1,2,3,4], UndefinedVar)
-        In this case, we should not expand recursively.
+    """TODO: make this function recursively.
+    nest1: Var1, (UndefinedVar, [1,2,3])
+    nest2: Var2, ([1,2,3,4], UndefinedVar)
+    In this case, we should not expand recursively.
     """
     from paddle.fluid.dygraph.dygraph_to_static.utils import UndefinedVar
-    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import RETURN_VALUE_PREFIX
+    from paddle.fluid.dygraph.dygraph_to_static.return_transformer import (
+        RETURN_VALUE_PREFIX,
+    )
 
     def pack_undefined_var_as(seq):
-        return pack_sequence_as(seq,
-                                [UndefinedVar("padding") for i in flatten(seq)])
-
-    def map_fn(n1, n2, name):
-        if not name.startswith(RETURN_VALUE_PREFIX) and (isinstance(
-                n1, UndefinedVar) or n1 is None):
+        return pack_sequence_as(
+            seq, [UndefinedVar("padding") for i in flatten(seq)]
+        )
+
+    def map_fn(n1, n2, name, order):
+        if not name.startswith(RETURN_VALUE_PREFIX) and (
+            isinstance(n1, UndefinedVar) or n1 is None
+        ):
+            if n1 is None and n2 is not None:
+                if order == 0:
+                    warnings.warn(
+                        "In cond : Var '{}' or part of it is set differently in ifelse branchs, "
+                        "<{}, {}> in true branch and <{}, {}> in false branch. Set var to "
+                        "'None' in ifelse block might lead to error.".format(
+                            name, type(n1), n1, type(n2), n2
+                        )
+                    )
+                else:
+                    warnings.warn(
+                        "In cond : Var '{}' or part of it is set differently in ifelse branchs, "
+                        "<{}, {}> in true branch and <{}, {}> in false branch. Set var to "
+                        "'None' in ifelse block might lead to error.".format(
+                            name, type(n2), n2, type(n1), n1
+                        )
+                    )
             return pack_undefined_var_as(n2)
         return n1
 
     nest1_out = list(
-        map(map_fn, to_sequence(nest1), to_sequence(nest2), to_sequence(names)))
+        map(
+            map_fn,
+            _to_sequence_except_dict(nest1),
+            _to_sequence_except_dict(nest2),
+            _to_sequence_except_dict(names),
+            [0 for i in _to_sequence_except_dict(names)],
+        )
+    )
     nest2_out = list(
-        map(map_fn, to_sequence(nest2), to_sequence(nest1), to_sequence(names)))
-    if not is_sequence(nest1): nest1_out = nest1_out[0]
-    if not is_sequence(nest2): nest2_out = nest2_out[0]
+        map(
+            map_fn,
+            _to_sequence_except_dict(nest2),
+            _to_sequence_except_dict(nest1),
+            _to_sequence_except_dict(names),
+            [1 for i in _to_sequence_except_dict(names)],
+        )
+    )
+    if not _is_sequence_except_dict(nest1):
+        nest1_out = nest1_out[0]
+    if not _is_sequence_except_dict(nest2):
+        nest2_out = nest2_out[0]
     return nest1_out, nest2_out
 
 
 def _error_message(what, arg_name, op_name, right_value, error_value):
-    error_message = "{what} of '{arg_name}' in {op_name} must be " \
+    error_message = (
+        "{what} of '{arg_name}' in {op_name} must be "
         "{right_value}, but received: {error_value}.".format(
-        what=what,
-        arg_name=arg_name,
-        op_name=op_name,
-        right_value=right_value,
-        error_value=error_value)
+            what=what,
+            arg_name=arg_name,
+            op_name=op_name,
+            right_value=right_value,
+            error_value=error_value,
+        )
+    )
 
     return error_message
 
@@ -2792,24 +3125,42 @@ def _case_check_args(pred_fn_pairs, default):
         for pred_fn in pred_fn_pairs:
             if not isinstance(pred_fn, tuple):
                 raise TypeError(
-                    _error_message("The elements' type", "pred_fn_pairs",
-                                   "case", tuple, type(pred_fn)))
+                    _error_message(
+                        "The elements' type",
+                        "pred_fn_pairs",
+                        "case",
+                        tuple,
+                        type(pred_fn),
+                    )
+                )
             if len(pred_fn) != 2:
                 raise TypeError(
-                    _error_message("The tuple's size", "pred_fn_pairs", "case",
-                                   "2",
-                                   str(len(pred_fn)) + "-tuple"))
+                    _error_message(
+                        "The tuple's size",
+                        "pred_fn_pairs",
+                        "case",
+                        "2",
+                        str(len(pred_fn)) + "-tuple",
+                    )
+                )
             pred, fn = pred_fn
 
             if not isinstance(pred, Variable):
                 raise TypeError(
-                    _error_message("The pred's type", "pred_fn_pairs", "case",
-                                   "boolean Variable", type(pred)))
+                    _error_message(
+                        "The pred's type",
+                        "pred_fn_pairs",
+                        "case",
+                        "boolean Variable",
+                        type(pred),
+                    )
+                )
 
             if not callable(fn):
                 raise TypeError(
                     "The fn for {} of pred_fn_pairs in Op(case) must"
-                    " be callable.".format(pred.name))
+                    " be callable.".format(pred.name)
+                )
 
         if default is None:
             default_index = len(pred_fn_pairs) - 1  # pick the last one
@@ -2835,11 +3186,11 @@ class Switch(object):
     """
     :api_attr: Static Graph
 
-    This class is used to implement Switch branch control function. 
-    Switch branch contains several case branches and one default branch. 
-    Switch control flow checks whether the case branch conditions are satisfied in turn, 
-    and only executes the statement after the first case branch that satisfies the conditions. 
-    If there is no case branch that satisfies the condition, 
+    This class is used to implement Switch branch control function.
+    Switch branch contains several case branches and one default branch.
+    Switch control flow checks whether the case branch conditions are satisfied in turn,
+    and only executes the statement after the first case branch that satisfies the conditions.
+    If there is no case branch that satisfies the condition,
     only the statement following the default branch is executed.
 
     Note:
@@ -2848,7 +3199,7 @@ class Switch(object):
 
     Member Functions:
         case(condition): The case branch of Switch whose parameter cond is a scalar Variable of bool type. Only if the cond of the current case branch is True and the cond of the previous case branch is False, the statement after the case branch will be executed, and the statement after the case branch will not be executed.
-        
+
         default(): The default branch of Switch. When cond of all case branches is False, the statement after default branch is executed.
 
     Case and default functions can only be used inside the scope of Switch, as shown below:
@@ -2870,7 +3221,7 @@ class Switch(object):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle.fluid as fluid
 
             lr = fluid.layers.create_global_var(
@@ -2911,8 +3262,11 @@ def case(self, condition):
             raise ValueError("case should be called inside with")
 
         check_variable_and_dtype(
-            condition, 'condition', ['bool'],
-            'the member function case of fluid.layers.Switch')
+            condition,
+            'condition',
+            ['bool'],
+            'the member function case of fluid.layers.Switch',
+        )
 
         if len(self.pre_not_conditions) == 0:
             cond_block = ConditionalBlock([condition], is_scalar_condition=True)
@@ -2921,12 +3275,14 @@ def case(self, condition):
         else:
             pre_cond_num = len(self.pre_not_conditions)
             pre_not_cond = self.pre_not_conditions[pre_cond_num - 1]
-            new_not_cond = logical_and(x=pre_not_cond,
-                                       y=logical_not(x=condition))
+            new_not_cond = logical_and(
+                x=pre_not_cond, y=logical_not(x=condition)
+            )
             self.pre_not_conditions.append(new_not_cond)
             cond_block = ConditionalBlock(
                 [logical_and(x=pre_not_cond, y=condition)],
-                is_scalar_condition=True)
+                is_scalar_condition=True,
+            )
 
         return ConditionalBlockGuard(cond_block)
 
@@ -2936,7 +3292,8 @@ def default(self):
             raise ValueError("there should be at least one condition")
         cond_block = ConditionalBlock(
             [self.pre_not_conditions[pre_cond_num - 1]],
-            is_scalar_condition=True)
+            is_scalar_condition=True,
+        )
         return ConditionalBlockGuard(cond_block)
 
     def __enter__(self):
@@ -2956,7 +3313,6 @@ def __exit__(self, exc_type, exc_val, exc_tb):
 
 
 class IfElseBlockGuard(object):
-
     def __init__(self, is_true, ifelse):
         if not isinstance(ifelse, IfElse):
             raise TypeError("ifelse must be an instance of IfElse class")
@@ -2977,7 +3333,11 @@ def __init__(self, is_true, ifelse):
         self.cond_block = self.cond_block.block()
 
     def __enter__(self):
-        self.ie.status = IfElse.IN_IF_ELSE_TRUE_BLOCKS if self.is_true else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        self.ie.status = (
+            IfElse.IN_IF_ELSE_TRUE_BLOCKS
+            if self.is_true
+            else IfElse.IN_IF_ELSE_FALSE_BLOCKS
+        )
         self.cond_block.__enter__()
 
     def __exit__(self, exc_type, exc_val, exc_tb):
@@ -3004,7 +3364,7 @@ class IfElse(object):
     IfElse OP is different from other OPs in usage, which may cause some users confusion. Here is a simple example to illustrate this OP.
 
     .. code-block:: python
-        
+
         # The following code completes the function: subtract 10 from the data greater than 0 in x, add 10 to the data less than 0 in x, and sum all the data.
         import numpy as np
         import paddle.fluid as fluid
@@ -3014,7 +3374,7 @@ class IfElse(object):
 
         x_d = np.array([[3], [1], [-2], [-3]]).astype(np.float32)
         y_d = np.zeros((4, 1)).astype(np.float32)
-        
+
         # Compare the size of x, y pairs of elements, output cond, cond is shape [4, 1], data type bool 2-D tensor.
         # Based on the input data x_d, y_d, it can be inferred that the data in cond are [[true], [true], [false], [false]].
         cond = fluid.layers.greater_than(x, y)
@@ -3033,7 +3393,7 @@ class IfElse(object):
             ie.output(out_1)
 
         # According to cond condition, the data processed in the two blocks are merged. The output here is output, the type is List, and the element type in List is Variable.
-        output = ie() #  [array([[-7.], [-9.], [ 8.], [ 7.]], dtype=float32)] 
+        output = ie() #  [array([[-7.], [-9.], [ 8.], [ 7.]], dtype=float32)]
 
         # Get the first Variable in the output List and add all elements.
         out = fluid.layers.reduce_sum(output[0])
@@ -3043,7 +3403,7 @@ class IfElse(object):
 
         res = exe.run(fluid.default_main_program(), feed={"x":x_d, "y":y_d}, fetch_list=[out])
         print(res)
-        # [array([-1.], dtype=float32)] 
+        # [array([-1.], dtype=float32)]
 
     Args:
         cond (Variable): cond is a 2-D Tensor with shape [N, 1] and data type bool, representing the corresponding execution conditions of N input data. The data type is bool.
@@ -3054,7 +3414,7 @@ class IfElse(object):
 
     Internal Functions:
         The block is constructed by calling the ``with ie. true_block()`` function in the object, and the computational logic under condition true is put into the block. If no corresponding block is constructed, the input data in the corresponding conditional dimension is unchanged.
- 
+
         The block is constructed by calling the ``with ie. false_block()`` function in the object, and the computational logic under condition false is put into the block. If no corresponding block is constructed, the input data in the corresponding conditional dimension is unchanged.
 
         ``Out = ie. input (x)`` will take out the data of the corresponding conditional dimension in X and put it into out, supporting the internal processing of multiple inputs in block.
@@ -3064,6 +3424,7 @@ class IfElse(object):
         There is a ``call ()`` function inside the object, that is, by calling ``output = ie ()``, all the outputs inside the block of False are fused as the whole output, the output type is a list, and the type of each element in the list is Variable.
 
     """
+
     OUT_IF_ELSE_BLOCKS = 0
     IN_IF_ELSE_TRUE_BLOCKS = 1
     IN_IF_ELSE_FALSE_BLOCKS = 2
@@ -3085,24 +3446,27 @@ def input(self, x):
         if id(x) not in self.input_table:
             parent_block = self._parent_block()
             out_true = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key('ifelse_input' +
-                                                             self.helper.name),
-                dtype=x.dtype)
+                name=unique_name.generate_with_ignorable_key(
+                    'ifelse_input' + self.helper.name
+                ),
+                dtype=x.dtype,
+            )
 
             out_false = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key('ifelse_input' +
-                                                             self.helper.name),
-                dtype=x.dtype)
-            parent_block.append_op(type='split_lod_tensor',
-                                   inputs={
-                                       'X': x,
-                                       'Mask': self.cond,
-                                   },
-                                   outputs={
-                                       'OutTrue': out_true,
-                                       'OutFalse': out_false
-                                   },
-                                   attrs={'level': 0})
+                name=unique_name.generate_with_ignorable_key(
+                    'ifelse_input' + self.helper.name
+                ),
+                dtype=x.dtype,
+            )
+            parent_block.append_op(
+                type='split_lod_tensor',
+                inputs={
+                    'X': x,
+                    'Mask': self.cond,
+                },
+                outputs={'OutTrue': out_true, 'OutFalse': out_false},
+                attrs={'level': 0},
+            )
             self.input_table[id(x)] = (out_true, out_false)
         else:
             out_true, out_false = self.input_table[id(x)]
@@ -3126,17 +3490,21 @@ def output(self, *outs):
         if self.status == self.OUT_IF_ELSE_BLOCKS:
             raise ValueError("output can only be invoked in the sub-block")
 
-        out_table = self.output_table[1 if self.status ==
-                                      self.IN_IF_ELSE_TRUE_BLOCKS else 0]
+        out_table = self.output_table[
+            1 if self.status == self.IN_IF_ELSE_TRUE_BLOCKS else 0
+        ]
         parent_block = self._parent_block()
         for each_out in outs:
-            check_type(each_out, "each output", Variable,
-                       "fluid.layers.IfElse.output")
+            check_type(
+                each_out, "each output", Variable, "fluid.layers.IfElse.output"
+            )
             # create outside tensor
             outside_out = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key("_".join(
-                    [self.helper.name, 'output'])),
-                dtype=each_out.dtype)
+                name=unique_name.generate_with_ignorable_key(
+                    "_".join([self.helper.name, 'output'])
+                ),
+                dtype=each_out.dtype,
+            )
             out_table.append(outside_out)
 
             # assign local var to outside
@@ -3147,8 +3515,9 @@ def __call__(self):
             raise ValueError("IfElse::__call__ must be out of sub-block")
         false_len, true_len = list(map(len, self.output_table))
         if false_len == 0 and true_len == 0:
-            raise ValueError("Must invoke true_block/false_block before "
-                             "__call__")
+            raise ValueError(
+                "Must invoke true_block/false_block before " "__call__"
+            )
         elif false_len != true_len and false_len != 0 and true_len != 0:
             raise ValueError("The output side must be same")
         elif false_len == 0 or true_len == 0:
@@ -3159,11 +3528,14 @@ def __call__(self):
         rlist = []
         for false_var, true_var in zip(*self.output_table):
             rlist.append(
-                merge_lod_tensor(in_true=true_var,
-                                 in_false=false_var,
-                                 mask=self.cond,
-                                 x=self.cond,
-                                 level=0))
+                merge_lod_tensor(
+                    in_true=true_var,
+                    in_false=false_var,
+                    mask=self.cond,
+                    x=self.cond,
+                    level=0,
+                )
+            )
         return rlist
 
 
@@ -3234,6 +3606,7 @@ class DynamicRNN(object):
             # Get RNN's result of the last time step
             last = fluid.layers.sequence_last_step(out)
     """
+
     BEFORE_RNN = 0
     IN_RNN = 1
     AFTER_RNN = 2
@@ -3351,39 +3724,44 @@ def step_input(self, x, level=0):
         if self.lod_rank_table is None:
             self.lod_rank_table = parent_block.create_var(
                 name=unique_name.generate('lod_rank_table'),
-                type=core.VarDesc.VarType.LOD_RANK_TABLE)
+                type=core.VarDesc.VarType.LOD_RANK_TABLE,
+            )
             self.lod_rank_table.stop_gradient = True
-            parent_block.append_op(type='lod_rank_table',
-                                   inputs={"X": x},
-                                   outputs={"Out": self.lod_rank_table},
-                                   attrs={"level": level})
+            parent_block.append_op(
+                type='lod_rank_table',
+                inputs={"X": x},
+                outputs={"Out": self.lod_rank_table},
+                attrs={"level": level},
+            )
             self.max_seq_len = parent_block.create_var(
                 name=unique_name.generate('dynamic_rnn_max_seq_len'),
-                dtype='int64')
+                dtype='int64',
+            )
             self.max_seq_len.stop_gradient = False
-            parent_block.append_op(type='max_sequence_len',
-                                   inputs={'RankTable': self.lod_rank_table},
-                                   outputs={"Out": self.max_seq_len})
+            parent_block.append_op(
+                type='max_sequence_len',
+                inputs={'RankTable': self.lod_rank_table},
+                outputs={"Out": self.max_seq_len},
+            )
             self.cond.stop_gradient = True
-            parent_block.append_op(type='less_than',
-                                   inputs={
-                                       'X': self.step_idx,
-                                       'Y': self.max_seq_len
-                                   },
-                                   outputs={'Out': self.cond},
-                                   attrs={'force_cpu': True})
+            parent_block.append_op(
+                type='less_than',
+                inputs={'X': self.step_idx, 'Y': self.max_seq_len},
+                outputs={'Out': self.cond},
+                attrs={'force_cpu': True},
+            )
 
         input_array = parent_block.create_var(
             name=unique_name.generate('dynamic_rnn_input_array'),
             type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-            dtype=x.dtype)
+            dtype=x.dtype,
+        )
         self.input_array.append((input_array, x.dtype))
-        parent_block.append_op(type='lod_tensor_to_array',
-                               inputs={
-                                   'X': x,
-                                   'RankTable': self.lod_rank_table
-                               },
-                               outputs={'Out': input_array})
+        parent_block.append_op(
+            type='lod_tensor_to_array',
+            inputs={'X': x, 'RankTable': self.lod_rank_table},
+            outputs={'Out': input_array},
+        )
         return array_read(array=input_array, i=self.step_idx)
 
     def static_input(self, x):
@@ -3516,18 +3894,19 @@ def static_input(self, x):
         check_type(x, 'x', Variable, 'fluid.layers.DynamicRNN.static_input()')
         if self.lod_rank_table is None:
             raise RuntimeError(
-                "static_input() must be called after step_input().")
+                "static_input() must be called after step_input()."
+            )
         parent_block = self._parent_block_()
         x_reordered = parent_block.create_var(
             name=unique_name.generate("dynamic_rnn_static_input_reordered"),
             type=core.VarDesc.VarType.LOD_TENSOR,
-            dtype=x.dtype)
-        parent_block.append_op(type='reorder_lod_tensor_by_rank',
-                               inputs={
-                                   'X': [x],
-                                   'RankTable': [self.lod_rank_table]
-                               },
-                               outputs={'Out': [x_reordered]})
+            dtype=x.dtype,
+        )
+        parent_block.append_op(
+            type='reorder_lod_tensor_by_rank',
+            inputs={'X': [x], 'RankTable': [self.lod_rank_table]},
+            outputs={'Out': [x_reordered]},
+        )
         return shrink_memory(x_reordered, self.step_idx, self.lod_rank_table)
 
     @signature_safe_contextmanager
@@ -3542,10 +3921,9 @@ def block(self):
         """
         if self.status != DynamicRNN.BEFORE_RNN:
             raise ValueError("rnn.block() can only be invoke once")
-        self.step_idx = fill_constant(shape=[1],
-                                      dtype='int64',
-                                      value=0,
-                                      force_cpu=True)
+        self.step_idx = fill_constant(
+            shape=[1], dtype='int64', value=0, force_cpu=True
+        )
         self.step_idx.stop_gradient = False
         self.status = DynamicRNN.IN_RNN
         with self.while_op.block():
@@ -3555,15 +3933,18 @@ def block(self):
             for new_mem, mem_array in self.mem_link:
                 array_write(x=new_mem, i=self.step_idx, array=mem_array)
 
-            less_than(x=self.step_idx,
-                      y=self.max_seq_len,
-                      force_cpu=True,
-                      cond=self.cond)
+            less_than(
+                x=self.step_idx,
+                y=self.max_seq_len,
+                force_cpu=True,
+                cond=self.cond,
+            )
 
         self.status = DynamicRNN.AFTER_RNN
         for each_array in self.output_array:
             self.outputs.append(
-                array_to_lod_tensor(x=each_array, table=self.lod_rank_table))
+                array_to_lod_tensor(x=each_array, table=self.lod_rank_table)
+            )
 
     def __call__(self, *args, **kwargs):
         """
@@ -3579,19 +3960,25 @@ def __call__(self, *args, **kwargs):
             ValueError: When :code:`__call__()` is called before :code:`block()` .
         """
         if self.status != DynamicRNN.AFTER_RNN:
-            raise ValueError(("Output of the dynamic RNN can only be visited "
-                              "outside the rnn block."))
+            raise ValueError(
+                (
+                    "Output of the dynamic RNN can only be visited "
+                    "outside the rnn block."
+                )
+            )
         if len(self.outputs) == 1:
             return self.outputs[0]
         else:
             return self.outputs
 
-    def memory(self,
-               init=None,
-               shape=None,
-               value=0.0,
-               need_reorder=False,
-               dtype='float32'):
+    def memory(
+        self,
+        init=None,
+        shape=None,
+        value=0.0,
+        need_reorder=False,
+        dtype='float32',
+    ):
         r"""
         Create a memory Variable for DynamicRNN to deliver data cross time steps.
         It can be initialized by an existing Tensor or a constant Tensor of given
@@ -3680,11 +4067,16 @@ def memory(self,
         self._assert_in_rnn_block_('memory')
         self._init_zero_idx_()
         if shape is not None:
-            check_type(shape, 'shape', (list, tuple),
-                       'fluid.layers.DynamicRNN.memory()')
+            check_type(
+                shape,
+                'shape',
+                (list, tuple),
+                'fluid.layers.DynamicRNN.memory()',
+            )
         if init is not None:
-            check_type(init, 'init', Variable,
-                       'fluid.layers.DynamicRNN.memory()')
+            check_type(
+                init, 'init', Variable, 'fluid.layers.DynamicRNN.memory()'
+            )
             parent_block = self._parent_block_()
             init_tensor = init
             if need_reorder == True:
@@ -3692,32 +4084,36 @@ def memory(self,
                     raise ValueError(
                         'If set need_reorder to True, make sure step_input be '
                         'invoked before '
-                        'memory(init=init, need_reordered=True, ...).')
+                        'memory(init=init, need_reordered=True, ...).'
+                    )
                 init_reordered = parent_block.create_var(
                     name=unique_name.generate('dynamic_rnn_mem_init_reordered'),
                     type=core.VarDesc.VarType.LOD_TENSOR,
-                    dtype=init.dtype)
-                parent_block.append_op(type='reorder_lod_tensor_by_rank',
-                                       inputs={
-                                           'X': [init_tensor],
-                                           'RankTable': [self.lod_rank_table]
-                                       },
-                                       outputs={'Out': [init_reordered]})
+                    dtype=init.dtype,
+                )
+                parent_block.append_op(
+                    type='reorder_lod_tensor_by_rank',
+                    inputs={
+                        'X': [init_tensor],
+                        'RankTable': [self.lod_rank_table],
+                    },
+                    outputs={'Out': [init_reordered]},
+                )
                 init_tensor = init_reordered
             mem_array = parent_block.create_var(
                 name=unique_name.generate('dynamic_rnn_mem_array'),
                 type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=init.dtype)
-            parent_block.append_op(type='write_to_array',
-                                   inputs={
-                                       'X': init_tensor,
-                                       'I': self.zero_idx
-                                   },
-                                   outputs={'Out': mem_array})
+                dtype=init.dtype,
+            )
+            parent_block.append_op(
+                type='write_to_array',
+                inputs={'X': init_tensor, 'I': self.zero_idx},
+                outputs={'Out': mem_array},
+            )
             retv = array_read(array=mem_array, i=self.step_idx)
-            retv = shrink_memory(x=retv,
-                                 i=self.step_idx,
-                                 table=self.lod_rank_table)
+            retv = shrink_memory(
+                x=retv, i=self.step_idx, table=self.lod_rank_table
+            )
             self.mem_dict[retv.name] = mem_array
             return retv
         else:
@@ -3727,24 +4123,27 @@ def memory(self,
                 )
             parent_block = self._parent_block_()
             init = parent_block.create_var(
-                name=unique_name.generate('mem_init'), dtype=dtype)
+                name=unique_name.generate('mem_init'), dtype=dtype
+            )
             arr, dtype = self.input_array[0]
-            in0 = parent_block.create_var(name=unique_name.generate('in0'),
-                                          dtype=dtype)
-            parent_block.append_op(type='read_from_array',
-                                   inputs={
-                                       'X': [arr],
-                                       'I': [self.zero_idx]
-                                   },
-                                   outputs={'Out': [in0]})
-            parent_block.append_op(type='fill_constant_batch_size_like',
-                                   inputs={'Input': [in0]},
-                                   outputs={'Out': [init]},
-                                   attrs={
-                                       'shape': [-1] + shape,
-                                       'value': float(value),
-                                       'dtype': init.dtype
-                                   })
+            in0 = parent_block.create_var(
+                name=unique_name.generate('in0'), dtype=dtype
+            )
+            parent_block.append_op(
+                type='read_from_array',
+                inputs={'X': [arr], 'I': [self.zero_idx]},
+                outputs={'Out': [in0]},
+            )
+            parent_block.append_op(
+                type='fill_constant_batch_size_like',
+                inputs={'Input': [in0]},
+                outputs={'Out': [init]},
+                attrs={
+                    'shape': [-1] + shape,
+                    'value': float(value),
+                    'dtype': init.dtype,
+                },
+            )
             return self.memory(init=init)
 
     def update_memory(self, ex_mem, new_mem):
@@ -3758,7 +4157,7 @@ def update_memory(self, ex_mem, new_mem):
 
         Returns:
             None
-        
+
         Raises:
             ValueError: When :code:`update_memory()` is called outside :code:`block()` .
             TypeError: When :code:`ex_mem` or :code:`new_mem` is not a Variable.
@@ -3766,10 +4165,18 @@ def update_memory(self, ex_mem, new_mem):
             ValueError: When :code:`update_memory()` is called before :code:`step_input()` .
         """
         self._assert_in_rnn_block_('update_memory')
-        check_type(ex_mem, 'ex_mem', Variable,
-                   'fluid.layers.DynamicRNN.update_memory()')
-        check_type(new_mem, 'new_mem', Variable,
-                   'fluid.layers.DynamicRNN.update_memory()')
+        check_type(
+            ex_mem,
+            'ex_mem',
+            Variable,
+            'fluid.layers.DynamicRNN.update_memory()',
+        )
+        check_type(
+            new_mem,
+            'new_mem',
+            Variable,
+            'fluid.layers.DynamicRNN.update_memory()',
+        )
 
         mem_array = self.mem_dict.get(ex_mem.name, None)
         if mem_array is None:
@@ -3796,13 +4203,16 @@ def output(self, *outputs):
         self._assert_in_rnn_block_('output')
         parent_block = self._parent_block_()
         for each in outputs:
-            check_type(each, "outputs", Variable,
-                       "fluid.layers.DynamicRNN.output")
+            check_type(
+                each, "outputs", Variable, "fluid.layers.DynamicRNN.output"
+            )
             outside_array = parent_block.create_var(
-                name=unique_name.generate_with_ignorable_key("_".join(
-                    [self.helper.name, "output_array", each.name])),
+                name=unique_name.generate_with_ignorable_key(
+                    "_".join([self.helper.name, "output_array", each.name])
+                ),
                 type=core.VarDesc.VarType.LOD_TENSOR_ARRAY,
-                dtype=each.dtype)
+                dtype=each.dtype,
+            )
             array_write(x=each, i=self.step_idx, array=outside_array)
             self.output_array.append(outside_array)
 
@@ -3810,16 +4220,19 @@ def _init_zero_idx_(self):
         if self.zero_idx is None:
             parent_block = self._parent_block_()
             self.zero_idx = parent_block.create_var(
-                name=unique_name.generate('zero_idx'), dtype='int64')
-            parent_block.append_op(type='fill_constant',
-                                   inputs={},
-                                   outputs={'Out': [self.zero_idx]},
-                                   attrs={
-                                       'shape': [1],
-                                       'dtype': self.zero_idx.dtype,
-                                       'value': float(0),
-                                       'force_cpu': True
-                                   })
+                name=unique_name.generate('zero_idx'), dtype='int64'
+            )
+            parent_block.append_op(
+                type='fill_constant',
+                inputs={},
+                outputs={'Out': [self.zero_idx]},
+                attrs={
+                    'shape': [1],
+                    'dtype': self.zero_idx.dtype,
+                    'value': float(0),
+                    'force_cpu': True,
+                },
+            )
 
     def _parent_block_(self):
         prog = self.helper.main_program
@@ -3832,7 +4245,8 @@ def _parent_block_(self):
     def _assert_in_rnn_block_(self, method):
         if self.status != DynamicRNN.IN_RNN:
             raise ValueError(
-                "{0} can only be invoked inside rnn block.".format(method))
+                "{0} can only be invoked inside rnn block.".format(method)
+            )
 
 
 def switch_case(branch_index, branch_fns, default=None, name=None):
@@ -3909,44 +4323,71 @@ def fn_3():
 
     def _check_args(branch_index, branch_fns, default):
 
-        check_variable_and_dtype(branch_index, 'branch_index',
-                                 ['uint8', 'int32', 'int64'], 'switch_case')
+        check_variable_and_dtype(
+            branch_index,
+            'branch_index',
+            ['uint8', 'int32', 'int64'],
+            'switch_case',
+        )
 
         if convert_dtype(branch_index.dtype) != "int64":
             branch_index = cast(branch_index, "int64")
 
         check_type(branch_fns, 'branch_fns', (list, tuple, dict), 'switch_case')
 
-        branch_fns = branch_fns.items() if isinstance(branch_fns,
-                                                      dict) else branch_fns
+        branch_fns = (
+            branch_fns.items() if isinstance(branch_fns, dict) else branch_fns
+        )
 
-        branch_fns = list(enumerate(branch_fns)) if all(
-            callable(fn) for fn in branch_fns) else branch_fns
+        branch_fns = (
+            list(enumerate(branch_fns))
+            if all(callable(fn) for fn in branch_fns)
+            else branch_fns
+        )
 
         keys_of_fns = []
         for index_fn_pair in branch_fns:
             if not isinstance(index_fn_pair, tuple):
                 raise TypeError(
-                    _error_message("The elements' type", "branch_fns",
-                                   "switch_case", tuple, type(branch_fns)))
+                    _error_message(
+                        "The elements' type",
+                        "branch_fns",
+                        "switch_case",
+                        tuple,
+                        type(branch_fns),
+                    )
+                )
 
             if len(index_fn_pair) != 2:
                 raise TypeError(
-                    _error_message("The tuple's size", "branch_fns",
-                                   "switch_case", "2",
-                                   str(len(index_fn_pair)) + "-tuple"))
+                    _error_message(
+                        "The tuple's size",
+                        "branch_fns",
+                        "switch_case",
+                        "2",
+                        str(len(index_fn_pair)) + "-tuple",
+                    )
+                )
 
             key, fn = index_fn_pair
 
             if not isinstance(key, int):
                 raise TypeError(
-                    _error_message("The key's type", "branch_fns",
-                                   "switch_case", int, type(key)))
+                    _error_message(
+                        "The key's type",
+                        "branch_fns",
+                        "switch_case",
+                        int,
+                        type(key),
+                    )
+                )
 
             if key in keys_of_fns:
                 raise ValueError(
-                    "The key in 'branch_fns' must be unique, but '{}' appears more than once."
-                    .format(key))
+                    "The key in 'branch_fns' must be unique, but '{}' appears more than once.".format(
+                        key
+                    )
+                )
             else:
                 keys_of_fns.append(key)
 
@@ -3954,7 +4395,12 @@ def _check_args(branch_index, branch_fns, default):
                 raise TypeError(
                     _error_message(
                         "The type of function for key {}".format(key),
-                        "branch_fns", "switch_case", "callable", type(fn)))
+                        "branch_fns",
+                        "switch_case",
+                        "callable",
+                        type(fn),
+                    )
+                )
 
         if default is None:
             default = sorted(branch_fns)[-1][1]
@@ -3987,7 +4433,7 @@ def reorder_lod_tensor_by_rank(x, rank_table):
     Args:
         x(${x_type}): ${x_comment}.
         rank_table(${rank_table_type}): ${rank_table_comment}.
-    
+
     Returns:
         out(${out_type}): ${out_comment}.
 
@@ -4005,20 +4451,20 @@ def reorder_lod_tensor_by_rank(x, rank_table):
     """
 
     check_type(x, 'x', (Variable), 'reorder_lod_tensor_by_rank')
-    check_type(rank_table, 'rank_table', (Variable),
-               'reorder_lod_tensor_by_rank')
+    check_type(
+        rank_table, 'rank_table', (Variable), 'reorder_lod_tensor_by_rank'
+    )
     if rank_table.type != core.VarDesc.VarType.LOD_RANK_TABLE:
         raise TypeError("The type of rank_table should be LOD_RANK_TABLE.")
 
     helper = LayerHelper('reorder_lod_tensor_by_rank', **locals())
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='reorder_lod_tensor_by_rank',
-                     inputs={
-                         'X': [x],
-                         'RankTable': [rank_table]
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='reorder_lod_tensor_by_rank',
+        inputs={'X': [x], 'RankTable': [rank_table]},
+        outputs={'Out': [out]},
+    )
     return out
 
 
@@ -4057,14 +4503,16 @@ def is_empty(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.is_empty(x)
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'is_empty')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'is_empty'
+    )
     check_type(name, "name", (str, type(None)), "is_empty")
 
     helper = LayerHelper("is_empty", **locals())
     cond = helper.create_variable_for_type_inference(dtype='bool')
     cond.stop_gradient = True
-    helper.append_op(type='is_empty',
-                     inputs={'X': [x]},
-                     outputs={'Out': [cond]})
+
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]}
+    )
     return cond
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 3300a9fc4920b7..8766e3982d673e 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -836,7 +836,7 @@ def box_coder(prior_box,
     **Box Coder Layer**
 
     Encode/Decode the target bounding box with the priorbox information.
-    
+
     The Encoding schema described below:
 
     .. math::
@@ -845,78 +845,78 @@ def box_coder(prior_box,
 
         oy = (ty - py) / ph / pyv
 
-        ow = \log(\abs(tw / pw)) / pwv 
+        ow = \log(\abs(tw / pw)) / pwv
 
-        oh = \log(\abs(th / ph)) / phv 
+        oh = \log(\abs(th / ph)) / phv
 
     The Decoding schema described below:
-    
+
     .. math::
-  
+
         ox = (pw * pxv * tx * + px) - tw / 2
 
         oy = (ph * pyv * ty * + py) - th / 2
 
         ow = \exp(pwv * tw) * pw + tw / 2
 
-        oh = \exp(phv * th) * ph + th / 2   
+        oh = \exp(phv * th) * ph + th / 2
 
-    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates, 
-    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote 
-    the priorbox's (anchor) center coordinates, width and height. `pxv`, 
-    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`, 
-    `ow`, `oh` denote the encoded/decoded coordinates, width and height. 
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
+    the priorbox's (anchor) center coordinates, width and height. `pxv`,
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height.
 
-    During Box Decoding, two modes for broadcast are supported. Say target 
-    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or 
-    [M, 4]. Then prior box will broadcast to target box along the 
-    assigned axis. 
+    During Box Decoding, two modes for broadcast are supported. Say target
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
+    [M, 4]. Then prior box will broadcast to target box along the
+    assigned axis.
 
     Args:
-        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape 
+        prior_box(Variable): Box list prior_box is a 2-D Tensor with shape
             [M, 4] holds M boxes and data type is float32 or float64. Each box
-            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the 
+            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the
             left top coordinate of the anchor box, if the input is image feature
-            map, they are close to the origin of the coordinate system. 
-            [xmax, ymax] is the right bottom coordinate of the anchor box.       
-        prior_box_var(List|Variable|None): prior_box_var supports three types 
-            of input. One is variable with shape [M, 4] which holds M group and 
-            data type is float32 or float64. The second is list consist of 
-            4 elements shared by all boxes and data type is float32 or float64. 
-            Other is None and not involved in calculation. 
-        target_box(Variable): This input can be a 2-D LoDTensor with shape 
-            [N, 4] when code_type is 'encode_center_size'. This input also can 
-            be a 3-D Tensor with shape [N, M, 4] when code_type is 
-            'decode_center_size'. Each box is represented as 
-            [xmin, ymin, xmax, ymax]. The data type is float32 or float64. 
-            This tensor can contain LoD information to represent a batch of inputs. 
+            map, they are close to the origin of the coordinate system.
+            [xmax, ymax] is the right bottom coordinate of the anchor box.
+        prior_box_var(List|Variable|None): prior_box_var supports three types
+            of input. One is variable with shape [M, 4] which holds M group and
+            data type is float32 or float64. The second is list consist of
+            4 elements shared by all boxes and data type is float32 or float64.
+            Other is None and not involved in calculation.
+        target_box(Variable): This input can be a 2-D LoDTensor with shape
+            [N, 4] when code_type is 'encode_center_size'. This input also can
+            be a 3-D Tensor with shape [N, M, 4] when code_type is
+            'decode_center_size'. Each box is represented as
+            [xmin, ymin, xmax, ymax]. The data type is float32 or float64.
+            This tensor can contain LoD information to represent a batch of inputs.
         code_type(str): The code type used with the target box. It can be
-            `encode_center_size` or `decode_center_size`. `encode_center_size` 
+            `encode_center_size` or `decode_center_size`. `encode_center_size`
             by default.
         box_normalized(bool): Whether treat the priorbox as a normalized box.
             Set true by default.
-        name(str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
-        axis(int): Which axis in PriorBox to broadcast for box decode, 
-            for example, if axis is 0 and TargetBox has shape [N, M, 4] and 
+        name(str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+        axis(int): Which axis in PriorBox to broadcast for box decode,
+            for example, if axis is 0 and TargetBox has shape [N, M, 4] and
             PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
-            for decoding. It is only valid when code type is 
-            `decode_center_size`. Set 0 by default. 
+            for decoding. It is only valid when code type is
+            `decode_center_size`. Set 0 by default.
 
     Returns:
         Variable:
 
-        output_box(Variable): When code_type is 'encode_center_size', the 
-        output tensor of box_coder_op with shape [N, M, 4] representing the 
-        result of N target boxes encoded with M Prior boxes and variances. 
-        When code_type is 'decode_center_size', N represents the batch size 
+        output_box(Variable): When code_type is 'encode_center_size', the
+        output tensor of box_coder_op with shape [N, M, 4] representing the
+        result of N target boxes encoded with M Prior boxes and variances.
+        When code_type is 'decode_center_size', N represents the batch size
         and M represents the number of decoded boxes.
 
     Examples:
- 
+
         .. code-block:: python
- 
+
             import paddle.fluid as fluid
             import paddle
             paddle.enable_static()
@@ -945,45 +945,13 @@ def box_coder(prior_box,
                                     box_normalized=False,
                                     axis=1)
     """
-    check_variable_and_dtype(prior_box, 'prior_box', ['float32', 'float64'],
-                             'box_coder')
-    check_variable_and_dtype(target_box, 'target_box', ['float32', 'float64'],
-                             'box_coder')
-    if in_dygraph_mode():
-        if isinstance(prior_box_var, Variable):
-            box_coder_op = _C_ops.box_coder(prior_box, prior_box_var,
-                                            target_box, code_type,
-                                            box_normalized, axis, [])
-        elif isinstance(prior_box_var, list):
-            box_coder_op = _C_ops.box_coder(prior_box, None, target_box,
-                                            code_type, box_normalized, axis,
-                                            prior_box_var)
-        else:
-            raise TypeError(
-                "Input variance of box_coder must be Variable or lisz")
-        return box_coder_op
-    helper = LayerHelper("box_coder", **locals())
-
-    output_box = helper.create_variable_for_type_inference(
-        dtype=prior_box.dtype)
-
-    inputs = {"PriorBox": prior_box, "TargetBox": target_box}
-    attrs = {
-        "code_type": code_type,
-        "box_normalized": box_normalized,
-        "axis": axis
-    }
-    if isinstance(prior_box_var, Variable):
-        inputs['PriorBoxVar'] = prior_box_var
-    elif isinstance(prior_box_var, list):
-        attrs['variance'] = prior_box_var
-    else:
-        raise TypeError("Input variance of box_coder must be Variable or lisz")
-    helper.append_op(type="box_coder",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"OutputBox": output_box})
-    return output_box
+    return paddle.vision.ops.box_coder(prior_box=prior_box,
+                                       prior_box_var=prior_box_var,
+                                       target_box=target_box,
+                                       code_type=code_type,
+                                       box_normalized=box_normalized,
+                                       axis=axis,
+                                       name=name)
 
 
 @templatedoc()
@@ -1872,16 +1840,16 @@ def prior_box(
 	    place = fluid.CPUPlace()
 	    exe = fluid.Executor(place)
 	    exe.run(fluid.default_startup_program())
- 
+
 	    # prepare a batch of data
 	    input_data = np.random.rand(1,3,6,9).astype("float32")
 	    image_data = np.random.rand(1,3,9,12).astype("float32")
- 
+
 	    box_out, var_out = exe.run(fluid.default_main_program(),
                 feed={"input":input_data,"image":image_data},
                 fetch_list=[box,var],
                 return_numpy=True)
- 
+
 	    # print(box_out.shape)
 	    # (6, 9, 1, 4)
 	    # print(var_out.shape)
@@ -1905,68 +1873,19 @@ def prior_box(
 		# [6L, 9L, 1L, 4L]
 
     """
-
-    if in_dygraph_mode():
-        step_w, step_h = steps
-        if max_sizes == None:
-            max_sizes = []
-        return _C_ops.prior_box(input, image, min_sizes, aspect_ratios,
-                                variance, max_sizes, flip, clip, step_w, step_h,
-                                offset, min_max_aspect_ratios_order)
-    helper = LayerHelper("prior_box", **locals())
-    dtype = helper.input_dtype()
-    check_variable_and_dtype(input, 'input',
-                             ['uint8', 'int8', 'float32', 'float64'],
-                             'prior_box')
-
-    def _is_list_or_tuple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
-
-    if not _is_list_or_tuple_(min_sizes):
-        min_sizes = [min_sizes]
-    if not _is_list_or_tuple_(aspect_ratios):
-        aspect_ratios = [aspect_ratios]
-    if not (_is_list_or_tuple_(steps) and len(steps) == 2):
-        raise ValueError('steps should be a list or tuple ',
-                         'with length 2, (step_width, step_height).')
-
-    min_sizes = list(map(float, min_sizes))
-    aspect_ratios = list(map(float, aspect_ratios))
-    steps = list(map(float, steps))
-
-    attrs = {
-        'min_sizes': min_sizes,
-        'aspect_ratios': aspect_ratios,
-        'variances': variance,
-        'flip': flip,
-        'clip': clip,
-        'step_w': steps[0],
-        'step_h': steps[1],
-        'offset': offset,
-        'min_max_aspect_ratios_order': min_max_aspect_ratios_order
-    }
-    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
-        if not _is_list_or_tuple_(max_sizes):
-            max_sizes = [max_sizes]
-        attrs['max_sizes'] = max_sizes
-
-    box = helper.create_variable_for_type_inference(dtype)
-    var = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(
-        type="prior_box",
-        inputs={
-            "Input": input,
-            "Image": image
-        },
-        outputs={
-            "Boxes": box,
-            "Variances": var
-        },
-        attrs=attrs,
-    )
-    box.stop_gradient = True
-    var.stop_gradient = True
-    return box, var
+    return paddle.vision.ops.prior_box(
+        input=input,
+        image=image,
+        min_sizes=min_sizes,
+        max_sizes=max_sizes,
+        aspect_ratios=aspect_ratios,
+        variance=variance,
+        flip=flip,
+        clip=clip,
+        steps=steps,
+        offset=offset,
+        min_max_aspect_ratios_order=min_max_aspect_ratios_order,
+        name=name)
 
 
 def density_prior_box(input,
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 3721b97368af14..cfa74f3505bc97 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -80,6 +80,10 @@ def create_new_tmp_var(block, dtype):
         tmp_name = unique_tmp_name()
         return block.create_var(name=tmp_name, dtype=dtype)
 
+    def create_new_tmp_sparse_var(block, dtype, type):
+        tmp_name = unique_tmp_name()
+        return block.create_var(name=tmp_name, dtype=dtype, type=type)
+
     def create_tensor(block, value, dtype, shape):
         value = float(value)
         var = create_new_tmp_var(block, dtype)
@@ -131,22 +135,34 @@ def create_tensor_with_batchsize(ref_var, value, dtype):
 
     @static_only
     def cpu(self):
-        """ 
-            Variable should not have cpu() and cuda() interface.
-            But this interface can greatly facilitate dy2static.
-            We do nothing here.
+        """
+        Variable should not have cpu() and cuda() interface.
+        But this interface can greatly facilitate dy2static.
+        We do nothing here.
         """
         return self
 
     @static_only
     def cuda(self):
-        """ 
-            Variable should not have cpu() and cuda() interface.
-            But this interface can greatly facilitate dy2static.
-            We do nothing here.
+        """
+        Variable should not have cpu() and cuda() interface.
+        But this interface can greatly facilitate dy2static.
+        We do nothing here.
         """
         return self
 
+    @static_only
+    def place(self):
+        """
+        Variable don't have 'place' interface in static mode
+        But this interface can greatly facilitate dy2static.
+        So we give a warnning here and return None.
+        """
+        warnings.warn(
+            "Variable do not have 'place' interface for static mode, try not to use it. None will be returned."
+        )
+        return None
+
     def astype(self, dtype):
         """
         **Notes**:
@@ -227,12 +243,30 @@ def append(self, var):
                 .format(self.type))
         array_write(x=var, i=array_length(self), array=self)
 
+    @static_only
+    def _item(self):
+        """ 
+        In order to be compatible with the item interface introduced by the dynamic graph, it does nothing but returns self. 
+        It will check that the shape must be a 1-D tensor
+        """
+        if len(self.shape) > 1:
+            raise TypeError(
+                "Required input var should be 1-D Variable, but received {}".
+                format(self.shape))
+        return self
+
     @static_only
     def pop(self, *args):
         """
-         **Notes**:
-            **The type variable must be LoD Tensor Array.
-        
+        The type variable must be LoD Tensor Array.
+        When self is LoDTensorArray, calling pop is similar to Python's pop on list. 
+        This interface is used to simplify dygraph to static graph operations.
+
+        Args:
+            self(Variable): The source variable, which must be LOD_TENSOR_ARRAY
+            *args: optional, a int means index.
+        Returns:
+            Variable: self[index]
         """
         from paddle.fluid.dygraph.dygraph_to_static.convert_operators import _run_paddle_pop
         if self.type != core.VarDesc.VarType.LOD_TENSOR_ARRAY:
@@ -403,13 +437,42 @@ def __impl__(self, other_var):
         __impl__.__name__ = method_name
         return __impl__
 
+    def values(var):
+        block = current_block(var)
+        out = create_new_tmp_var(block, var.dtype)
+        block.append_op(type="sparse_values",
+                        inputs={"x": [var]},
+                        outputs={"out": [out]},
+                        attrs={})
+        return out
+
+    def indices(var):
+        block = current_block(var)
+        out = create_new_tmp_var(block, var.dtype)
+        block.append_op(type="sparse_indices",
+                        inputs={"x": [var]},
+                        outputs={"out": [out]},
+                        attrs={})
+        return out
+
+    def to_dense(var):
+        block = current_block(var)
+        out = create_new_tmp_var(block, var.dtype)
+        block.append_op(type="sparse_to_dense",
+                        inputs={"x": [var]},
+                        outputs={"out": [out]},
+                        attrs={})
+        return out
+
     variable_methods = [
         #   b=-a
         ('__neg__', _neg_),
         ('astype', astype),
         ('cpu', cpu),
         ('cuda', cuda),
+        ('place', place),
         ('append', append),
+        ('item', _item),
         ('pop', pop),
         ('dim', lambda x: len(x.shape)),
         ('ndimension', lambda x: len(x.shape)),
@@ -453,7 +516,10 @@ def __impl__(self, other_var):
         ('__lt__', _binary_creator_('__lt__', 'less_than', False, None)),
         ('__le__', _binary_creator_('__le__', 'less_equal', False, None)),
         ('__gt__', _binary_creator_('__gt__', 'greater_than', False, None)),
-        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None))
+        ('__ge__', _binary_creator_('__ge__', 'greater_equal', False, None)),
+        ('values', values),
+        ('indices', indices),
+        ('to_dense', to_dense),
     ]
 
     global _already_patch_variable
diff --git a/python/paddle/fluid/layers/metric_op.py b/python/paddle/fluid/layers/metric_op.py
index 736213340e9027..1df224ed050482 100755
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
@@ -20,7 +20,13 @@
 import warnings
 from ..layer_helper import LayerHelper
 from ..initializer import Normal, Constant
-from ..framework import Variable, _non_static_mode, _varbase_creator, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    _non_static_mode,
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from .. import core
 from ..param_attr import ParamAttr
 from . import nn
@@ -33,22 +39,29 @@
 
 def accuracy(input, label, k=1, correct=None, total=None):
     """
+
     accuracy layer.
     Refer to the https://en.wikipedia.org/wiki/Precision_and_recall
     This function computes the accuracy using the input and label.
     If the correct label occurs in top k predictions, then correct will increment by one.
-    Note: the dtype of accuracy is determined by input. the input and label dtype can be different.
+
+    Note:
+        the dtype of accuracy is determined by input. the input and label dtype can be different.
+
     Args:
         input(Tensor): The input of accuracy layer, which is the predictions of network. A Tensor with type float32,float64.
             The shape is ``[sample_number, class_dim]`` .
         label(Tensor): The label of dataset.  Tensor with type int32,int64. The shape is ``[sample_number, 1]`` .
-        k(int): The top k predictions for each class will be checked. Data type is int64 or int32.
-        correct(Tensor): The correct predictions count. A Tensor with type int64 or int32.
-        total(Tensor): The total entries count. A tensor with type int64 or int32.
+        k(int, optional): The top k predictions for each class will be checked. Data type is int64 or int32. Default is 1.
+        correct(Tensor, optional): The correct predictions count. A Tensor with type int64 or int32. Default is None.
+        total(Tensor, optional): The total entries count. A tensor with type int64 or int32. Default is None.
+
     Returns:
-        Tensor: The correct rate. A Tensor with type float32.
+        Tensor, The correct rate. A Tensor with type float32.
+
     Examples:
         .. code-block:: python
+
             import numpy as np
             import paddle
             import paddle.static as static
@@ -68,6 +81,7 @@ def accuracy(input, label, k=1, correct=None, total=None):
                         fetch_list=[result[0]])
             print(output)
             #[array([0.], dtype=float32)]
+
     """
     if _non_static_mode():
         if correct is None:
@@ -76,15 +90,18 @@ def accuracy(input, label, k=1, correct=None, total=None):
             total = _varbase_creator(dtype="int32")
 
         _k = k.numpy().item(0) if isinstance(k, Variable) else k
-        topk_out, topk_indices = _legacy_C_ops.top_k_v2(input, 'k', _k,
-                                                        'sorted', False)
-        _acc, _, _ = _legacy_C_ops.accuracy(topk_out, topk_indices, label,
-                                            correct, total)
+        topk_out, topk_indices = _legacy_C_ops.top_k_v2(
+            input, 'k', _k, 'sorted', False
+        )
+        _acc, _, _ = _legacy_C_ops.accuracy(
+            topk_out, topk_indices, label, correct, total
+        )
         return _acc
 
     helper = LayerHelper("accuracy", **locals())
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'accuracy')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'accuracy'
+    )
     topk_out = helper.create_variable_for_type_inference(dtype=input.dtype)
     topk_indices = helper.create_variable_for_type_inference(dtype="int64")
     inputs = {"X": [input]}
@@ -93,39 +110,38 @@ def accuracy(input, label, k=1, correct=None, total=None):
     else:
         attrs = {'k': k}
     attrs['sorted'] = False
-    helper.append_op(type="top_k_v2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": [topk_out],
-                         "Indices": [topk_indices]
-                     })
+    helper.append_op(
+        type="top_k_v2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": [topk_out], "Indices": [topk_indices]},
+    )
     acc_out = helper.create_variable_for_type_inference(dtype="float32")
     if correct is None:
         correct = helper.create_variable_for_type_inference(dtype="int32")
     if total is None:
         total = helper.create_variable_for_type_inference(dtype="int32")
-    helper.append_op(type="accuracy",
-                     inputs={
-                         "Out": [topk_out],
-                         "Indices": [topk_indices],
-                         "Label": [label]
-                     },
-                     outputs={
-                         "Accuracy": [acc_out],
-                         "Correct": [correct],
-                         "Total": [total],
-                     })
+    helper.append_op(
+        type="accuracy",
+        inputs={"Out": [topk_out], "Indices": [topk_indices], "Label": [label]},
+        outputs={
+            "Accuracy": [acc_out],
+            "Correct": [correct],
+            "Total": [total],
+        },
+    )
     return acc_out
 
 
-def auc(input,
-        label,
-        curve='ROC',
-        num_thresholds=2**12 - 1,
-        topk=1,
-        slide_steps=1,
-        ins_tag_weight=None):
+def auc(
+    input,
+    label,
+    curve='ROC',
+    num_thresholds=2**12 - 1,
+    topk=1,
+    slide_steps=1,
+    ins_tag_weight=None,
+):
     """
     **Area Under the Curve (AUC) Layer**
 
@@ -216,13 +232,14 @@ def auc(input,
     helper = LayerHelper("auc", **locals())
 
     if ins_tag_weight is None:
-        ins_tag_weight = tensor.fill_constant(shape=[1, 1],
-                                              dtype="float32",
-                                              value=1.0)
+        ins_tag_weight = tensor.fill_constant(
+            shape=[1, 1], dtype="float32", value=1.0
+        )
     check_variable_and_dtype(input, 'input', ['float32', 'float64'], 'auc')
     check_variable_and_dtype(label, 'label', ['int32', 'int64'], 'auc')
-    check_variable_and_dtype(ins_tag_weight, 'ins_tag_weight',
-                             ['float32', 'float64'], 'auc')
+    check_variable_and_dtype(
+        ins_tag_weight, 'ins_tag_weight', ['float32', 'float64'], 'auc'
+    )
     auc_out = helper.create_variable_for_type_inference(dtype="float64")
     batch_auc_out = helper.create_variable_for_type_inference(dtype="float64")
     # make tp, tn, fp, fn persistable, so that can accumulate all batches.
@@ -236,62 +253,71 @@ def auc(input,
     batch_stat_pos = helper.create_global_variable(
         persistable=True,
         dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
     batch_stat_neg = helper.create_global_variable(
         persistable=True,
         dtype='int64',
-        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1])
+        shape=[(1 + slide_steps) * (num_thresholds + 1) + 1],
+    )
 
     # for global auc
     # Needn't maintain the batch id
-    stat_pos = helper.create_global_variable(persistable=True,
-                                             dtype='int64',
-                                             shape=[1, num_thresholds + 1])
-    stat_neg = helper.create_global_variable(persistable=True,
-                                             dtype='int64',
-                                             shape=[1, num_thresholds + 1])
+    stat_pos = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
+    )
+    stat_neg = helper.create_global_variable(
+        persistable=True, dtype='int64', shape=[1, num_thresholds + 1]
+    )
 
     for var in [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg]:
-        helper.set_variable_initializer(var, Constant(value=0.0,
-                                                      force_cpu=False))
+        helper.set_variable_initializer(
+            var, Constant(value=0.0, force_cpu=False)
+        )
 
-    #"InsTagWeight": [ins_tag_weight]
+    # "InsTagWeight": [ins_tag_weight]
     # Batch AUC
-    helper.append_op(type="auc",
-                     inputs={
-                         "Predict": [input],
-                         "Label": [label],
-                         "StatPos": [batch_stat_pos],
-                         "StatNeg": [batch_stat_neg]
-                     },
-                     attrs={
-                         "curve": curve,
-                         "num_thresholds": num_thresholds,
-                         "slide_steps": slide_steps
-                     },
-                     outputs={
-                         "AUC": [batch_auc_out],
-                         "StatPosOut": [batch_stat_pos],
-                         "StatNegOut": [batch_stat_neg]
-                     })
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [batch_stat_pos],
+            "StatNeg": [batch_stat_neg],
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": slide_steps,
+        },
+        outputs={
+            "AUC": [batch_auc_out],
+            "StatPosOut": [batch_stat_pos],
+            "StatNegOut": [batch_stat_neg],
+        },
+    )
     # Global AUC
-    helper.append_op(type="auc",
-                     inputs={
-                         "Predict": [input],
-                         "Label": [label],
-                         "StatPos": [stat_pos],
-                         "StatNeg": [stat_neg]
-                     },
-                     attrs={
-                         "curve": curve,
-                         "num_thresholds": num_thresholds,
-                         "slide_steps": 0
-                     },
-                     outputs={
-                         "AUC": [auc_out],
-                         "StatPosOut": [stat_pos],
-                         "StatNegOut": [stat_neg]
-                     })
-    return auc_out, batch_auc_out, [
-        batch_stat_pos, batch_stat_neg, stat_pos, stat_neg
-    ]
+    helper.append_op(
+        type="auc",
+        inputs={
+            "Predict": [input],
+            "Label": [label],
+            "StatPos": [stat_pos],
+            "StatNeg": [stat_neg],
+        },
+        attrs={
+            "curve": curve,
+            "num_thresholds": num_thresholds,
+            "slide_steps": 0,
+        },
+        outputs={
+            "AUC": [auc_out],
+            "StatPosOut": [stat_pos],
+            "StatNegOut": [stat_neg],
+        },
+    )
+    return (
+        auc_out,
+        batch_auc_out,
+        [batch_stat_pos, batch_stat_neg, stat_pos, stat_neg],
+    )
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index b4330f1c4a78bf..49180f8c9670fb 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -27,18 +27,39 @@
 from ..layer_helper import LayerHelper
 from paddle.fluid.framework import _in_legacy_dygraph
 from ..initializer import Normal, Constant, NumpyArrayInitializer
-from ..framework import Variable, OpProtoHolder, _non_static_mode, dygraph_only, _dygraph_tracer, default_main_program, _varbase_creator, static_only, _global_flags, _in_legacy_dygraph, in_dygraph_mode
+from ..framework import (
+    Variable,
+    OpProtoHolder,
+    _non_static_mode,
+    dygraph_only,
+    _dygraph_tracer,
+    default_main_program,
+    _varbase_creator,
+    static_only,
+    _global_flags,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from ..framework import _current_expected_place
 from .. import dygraph_utils
 from ..param_attr import ParamAttr
-from .layer_function_generator import autodoc, templatedoc, _generate_doc_string_
+from .layer_function_generator import (
+    autodoc,
+    templatedoc,
+    _generate_doc_string_,
+)
 from .tensor import concat, assign, fill_constant, zeros, tensor_array_to_tensor
 from . import utils
 from .. import unique_name
 from functools import reduce
 from .. import core
 from ...utils import deprecated
-from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 import paddle
 from paddle.utils import deprecated
 from paddle import _C_ops, _legacy_C_ops
@@ -210,13 +231,9 @@
 
 
 @dygraph_only
-def _elementwise_op_in_dygraph(x,
-                               y,
-                               axis=-1,
-                               act=None,
-                               use_mkldnn=False,
-                               op_name=None):
-
+def _elementwise_op_in_dygraph(
+    x, y, axis=-1, act=None, use_mkldnn=False, op_name=None
+):
     def is_inplace(op_name):
         return op_name[-1] == "_"
 
@@ -227,24 +244,27 @@ def is_inplace(op_name):
         if in_dygraph_mode():
             op = getattr(
                 _C_ops,
-                OP_NAMEMAPPING[op_name] if not is_inplace(op_name) else op_name)
+                OP_NAMEMAPPING[op_name] if not is_inplace(op_name) else op_name,
+            )
             out = op(x, y)
 
         if _in_legacy_dygraph():
             op = getattr(_legacy_C_ops, op_name)
             out = op(x, y, 'axis', axis, 'use_mkldnn', use_mkldnn)
-    return dygraph_utils._append_activation_in_dygraph(out,
-                                                       act,
-                                                       use_mkldnn=use_mkldnn)
-
-
-def fc(input,
-       size,
-       num_flatten_dims=1,
-       param_attr=None,
-       bias_attr=None,
-       act=None,
-       name=None):
+    return dygraph_utils._append_activation_in_dygraph(
+        out, act, use_mkldnn=use_mkldnn
+    )
+
+
+def fc(
+    input,
+    size,
+    num_flatten_dims=1,
+    param_attr=None,
+    bias_attr=None,
+    act=None,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -362,8 +382,9 @@ def fc(input,
         for i, input_x in enumerate(input):
             check_type(input_x, 'input[' + str(i) + ']', Variable, 'fc')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'input', ['float16', 'uint16', 'float32', 'float64'],
-                'fc')
+    check_dtype(
+        dtype, 'input', ['float16', 'uint16', 'float32', 'float64'], 'fc'
+    )
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -373,31 +394,28 @@ def fc(input,
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
 
-        w = helper.create_parameter(attr=param_attr,
-                                    shape=param_shape,
-                                    dtype=dtype,
-                                    is_bias=False)
+        w = helper.create_parameter(
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False
+        )
         tmp = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type="mul",
-                         inputs={
-                             "X": input_var,
-                             "Y": w
-                         },
-                         outputs={"Out": tmp},
-                         attrs={
-                             "x_num_col_dims": num_flatten_dims,
-                             "y_num_col_dims": 1
-                         })
+        helper.append_op(
+            type="mul",
+            inputs={"X": input_var, "Y": w},
+            outputs={"Out": tmp},
+            attrs={"x_num_col_dims": num_flatten_dims, "y_num_col_dims": 1},
+        )
         mul_results.append(tmp)
 
     if len(mul_results) == 1:
         pre_bias = mul_results[0]
     else:
         pre_bias = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type="sum",
-                         inputs={"X": mul_results},
-                         outputs={"Out": pre_bias},
-                         attrs={"use_mkldnn": False})
+        helper.append_op(
+            type="sum",
+            inputs={"X": mul_results},
+            outputs={"Out": pre_bias},
+            attrs={"use_mkldnn": False},
+        )
     # add bias
     pre_activation = helper.append_bias_op(pre_bias, dim_start=num_flatten_dims)
     # add activation
@@ -405,13 +423,15 @@ def fc(input,
 
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.embedding")
-def embedding(input,
-              size,
-              is_sparse=False,
-              is_distributed=False,
-              padding_idx=None,
-              param_attr=None,
-              dtype='float32'):
+def embedding(
+    input,
+    size,
+    is_sparse=False,
+    is_distributed=False,
+    padding_idx=None,
+    param_attr=None,
+    dtype='float32',
+):
     r"""
     :api_attr: Static Graph
 
@@ -507,7 +527,7 @@ def embedding(input,
           import numpy as np
           import paddle
           paddle.enable_static()
-          
+
           data = fluid.data(name='x', shape=[None, 1], dtype='int64')
 
           # example 1
@@ -524,10 +544,15 @@ def embedding(input,
     """
 
     helper = LayerHelper('embedding', **locals())
-    check_variable_and_dtype(input, 'input', ['int64'],
-                             'fluid.layers.embedding')
-    check_dtype(dtype, 'dtype', ['uint16', 'float16', 'float32', 'float64'],
-                'fluid.layers.embedding')
+    check_variable_and_dtype(
+        input, 'input', ['int64'], 'fluid.layers.embedding'
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        ['uint16', 'float16', 'float32', 'float64'],
+        'fluid.layers.embedding',
+    )
 
     if is_distributed:
         is_distributed = False
@@ -537,37 +562,42 @@ def embedding(input,
 
     remote_prefetch = True if is_sparse else False
 
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=size,
-                                dtype=dtype,
-                                is_bias=False)
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False
+    )
     tmp = helper.create_variable_for_type_inference(dtype)
-    padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-        size[0] + padding_idx)
-    helper.append_op(type='lookup_table',
-                     inputs={
-                         'Ids': input,
-                         'W': w
-                     },
-                     outputs={'Out': tmp},
-                     attrs={
-                         'is_sparse': is_sparse,
-                         'is_distributed': is_distributed,
-                         'remote_prefetch': remote_prefetch,
-                         'padding_idx': padding_idx
-                     })
+    padding_idx = (
+        -1
+        if padding_idx is None
+        else padding_idx
+        if padding_idx >= 0
+        else (size[0] + padding_idx)
+    )
+    helper.append_op(
+        type='lookup_table',
+        inputs={'Ids': input, 'W': w},
+        outputs={'Out': tmp},
+        attrs={
+            'is_sparse': is_sparse,
+            'is_distributed': is_distributed,
+            'remote_prefetch': remote_prefetch,
+            'padding_idx': padding_idx,
+        },
+    )
     return tmp
 
 
-def _pull_sparse(input,
-                 size,
-                 table_id,
-                 accessor_class,
-                 name="embedding",
-                 ctr_label_name="",
-                 padding_id=0,
-                 dtype='float32',
-                 scale_sparse_grad=True):
+def _pull_sparse(
+    input,
+    size,
+    table_id,
+    accessor_class,
+    name="embedding",
+    ctr_label_name="",
+    padding_id=0,
+    dtype='float32',
+    scale_sparse_grad=True,
+):
     r"""
     **Pull Fleet Sparse Layer**
 
@@ -614,35 +644,34 @@ def _pull_sparse(input,
         'ScaleSparseGrad': scale_sparse_grad,
         'InputNames': input_names,
         # this is only for compatible with embedding op
-        'is_distributed': True
+        'is_distributed': True,
     }
     # this is only for compatible with embedding op
-    w, _ = helper.create_or_get_global_variable(name=name,
-                                                shape=[size],
-                                                dtype=dtype,
-                                                is_bias=False,
-                                                persistable=True)
-    helper.append_op(type='pull_sparse',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    w, _ = helper.create_or_get_global_variable(
+        name=name, shape=[size], dtype=dtype, is_bias=False, persistable=True
+    )
+    helper.append_op(
+        type='pull_sparse',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs=attrs,
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
 
 
-def _pull_sparse_v2(input,
-                    size,
-                    table_id,
-                    accessor_class,
-                    name="embedding",
-                    ctr_label_name="",
-                    padding_id=0,
-                    dtype='float32',
-                    scale_sparse_grad=True):
+def _pull_sparse_v2(
+    input,
+    size,
+    table_id,
+    accessor_class,
+    name="embedding",
+    ctr_label_name="",
+    padding_id=0,
+    dtype='float32',
+    scale_sparse_grad=True,
+):
     r"""
     **Pull Fleet Sparse Layer**
 
@@ -689,31 +718,26 @@ def _pull_sparse_v2(input,
         'ScaleSparseGrad': scale_sparse_grad,
         'InputNames': input_names,
         # this is only for compatible with embedding op
-        'is_distributed': True
+        'is_distributed': True,
     }
     # this is only for compatible with embedding op
-    w, _ = helper.create_or_get_global_variable(name=name,
-                                                shape=[size],
-                                                dtype=dtype,
-                                                is_bias=False,
-                                                persistable=True)
-    helper.append_op(type='pull_sparse_v2',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    w, _ = helper.create_or_get_global_variable(
+        name=name, shape=[size], dtype=dtype, is_bias=False, persistable=True
+    )
+    helper.append_op(
+        type='pull_sparse_v2',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs=attrs,
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
 
 
-def _pull_gpups_sparse(input,
-                       size,
-                       dtype='float32',
-                       is_distributed=False,
-                       is_sparse=False):
+def _pull_gpups_sparse(
+    input, size, dtype='float32', is_distributed=False, is_sparse=False
+):
     r"""
     **Pull GpuPS Sparse Layer**
 
@@ -747,39 +771,36 @@ def _pull_gpups_sparse(input,
     helper = LayerHelper('pull_gpups_sparse', **locals())
     if dtype != 'float32':
         raise ValueError(
-            "GpuPS only support float type embedding now, and your type is: " +
-            dtype)
+            "GpuPS only support float type embedding now, and your type is: "
+            + dtype
+        )
     helper.input_dtype()
     inputs = helper.multiple_input()
     outs = [
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=[size[0]],
-                                dtype=dtype,
-                                is_bias=False)
-    helper.append_op(type='pull_gpups_sparse',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs={
-                         'size': size,
-                         'is_distributed': is_distributed,
-                         'is_sparse': is_sparse
-                     })
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=[size[0]], dtype=dtype, is_bias=False
+    )
+    helper.append_op(
+        type='pull_gpups_sparse',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs={
+            'size': size,
+            'is_distributed': is_distributed,
+            'is_sparse': is_sparse,
+        },
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
 
 
-def _pull_box_sparse(input,
-                     size,
-                     dtype='float32',
-                     is_distributed=False,
-                     is_sparse=False):
+def _pull_box_sparse(
+    input, size, dtype='float32', is_distributed=False, is_sparse=False
+):
     r"""
     **Pull Box Sparse Layer**
 
@@ -809,29 +830,28 @@ def _pull_box_sparse(input,
     helper = LayerHelper('pull_box_sparse', **locals())
     if dtype != 'float32':
         raise ValueError(
-            "BoxPS only support float type embedding now, and your type is: " +
-            dtype)
+            "BoxPS only support float type embedding now, and your type is: "
+            + dtype
+        )
     helper.input_dtype()
     inputs = helper.multiple_input()
     outs = [
         helper.create_variable_for_type_inference(dtype)
         for i in range(len(inputs))
     ]
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=[size],
-                                dtype=dtype,
-                                is_bias=False)
-    helper.append_op(type='pull_box_sparse',
-                     inputs={
-                         'Ids': inputs,
-                         'W': w
-                     },
-                     outputs={'Out': outs},
-                     attrs={
-                         'size': size,
-                         'is_distributed': is_distributed,
-                         'is_sparse': is_sparse
-                     })
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=[size], dtype=dtype, is_bias=False
+    )
+    helper.append_op(
+        type='pull_box_sparse',
+        inputs={'Ids': inputs, 'W': w},
+        outputs={'Out': outs},
+        attrs={
+            'size': size,
+            'is_distributed': is_distributed,
+            'is_sparse': is_sparse,
+        },
+    )
     if len(outs) == 1:
         return outs[0]
     return outs
@@ -927,37 +947,46 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
             print(transition)
 
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'linear_chain_crf')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'linear_chain_crf'
+    )
     check_variable_and_dtype(label, 'label', ['int64'], 'linear_chain_crf')
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[2] if length else input.shape[1]
-    transition = helper.create_parameter(attr=helper.param_attr,
-                                         shape=[size + 2, size],
-                                         dtype=helper.input_dtype())
+    transition = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=[size + 2, size],
+        dtype=helper.input_dtype(),
+    )
     alpha = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     emission_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     transition_exps = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     log_likelihood = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype())
+        dtype=helper.input_dtype()
+    )
     this_inputs = {
         "Emission": [input],
         "Transition": transition,
-        "Label": [label]
+        "Label": [label],
     }
     if length:
         this_inputs['Length'] = [length]
-    helper.append_op(type='linear_chain_crf',
-                     inputs=this_inputs,
-                     outputs={
-                         "Alpha": [alpha],
-                         "EmissionExps": [emission_exps],
-                         "TransitionExps": transition_exps,
-                         "LogLikelihood": log_likelihood
-                     })
+    helper.append_op(
+        type='linear_chain_crf',
+        inputs=this_inputs,
+        outputs={
+            "Alpha": [alpha],
+            "EmissionExps": [emission_exps],
+            "TransitionExps": transition_exps,
+            "LogLikelihood": log_likelihood,
+        },
+    )
 
     return log_likelihood
 
@@ -1013,18 +1042,22 @@ def crf_decoding(input, param_attr, label=None, length=None):
            crf_decode = paddle.static.nn.crf_decoding(input=emission, length=length,
                      param_attr=paddle.ParamAttr(name="crfw_pad"))
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'crf_decoding')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'crf_decoding'
+    )
     helper = LayerHelper('crf_decoding', **locals())
     transition = helper.get_parameter(param_attr.name)
     viterbi_path = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
+        dtype=core.VarDesc.VarType.INT64
+    )
     inputs = {"Emission": [input], "Transition": transition, "Label": label}
     if length:
         inputs['Length'] = length
-    helper.append_op(type='crf_decoding',
-                     inputs=inputs,
-                     outputs={"ViterbiPath": [viterbi_path]})
+    helper.append_op(
+        type='crf_decoding',
+        inputs=inputs,
+        outputs={"ViterbiPath": [viterbi_path]},
+    )
 
     return viterbi_path
 
@@ -1058,26 +1091,23 @@ def cos_sim(X, Y):
     out = helper.create_variable_for_type_inference(dtype=X.dtype)
     xnorm = helper.create_variable_for_type_inference(dtype=X.dtype)
     ynorm = helper.create_variable_for_type_inference(dtype=X.dtype)
-    helper.append_op(type='cos_sim',
-                     inputs={
-                         'X': [X],
-                         'Y': [Y]
-                     },
-                     outputs={
-                         'Out': [out],
-                         'XNorm': [xnorm],
-                         'YNorm': [ynorm]
-                     })
+    helper.append_op(
+        type='cos_sim',
+        inputs={'X': [X], 'Y': [Y]},
+        outputs={'Out': [out], 'XNorm': [xnorm], 'YNorm': [ynorm]},
+    )
     return out
 
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.dropout")
-def dropout(x,
-            dropout_prob,
-            is_test=None,
-            seed=None,
-            name=None,
-            dropout_implementation="downgrade_in_infer"):
+def dropout(
+    x,
+    dropout_prob,
+    is_test=None,
+    seed=None,
+    name=None,
+    dropout_implementation="downgrade_in_infer",
+):
     """
 
     Computes dropout.
@@ -1093,7 +1123,7 @@ def dropout(x,
     Args:
         x (Variable): The input tensor variable. The data type is float16 or float32 or float64.
         dropout_prob (float): Probability of setting units to zero.
-        is_test (bool): A flag indicating whether it is in test phrase or not. 
+        is_test (bool): A flag indicating whether it is in test phrase or not.
                         Default None, in dynamic graph, it use global tracer mode; in static graph, it means False.
         seed (int): A Python integer used to create random seeds. If this
                     parameter is set to None, a random seed is used.
@@ -1128,30 +1158,39 @@ def dropout(x,
 
             import paddle
             import paddle.fluid as fluid
-            
+
             paddle.enable_static()
             x = fluid.data(name="data", shape=[None, 32, 32], dtype="float32")
             dropped = fluid.layers.dropout(x, dropout_prob=0.5)
     """
     if not isinstance(dropout_prob, (float, int, Variable)):
         raise TypeError(
-            "dropout_prob argument should be a number(int|float) or Variable")
+            "dropout_prob argument should be a number(int|float) or Variable"
+        )
     # fast return for p == 0
     if isinstance(dropout_prob, (int, float)) and dropout_prob == 0:
         return x
 
     if _non_static_mode():
-        if (seed is None
-                or seed == 0) and default_main_program().random_seed != 0:
+        if (
+            seed is None or seed == 0
+        ) and default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
         if is_test is None:
             is_test = not _dygraph_tracer()._train_mode
-        out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', dropout_prob,
-                                          'is_test', is_test, 'fix_seed', seed
-                                          is not None, 'seed',
-                                          seed if seed is not None else 0,
-                                          'dropout_implementation',
-                                          dropout_implementation)
+        out, mask = _legacy_C_ops.dropout(
+            x,
+            'dropout_prob',
+            dropout_prob,
+            'is_test',
+            is_test,
+            'fix_seed',
+            seed is not None,
+            'seed',
+            seed if seed is not None else 0,
+            'dropout_implementation',
+            dropout_implementation,
+        )
         return out
 
     def get_attrs(prog, dropout_prob, is_test, seed):
@@ -1159,8 +1198,10 @@ def get_attrs(prog, dropout_prob, is_test, seed):
             seed = prog.random_seed
         if isinstance(dropout_prob, Variable) and not dropout_prob.shape != [1]:
             raise TypeError(
-                "Required dropout_prob.shape == [1] if type(dropout_prob) is Variable, but received dropout_prob.shape = {}"
-                .format(dropout_prob.shape))
+                "Required dropout_prob.shape == [1] if type(dropout_prob) is Variable, but received dropout_prob.shape = {}".format(
+                    dropout_prob.shape
+                )
+            )
         attrs = {
             'dropout_prob': dropout_prob,
             'is_test': is_test,
@@ -1171,32 +1212,35 @@ def get_attrs(prog, dropout_prob, is_test, seed):
         return attrs
 
     helper = LayerHelper('dropout', **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'dropout')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'dropout'
+    )
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     mask = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+        dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+    )
 
     attrs = get_attrs(helper.main_program, dropout_prob, is_test, seed)
 
-    helper.append_op(type='dropout',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'Mask': [mask]
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='dropout',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'Mask': [mask]},
+        attrs=attrs,
+    )
     return out
 
 
 @templatedoc()
-def chunk_eval(input,
-               label,
-               chunk_scheme,
-               num_chunk_types,
-               excluded_chunk_types=None,
-               seq_length=None):
+def chunk_eval(
+    input,
+    label,
+    chunk_scheme,
+    num_chunk_types,
+    excluded_chunk_types=None,
+    seq_length=None,
+):
     r"""
     This operator computes the precision, recall and F1-score for chunk detection.
     It is often used in sequence tagging tasks, such as Named Entity Recognition(NER).
@@ -1315,30 +1359,39 @@ def chunk_eval(input,
     num_infer_chunks = helper.create_variable_for_type_inference(dtype="int64")
     num_label_chunks = helper.create_variable_for_type_inference(dtype="int64")
     num_correct_chunks = helper.create_variable_for_type_inference(
-        dtype="int64")
+        dtype="int64"
+    )
 
     this_input = {"Inference": [input], "Label": [label]}
 
     if seq_length is not None:
         this_input["SeqLength"] = [seq_length]
 
-    helper.append_op(type="chunk_eval",
-                     inputs=this_input,
-                     outputs={
-                         "Precision": [precision],
-                         "Recall": [recall],
-                         "F1-Score": [f1_score],
-                         "NumInferChunks": [num_infer_chunks],
-                         "NumLabelChunks": [num_label_chunks],
-                         "NumCorrectChunks": [num_correct_chunks]
-                     },
-                     attrs={
-                         "num_chunk_types": num_chunk_types,
-                         "chunk_scheme": chunk_scheme,
-                         "excluded_chunk_types": excluded_chunk_types or []
-                     })
-    return (precision, recall, f1_score, num_infer_chunks, num_label_chunks,
-            num_correct_chunks)
+    helper.append_op(
+        type="chunk_eval",
+        inputs=this_input,
+        outputs={
+            "Precision": [precision],
+            "Recall": [recall],
+            "F1-Score": [f1_score],
+            "NumInferChunks": [num_infer_chunks],
+            "NumLabelChunks": [num_label_chunks],
+            "NumCorrectChunks": [num_correct_chunks],
+        },
+        attrs={
+            "num_chunk_types": num_chunk_types,
+            "chunk_scheme": chunk_scheme,
+            "excluded_chunk_types": excluded_chunk_types or [],
+        },
+    )
+    return (
+        precision,
+        recall,
+        f1_score,
+        num_infer_chunks,
+        num_label_chunks,
+        num_correct_chunks,
+    )
 
 
 @deprecated(since="2.0.0", update_to="paddle.nn.functional.softmax")
@@ -1459,38 +1512,44 @@ def softmax(input, use_cudnn=True, name=None, axis=-1):
         return _C_ops.softmax(input, axis)
 
     if _non_static_mode():
-        return _legacy_C_ops.softmax(input, 'axis', axis, 'use_cudnn',
-                                     use_cudnn)
+        return _legacy_C_ops.softmax(
+            input, 'axis', axis, 'use_cudnn', use_cudnn
+        )
 
     inputs = {"X": [input]}
     attrs = {"axis": axis, "use_cudnn": use_cudnn}
 
     helper = LayerHelper('softmax', **locals())
-    check_variable_and_dtype(input, 'input/x',
-                             ['float16', 'float32', 'float64'], 'softmax')
+    check_variable_and_dtype(
+        input, 'input/x', ['float16', 'float32', 'float64'], 'softmax'
+    )
 
     dtype = helper.input_dtype()
     softmax_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="softmax",
-                     inputs={"X": input},
-                     outputs={"Out": softmax_out},
-                     attrs=attrs)
+    helper.append_op(
+        type="softmax",
+        inputs={"X": input},
+        outputs={"Out": softmax_out},
+        attrs=attrs,
+    )
     return softmax_out
 
 
-def conv2d(input,
-           num_filters,
-           filter_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           use_cudnn=True,
-           act=None,
-           name=None,
-           data_format="NCHW"):
+def conv2d(
+    input,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format="NCHW",
+):
     r"""
     :api_attr: Static Graph
 
@@ -1626,27 +1685,34 @@ def conv2d(input,
           print(conv2d.shape) # [-1, 2, 30, 30]
     """
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'conv2d')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'conv2d'
+    )
     if len(input.shape) != 4:
-        raise ValueError("Input size should be 4, "
-                         "but received {}".format(len(input.shape)))
+        raise ValueError(
+            "Input size should be 4, "
+            "but received {}".format(len(input.shape))
+        )
     num_channels = input.shape[1]
     if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+        raise ValueError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s. " % str(use_cudnn)
+        )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    channel_last = (data_format == "NHWC")
+    channel_last = data_format == "NHWC"
     num_channels = input.shape[3] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input(%s) should be defined. "
-            "Received: %s." % (str(input.shape), str(num_channels)))
+            "Received: %s." % (str(input.shape), str(num_channels))
+        )
     assert param_attr is not False, "param_attr should not be False here."
 
     if groups is None:
@@ -1654,27 +1720,35 @@ def conv2d(input,
     elif groups <= 0:
         raise ValueError(
             "the groups of input must be greater than 0, "
-            "but received the groups of input is {}".format(groups))
+            "but received the groups of input is {}".format(groups)
+        )
     else:
         if num_channels % groups != 0:
             raise ValueError(
                 "the channel of input must be divisible by groups,"
                 "received: the channel of input is {}, the shape of input is {}"
-                ", the groups is {}".format(num_channels, input.shape, groups))
+                ", the groups is {}".format(num_channels, input.shape, groups)
+            )
         num_filter_channels = num_channels // groups
 
     l_type = 'conv2d'
-    if (num_channels == groups and num_filters % num_channels == 0
-            and not use_cudnn):
+    if (
+        num_channels == groups
+        and num_filters % num_channels == 0
+        and not use_cudnn
+    ):
         l_type = 'depthwise_conv2d'
 
-    if (num_channels == groups and num_filters % num_channels == 0
-            and core.is_compiled_with_rocm()):
+    if (
+        num_channels == groups
+        and num_filters % num_channels == 0
+        and core.is_compiled_with_rocm()
+    ):
         l_type = 'depthwise_conv2d'
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
     if core.is_compiled_with_npu():
-        if (num_channels == groups and num_channels == num_filters):
+        if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
@@ -1688,7 +1762,6 @@ def conv2d(input,
 
     # padding
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -1699,14 +1772,16 @@ def is_list_or_tuple(ele):
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:4]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
                 if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:3]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 4, 'padding')
@@ -1723,8 +1798,9 @@ def is_list_or_tuple(ele):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0]
@@ -1742,39 +1818,47 @@ def _get_default_param_initializer():
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
                 " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num))
-        std = (2.0 / filter_elem_num)**0.5
+                "filter size.".format(filter_elem_num)
+            )
+        std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        default_initializer=_get_default_param_initializer())
+        default_initializer=_get_default_param_initializer(),
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
-    if (core.is_compiled_with_cuda() and paddle.fluid.get_flags(
-            "FLAGS_conv2d_disable_cudnn")["FLAGS_conv2d_disable_cudnn"]):
+    if (
+        core.is_compiled_with_cuda()
+        and paddle.fluid.get_flags("FLAGS_conv2d_disable_cudnn")[
+            "FLAGS_conv2d_disable_cudnn"
+        ]
+    ):
         use_cudnn = False
 
-    helper.append_op(type=l_type,
-                     inputs={
-                         'Input': input,
-                         'Filter': filter_param,
-                     },
-                     outputs={"Output": pre_bias},
-                     attrs={
-                         'strides': stride,
-                         'paddings': padding,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'use_mkldnn': False,
-                         'fuse_relu_before_depthwise_conv': False,
-                         "padding_algorithm": padding_algorithm,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            'fuse_relu_before_depthwise_conv': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format,
+        },
+    )
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -1784,19 +1868,21 @@ def _get_default_param_initializer():
     return helper.append_activation(pre_act)
 
 
-def conv3d(input,
-           num_filters,
-           filter_size,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=None,
-           param_attr=None,
-           bias_attr=None,
-           use_cudnn=True,
-           act=None,
-           name=None,
-           data_format="NCDHW"):
+def conv3d(
+    input,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format="NCDHW",
+):
     r"""
     :api_attr: Static Graph
 
@@ -1939,37 +2025,46 @@ def conv3d(input,
     dtype = helper.input_dtype()
 
     if not isinstance(use_cudnn, bool):
-        raise ValueError("Attr(use_cudnn) should be True or False. Received "
-                         "Attr(use_cudnn): %s. " % str(use_cudnn))
+        raise ValueError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s. " % str(use_cudnn)
+        )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}"
-            .format(input.shape))
+            "Input should be 5D tensor, but received input with the shape of {}".format(
+                input.shape
+            )
+        )
     num_channels = input.shape[4] if channel_last else input.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimmention of the input(%s) should be defined. "
-            "Received: %s." % (str(input.shape), str(num_channels)))
+            "Received: %s." % (str(input.shape), str(num_channels))
+        )
 
     if groups is None:
         num_filter_channels = num_channels
     elif groups <= 0:
         raise ValueError(
-            "the groups of conv3d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "the groups of conv3d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     else:
         if num_channels % groups != 0:
             raise ValueError(
                 "The number of input channels must be divisible by Attr(groups). "
-                "Received: number of channels(%s), groups(%s)." %
-                (str(num_channels), str(groups)))
+                "Received: number of channels(%s), groups(%s)."
+                % (str(num_channels), str(groups))
+            )
         num_filter_channels = num_channels // groups
 
     filter_size = utils.convert_to_list(filter_size, 3, 'filter_size')
@@ -1977,7 +2072,6 @@ def conv3d(input,
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
 
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -1988,14 +2082,16 @@ def is_list_or_tuple(ele):
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:5]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
                 if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:4]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 6, 'padding')
@@ -2015,8 +2111,9 @@ def is_list_or_tuple(ele):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0, 0]
@@ -2030,41 +2127,46 @@ def is_list_or_tuple(ele):
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
     def _get_default_param_initializer():
-        filter_elem_num = filter_size[0] * filter_size[1] * filter_size[
-            2] * num_channels
+        filter_elem_num = (
+            filter_size[0] * filter_size[1] * filter_size[2] * num_channels
+        )
         if filter_elem_num <= 0:
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
                 " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num))
+                "filter size.".format(filter_elem_num)
+            )
 
-        std = (2.0 / filter_elem_num)**0.5
+        std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        default_initializer=_get_default_param_initializer())
+        default_initializer=_get_default_param_initializer(),
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=l_type,
-                     inputs={
-                         'Input': input,
-                         'Filter': filter_param,
-                     },
-                     outputs={"Output": pre_bias},
-                     attrs={
-                         'strides': stride,
-                         'paddings': padding,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'use_mkldnn': False,
-                         "padding_algorithm": padding_algorithm,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={
+            'Input': input,
+            'Filter': filter_param,
+        },
+        outputs={"Output": pre_bias},
+        attrs={
+            'strides': stride,
+            'paddings': padding,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'use_mkldnn': False,
+            "padding_algorithm": padding_algorithm,
+            "data_format": data_format,
+        },
+    )
 
     if data_format == 'NCDHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -2075,17 +2177,19 @@ def _get_default_param_initializer():
 
 
 @templatedoc()
-def pool2d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True,
-           data_format="NCHW"):
+def pool2d(
+    input,
+    pool_size=-1,
+    pool_type="max",
+    pool_stride=1,
+    pool_padding=0,
+    global_pooling=False,
+    use_cudnn=True,
+    ceil_mode=False,
+    name=None,
+    exclusive=True,
+    data_format="NCHW",
+):
     """
 
     ${comment}
@@ -2196,27 +2300,31 @@ def pool2d(input,
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if global_pooling is False and pool_size == -1:
         raise ValueError(
             "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received pool_size: %s." % str(pool_size))
+            "and be a valid value. Received pool_size: %s." % str(pool_size)
+        )
 
     if not isinstance(use_cudnn, bool):
-        raise TypeError("Attr(use_cudnn) should be True or False. Received "
-                        "Attr(use_cudnn): %s." % str(use_cudnn))
+        raise TypeError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s." % str(use_cudnn)
+        )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
     pool_stride = utils.convert_to_list(pool_stride, 2, 'pool_stride')
 
     def update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -2227,14 +2335,16 @@ def is_list_or_tuple(ele):
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:4]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
                 if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:3]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 4, 'padding')
@@ -2252,61 +2362,77 @@ def is_list_or_tuple(ele):
         if pool_padding not in ["SAME", "VALID"]:
             raise ValueError(
                 "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(pool_padding))
+                % str(pool_padding)
+            )
         if pool_padding == "VALID":
             padding_algorithm = "VALID"
             pool_padding = [0, 0]
             if ceil_mode != False:
                 raise ValueError(
                     "When Attr(pool_padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
+                    "Received ceil_mode: True."
+                )
         elif pool_padding == "SAME":
             padding_algorithm = "SAME"
             pool_padding = [0, 0]
 
     pool_padding = update_padding(pool_padding, data_format)
     if in_dygraph_mode():
-        return _C_ops.pool2d(input, pool_size, pool_stride, pool_padding,
-                             ceil_mode, exclusive, data_format, pool_type,
-                             global_pooling, False, padding_algorithm,
-                             use_cudnn)
+        return _C_ops.pool2d(
+            input,
+            pool_size,
+            pool_stride,
+            pool_padding,
+            ceil_mode,
+            exclusive,
+            data_format,
+            pool_type,
+            global_pooling,
+            False,
+            padding_algorithm,
+            use_cudnn,
+        )
     op_type = 'pool2d'
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": input},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "global_pooling": global_pooling,
-                         "strides": pool_stride,
-                         "paddings": pool_padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": use_cudnn,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
 
 @templatedoc()
-def pool3d(input,
-           pool_size=-1,
-           pool_type="max",
-           pool_stride=1,
-           pool_padding=0,
-           global_pooling=False,
-           use_cudnn=True,
-           ceil_mode=False,
-           name=None,
-           exclusive=True,
-           data_format="NCDHW"):
+def pool3d(
+    input,
+    pool_size=-1,
+    pool_type="max",
+    pool_stride=1,
+    pool_padding=0,
+    global_pooling=False,
+    use_cudnn=True,
+    ceil_mode=False,
+    name=None,
+    exclusive=True,
+    data_format="NCDHW",
+):
     """
 
     ${comment}
@@ -2423,28 +2549,32 @@ def pool3d(input,
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown Attr(pool_type): '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if global_pooling is False and pool_size == -1:
         raise ValueError(
             "When Attr(global_pooling) is False, Attr(pool_size) must be passed "
-            "and be a valid value. Received Attr(pool_size): %s." %
-            str(pool_size))
+            "and be a valid value. Received Attr(pool_size): %s."
+            % str(pool_size)
+        )
 
     if not isinstance(use_cudnn, bool):
-        raise TypeError("Attr(use_cudnn) should be True or False. Received "
-                        "Attr(use_cudnn): %s. " % str(use_cudnn))
+        raise TypeError(
+            "Attr(use_cudnn) should be True or False. Received "
+            "Attr(use_cudnn): %s. " % str(use_cudnn)
+        )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s" % str(data_format))
+            "Attr(data_format): %s" % str(data_format)
+        )
 
     pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
     pool_stride = utils.convert_to_list(pool_stride, 3, 'pool_stride')
 
     def update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, (list, tuple)):
                 return True
@@ -2455,14 +2585,16 @@ def is_list_or_tuple(ele):
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:5]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
                 if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
                     raise ValueError(
                         "Non-zero pool_padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:4]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 6, 'padding')
@@ -2484,14 +2616,16 @@ def is_list_or_tuple(ele):
         if pool_padding not in ["SAME", "VALID"]:
             raise ValueError(
                 "Unknown Attr(pool_padding): '%s'. It can only be 'SAME' or 'VALID'."
-                % str(pool_padding))
+                % str(pool_padding)
+            )
         if pool_padding == "VALID":
             padding_algorithm = "VALID"
             pool_padding = [0, 0, 0]
             if ceil_mode != False:
                 raise ValueError(
                     "When Attr(pool_padding) is \"VALID\", ceil_mode must be False. "
-                    "Received ceil_mode: True.")
+                    "Received ceil_mode: True."
+                )
         elif pool_padding == "SAME":
             padding_algorithm = "SAME"
             pool_padding = [0, 0, 0]
@@ -2503,33 +2637,33 @@ def is_list_or_tuple(ele):
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": input},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "global_pooling": global_pooling,
-                         "strides": pool_stride,
-                         "paddings": pool_padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": use_cudnn,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": input},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "global_pooling": global_pooling,
+            "strides": pool_stride,
+            "paddings": pool_padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": use_cudnn,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
 
 @deprecated(since="2.0.0")
 @templatedoc(op_type="pool2d")
-def adaptive_pool2d(input,
-                    pool_size,
-                    pool_type="max",
-                    require_index=False,
-                    name=None):
+def adaptive_pool2d(
+    input, pool_size, pool_type="max", require_index=False, name=None
+):
     r"""
 
     This operation calculates the output based on the input, pool_size,
@@ -2626,19 +2760,24 @@ def adaptive_pool2d(input,
                             pool_type='max')
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'adaptive_pool2d')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'adaptive_pool2d',
+    )
     check_type(pool_type, 'pool_type', str, 'adaptive_pool2d')
     check_type(pool_size, 'pool_size', (int, list, tuple), 'adaptive_pool2d')
     check_type(require_index, 'require_index', bool, 'adaptive_pool2d')
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if pool_type == "avg" and require_index:
         raise ValueError(
-            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'."
+        )
 
     pool_size = utils.convert_to_list(pool_size, 2, 'pool_size')
 
@@ -2656,25 +2795,25 @@ def adaptive_pool2d(input,
         mask = helper.create_variable_for_type_inference(dtype)
         outputs["Mask"] = mask
 
-    helper.append_op(type=l_type,
-                     inputs={"X": input},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
     return (pool_out, mask) if require_index else pool_out
 
 
 @deprecated(since="2.0.0")
 @templatedoc(op_type="pool3d")
-def adaptive_pool3d(input,
-                    pool_size,
-                    pool_type="max",
-                    require_index=False,
-                    name=None):
+def adaptive_pool3d(
+    input, pool_size, pool_type="max", require_index=False, name=None
+):
     r"""
 
     This operation calculates the output based on the input, pool_size,
@@ -2785,19 +2924,24 @@ def adaptive_pool3d(input,
                             pool_type='max')
     """
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'adaptive_pool3d')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'adaptive_pool3d',
+    )
     check_type(pool_type, 'pool_type', str, 'adaptive_pool3d')
     check_type(pool_size, 'pool_size', (int, list, tuple), 'adaptive_pool3d')
     check_type(require_index, 'require_index', bool, 'adaptive_pool3d')
     if pool_type not in ["max", "avg"]:
         raise ValueError(
             "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.",
-            str(pool_type))
+            str(pool_type),
+        )
 
     if pool_type == "avg" and require_index:
         raise ValueError(
-            "invalid setting 'require_index' true when 'pool_type' is 'avg'.")
+            "invalid setting 'require_index' true when 'pool_type' is 'avg'."
+        )
 
     pool_size = utils.convert_to_list(pool_size, 3, 'pool_size')
 
@@ -2815,32 +2959,36 @@ def adaptive_pool3d(input,
         mask = helper.create_variable_for_type_inference(dtype)
         outputs["Mask"] = mask
 
-    helper.append_op(type=l_type,
-                     inputs={"X": input},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": input},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
     return (pool_out, mask) if require_index else pool_out
 
 
-def batch_norm(input,
-               act=None,
-               is_test=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               data_layout='NCHW',
-               in_place=False,
-               name=None,
-               moving_mean_name=None,
-               moving_variance_name=None,
-               do_model_average_for_mean_and_var=True,
-               use_global_stats=False):
+def batch_norm(
+    input,
+    act=None,
+    is_test=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    data_layout='NCHW',
+    in_place=False,
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    use_global_stats=False,
+):
     r"""
     :api_attr: Static Graph
 
@@ -2955,11 +3103,14 @@ def batch_norm(input,
             print(hidden2.shape)
             # [3, 200]
     """
-    assert bias_attr is not False, "bias_attr should not be False in batch_norm."
+    assert (
+        bias_attr is not False
+    ), "bias_attr should not be False in batch_norm."
     helper = LayerHelper('batch_norm', **locals())
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'batch_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'batch_norm'
+    )
     dtype = helper.input_dtype()
 
     # use fp32 for bn parameter
@@ -2978,31 +3129,38 @@ def batch_norm(input,
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(attr=helper.param_attr,
-                                    shape=param_shape,
-                                    dtype=dtype,
-                                    default_initializer=Constant(1.0))
-    bias = helper.create_parameter(attr=helper.bias_attr,
-                                   shape=param_shape,
-                                   dtype=dtype,
-                                   is_bias=True)
-
-    mean = helper.create_parameter(attr=ParamAttr(
-        name=moving_mean_name,
-        initializer=Constant(0.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                   shape=param_shape,
-                                   dtype=dtype)
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0),
+    )
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+    )
+
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     mean.stop_gradient = True
 
-    variance = helper.create_parameter(attr=ParamAttr(
-        name=moving_variance_name,
-        initializer=Constant(1.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                       shape=param_shape,
-                                       dtype=dtype)
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     variance.stop_gradient = True
 
     # create output
@@ -3022,38 +3180,81 @@ def batch_norm(input,
 
         attrs_ = ()
         if attrs_has_momentum:
-            attrs_ = ('momentum', momentum, 'epsilon', epsilon, 'is_test',
-                      is_test, 'data_layout', data_layout, 'use_mkldnn', False,
-                      'fuse_with_relu', False, 'use_global_stats',
-                      use_global_stats)
+            attrs_ = (
+                'momentum',
+                momentum,
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
         else:
-            attrs_ = ('epsilon', epsilon, 'is_test', is_test, 'data_layout',
-                      data_layout, 'use_mkldnn', False, 'fuse_with_relu', False,
-                      'use_global_stats', use_global_stats)
+            attrs_ = (
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+            )
         if inputs_has_MomemtumTensor:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input, scale, bias, mean, variance, momentum, mean_out,
-                variance_out, *attrs_)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                momentum,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
         else:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-                input, scale, bias, mean, variance, None, mean_out,
-                variance_out, *attrs_)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                None,
+                mean_out,
+                variance_out,
+                *attrs_,
+            )
 
-        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
-                                                           act=act,
-                                                           use_mkldnn=False)
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=act, use_mkldnn=False
+        )
 
-    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
     reserve_space = None
     if not is_test:
         reserve_space = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype(), stop_gradient=True)
+            dtype=helper.input_dtype(), stop_gradient=True
+        )
 
-    batch_norm_out = input if in_place else \
-            helper.create_variable_for_type_inference(dtype)
+    batch_norm_out = (
+        input if in_place else helper.create_variable_for_type_inference(dtype)
+    )
 
     inputs = {
         "X": input,
@@ -3062,7 +3263,7 @@ def batch_norm(input,
         "Mean": mean,
         "Variance": variance,
         "MeanOut": mean_out,
-        "VarianceOut": variance_out
+        "VarianceOut": variance_out,
     }
     attrs = {
         "epsilon": epsilon,
@@ -3070,7 +3271,7 @@ def batch_norm(input,
         "data_layout": data_layout,
         "use_mkldnn": False,
         "fuse_with_relu": False,
-        "use_global_stats": use_global_stats
+        "use_global_stats": use_global_stats,
     }
     if isinstance(momentum, Variable):
         inputs['MomemtumTensor'] = momentum
@@ -3082,33 +3283,34 @@ def batch_norm(input,
         "MeanOut": mean_out,
         "VarianceOut": variance_out,
         "SavedMean": saved_mean,
-        "SavedVariance": saved_variance
+        "SavedVariance": saved_variance,
     }
     if reserve_space is not None:
         outputs["ReserveSpace"] = reserve_space
 
-    helper.append_op(type="batch_norm",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return helper.append_activation(batch_norm_out)
 
 
-def inplace_abn(input,
-                act=None,
-                is_test=False,
-                momentum=0.9,
-                epsilon=1e-05,
-                param_attr=None,
-                bias_attr=None,
-                data_layout='NCHW',
-                name=None,
-                moving_mean_name=None,
-                moving_variance_name=None,
-                do_model_average_for_mean_and_var=True,
-                use_global_stats=False,
-                act_alpha=1.0):
+def inplace_abn(
+    input,
+    act=None,
+    is_test=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    data_layout='NCHW',
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    use_global_stats=False,
+    act_alpha=1.0,
+):
     r"""
     **In-place Activation Batch Normalization Layer**
 
@@ -3142,14 +3344,14 @@ def inplace_abn(input,
             numerical stability. Default is 1e-5.
         param_attr(ParamAttr|None): The parameter attribute for Parameter `scale`
              of inplace_abn. If it is set to None or one attribute of ParamAttr, inplace_abn
-	     will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
-	     If the Initializer of the param_attr is not set, the parameter is initialized
-	     with Xavier. Default: None.
+             will create ParamAttr as param_attr, the name of scale can be set in ParamAttr.
+             If the Initializer of the param_attr is not set, the parameter is initialized
+             with Xavier. Default: None.
         bias_attr(ParamAttr|None): The parameter attribute for the bias of inplace_abn.
              If it is set to None or one attribute of ParamAttr, inplace_abn
-	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
-	     If the Initializer of the bias_attr is not set, the bias is initialized zero.
-	     Default: None.
+             will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+             If the Initializer of the bias_attr is not set, the bias is initialized zero.
+             Default: None.
         data_layout (str, optional): Specify the data format of the input, and the data format of the output
              will be consistent with that of the input. An optional string from: `"NCHW"`, `"NHWC"`.
              The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
@@ -3187,14 +3389,18 @@ def inplace_abn(input,
             hidden3 = fluid.layers.inplace_abn(input=hidden2, act='leaky_relu', act_alpha=0.2)
 
     """
-    assert act in [None, 'identity', 'leaky_relu', 'elu'], \
-        "inplace_abn only support act as None, 'identity', " \
+    assert act in [None, 'identity', 'leaky_relu', 'elu'], (
+        "inplace_abn only support act as None, 'identity', "
         "'leaky_relu', 'elu' currently"
-    assert bias_attr is not False, "bias_attr should not be False in inplace_abn."
+    )
+    assert (
+        bias_attr is not False
+    ), "bias_attr should not be False in inplace_abn."
     helper = LayerHelper('inplace_abn', **locals())
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'inplace_abn')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'inplace_abn'
+    )
     dtype = helper.input_dtype()
 
     input_shape = input.shape
@@ -3209,31 +3415,38 @@ def inplace_abn(input,
     param_shape = [channel_num]
 
     # create parameter
-    scale = helper.create_parameter(attr=helper.param_attr,
-                                    shape=param_shape,
-                                    dtype=dtype,
-                                    default_initializer=Constant(1.0))
-    bias = helper.create_parameter(attr=helper.bias_attr,
-                                   shape=param_shape,
-                                   dtype=dtype,
-                                   is_bias=True)
-
-    mean = helper.create_parameter(attr=ParamAttr(
-        name=moving_mean_name,
-        initializer=Constant(0.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                   shape=param_shape,
-                                   dtype=dtype)
+    scale = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=param_shape,
+        dtype=dtype,
+        default_initializer=Constant(1.0),
+    )
+    bias = helper.create_parameter(
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+    )
+
+    mean = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_mean_name,
+            initializer=Constant(0.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     mean.stop_gradient = True
 
-    variance = helper.create_parameter(attr=ParamAttr(
-        name=moving_variance_name,
-        initializer=Constant(1.0),
-        trainable=False,
-        do_model_average=do_model_average_for_mean_and_var),
-                                       shape=param_shape,
-                                       dtype=dtype)
+    variance = helper.create_parameter(
+        attr=ParamAttr(
+            name=moving_variance_name,
+            initializer=Constant(1.0),
+            trainable=False,
+            do_model_average=do_model_average_for_mean_and_var,
+        ),
+        shape=param_shape,
+        dtype=dtype,
+    )
     variance.stop_gradient = True
 
     # create output
@@ -3255,39 +3468,88 @@ def inplace_abn(input,
 
         attrs__ = ()
         if attrs_has_momentum:
-            attrs__ = ('momentum', momentum, 'epsilon', epsilon, 'is_test',
-                       is_test, 'data_layout', data_layout, 'use_mkldnn', False,
-                       'fuse_with_relu', False, 'use_global_stats',
-                       use_global_stats, 'activation', act, 'alpha', act_alpha)
+            attrs__ = (
+                'momentum',
+                momentum,
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+                'activation',
+                act,
+                'alpha',
+                act_alpha,
+            )
         else:
-            attrs__ = ('epsilon', epsilon, 'is_test', is_test, 'data_layout',
-                       data_layout, 'use_mkldnn', False, 'fuse_with_relu',
-                       False, 'use_global_stats', use_global_stats,
-                       'activation', act, 'alpha', act_alpha)
+            attrs__ = (
+                'epsilon',
+                epsilon,
+                'is_test',
+                is_test,
+                'data_layout',
+                data_layout,
+                'use_mkldnn',
+                False,
+                'fuse_with_relu',
+                False,
+                'use_global_stats',
+                use_global_stats,
+                'activation',
+                act,
+                'alpha',
+                act_alpha,
+            )
         if inputs_has_MomemtumTensor:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_(
-                input, scale, bias, mean, variance, momentum, mean_out,
-                variance_out, *attrs__)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                momentum,
+                mean_out,
+                variance_out,
+                *attrs__,
+            )
             return batch_norm_out
         else:
             batch_norm_out, _, _, _, _, _ = _legacy_C_ops.inplace_abn_(
-                input, scale, bias, mean, variance, None, mean_out,
-                variance_out, *attrs__)
+                input,
+                scale,
+                bias,
+                mean,
+                variance,
+                None,
+                mean_out,
+                variance_out,
+                *attrs__,
+            )
             return batch_norm_out
 
-    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
     reserve_space = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
 
     inputs = {
         "X": input,
         "Scale": scale,
         "Bias": bias,
         "Mean": mean,
-        "Variance": variance
+        "Variance": variance,
     }
     attrs = {
         "epsilon": epsilon,
@@ -3308,24 +3570,21 @@ def inplace_abn(input,
         "MeanOut": mean_out,
         "VarianceOut": variance_out,
         "SavedMean": saved_mean,
-        "SavedVariance": saved_variance
+        "SavedVariance": saved_variance,
     }
     if reserve_space is not None:
         outputs["ReserveSpace"] = reserve_space
 
-    helper.append_op(type="inplace_abn",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="inplace_abn", inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return batch_norm_out
 
 
-def instance_norm(input,
-                  epsilon=1e-05,
-                  param_attr=None,
-                  bias_attr=None,
-                  name=None):
+def instance_norm(
+    input, epsilon=1e-05, param_attr=None, bias_attr=None, name=None
+):
     r"""
     :api_attr: Static Graph
 
@@ -3389,10 +3648,13 @@ def instance_norm(input,
             hidden1 = paddle.static.nn.fc(x, size=200)
             hidden2 = paddle.static.nn.instance_norm(hidden1)
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'instance_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'instance_norm'
+    )
     if param_attr is False:
-        assert bias_attr is False, "param_attr and bias_attr must be set to Fasle at the same time in instance_norm"
+        assert (
+            bias_attr is False
+        ), "param_attr and bias_attr must be set to Fasle at the same time in instance_norm"
 
     helper = LayerHelper('instance_norm', **locals())
     dtype = helper.input_dtype()
@@ -3404,29 +3666,37 @@ def instance_norm(input,
     input_shape = input.shape
     if len(input.shape) < 2 or len(input.shape) > 5:
         raise ValueError(
-            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'
-            .format(len(input.shape), input_shape))
+            'expected 2D or 3D or 4D or 5D input (got {}D input, input shape is: {})'.format(
+                len(input.shape), input_shape
+            )
+        )
     channel_num = input_shape[1]
 
     param_shape = [channel_num]
 
     if param_attr != False and bias_attr != False:
         # create parameter
-        scale = helper.create_parameter(attr=helper.param_attr,
-                                        shape=param_shape,
-                                        dtype=dtype,
-                                        default_initializer=Constant(1.0))
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=param_shape,
-                                       dtype=dtype,
-                                       is_bias=True,
-                                       default_initializer=Constant(0.0))
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0),
+        )
+        bias = helper.create_parameter(
+            attr=helper.bias_attr,
+            shape=param_shape,
+            dtype=dtype,
+            is_bias=True,
+            default_initializer=Constant(0.0),
+        )
 
     # create output
-    saved_mean = helper.create_variable_for_type_inference(dtype=dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=True)
+        dtype=dtype, stop_gradient=True
+    )
 
     instance_norm_out = helper.create_variable_for_type_inference(dtype)
 
@@ -3435,35 +3705,39 @@ def instance_norm(input,
         inputs["Scale"] = scale
         inputs["Bias"] = bias
 
-    helper.append_op(type="instance_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": instance_norm_out,
-                         "SavedMean": saved_mean,
-                         "SavedVariance": saved_variance
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                     })
+    helper.append_op(
+        type="instance_norm",
+        inputs=inputs,
+        outputs={
+            "Y": instance_norm_out,
+            "SavedMean": saved_mean,
+            "SavedVariance": saved_variance,
+        },
+        attrs={
+            "epsilon": epsilon,
+        },
+    )
 
     return instance_norm_out
 
 
 @static_only
-def data_norm(input,
-              act=None,
-              epsilon=1e-05,
-              param_attr=None,
-              data_layout='NCHW',
-              in_place=False,
-              name=None,
-              moving_mean_name=None,
-              moving_variance_name=None,
-              do_model_average_for_mean_and_var=True,
-              slot_dim=-1,
-              sync_stats=False,
-              summary_decay_rate=0.9999999,
-              enable_scale_and_shift=False):
+def data_norm(
+    input,
+    act=None,
+    epsilon=1e-05,
+    param_attr=None,
+    data_layout='NCHW',
+    in_place=False,
+    name=None,
+    moving_mean_name=None,
+    moving_variance_name=None,
+    do_model_average_for_mean_and_var=True,
+    slot_dim=-1,
+    sync_stats=False,
+    summary_decay_rate=0.9999999,
+    enable_scale_and_shift=False,
+):
     r"""
     :api_attr: Static Graph
 
@@ -3561,39 +3835,54 @@ def data_norm(input,
     if name == None:
         name = "dn"
     if enable_scale_and_shift:
-        scale_w = helper.create_parameter(attr=ParamAttr(
-            name=name + '.scale_w',
-            initializer=Constant(value=float(scale_w_default)),
-            trainable=True),
-                                          shape=param_shape,
-                                          dtype=input.dtype)
-        bias = helper.create_parameter(attr=ParamAttr(
-            name=name + '.bias',
-            initializer=Constant(value=float(bias_default)),
-            trainable=True),
-                                       shape=param_shape,
-                                       dtype=input.dtype)
+        scale_w = helper.create_parameter(
+            attr=ParamAttr(
+                name=name + '.scale_w',
+                initializer=Constant(value=float(scale_w_default)),
+                trainable=True,
+            ),
+            shape=param_shape,
+            dtype=input.dtype,
+        )
+        bias = helper.create_parameter(
+            attr=ParamAttr(
+                name=name + '.bias',
+                initializer=Constant(value=float(bias_default)),
+                trainable=True,
+            ),
+            shape=param_shape,
+            dtype=input.dtype,
+        )
     # create parameter
-    batch_size = helper.create_parameter(attr=ParamAttr(
-        name=name + '.batch_size',
-        initializer=Constant(value=float(batch_size_default)),
-        trainable=True),
-                                         shape=param_shape,
-                                         dtype=input.dtype)
-
-    batch_sum = helper.create_parameter(attr=ParamAttr(
-        name=name + '.batch_sum',
-        initializer=Constant(value=float(batch_sum_default)),
-        trainable=True),
-                                        shape=param_shape,
-                                        dtype=input.dtype)
-
-    batch_square_sum = helper.create_parameter(attr=ParamAttr(
-        name=name + '.batch_square_sum',
-        initializer=Constant(value=float(batch_square_sum_default)),
-        trainable=True),
-                                               shape=param_shape,
-                                               dtype=input.dtype)
+    batch_size = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_size',
+            initializer=Constant(value=float(batch_size_default)),
+            trainable=True,
+        ),
+        shape=param_shape,
+        dtype=input.dtype,
+    )
+
+    batch_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_sum',
+            initializer=Constant(value=float(batch_sum_default)),
+            trainable=True,
+        ),
+        shape=param_shape,
+        dtype=input.dtype,
+    )
+
+    batch_square_sum = helper.create_parameter(
+        attr=ParamAttr(
+            name=name + '.batch_square_sum',
+            initializer=Constant(value=float(batch_square_sum_default)),
+            trainable=True,
+        ),
+        shape=param_shape,
+        dtype=input.dtype,
+    )
 
     means = helper.create_variable(dtype=dtype, stop_gradient=True)
     scales = helper.create_variable(dtype=dtype, stop_gradient=True)
@@ -3604,7 +3893,7 @@ def data_norm(input,
         "X": input,
         "BatchSize": batch_size,
         "BatchSum": batch_sum,
-        "BatchSquareSum": batch_square_sum
+        "BatchSquareSum": batch_square_sum,
     }
     attrs = {
         "epsilon": epsilon,
@@ -3619,31 +3908,35 @@ def data_norm(input,
     if enable_scale_and_shift:
         inputs["scale_w"] = scale_w
         inputs["bias"] = bias
-    helper.append_op(type="data_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": data_norm_out,
-                         "Means": means,
-                         "Scales": scales,
-                         "BatchSize": batch_size,
-                         "BatchSum": batch_sum,
-                         "BatchSquareSum": batch_square_sum
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type="data_norm",
+        inputs=inputs,
+        outputs={
+            "Y": data_norm_out,
+            "Means": means,
+            "Scales": scales,
+            "BatchSize": batch_size,
+            "BatchSum": batch_sum,
+            "BatchSquareSum": batch_square_sum,
+        },
+        attrs=attrs,
+    )
 
     return helper.append_activation(data_norm_out)
 
 
 @templatedoc()
-def layer_norm(input,
-               scale=True,
-               shift=True,
-               begin_norm_axis=1,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               act=None,
-               name=None):
+def layer_norm(
+    input,
+    scale=True,
+    shift=True,
+    begin_norm_axis=1,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    act=None,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -3706,11 +3999,13 @@ def layer_norm(input,
             output = paddle.static.nn.layer_norm(input=x, begin_norm_axis=1)
             print(output.shape)  # [8, 32, 32]
     """
-    assert _non_static_mode(
-    ) is not True, "please use LayerNorm instead of layer_norm in dygraph mode!"
+    assert (
+        _non_static_mode() is not True
+    ), "please use LayerNorm instead of layer_norm in dygraph mode!"
     helper = LayerHelper('layer_norm', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'layer_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'layer_norm'
+    )
     dtype = helper.input_dtype()
 
     # create intput and parameters
@@ -3718,57 +4013,65 @@ def layer_norm(input,
     input_shape = input.shape
     param_shape = [reduce(lambda x, y: x * y, input_shape[begin_norm_axis:])]
     if scale:
-        assert param_attr is not False, "param_attr should not be False when using scale."
-        scale = helper.create_parameter(attr=helper.param_attr,
-                                        shape=param_shape,
-                                        dtype=dtype,
-                                        default_initializer=Constant(1.0))
+        assert (
+            param_attr is not False
+        ), "param_attr should not be False when using scale."
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0),
+        )
         inputs['Scale'] = scale
     else:
         if param_attr:
             warnings.warn("param_attr is only available with scale is True.")
     if shift:
-        assert bias_attr is not False, "bias_attr should not be False when using shift."
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=param_shape,
-                                       dtype=dtype,
-                                       is_bias=True)
+        assert (
+            bias_attr is not False
+        ), "bias_attr should not be False when using shift."
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+        )
         inputs['Bias'] = bias
     else:
         if bias_attr:
             warnings.warn("bias_attr is only available with shift is True.")
 
     # create output
-    mean_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                         stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                             stop_gradient=True)
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type="layer_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": layer_norm_out,
-                         "Mean": mean_out,
-                         "Variance": variance_out,
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                         "begin_norm_axis": begin_norm_axis
-                     })
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
+    )
 
     return helper.append_activation(layer_norm_out)
 
 
 @templatedoc()
-def group_norm(input,
-               groups,
-               epsilon=1e-05,
-               param_attr=None,
-               bias_attr=None,
-               act=None,
-               data_layout='NCHW',
-               name=None):
+def group_norm(
+    input,
+    groups,
+    epsilon=1e-05,
+    param_attr=None,
+    bias_attr=None,
+    act=None,
+    data_layout='NCHW',
+    name=None,
+):
     """
     :api_attr: Static Graph
 
@@ -3806,15 +4109,16 @@ def group_norm(input,
 
             import paddle
             paddle.enable_static()
-            
+
             data = paddle.static.data(name='data', shape=[2, 8, 32, 32], dtype='float32')
             x = paddle.static.nn.group_norm(input=data, groups=4)
             print(x.shape) # [2, 8, 32, 32]
     """
     helper = LayerHelper('group_norm', **locals())
     dtype = helper.input_dtype()
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'group_norm')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'group_norm'
+    )
     # create intput and parameters
     inputs = {'X': input}
     input_shape = input.shape
@@ -3825,20 +4129,23 @@ def group_norm(input,
     if data_layout != 'NCHW' and data_layout != 'NHWC':
         raise ValueError(
             "Param(data_layout) of Op(fluid.layers.group_norm) got wrong value: received "
-            + data_layout + " but only NCHW or NHWC supported.")
+            + data_layout
+            + " but only NCHW or NHWC supported."
+        )
     channel_num = input_shape[1] if data_layout == 'NCHW' else input_shape[-1]
     param_shape = [channel_num]
     if param_attr:
-        scale = helper.create_parameter(attr=helper.param_attr,
-                                        shape=param_shape,
-                                        dtype=dtype,
-                                        default_initializer=Constant(1.0))
+        scale = helper.create_parameter(
+            attr=helper.param_attr,
+            shape=param_shape,
+            dtype=dtype,
+            default_initializer=Constant(1.0),
+        )
         inputs['Scale'] = scale
     if bias_attr:
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=param_shape,
-                                       dtype=dtype,
-                                       is_bias=True)
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True
+        )
         inputs['Bias'] = bias
 
     # create output
@@ -3846,18 +4153,20 @@ def group_norm(input,
     variance_out = helper.create_variable(dtype=dtype, stop_gradient=True)
     group_norm_out = helper.create_variable(dtype=dtype)
 
-    helper.append_op(type="group_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": group_norm_out,
-                         "Mean": mean_out,
-                         "Variance": variance_out,
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                         "groups": groups,
-                         "data_layout": data_layout
-                     })
+    helper.append_op(
+        type="group_norm",
+        inputs=inputs,
+        outputs={
+            "Y": group_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={
+            "epsilon": epsilon,
+            "groups": groups,
+            "data_layout": data_layout,
+        },
+    )
 
     return helper.append_activation(group_norm_out)
 
@@ -3926,8 +4235,9 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
             print(x.shape) # [2, 8, 32, 32]
     """
     helper = LayerHelper('spectral_norm', **locals())
-    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                             'spectral_norm')
+    check_variable_and_dtype(
+        weight, 'weight', ['float32', 'float64'], 'spectral_norm'
+    )
     check_type(dim, 'dim', int, 'spectral_norm')
     check_type(power_iters, 'power_iters', int, 'spectral_norm')
     check_type(eps, 'eps', float, 'spectral_norm')
@@ -3936,21 +4246,27 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     # create intput and parameters
     input_shape = weight.shape
     assert weight.numel() > 0, "Any dimension of input cannot be equal to 0."
-    assert dim < len(input_shape), ("The input `dim` should be less than the "
-                                    "rank of `weight`, but received dim="
-                                    "{}".format(dim))
+    assert dim < len(input_shape), (
+        "The input `dim` should be less than the "
+        "rank of `weight`, but received dim="
+        "{}".format(dim)
+    )
     h = input_shape[dim]
     w = np.prod(input_shape) // h
 
-    u = helper.create_parameter(attr=ParamAttr(),
-                                shape=[h],
-                                dtype=dtype,
-                                default_initializer=Normal(0., 1.))
+    u = helper.create_parameter(
+        attr=ParamAttr(),
+        shape=[h],
+        dtype=dtype,
+        default_initializer=Normal(0.0, 1.0),
+    )
     u.stop_gradient = True
-    v = helper.create_parameter(attr=ParamAttr(),
-                                shape=[w],
-                                dtype=dtype,
-                                default_initializer=Normal(0., 1.))
+    v = helper.create_parameter(
+        attr=ParamAttr(),
+        shape=[w],
+        dtype=dtype,
+        default_initializer=Normal(0.0, 1.0),
+    )
     v.stop_gradient = True
 
     if in_dygraph_mode():
@@ -3963,34 +4279,38 @@ def spectral_norm(weight, dim=0, power_iters=1, eps=1e-12, name=None):
     # create output
     out = helper.create_variable(dtype=dtype)
 
-    helper.append_op(type="spectral_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Out": out,
-                     },
-                     attrs={
-                         "dim": dim,
-                         "power_iters": power_iters,
-                         "eps": eps,
-                     })
+    helper.append_op(
+        type="spectral_norm",
+        inputs=inputs,
+        outputs={
+            "Out": out,
+        },
+        attrs={
+            "dim": dim,
+            "power_iters": power_iters,
+            "eps": eps,
+        },
+    )
 
     return out
 
 
-def conv2d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_cudnn=True,
-                     act=None,
-                     name=None,
-                     data_format='NCHW'):
+def conv2d_transpose(
+    input,
+    num_filters,
+    output_size=None,
+    filter_size=None,
+    padding=0,
+    stride=1,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format='NCHW',
+):
     r"""
     :api_attr: Static Graph
 
@@ -4146,20 +4466,29 @@ def conv2d_transpose(input,
           conv2d_transpose = paddle.static.nn.conv2d_transpose(input=data, num_filters=2, filter_size=3)
           print(conv2d_transpose.shape) # [-1, 2, 34, 34]
     """
-    assert param_attr is not False, "param_attr should not be False in conv2d_transpose."
+    assert (
+        param_attr is not False
+    ), "param_attr should not be False in conv2d_transpose."
     if len(input.shape) != 4:
-        raise ValueError("Input size should be 4, "
-                         "but received {}".format(len(input.shape)))
+        raise ValueError(
+            "Input size should be 4, "
+            "but received {}".format(len(input.shape))
+        )
 
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
             "Attr(data_format) of Op(fluid.layers.conv2d_transpose) got wrong value: received "
-            + data_format + " but only NCHW or NHWC supported.")
+            + data_format
+            + " but only NCHW or NHWC supported."
+        )
 
     input_channel = input.shape[1] if data_format == 'NCHW' else input.shape[-1]
     op_type = 'conv2d_transpose'
-    if (input_channel == groups and num_filters == input_channel
-            and not use_cudnn):
+    if (
+        input_channel == groups
+        and num_filters == input_channel
+        and not use_cudnn
+    ):
         op_type = 'depthwise_conv2d_transpose'
 
     helper = LayerHelper(op_type, **locals())
@@ -4173,7 +4502,6 @@ def conv2d_transpose(input,
         raise ValueError("use_cudnn should be True or False")
 
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -4184,14 +4512,16 @@ def is_list_or_tuple(ele):
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:4]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NHWC"):
                 if not (padding[0] == [0, 0] and padding[3] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:3]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 4, 'padding')
@@ -4205,8 +4535,9 @@ def is_list_or_tuple(ele):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0, 0, 0]
@@ -4226,44 +4557,63 @@ def is_list_or_tuple(ele):
     elif isinstance(output_size, int):
         output_size = utils.convert_to_list(output_size, 2, 'output_size')
     elif isinstance(output_size, Variable):
-        check_dtype(output_size.dtype, 'output_size', ['int32', 'int64'],
-                    'conv2d_transpose')
-        if len(output_size.shape) == 1 and (output_size.shape[0] == 1
-                                            or output_size.shape[0] == 2):
+        check_dtype(
+            output_size.dtype,
+            'output_size',
+            ['int32', 'int64'],
+            'conv2d_transpose',
+        )
+        if len(output_size.shape) == 1 and (
+            output_size.shape[0] == 1 or output_size.shape[0] == 2
+        ):
             if output_size.shape[0] == 1:
                 output_size = [output_size, output_size]
         else:
             raise ValueError("output_size must contain one or two integers.")
     else:
         raise ValueError(
-            "output_size should be int, list[int] or tuple[int] or Tensor")
+            "output_size should be int, list[int] or tuple[int] or Tensor"
+        )
 
     if filter_size is None:
         if output_size is []:
             raise ValueError("output_size must be set when filter_size is None")
         if not _non_static_mode():
-            if isinstance(output_size,
-                          Variable) or utils._contain_var(output_size):
+            if isinstance(output_size, Variable) or utils._contain_var(
+                output_size
+            ):
                 raise ValueError(
                     "filter_size should not be None when output_size is Variable or contain Variable in static mode."
                 )
         else:
             output_size = utils.convert_shape_to_list(output_size)
             if len(output_size) == 1:
-                output_size = utils.convert_to_list(output_size[0], 2,
-                                                    'output_size')
+                output_size = utils.convert_to_list(
+                    output_size[0], 2, 'output_size'
+                )
 
         h_in = input.shape[2] if data_format == 'NCHW' else input.shape[1]
         w_in = input.shape[3] if data_format == 'NCHW' else input.shape[2]
 
-        filter_size_h = (output_size[0] - (h_in - 1) * stride[0] + padding[0] +
-                         padding[1] - 1) // dilation[0] + 1
-        filter_size_w = (output_size[1] - (w_in - 1) * stride[1] + padding[2] +
-                         padding[3] - 1) // dilation[1] + 1
+        filter_size_h = (
+            output_size[0]
+            - (h_in - 1) * stride[0]
+            + padding[0]
+            + padding[1]
+            - 1
+        ) // dilation[0] + 1
+        filter_size_w = (
+            output_size[1]
+            - (w_in - 1) * stride[1]
+            + padding[2]
+            + padding[3]
+            - 1
+        ) // dilation[1] + 1
         filter_size = [filter_size_h, filter_size_w]
     else:
-        filter_size = utils.convert_to_list(filter_size, 2,
-                                            'conv2d_transpose.filter_size')
+        filter_size = utils.convert_to_list(
+            filter_size, 2, 'conv2d_transpose.filter_size'
+        )
 
     if len(padding) == 4 and utils._is_symmetric_padding(padding, 2):
         padding = [padding[0], padding[2]]
@@ -4273,31 +4623,31 @@ def is_list_or_tuple(ele):
     elif groups <= 0:
         raise ValueError(
             "the groups of input must be greater than 0, "
-            "but received the groups of input is {}".format(groups))
+            "but received the groups of input is {}".format(groups)
+        )
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
 
-    img_filter = helper.create_parameter(dtype=input.dtype,
-                                         shape=filter_shape,
-                                         attr=helper.param_attr)
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=op_type,
-                     inputs={
-                         'Input': [input],
-                         'Filter': [img_filter]
-                     },
-                     outputs={'Output': pre_bias},
-                     attrs={
-                         'output_size': output_size,
-                         'strides': stride,
-                         'paddings': padding,
-                         'padding_algorithm': padding_algorithm,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'data_format': data_format
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'Input': [input], 'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': data_format,
+        },
+    )
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -4307,20 +4657,22 @@ def is_list_or_tuple(ele):
     return out
 
 
-def conv3d_transpose(input,
-                     num_filters,
-                     output_size=None,
-                     filter_size=None,
-                     padding=0,
-                     stride=1,
-                     dilation=1,
-                     groups=None,
-                     param_attr=None,
-                     bias_attr=None,
-                     use_cudnn=True,
-                     act=None,
-                     name=None,
-                     data_format='NCDHW'):
+def conv3d_transpose(
+    input,
+    num_filters,
+    output_size=None,
+    filter_size=None,
+    padding=0,
+    stride=1,
+    dilation=1,
+    groups=None,
+    param_attr=None,
+    bias_attr=None,
+    use_cudnn=True,
+    act=None,
+    name=None,
+    data_format='NCDHW',
+):
     r"""
     :api_attr: Static Graph
 
@@ -4484,11 +4836,15 @@ def conv3d_transpose(input,
           output = exe.run(feed={"data": x}, fetch_list=[res])
           print(output)
     """
-    assert param_attr is not False, "param_attr should not be False in conv3d_transpose."
+    assert (
+        param_attr is not False
+    ), "param_attr should not be False in conv3d_transpose."
     if data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
             "Param(data_format) of Op(fluid.layers.conv3d_transpose) got wrong value: received "
-            + data_format + " but only NCDHW or NDHWC supported.")
+            + data_format
+            + " but only NCDHW or NDHWC supported."
+        )
 
     l_type = "conv3d_transpose"
     helper = LayerHelper(l_type, **locals())
@@ -4496,9 +4852,13 @@ def conv3d_transpose(input,
         raise TypeError("Input of conv3d_transpose must be Variable")
     if len(input.shape) != 5:
         raise ValueError(
-            "Input should be 5D tensor, but received input with the shape of {}"
-            .format(input.shape))
-    input_channel = input.shape[1] if data_format == 'NCDHW' else input.shape[-1]
+            "Input should be 5D tensor, but received input with the shape of {}".format(
+                input.shape
+            )
+        )
+    input_channel = (
+        input.shape[1] if data_format == 'NCDHW' else input.shape[-1]
+    )
 
     stride = utils.convert_to_list(stride, 3, 'stride')
     dilation = utils.convert_to_list(dilation, 3, 'dilation')
@@ -4507,7 +4867,6 @@ def conv3d_transpose(input,
         raise ValueError("use_cudnn should be True or False")
 
     def _update_padding(padding, data_format):
-
         def is_list_or_tuple(ele):
             if isinstance(ele, list) or isinstance(ele, tuple):
                 return True
@@ -4518,14 +4877,16 @@ def is_list_or_tuple(ele):
                 if not (padding[0] == [0, 0] and padding[1] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[2:5]
                 padding = [ele for a_list in padding for ele in a_list]
             elif is_list_or_tuple(padding[0]) and (data_format == "NDHWC"):
                 if not (padding[0] == [0, 0] and padding[4] == [0, 0]):
                     raise ValueError(
                         "Non-zero padding(%s) in the batch or channel dimensions "
-                        "is not supported." % str(padding))
+                        "is not supported." % str(padding)
+                    )
                 padding = padding[1:4]
                 padding = [ele for a_list in padding for ele in a_list]
             padding = utils.convert_to_list(padding, 6, 'padding')
@@ -4536,8 +4897,12 @@ def is_list_or_tuple(ele):
         else:
             padding = utils.convert_to_list(padding, 3, 'padding')
             padding = [
-                padding[0], padding[0], padding[1], padding[1], padding[2],
-                padding[2]
+                padding[0],
+                padding[0],
+                padding[1],
+                padding[1],
+                padding[2],
+                padding[2],
             ]
         return padding
 
@@ -4546,8 +4911,9 @@ def is_list_or_tuple(ele):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'." %
-                str(padding))
+                "Unknown padding: '%s'. It can only be 'SAME' or 'VALID'."
+                % str(padding)
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0, 0, 0, 0, 0, 0]
@@ -4567,16 +4933,32 @@ def is_list_or_tuple(ele):
         h_in = input.shape[3] if data_format == 'NCDHW' else input.shape[2]
         w_in = input.shape[4] if data_format == 'NCDHW' else input.shape[3]
 
-        filter_size_d = (output_size[0] - (d_in - 1) * stride[0] + padding[0] +
-                         padding[1] - 1) // dilation[0] + 1
-        filter_size_h = (output_size[1] - (h_in - 1) * stride[1] + padding[2] +
-                         padding[3] - 1) // dilation[1] + 1
-        filter_size_w = (output_size[2] - (w_in - 1) * stride[2] + padding[4] +
-                         padding[5] - 1) // dilation[2] + 1
+        filter_size_d = (
+            output_size[0]
+            - (d_in - 1) * stride[0]
+            + padding[0]
+            + padding[1]
+            - 1
+        ) // dilation[0] + 1
+        filter_size_h = (
+            output_size[1]
+            - (h_in - 1) * stride[1]
+            + padding[2]
+            + padding[3]
+            - 1
+        ) // dilation[1] + 1
+        filter_size_w = (
+            output_size[2]
+            - (w_in - 1) * stride[2]
+            + padding[4]
+            + padding[5]
+            - 1
+        ) // dilation[2] + 1
         filter_size = [filter_size_d, filter_size_h, filter_size_w]
     else:
-        filter_size = utils.convert_to_list(filter_size, 3,
-                                            'conv3d_transpose.filter_size')
+        filter_size = utils.convert_to_list(
+            filter_size, 3, 'conv3d_transpose.filter_size'
+        )
 
     if len(padding) == 6 and utils._is_symmetric_padding(padding, 3):
         padding = [padding[0], padding[2], padding[4]]
@@ -4591,18 +4973,22 @@ def is_list_or_tuple(ele):
     groups = 1 if groups is None else groups
     if groups <= 0:
         raise ValueError(
-            "the groups of conv3d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "the groups of conv3d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "Attr(num_filters) must be divisible by groups,"
             "Received: Attr(num_filters) is {}, the groups is {}".format(
-                num_filters, groups))
+                num_filters, groups
+            )
+        )
 
     filter_shape = [input_channel, num_filters // groups] + filter_size
-    img_filter = helper.create_parameter(dtype=input.dtype,
-                                         shape=filter_shape,
-                                         attr=helper.param_attr)
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr
+    )
 
     if data_format == 'NCDHW':
         data_format = 'NCHW'
@@ -4610,22 +4996,21 @@ def is_list_or_tuple(ele):
         data_format = 'NHWC'
 
     pre_bias = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=l_type,
-                     inputs={
-                         'Input': [input],
-                         'Filter': [img_filter]
-                     },
-                     outputs={'Output': pre_bias},
-                     attrs={
-                         'output_size': output_size,
-                         'strides': stride,
-                         'paddings': padding,
-                         'padding_algorithm': padding_algorithm,
-                         'dilations': dilation,
-                         'groups': groups,
-                         'use_cudnn': use_cudnn,
-                         'data_format': data_format
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={'Input': [input], 'Filter': [img_filter]},
+        outputs={'Output': pre_bias},
+        attrs={
+            'output_size': output_size,
+            'strides': stride,
+            'paddings': padding,
+            'padding_algorithm': padding_algorithm,
+            'dilations': dilation,
+            'groups': groups,
+            'use_cudnn': use_cudnn,
+            'data_format': data_format,
+        },
+    )
 
     if data_format == 'NCHW':
         pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
@@ -4691,37 +5076,47 @@ def reduce_sum(input, dim=None, keep_dim=False, name=None):
         dim = [dim]
 
     if in_dygraph_mode():
-        reduce_all = True if dim == None or dim == [] or len(dim) == len(
-            input.shape) else False
+        reduce_all = (
+            True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False
+        )
         dim = dim if dim != None and dim != [] else [0]
         if reduce_all:
             return _C_ops.sum(input, [], None, keep_dim)
         else:
             return _C_ops.sum(input, dim, None, keep_dim)
     elif _in_legacy_dygraph():
-        reduce_all = True if dim == None or dim == [] or len(dim) == len(
-            input.shape) else False
+        reduce_all = (
+            True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False
+        )
         dim = dim if dim != None and dim != [] else [0]
-        return _legacy_C_ops.reduce_sum(input, 'dim', dim, 'keep_dim', keep_dim,
-                                        'reduce_all', reduce_all)
+        return _legacy_C_ops.reduce_sum(
+            input, 'dim', dim, 'keep_dim', keep_dim, 'reduce_all', reduce_all
+        )
     attrs = {
-        'dim':
-        dim if dim != None and dim != [] else [0],
-        'keep_dim':
-        keep_dim,
-        'reduce_all':
-        True
-        if dim == None or dim == [] or len(dim) == len(input.shape) else False
+        'dim': dim if dim != None and dim != [] else [0],
+        'keep_dim': keep_dim,
+        'reduce_all': True
+        if dim == None or dim == [] or len(dim) == len(input.shape)
+        else False,
     }
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'reduce_sum')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'reduce_sum',
+    )
     helper = LayerHelper('reduce_sum', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='reduce_sum',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='reduce_sum',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs=attrs,
+    )
     return out
 
 
@@ -4839,18 +5234,18 @@ def reduce_max(input, dim=None, keep_dim=False, name=None):
     if in_dygraph_mode():
         return _C_ops.max(input, dim if dim != None else [], keep_dim)
 
-    helper.append_op(type='reduce_max',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_max',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -4911,18 +5306,18 @@ def reduce_min(input, dim=None, keep_dim=False, name=None):
     if in_dygraph_mode():
         return _C_ops.min(input, dim if dim != None else [], keep_dim)
 
-    helper.append_op(type='reduce_min',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_min',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -4983,30 +5378,37 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
             dim = [dim]
         else:
             raise TypeError(
-                "The type of axis must be int, list or tuple, but received {}".
-                format(type(dim)))
+                "The type of axis must be int, list or tuple, but received {}".format(
+                    type(dim)
+                )
+            )
     if in_dygraph_mode():
         return _C_ops.reduce_prod(
-            input, dim if dim != None and dim != [] else [0], keep_dim, True if
-            dim == None or dim == [] or len(dim) == len(input.shape) else False)
+            input,
+            dim if dim != None and dim != [] else [0],
+            keep_dim,
+            True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        )
 
     helper = LayerHelper('reduce_prod', **locals())
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'reduce_prod')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'reduce_prod'
+    )
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='reduce_prod',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_prod',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -5063,18 +5465,18 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
     check_variable_and_dtype(input, 'input', ('bool'), 'reduce_all')
     helper = LayerHelper('reduce_all', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='reduce_all',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_all',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -5126,18 +5528,18 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
     if dim is not None and not isinstance(dim, list):
         dim = [dim]
-    helper.append_op(type='reduce_any',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'dim':
-                         dim if dim != None and dim != [] else [0],
-                         'keep_dim':
-                         keep_dim,
-                         'reduce_all':
-                         True if dim == None or dim == []
-                         or len(dim) == len(input.shape) else False
-                     })
+    helper.append_op(
+        type='reduce_any',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None and dim != [] else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True
+            if dim == None or dim == [] or len(dim) == len(input.shape)
+            else False,
+        },
+    )
     return out
 
 
@@ -5147,15 +5549,15 @@ def split(input, num_or_sections, dim=-1, name=None):
 
     Args:
         input (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is int, then the ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is int, then the ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``input``
-            will be divided into. If ``num_or_sections`` is a list or tuple, the length of it 
+            will be divided into. If ``num_or_sections`` is a list or tuple, the length of it
             indicates the number of sub-Tensors and the elements in it indicate the sizes of sub-Tensors'
             dimension orderly. The length of the list mustn't be larger than the ``input`` 's size of specified dim.
         dim (int|Tensor, optional): The dimension along which to split, it can be a scalar with type ``int`` or
             a ``Tensor`` with shape [1] and data type ``int32`` or ``int64``. If :math:`dim < 0`,
             the dimension to split along is :math:`rank(input) + dim`. Default is -1.
-        name (str, optional): The default value is None.  Normally there is no need for user to set this property. 
+        name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
@@ -5184,7 +5586,7 @@ def split(input, num_or_sections, dim=-1, name=None):
             # out0.shape [3, 2, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 4, 5]
-            
+
             # dim is negative, the real dim is (rank(input) + axis) which real
             # value is 1.
             out0, out1, out2 = fluid.layers.split(input, num_or_sections=3, dim=-2)
@@ -5212,15 +5614,17 @@ def split(input, num_or_sections, dim=-1, name=None):
             if utils._contain_var(num_or_sections):
                 for index, item in enumerate(num_or_sections):
                     if isinstance(item, Variable):
-                        num_or_sections[index] = num_or_sections[index].numpy(
-                        )[0]
+                        num_or_sections[index] = num_or_sections[index].numpy()[
+                            0
+                        ]
                 attrs += ('sections', list(num_or_sections))
             else:
                 attrs += ('sections', list(num_or_sections))
         else:
             raise TypeError(
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
-                "received %s." % (type(num_or_sections)))
+                "received %s." % (type(num_or_sections))
+            )
         if in_dygraph_mode():
             if isinstance(num_or_sections, int):
                 return _C_ops.split_with_num(input, num_or_sections, dim)
@@ -5232,8 +5636,11 @@ def split(input, num_or_sections, dim=-1, name=None):
             return out
 
     check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], 'split')
+        input,
+        'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'split',
+    )
     check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
     check_type(dim, 'dim', (int, Variable), 'split')
     if isinstance(dim, Variable):
@@ -5253,19 +5660,18 @@ def _get_SectionsTensorList(one_list):
                 dim_size.stop_gradient = True
                 tensor_list.append(dim_size)
             else:
-                assert (isinstance(dim_size, int))
+                assert isinstance(dim_size, int)
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one value of 'num_or_section' in split can "
-                        "be -1. But received num_or_section[%d] is also -1." %
-                        idx)
+                        "be -1. But received num_or_section[%d] is also -1."
+                        % idx
+                    )
                     unk_dim_idx = idx
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 tensor_list.append(temp_out)
         return tensor_list
 
@@ -5280,31 +5686,37 @@ def _get_SectionsTensorList(one_list):
     if isinstance(num_or_sections, int):
         assert num_or_sections > 1, 'num_or_sections must be more than 1.'
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert input_shape[dim] % num_or_sections ==0, \
-                "The input's size along the split dimension " \
-                "must be evenly divisible by Attr(num_or_sections). " \
-                "But %d is not evenly divisible by %d. " % (num_or_sections,input_shape[dim])
+            assert input_shape[dim] % num_or_sections == 0, (
+                "The input's size along the split dimension "
+                "must be evenly divisible by Attr(num_or_sections). "
+                "But %d is not evenly divisible by %d. "
+                % (num_or_sections, input_shape[dim])
+            )
         num = num_or_sections
     else:
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert len(num_or_sections) <= input_shape[
-                dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+            assert (
+                len(num_or_sections) <= input_shape[dim]
+            ), 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
         attrs['sections'] = list(
-            map(lambda ele: -1
-                if isinstance(ele, Variable) else ele, num_or_sections))
+            map(
+                lambda ele: -1 if isinstance(ele, Variable) else ele,
+                num_or_sections,
+            )
+        )
         if utils._contain_var(num_or_sections):
             inputs['SectionsTensorList'] = _get_SectionsTensorList(
-                num_or_sections)
+                num_or_sections
+            )
 
     outs = [
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(type='split',
-                     inputs=inputs,
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    helper.append_op(
+        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs
+    )
     return outs
 
 
@@ -5355,8 +5767,9 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
         if in_dygraph_mode():
             out, _ = _C_ops.norm(x, 1 if axis is None else axis, epsilon, False)
         elif _in_legacy_dygraph():
-            _, out = _legacy_C_ops.norm(x, 'axis', 1 if axis is None else axis,
-                                        'epsilon', epsilon)
+            _, out = _legacy_C_ops.norm(
+                x, 'axis', 1 if axis is None else axis, 'epsilon', epsilon
+            )
         return out
 
     check_variable_and_dtype(x, "X", ("float16", "float32", "float64"), "norm")
@@ -5364,16 +5777,15 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
     helper = LayerHelper("l2_normalize", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="norm",
-                     inputs={"X": x},
-                     outputs={
-                         "Out": out,
-                         "Norm": norm
-                     },
-                     attrs={
-                         "axis": 1 if axis is None else axis,
-                         "epsilon": epsilon,
-                     })
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out, "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        },
+    )
     return out
 
 
@@ -5454,16 +5866,25 @@ def matmul(x, y, transpose_x=False, transpose_y=False, alpha=1.0, name=None):
     """
     if _non_static_mode():
         out = _varbase_creator(dtype=x.dtype)
-        _legacy_C_ops.matmul(x, y, out, 'transpose_X', transpose_x,
-                             'transpose_Y', transpose_y, 'alpha', float(alpha))
+        _legacy_C_ops.matmul(
+            x,
+            y,
+            out,
+            'transpose_X',
+            transpose_x,
+            'transpose_Y',
+            transpose_y,
+            'alpha',
+            float(alpha),
+        )
         return out
 
     def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
-            check_variable_and_dtype(val, name,
-                                     ['float16', 'float32', 'float64'],
-                                     'matmul')
+            check_variable_and_dtype(
+                val, name, ['float16', 'float32', 'float64'], 'matmul'
+            )
         x_shape = list(x.shape)
         y_shape = list(y.shape)
         if len(x_shape) == 1:
@@ -5477,11 +5898,12 @@ def __check_input(x, y):
         if transpose_y:
             y_shape[-2], y_shape[-1] = y_shape[-1], y_shape[-2]
         if x_shape[-1] != y_shape[-2]:
-            assert (x_shape[-1] == -1) or (y_shape[-2] == -1),                         \
-                "After performing an optional transpose, Input X's width should be "   \
-                "equal to Y's width for multiplication "                               \
-                "prerequisites. But received X's shape: %s, Y's shape: %s\n" %         \
-                (x_shape, y_shape)
+            assert (x_shape[-1] == -1) or (y_shape[-2] == -1), (
+                "After performing an optional transpose, Input X's width should be "
+                "equal to Y's width for multiplication "
+                "prerequisites. But received X's shape: %s, Y's shape: %s\n"
+                % (x_shape, y_shape)
+            )
 
         if len(y_shape) > 2 and len(x_shape) > 2:
             for i, dim_x in enumerate(x_shape[:-2]):
@@ -5493,7 +5915,8 @@ def __check_input(x, y):
                         "When the matrix is larger than 2 dimensions, the higher "
                         "dimensional values of the two matrices need to be equal. "
                         "But received x_shape[%d] != y_shape[%d]. X's shape: %s, "
-                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape))
+                        "Y's shape: %s.\n" % (i, i, x_shape, y_shape)
+                    )
 
     attrs = {
         'transpose_X': transpose_x,
@@ -5505,21 +5928,20 @@ def __check_input(x, y):
 
     helper = LayerHelper('matmul', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='matmul',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='matmul',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs=attrs,
+    )
     return out
 
 
 def topk(input, k, name=None):
     """
     :alias_main: paddle.topk
-	:alias: paddle.topk,paddle.tensor.topk,paddle.tensor.search.topk
-	:old_api: paddle.fluid.layers.topk
+        :alias: paddle.topk,paddle.tensor.topk,paddle.tensor.search.topk
+        :old_api: paddle.fluid.layers.topk
 
     This OP is used to find values and indices of the k largest entries
     for the last dimension.
@@ -5604,23 +6026,20 @@ def topk(input, k, name=None):
     values = helper.create_variable_for_type_inference(dtype=input.dtype)
     indices = helper.create_variable_for_type_inference(dtype="int64")
 
-    helper.append_op(type="top_k",
-                     inputs=inputs,
-                     outputs={
-                         "Out": [values],
-                         "Indices": [indices]
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type="top_k",
+        inputs=inputs,
+        outputs={"Out": [values], "Indices": [indices]},
+        attrs=attrs,
+    )
     values.stop_gradient = True
     indices.stop_gradient = True
     return values, indices
 
 
-def ctc_greedy_decoder(input,
-                       blank,
-                       input_length=None,
-                       padding_value=0,
-                       name=None):
+def ctc_greedy_decoder(
+    input, blank, input_length=None, padding_value=0, name=None
+):
     r"""
     This op is used to decode sequences by greedy policy by the following steps:
 
@@ -5746,8 +6165,9 @@ def ctc_greedy_decoder(input,
                             input_length=x_pad_len)
 
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'ctc_greedy_decoder')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'ctc_greedy_decoder'
+    )
 
     helper = LayerHelper("ctc_greedy_decoder", **locals())
     _, topk_indices = topk(input, k=1)
@@ -5756,32 +6176,27 @@ def ctc_greedy_decoder(input,
     ctc_out = helper.create_variable_for_type_inference(dtype="int64")
 
     if input_length is None:
-        helper.append_op(type="ctc_align",
-                         inputs={"Input": [topk_indices]},
-                         outputs={"Output": [ctc_out]},
-                         attrs={
-                             "merge_repeated": True,
-                             "blank": blank
-                         })
+        helper.append_op(
+            type="ctc_align",
+            inputs={"Input": [topk_indices]},
+            outputs={"Output": [ctc_out]},
+            attrs={"merge_repeated": True, "blank": blank},
+        )
         return ctc_out
     else:
         ctc_out_len = helper.create_variable_for_type_inference(dtype="int64")
         ctc_input = squeeze(topk_indices, [2])
 
-        helper.append_op(type="ctc_align",
-                         inputs={
-                             "Input": [ctc_input],
-                             "InputLength": [input_length]
-                         },
-                         outputs={
-                             "Output": [ctc_out],
-                             "OutputLength": [ctc_out_len]
-                         },
-                         attrs={
-                             "merge_repeated": True,
-                             "blank": blank,
-                             "padding_value": padding_value
-                         })
+        helper.append_op(
+            type="ctc_align",
+            inputs={"Input": [ctc_input], "InputLength": [input_length]},
+            outputs={"Output": [ctc_out], "OutputLength": [ctc_out_len]},
+            attrs={
+                "merge_repeated": True,
+                "blank": blank,
+                "padding_value": padding_value,
+            },
+        )
         return ctc_out, ctc_out_len
 
 
@@ -5842,10 +6257,21 @@ def transpose(x, perm, name=None):
             out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'transpose')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'transpose',
+    )
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -5854,34 +6280,37 @@ def transpose(x, perm, name=None):
             "Input(perm) is the permutation of dimensions of Input(x), "
             "its length should be equal to dimensions of Input(x), "
             "but received dimension of Input(x) is %s, "
-            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm))
+        )
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
                 "Each element in Input(perm) should be less than Input(x)'s dimension, "
                 "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
-                "dimension %d." % (idx, perm[idx], len(x.shape)))
+                "dimension %d." % (idx, perm[idx], len(x.shape))
+            )
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
-def im2sequence(input,
-                filter_size=1,
-                stride=1,
-                padding=0,
-                input_image_size=None,
-                out_stride=1,
-                name=None):
+def im2sequence(
+    input,
+    filter_size=1,
+    stride=1,
+    padding=0,
+    input_image_size=None,
+    out_stride=1,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -5995,8 +6424,9 @@ def im2sequence(input,
 
 
     """
-    assert not _non_static_mode(), (
-        "sequence layer is not supported in dygraph mode yet.")
+    assert (
+        not _non_static_mode()
+    ), "sequence layer is not supported in dygraph mode yet."
 
     check_variable_and_dtype(input, 'input', ['float32'], 'im2sequence')
 
@@ -6018,10 +6448,9 @@ def im2sequence(input,
         attrs["out_stride"] = out_stride
     helper = LayerHelper('im2sequence', **locals())
     out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
-    helper.append_op(type='im2sequence',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -6061,16 +6490,15 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     check_variable_and_dtype(input, 'input', ['float32'], 'row_conv')
     dtype = helper.input_dtype()
     filter_shape = [future_context_size + 1, input.shape[-1]]
-    filter_param = helper.create_parameter(attr=helper.param_attr,
-                                           shape=filter_shape,
-                                           dtype=dtype)
+    filter_param = helper.create_parameter(
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype
+    )
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='row_conv',
-                     inputs={
-                         'X': [input],
-                         'Filter': [filter_param]
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='row_conv',
+        inputs={'X': [input], 'Filter': [filter_param]},
+        outputs={'Out': [out]},
+    )
     return helper.append_activation(out)
 
 
@@ -6136,20 +6564,23 @@ def multiplex(inputs, index, name=None):
     check_type(inputs, 'inputs', (list), 'multiplex')
     if len(inputs) < 2:
         raise ValueError(
-            "inputs should be a list object with at least 2 elements.")
+            "inputs should be a list object with at least 2 elements."
+        )
     for id, x in enumerate(inputs):
-        check_variable_and_dtype(x, 'input[' + str(id) + ']',
-                                 ['float32', 'float64', 'int32', 'int64'],
-                                 'multiplex')
+        check_variable_and_dtype(
+            x,
+            'input[' + str(id) + ']',
+            ['float32', 'float64', 'int32', 'int64'],
+            'multiplex',
+        )
     check_variable_and_dtype(index, "index", ['int32', 'int64'], 'multiplex')
 
     out = helper.create_variable_for_type_inference(inputs[0].dtype)
-    helper.append_op(type='multiplex',
-                     inputs={
-                         'X': inputs,
-                         'Ids': index
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='multiplex',
+        inputs={'X': inputs, 'Ids': index},
+        outputs={'Out': [out]},
+    )
     return out
 
 
@@ -6216,18 +6647,17 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
 
     diff = helper.create_variable_for_type_inference(dtype=x.dtype)
     loss = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='smooth_l1_loss',
-                     inputs={
-                         'X': x,
-                         'Y': y,
-                         'InsideWeight': inside_weight,
-                         'OutsideWeight': outside_weight
-                     },
-                     outputs={
-                         'Diff': diff,
-                         'Out': loss
-                     },
-                     attrs={'sigma': sigma if sigma is not None else 1.0})
+    helper.append_op(
+        type='smooth_l1_loss',
+        inputs={
+            'X': x,
+            'Y': y,
+            'InsideWeight': inside_weight,
+            'OutsideWeight': outside_weight,
+        },
+        outputs={'Diff': diff, 'Out': loss},
+        attrs={'sigma': sigma if sigma is not None else 1.0},
+    )
     return loss
 
 
@@ -6319,10 +6749,12 @@ def one_hot(input, depth, allow_out_of_range=False):
         if isinstance(depth, Variable):
             depth = depth.numpy()
             assert depth.shape == (
-                1, ), "depth of type Variable should have shape [1]"
+                1,
+            ), "depth of type Variable should have shape [1]"
             depth = depth.item(0)
-        out = _legacy_C_ops.one_hot(input, 'depth', depth, 'allow_out_of_range',
-                                    allow_out_of_range)
+        out = _legacy_C_ops.one_hot(
+            input, 'depth', depth, 'allow_out_of_range', allow_out_of_range
+        )
         out.stop_gradient = True
         return out
 
@@ -6339,10 +6771,9 @@ def one_hot(input, depth, allow_out_of_range=False):
         depth.stop_gradient = True
         inputs = {'X': input, 'depth_tensor': depth}
         attrs = {'allow_out_of_range': allow_out_of_range}
-    helper.append_op(type="one_hot",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': one_hot_out})
+    helper.append_op(
+        type="one_hot", inputs=inputs, attrs=attrs, outputs={'Out': one_hot_out}
+    )
     one_hot_out.stop_gradient = True
     return one_hot_out
 
@@ -6380,16 +6811,18 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         dtype='int64',
         shape=[1],
         persistable=True,
-        belong_to_optimizer=True)
+        belong_to_optimizer=True,
+    )
     if is_new_var:
-        helper.set_variable_initializer(counter,
-                                        initializer=Constant(value=begin - 1,
-                                                             force_cpu=True))
+        helper.set_variable_initializer(
+            counter, initializer=Constant(value=begin - 1, force_cpu=True)
+        )
         helper.main_program.global_block()._prepend_op(
             type='increment',
             inputs={'X': [counter]},
             outputs={'Out': [counter]},
-            attrs={'step': float(step)})
+            attrs={'step': float(step)},
+        )
         counter.stop_gradient = True
 
     return counter
@@ -6493,7 +6926,7 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
     """
     if in_dygraph_mode():
         tmp_tensor_type = core.eager.Tensor
-        #TODO(zhiqiu): enable inplace in dygraph mode.
+        # TODO(zhiqiu): enable inplace in dygraph mode.
         if inplace:
             warnings.warn(
                 "Inplace on reshape is not allowed and will be discarded in dygraph mode currently."
@@ -6511,7 +6944,8 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape)))
+                " got '{}.'".format(type(shape))
+            )
 
         return dygraph_utils._append_activation_in_dygraph(out, act)
     else:
@@ -6533,14 +6967,26 @@ def reshape(x, shape, actual_shape=None, act=None, inplace=False, name=None):
             else:
                 raise ValueError(
                     "shape must be an instance of `list`, `tuple` or `Variable`,"
-                    " got '{}.'".format(type(shape)))
+                    " got '{}.'".format(type(shape))
+                )
 
             return dygraph_utils._append_activation_in_dygraph(out, act)
 
-    check_variable_and_dtype(x, 'x', [
-        'float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'bool',
-        'uint16'
-    ], 'reshape')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+            'bool',
+            'uint16',
+        ],
+        'reshape',
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -6564,20 +7010,23 @@ def get_attr_shape(list_shape):
                         "\t# z.shape is [-1, -1, 4]\n\n"
                         "    If your target shape in Reshape represents dynamic shape, "
                         "please turn it into a Tensor under @to_static. See above example for details."
-                        % dim_idx)
+                        % dim_idx
+                    )
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
                         "The index of 0 in `shape` must be less than "
                         "the input tensor X's dimensions. "
-                        "But received shape[%d] = 0, X's dimensions = %d." %
-                        (dim_idx, len(x.shape)))
+                        "But received shape[%d] = 0, X's dimensions = %d."
+                        % (dim_idx, len(x.shape))
+                    )
                 else:
                     assert dim_size > 0, (
                         "Each dimension value of 'shape' in reshape must not "
                         "be negative except one unknown dimension. "
-                        "But received shape[%d] = %s." %
-                        (dim_idx, str(dim_size)))
+                        "But received shape[%d] = %s."
+                        % (dim_idx, str(dim_size))
+                    )
         return attrs_shape
 
     inputs = {"X": x}
@@ -6586,8 +7035,10 @@ def get_attr_shape(list_shape):
         shape.stop_gradient = True
         inputs["Shape"] = shape
     elif isinstance(shape, (list, tuple)):
-        assert len(shape) > 0, ("The size of 'shape' in reshape can't be zero, "
-                                "but received %s." % len(shape))
+        assert len(shape) > 0, (
+            "The size of 'shape' in reshape can't be zero, "
+            "but received %s." % len(shape)
+        )
         attrs["shape"] = get_attr_shape(shape)
         if utils._contain_var(shape):
             inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
@@ -6595,16 +7046,18 @@ def get_attr_shape(list_shape):
             actual_shape.stop_gradient = True
             inputs["Shape"] = actual_shape
 
-    out = x if inplace else helper.create_variable_for_type_inference(
-        dtype=x.dtype)
+    out = (
+        x
+        if inplace
+        else helper.create_variable_for_type_inference(dtype=x.dtype)
+    )
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="reshape2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="reshape2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return helper.append_activation(out)
 
@@ -6669,10 +7122,22 @@ def squeeze(input, axes, name=None):
         return out
 
     helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(input, 'input', [
-        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
-        'complex64', 'complex128'
-    ], 'squeeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'squeeze',
+    )
     check_type(axes, 'axis/axes', (list, tuple, Variable), 'squeeze')
 
     attrs = {}
@@ -6686,13 +7151,12 @@ def squeeze(input, axes, name=None):
             attrs["axes"] = axes
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="squeeze2",
-                     inputs={"X": input},
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="squeeze2",
+        inputs={"X": input},
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -6742,18 +7206,23 @@ def unsqueeze(input, axes, name=None):
         return _C_ops.unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
-    check_variable_and_dtype(input, 'input', [
-        'float16',
-        'float32',
-        'float64',
-        'bool',
-        'int8',
-        'int16',
-        'int32',
-        'int64',
-        'complex64',
-        'complex128',
-    ], 'unsqueeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'unsqueeze',
+    )
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
     attrs = {}
@@ -6771,13 +7240,12 @@ def unsqueeze(input, axes, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="unsqueeze2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="unsqueeze2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -6841,10 +7309,10 @@ def lod_reset(x, y=None, target_lod=None):
                 out.dims = [6, 1]
 
     Args:
-        x (Variable): Input variable which could be a Tensor or LoDTensor. 
+        x (Variable): Input variable which could be a Tensor or LoDTensor.
                       The data type should be int32, int64, float32 or float64.
-        y (Variable, optional): If provided, output's LoD would be derived from :attr:`y`. 
-                                If y's lod level>0, the data type can be any type. 
+        y (Variable, optional): If provided, output's LoD would be derived from :attr:`y`.
+                                If y's lod level>0, the data type can be any type.
                                 If y's lod level=0, the data type should be int32.
         target_lod (list|tuple, optional): One level LoD which should be considered
                                       as target LoD when :attr:`y` not provided.
@@ -6863,24 +7331,24 @@ def lod_reset(x, y=None, target_lod=None):
             y = fluid.layers.data(name='y', shape=[10, 20], lod_level=2)
             out = fluid.layers.lod_reset(x=x, y=y)
     """
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'lod_reset')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'lod_reset'
+    )
     helper = LayerHelper("lod_reset", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     if y is not None:
         check_type(y, 'y', (Variable), 'lod_reset')
-        #TODO: check y.lod_level = 0 dtype
-        helper.append_op(type="lod_reset",
-                         inputs={
-                             'X': x,
-                             'Y': y
-                         },
-                         outputs={'Out': out})
+        # TODO: check y.lod_level = 0 dtype
+        helper.append_op(
+            type="lod_reset", inputs={'X': x, 'Y': y}, outputs={'Out': out}
+        )
     elif target_lod is not None:
-        helper.append_op(type="lod_reset",
-                         inputs={'X': x},
-                         attrs={'target_lod': target_lod},
-                         outputs={'Out': out})
+        helper.append_op(
+            type="lod_reset",
+            inputs={'X': x},
+            attrs={'target_lod': target_lod},
+            outputs={'Out': out},
+        )
     else:
         raise ValueError("y and target_lod should not be both none.")
     return out
@@ -6907,9 +7375,9 @@ def lod_append(x, level):
                 x.dims = [6, 1]
 
     Args:
-        x (Variable): Input variable which could be a tensor or LoDTensor. 
+        x (Variable): Input variable which could be a tensor or LoDTensor.
                       The data type should be int32, int64, float32 or float64.
-        level (list|tuple|Variable, optional): The LoD level to be appended into LoD of x. 
+        level (list|tuple|Variable, optional): The LoD level to be appended into LoD of x.
                                                If level is variable and its lod level>0, the data type can be any type.
                                                If level is variable and its lod level=0, the data type should be int32.
     Returns:
@@ -6934,8 +7402,9 @@ def lod_append(x, level):
     if (not isinstance(level, Iterable)) and (not isinstance(level, Variable)):
         raise ValueError("Input(level) must be list, tuple or Variable.")
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'lod_append')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'lod_append'
+    )
 
     helper = LayerHelper("lod_append", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -6945,27 +7414,22 @@ def lod_append(x, level):
 
     if isinstance(level, Variable):
         inputs['Y'] = level
-        #TODO: check y.lod_level = 0 dtype
+        # TODO: check y.lod_level = 0 dtype
     else:
         attrs['target_lod'] = level
-    helper.append_op(type="lod_reset",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+    helper.append_op(
+        type="lod_reset", inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
     return out
 
 
-def lrn(input,
-        n=5,
-        k=1.0,
-        alpha=1e-4,
-        beta=0.75,
-        name=None,
-        data_format='NCHW'):
+def lrn(
+    input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None, data_format='NCHW'
+):
     r"""
     :alias_main: paddle.nn.functional.lrn
-	:alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
-	:old_api: paddle.fluid.layers.lrn
+        :alias: paddle.nn.functional.lrn,paddle.nn.functional.norm.lrn
+        :old_api: paddle.fluid.layers.lrn
 
     This operator implements the Local Response Normalization Layer.
     This layer performs a type of "lateral inhibition" by normalizing over local input regions.
@@ -7023,38 +7487,44 @@ def lrn(input,
 
     if dims != 4:
         raise ValueError(
-            "Input's dimension size of Op(lrn) must be 4, but received %d." %
-            (dims))
+            "Input's dimension size of Op(lrn) must be 4, but received %d."
+            % (dims)
+        )
     if data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
-            "Attr(data_format) of Op(lrn) got wrong value: received " +
-            data_format + " but only NCHW or NHWC supported.")
+            "Attr(data_format) of Op(lrn) got wrong value: received "
+            + data_format
+            + " but only NCHW or NHWC supported."
+        )
 
-    mid_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                        stop_gradient=True)
+    mid_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     lrn_out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="lrn",
-                     inputs={"X": input},
-                     outputs={
-                         "Out": lrn_out,
-                         "MidOut": mid_out,
-                     },
-                     attrs={
-                         "n": n,
-                         "k": k,
-                         "alpha": alpha,
-                         "beta": beta,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="lrn",
+        inputs={"X": input},
+        outputs={
+            "Out": lrn_out,
+            "MidOut": mid_out,
+        },
+        attrs={
+            "n": n,
+            "k": k,
+            "alpha": alpha,
+            "beta": beta,
+            "data_format": data_format,
+        },
+    )
 
     return lrn_out
 
 
-def pad(x, paddings, pad_value=0., name=None):
+def pad(x, paddings, pad_value=0.0, name=None):
     r"""
     :alias_main: paddle.nn.functional.pad
-	:alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
-	:old_api: paddle.fluid.layers.pad
+        :alias: paddle.nn.functional.pad,paddle.nn.functional.common.pad
+        :old_api: paddle.fluid.layers.pad
 
     This op will pad a tensor with a constant value given by :attr:`pad_value`, and the
     padded shape is specified by :attr:`paddings`.
@@ -7105,10 +7575,20 @@ def pad(x, paddings, pad_value=0., name=None):
             x = fluid.data(name='data', shape=[300, 300], dtype='float32')
             out = fluid.layers.pad(x=x, paddings=[0, 1, 1, 2], pad_value=0.)
     """
-    check_variable_and_dtype(x, 'x', [
-        'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], "pad")
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        "pad",
+    )
 
     check_type(pad_value, 'pad_value', (float, int, Variable), 'pad')
     if isinstance(pad_value, int):
@@ -7117,17 +7597,16 @@ def pad(x, paddings, pad_value=0., name=None):
     helper = LayerHelper('pad', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='pad',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'paddings': paddings,
-                         'pad_value': pad_value
-                     })
+    helper.append_op(
+        type='pad',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'paddings': paddings, 'pad_value': pad_value},
+    )
     return out
 
 
-def pad_constant_like(x, y, pad_value=0., name=None):
+def pad_constant_like(x, y, pad_value=0.0, name=None):
     r"""
     Pad :attr:`y` with :attr:`pad_value`, the number of values padded to
     the edges of each axis is specified by the difference of the shape
@@ -7207,31 +7686,29 @@ def pad_constant_like(x, y, pad_value=0., name=None):
             # out is a rank 4 tensor variable, and out.shape = [2, 3 ,2 , 3]
     """
     check_type(x, 'x', (Variable), 'pad_constant_like')
-    check_variable_and_dtype(y, 'y', ['float32', 'float64', 'int32', 'int64'],
-                             "pad_constant_like")
+    check_variable_and_dtype(
+        y, 'y', ['float32', 'float64', 'int32', 'int64'], "pad_constant_like"
+    )
 
     helper = LayerHelper('pad_constant_like', **locals())
     dtype = helper.input_dtype(input_param_name='y')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='pad_constant_like',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={'pad_value': float(pad_value)})
+    helper.append_op(
+        type='pad_constant_like',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={'pad_value': float(pad_value)},
+    )
     return out
 
 
-def label_smooth(label,
-                 prior_dist=None,
-                 epsilon=0.1,
-                 dtype="float32",
-                 name=None):
+def label_smooth(
+    label, prior_dist=None, epsilon=0.1, dtype="float32", name=None
+):
     r"""
     :alias_main: paddle.nn.functional.label_smooth
-	:alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
-	:old_api: paddle.fluid.layers.label_smooth
+        :alias: paddle.nn.functional.label_smooth,paddle.nn.functional.common.label_smooth
+        :old_api: paddle.fluid.layers.label_smooth
 
     Label smoothing is a mechanism to regularize the classifier layer and is called
     label-smoothing regularization (LSR).
@@ -7288,37 +7765,42 @@ def label_smooth(label,
     if in_dygraph_mode():
         return _C_ops.label_smooth(label, prior_dist, float(epsilon))
 
-    if epsilon > 1. or epsilon < 0.:
+    if epsilon > 1.0 or epsilon < 0.0:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
     if _non_static_mode():
-        return _legacy_C_ops.label_smooth(label, prior_dist, 'epsilon',
-                                          float(epsilon))
+        return _legacy_C_ops.label_smooth(
+            label, prior_dist, 'epsilon', float(epsilon)
+        )
 
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'label_smooth')
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'label_smooth'
+    )
 
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
     smooth_label = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="label_smooth",
-                     inputs={
-                         "X": label,
-                         "PriorDist": prior_dist
-                     } if prior_dist else {"X": label},
-                     outputs={"Out": smooth_label},
-                     attrs={"epsilon": float(epsilon)})
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label, "PriorDist": prior_dist}
+        if prior_dist
+        else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)},
+    )
     return smooth_label
 
 
 @templatedoc()
-def roi_pool(input,
-             rois,
-             pooled_height=1,
-             pooled_width=1,
-             spatial_scale=1.0,
-             rois_num=None,
-             name=None):
+def roi_pool(
+    input,
+    rois,
+    pooled_height=1,
+    pooled_width=1,
+    spatial_scale=1.0,
+    rois_num=None,
+    name=None,
+):
     """
 
     This operator implements the roi_pooling layer.
@@ -7384,10 +7866,20 @@ def roi_pool(input,
         print(np.array(out).shape)  # (2, 1, 1, 1)
     """
     if _non_static_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
         pool_out, argmaxes = _legacy_C_ops.roi_pool(
-            input, rois, rois_num, "pooled_height", pooled_height,
-            "pooled_width", pooled_width, "spatial_scale", spatial_scale)
+            input,
+            rois,
+            rois_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+        )
         return pool_out, argmaxes
 
     check_variable_and_dtype(input, 'input', ['float32'], 'roi_pool')
@@ -7403,29 +7895,30 @@ def roi_pool(input,
     }
     if rois_num is not None:
         inputs['RoisNum'] = rois_num
-    helper.append_op(type="roi_pool",
-                     inputs=inputs,
-                     outputs={
-                         "Out": pool_out,
-                         "Argmax": argmaxes
-                     },
-                     attrs={
-                         "pooled_height": pooled_height,
-                         "pooled_width": pooled_width,
-                         "spatial_scale": spatial_scale
-                     })
+    helper.append_op(
+        type="roi_pool",
+        inputs=inputs,
+        outputs={"Out": pool_out, "Argmax": argmaxes},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale,
+        },
+    )
     return pool_out
 
 
 @templatedoc()
-def roi_align(input,
-              rois,
-              pooled_height=1,
-              pooled_width=1,
-              spatial_scale=1.0,
-              sampling_ratio=-1,
-              rois_num=None,
-              name=None):
+def roi_align(
+    input,
+    rois,
+    pooled_height=1,
+    pooled_width=1,
+    spatial_scale=1.0,
+    sampling_ratio=-1,
+    rois_num=None,
+    name=None,
+):
     """
 
     ${comment}
@@ -7473,21 +7966,41 @@ def roi_align(input,
                                                rois_num=rois_num)
     """
     if in_dygraph_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        return _C_ops.roi_align(input, rois, rois_num, pooled_height,
-                                pooled_width, spatial_scale, sampling_ratio,
-                                False)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        return _C_ops.roi_align(
+            input,
+            rois,
+            rois_num,
+            pooled_height,
+            pooled_width,
+            spatial_scale,
+            sampling_ratio,
+            False,
+        )
     if _in_legacy_dygraph():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        align_out = _legacy_C_ops.roi_align(input, rois, rois_num,
-                                            "pooled_height", pooled_height,
-                                            "pooled_width", pooled_width,
-                                            "spatial_scale", spatial_scale,
-                                            "sampling_ratio", sampling_ratio)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        align_out = _legacy_C_ops.roi_align(
+            input,
+            rois,
+            rois_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+            "sampling_ratio",
+            sampling_ratio,
+        )
         return align_out
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'roi_align')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'roi_align'
+    )
     check_variable_and_dtype(rois, 'rois', ['float32', 'float64'], 'roi_align')
     helper = LayerHelper('roi_align', **locals())
     dtype = helper.input_dtype()
@@ -7498,15 +8011,17 @@ def roi_align(input,
     }
     if rois_num is not None:
         inputs['RoisNum'] = rois_num
-    helper.append_op(type="roi_align",
-                     inputs=inputs,
-                     outputs={"Out": align_out},
-                     attrs={
-                         "pooled_height": pooled_height,
-                         "pooled_width": pooled_width,
-                         "spatial_scale": spatial_scale,
-                         "sampling_ratio": sampling_ratio
-                     })
+    helper.append_op(
+        type="roi_align",
+        inputs=inputs,
+        outputs={"Out": align_out},
+        attrs={
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "spatial_scale": spatial_scale,
+            "sampling_ratio": sampling_ratio,
+        },
+    )
     return align_out
 
 
@@ -7552,21 +8067,22 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
             predictions = F.softmax(x)
             loss = F.dice_loss(input=predictions, label=label)
     """
-    return paddle.nn.functional.dice_loss(input,
-                                          label,
-                                          epsilon=epsilon,
-                                          name=name)
-
-
-def image_resize(input,
-                 out_shape=None,
-                 scale=None,
-                 name=None,
-                 resample='BILINEAR',
-                 actual_shape=None,
-                 align_corners=True,
-                 align_mode=1,
-                 data_format='NCHW'):
+    return paddle.nn.functional.dice_loss(
+        input, label, epsilon=epsilon, name=name
+    )
+
+
+def image_resize(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    resample='BILINEAR',
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCHW',
+):
     """
 
     This op resizes a batch of images.
@@ -7581,19 +8097,19 @@ def image_resize(input,
     future and only use :attr:`out_shape` instead.
 
     Supporting resample methods:
-        'LINEAR' : Linear interpolation 
+        'LINEAR' : Linear interpolation
 
         'BILINEAR' : Bilinear interpolation
 
         'TRILINEAR' : Trilinear interpolation
 
         'NEAREST' : Nearest neighbor interpolation
-        
+
         'BICUBIC' : Bicubic interpolation
-    
-    Linear interpolation is the method of using a line connecting two known quantities 
+
+    Linear interpolation is the method of using a line connecting two known quantities
     to determine the value of an unknown quantity between the two known quantities.
-    
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -7608,7 +8124,7 @@ def image_resize(input,
     interpolating functions of three variables (e.g. D-direction,
     H-direction and W-direction in this op) on a rectilinear 3D grid.
     The linear interpolation is performed on three directions.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -7707,7 +8223,7 @@ def image_resize(input,
               output: (N,C,D_out,H_out,W_out) where:
 
               D_out = D_{in} * scale_{factor}
-       
+
         Trilinear interpolation:
           if:
               align_corners = False , align_mode = 0
@@ -7722,20 +8238,20 @@ def image_resize(input,
               D_out = D_{in} * scale_{factor}
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
-        
+
 
     For details of linear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Linear_interpolation.
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
 
@@ -7743,8 +8259,8 @@ def image_resize(input,
         input (Variable): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         out_shape (list|tuple|Variable|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list, each element can be an integer or a Tensor Variable of shape: [1].
              If a Tensor Variable, its dimensions size should be a 1.
         scale(float|Variable|None): The multiplier for the input height or width. At
@@ -7772,8 +8288,8 @@ def image_resize(input,
                                input and output tensors are aligned, preserving the values at the
                                corner pixels.
                                Default: True
-        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the fomula in the 
-                            the example code above, it can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 , 
+        align_mode(int)  :  An optional for linear/bilinear/trilinear interpolation. Refer to the fomula in the
+                            the example code above, it can be \'0\' for src_idx = scale*(dst_indx+0.5)-0.5 ,
                             can be \'1\' for src_idx = scale*dst_index.
         data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from:`NCW`, `NWC`, `"NCHW"`, `"NHWC"`, `"NCDHW"`,
@@ -7806,65 +8322,65 @@ def image_resize(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    paddle.enable_static()
-	    input = fluid.data(name="input", shape=[None,3,6,10])
+            #declarative mode
+            import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            paddle.enable_static()
+            input = fluid.data(name="input", shape=[None,3,6,10])
 
-	    #1
-	    output = fluid.layers.image_resize(input=input,out_shape=[12,12])
+            #1
+            output = fluid.layers.image_resize(input=input,out_shape=[12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.image_resize(input=input,out_shape=[12,dim1])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.image_resize(input=input,out_shape=[12,dim1])
 
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.image_resize(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.image_resize(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.image_resize(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.image_resize(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
+            input_data = np.random.rand(2,3,6,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
+            #1
+            # (2, 3, 12, 12)
+            #2
+            # (2, 3, 12, 2)
+            #3
+            # (2, 3, 3, 12)
+            #4
+            # (2, 3, 3, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.image_resize(input=input, out_shape=[12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.image_resize(input=input, out_shape=[12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L]
+                # [2L, 3L, 12L, 12L]
 
     """
     resample_methods = {
@@ -7878,7 +8394,8 @@ def image_resize(input,
     if resample not in resample_methods:
         raise ValueError(
             "The 'resample' of image_resize can only be 'LINEAR', 'BILINEAR', 'TRILINEAR' "
-            "or 'NEAREST' currently.")
+            "or 'NEAREST' currently."
+        )
     resample_type = resample_methods[resample]
 
     if resample == 'LINEAR' and len(input.shape) != 3:
@@ -7900,19 +8417,25 @@ def image_resize(input,
 
     if len(input.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCW` or `NWC` supported for 3-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCW` or `NWC` supported for 3-D input."
+        )
     elif len(input.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCHW` or `NHWC` supported for 4-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCHW` or `NHWC` supported for 4-D input."
+        )
     elif len(input.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCDHW` or `NDHWC` supported for 5-D input."
+        )
 
     def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
     if data_format == 'NCHW' or data_format == 'NCDHW' or data_format == 'NCW':
         data_layout = 'NCHW'
@@ -7927,7 +8450,7 @@ def _is_list_or_turple_(data):
         "interp_method": resample_type,
         "align_corners": align_corners,
         "align_mode": align_mode,
-        "data_layout": data_layout
+        "data_layout": data_layout,
     }
 
     if out_shape is not None:
@@ -7945,16 +8468,17 @@ def _is_list_or_turple_(data):
                         out_shape[i] = dim.numpy()[0]
             if not (_is_list_or_turple_(out_shape)):
                 raise TypeError(
-                    "out_shape should be a list or tuple or Variable.")
+                    "out_shape should be a list or tuple or Variable."
+                )
             # Validate the shape
             contain_var = False
             for dim_idx, dim_size in enumerate(out_shape):
                 if isinstance(dim_size, Variable):
                     contain_var = True
                     continue
-                assert dim_size > 0, (
-                    "Each dimension size given in out_shape must be greater than 0."
-                )
+                assert (
+                    dim_size > 0
+                ), "Each dimension size given in out_shape must be greater than 0."
 
             if contain_var:
                 new_size_tensor = []
@@ -7965,22 +8489,22 @@ def _is_list_or_turple_(data):
                         new_size_tensor.append(dim)
                         size_list.append(-1)
                     else:
-                        assert (isinstance(dim, int))
+                        assert isinstance(dim, int)
                         temp_out = helper.create_variable_for_type_inference(
-                            'int32')
-                        fill_constant([1],
-                                      'int32',
-                                      dim,
-                                      force_cpu=True,
-                                      out=temp_out)
+                            'int32'
+                        )
+                        fill_constant(
+                            [1], 'int32', dim, force_cpu=True, out=temp_out
+                        )
                         new_size_tensor.append(temp_out)
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
 
             if len(input.shape) == 3:
                 if len(out_shape) != 1:
-                    raise ValueError("out_shape length should be 1 for "
-                                     "input 3-D tensor.")
+                    raise ValueError(
+                        "out_shape length should be 1 for " "input 3-D tensor."
+                    )
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -7988,8 +8512,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[0]
             elif len(input.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("out_shape length should be 2 for "
-                                     "input 4-D tensor.")
+                    raise ValueError(
+                        "out_shape length should be 2 for " "input 4-D tensor."
+                    )
                 if contain_var:
                     attrs['out_h'] = size_list[0]
                     attrs['out_w'] = size_list[1]
@@ -7999,8 +8524,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[1]
             if len(input.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("out_shape length should be 3 for "
-                                     "input 5-D tensor.")
+                    raise ValueError(
+                        "out_shape length should be 3 for " "input 5-D tensor."
+                    )
                 if contain_var:
                     attrs['out_d'] = size_list[0]
                     attrs['out_h'] = size_list[1]
@@ -8023,7 +8549,8 @@ def _is_list_or_turple_(data):
             attrs['scale'] = float(scale)
         else:
             raise TypeError(
-                "Attr(scale)'s type should be float, int or Variable.")
+                "Attr(scale)'s type should be float, int or Variable."
+            )
 
     if isinstance(actual_shape, Variable):
         warnings.warn(
@@ -8055,31 +8582,35 @@ def _is_list_or_turple_(data):
         return out
 
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='{}_interp'.format(resample_type),
-                     inputs=inputs,
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type='{}_interp'.format(resample_type),
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs=attrs,
+    )
     return out
 
 
 @templatedoc(op_type="linear_interp")
-def resize_linear(input,
-                  out_shape=None,
-                  scale=None,
-                  name=None,
-                  actual_shape=None,
-                  align_corners=True,
-                  align_mode=1,
-                  data_format='NCW'):
+def resize_linear(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCW',
+):
     """
     This op resizes the input by performing linear interpolation based on given
     output shape which specified by actual_shape, out_shape and scale
     in priority order.
 
-    **Warning:** the parameter :attr:`actual_shape` will be deprecated in 
+    **Warning:** the parameter :attr:`actual_shape` will be deprecated in
     the future and only use :attr:`out_shape` instead.
 
-    Align_corners and align_mode are optional parameters,the calculation 
+    Align_corners and align_mode are optional parameters,the calculation
     method of interpolation can be selected by them.
 
     Example:
@@ -8087,23 +8618,23 @@ def resize_linear(input,
     .. code-block:: text
 
         For scale:
-          
+
             if align_corners = True && out_size > 1 :
 
               scale_factor = (in_size-1.0)/(out_size-1.0)
-            
+
             else:
-              
+
               scale_factor = float(in_size/out_size)
 
         Linear interpolation:
 
           if:
               align_corners = False , align_mode = 0
-              
+
               input : (N,C,W_in)
               output: (N,C,W_out) where:
-              
+
               W_out = (W_{in}+0.5) * scale_{factor} - 0.5
 
           else:
@@ -8116,12 +8647,12 @@ def resize_linear(input,
         input(Variable): 3-D Tensor(NCW), its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         out_shape(list|tuple|Variable|None): Output shape of resize linear
-            layer, the shape is (out_w,). Default: None. If a list, each 
-            element can be an integer or a Tensor Variable with shape: [1]. If a 
+            layer, the shape is (out_w,). Default: None. If a list, each
+            element can be an integer or a Tensor Variable with shape: [1]. If a
             Tensor Variable, its dimension size should be 1.
         scale(float|Variable|None): The multiplier for the input height or width. At
-             least one of :attr:`out_shape` or :attr:`scale` must be set. 
-             And :attr:`out_shape` has a higher priority than :attr:`scale`. 
+             least one of :attr:`out_shape` or :attr:`scale` must be set.
+             And :attr:`out_shape` has a higher priority than :attr:`scale`.
              Default: None.
         actual_shape(Variable): An optional input to specify output shape
                                 dynamically. If provided, image resize
@@ -8129,75 +8660,86 @@ def resize_linear(input,
                                 :attr:`out_shape` and :attr:`scale` specifying
                                 shape. That is to say actual_shape has the
                                 highest priority. It is recommended to use
-                                :attr:`out_shape` if you want to specify output 
-                                shape dynamically, because :attr:`actual_shape` 
-                                will be deprecated. When using actual_shape to 
-                                specify output shape, one of :attr:`out_shape` 
-                                and :attr:`scale` should also be set, otherwise 
+                                :attr:`out_shape` if you want to specify output
+                                shape dynamically, because :attr:`actual_shape`
+                                will be deprecated. When using actual_shape to
+                                specify output shape, one of :attr:`out_shape`
+                                and :attr:`scale` should also be set, otherwise
                                 errors would be occurred in graph constructing stage.
                                 Default: None
         align_corners(bool): ${align_corners_comment}
         align_mode(bool): ${align_mode_comment}
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCW"`, `"NWC"`.
             The default is `"NCW"`. When it is `"NCW"`, the data is stored in the order of:
             `[batch_size, input_channels, input_width]`.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property.  
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-	Variable: 3-D tensor(NCW or NWC).
-    
+        Variable: 3-D tensor(NCW or NWC).
+
     Examples:
         .. code-block:: python
-	
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    input = fluid.data(name="input", shape=[None,3,100])
 
-	    output = fluid.layers.resize_linear(input=input,out_shape=[50,])
+            #declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            input = fluid.data(name="input", shape=[None,3,100])
+
+            output = fluid.layers.resize_linear(input=input,out_shape=[50,])
+
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
- 
-	    input_data = np.random.rand(1,3,100).astype("float32")
+            input_data = np.random.rand(1,3,100).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
- 
-	    print(output_data[0].shape)
 
-	    # (1, 3, 50)
+            print(output_data[0].shape)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            # (1, 3, 50)
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_linear(input=input, out_shape=[50,])
-    		print(output.shape)
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-		# [1L, 3L, 50L]
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_linear(input=input, out_shape=[50,])
+                print(output.shape)
+
+                # [1L, 3L, 50L]
 
     """
 
-    return image_resize(input, out_shape, scale, name, 'LINEAR', actual_shape,
-                        align_corners, align_mode, data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'LINEAR',
+        actual_shape,
+        align_corners,
+        align_mode,
+        data_format,
+    )
 
 
 @templatedoc(op_type="bilinear_interp")
-def resize_bilinear(input,
-                    out_shape=None,
-                    scale=None,
-                    name=None,
-                    actual_shape=None,
-                    align_corners=True,
-                    align_mode=1,
-                    data_format='NCHW'):
+def resize_bilinear(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCHW',
+):
     """
 
     This op resizes the input by performing bilinear interpolation based on given
@@ -8284,86 +8826,97 @@ def resize_bilinear(input,
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
 
     Returns:
-	Variable: 4-D tensor(NCHW or NHWC).
+        Variable: 4-D tensor(NCHW or NHWC).
 
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    import paddle
-	    paddle.enable_static()
-	    input = fluid.data(name="input", shape=[None,3,6,10])
+            #declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            paddle.enable_static()
+            input = fluid.data(name="input", shape=[None,3,6,10])
 
-	    #1
-	    output = fluid.layers.resize_bilinear(input=input,out_shape=[12,12])
+            #1
+            output = fluid.layers.resize_bilinear(input=input,out_shape=[12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.resize_bilinear(input=input,out_shape=[12,dim1])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.resize_bilinear(input=input,out_shape=[12,dim1])
 
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.resize_bilinear(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.resize_bilinear(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.resize_bilinear(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.resize_bilinear(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
+            input_data = np.random.rand(2,3,6,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
+            #1
+            # (2, 3, 12, 12)
+            #2
+            # (2, 3, 12, 2)
+            #3
+            # (2, 3, 3, 12)
+            #4
+            # (2, 3, 3, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_bilinear(input=input, out_shape=[12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_bilinear(input=input, out_shape=[12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L]
+                # [2L, 3L, 12L, 12L]
 
     """
 
-    return image_resize(input, out_shape, scale, name, 'BILINEAR', actual_shape,
-                        align_corners, align_mode, data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'BILINEAR',
+        actual_shape,
+        align_corners,
+        align_mode,
+        data_format,
+    )
 
 
 @templatedoc(op_type="trilinear_interp")
-def resize_trilinear(input,
-                     out_shape=None,
-                     scale=None,
-                     name=None,
-                     actual_shape=None,
-                     align_corners=True,
-                     align_mode=1,
-                     data_format='NCDHW'):
+def resize_trilinear(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    align_mode=1,
+    data_format='NCDHW',
+):
     """
 
     This op resizes the input by performing trilinear interpolation based on given
@@ -8455,82 +9008,93 @@ def resize_trilinear(input,
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import paddle
-	    import numpy as np
-	    paddle.enable_static()
-	    input = fluid.data(name="input", shape=[None,3,6,8,10])
+            #declarative mode
+            import paddle.fluid as fluid
+            import paddle
+            import numpy as np
+            paddle.enable_static()
+            input = fluid.data(name="input", shape=[None,3,6,8,10])
 
-	    #1
-	    output = fluid.layers.resize_trilinear(input=input,out_shape=[12,12,12])
+            #1
+            output = fluid.layers.resize_trilinear(input=input,out_shape=[12,12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.resize_trilinear(input=input,out_shape=[12,dim1,4])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.resize_trilinear(input=input,out_shape=[12,dim1,4])
 
-	    #3
-	    #x = np.array([3,12,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.resize_trilinear(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[3], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.resize_trilinear(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.resize_trilinear(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.resize_trilinear(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,8,10).astype("float32")
+            input_data = np.random.rand(2,3,6,8,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12, 12)
-	    #2
-	    # (2, 3, 12, 2, 4)
-	    #3
-	    # (2, 3, 3, 12, 12)
-	    #4
-	    # (2, 3, 3, 4, 5)
+            #1
+            # (2, 3, 12, 12, 12)
+            #2
+            # (2, 3, 12, 2, 4)
+            #3
+            # (2, 3, 3, 12, 12)
+            #4
+            # (2, 3, 3, 4, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_trilinear(input=input, out_shape=[12,12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_trilinear(input=input, out_shape=[12,12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L, 12L]
+                # [2L, 3L, 12L, 12L, 12L]
 
 
 
     """
 
-    return image_resize(input, out_shape, scale, name, 'TRILINEAR',
-                        actual_shape, align_corners, align_mode, data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'TRILINEAR',
+        actual_shape,
+        align_corners,
+        align_mode,
+        data_format,
+    )
 
 
 @templatedoc(op_type="nearest_interp")
-def resize_nearest(input,
-                   out_shape=None,
-                   scale=None,
-                   name=None,
-                   actual_shape=None,
-                   align_corners=True,
-                   data_format='NCHW'):
+def resize_nearest(
+    input,
+    out_shape=None,
+    scale=None,
+    name=None,
+    actual_shape=None,
+    align_corners=True,
+    data_format='NCHW',
+):
     """
 
     This op resizes the input by performing nearest neighbor interpolation in both the
@@ -8586,7 +9150,7 @@ def resize_nearest(input,
              And :attr:`out_shape` has a higher priority than :attr:`scale`.
              Default: None.
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name`
-	actual_shape(Variable): An optional input to specify output shape
+        actual_shape(Variable): An optional input to specify output shape
                                 dynamically. If provided, image resize
                                 according to this given shape rather than
                                 :attr:`out_shape` and :attr:`scale` specifying
@@ -8606,85 +9170,87 @@ def resize_nearest(input,
             `[batch_size, input_channels, input_height, input_width]`.
 
     Returns:
-	Variable: 4-D tensor(NCHW or NHWC).
+        Variable: 4-D tensor(NCHW or NHWC).
 
     Examples:
         .. code-block:: python
 
-	    #declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    import paddle
-	    paddle.enable_static()
+            #declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            paddle.enable_static()
 
-	    input = fluid.data(name="input", shape=[None,3,6,10])
+            input = fluid.data(name="input", shape=[None,3,6,10])
 
-	    #1
-	    output = fluid.layers.resize_nearest(input=input,out_shape=[12,12])
+            #1
+            output = fluid.layers.resize_nearest(input=input,out_shape=[12,12])
 
-	    #2
-	    #x = np.array([2]).astype("int32")
-	    #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
-	    #fluid.layers.assign(input=x, output=dim1)
-	    #output = fluid.layers.resize_nearest(input=input,out_shape=[12,dim1])
+            #2
+            #x = np.array([2]).astype("int32")
+            #dim1 = fluid.data(name="dim1", shape=[1], dtype="int32")
+            #fluid.layers.assign(input=x, output=dim1)
+            #output = fluid.layers.resize_nearest(input=input,out_shape=[12,dim1])
 
-	    #3
-	    #x = np.array([3,12]).astype("int32")
-	    #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
-	    #fluid.layers.assign(input=x, output=shape_tensor)
-	    #output = fluid.layers.resize_nearest(input=input,out_shape=shape_tensor)
+            #3
+            #x = np.array([3,12]).astype("int32")
+            #shape_tensor = fluid.data(name="shape_tensor", shape=[2], dtype="int32")
+            #fluid.layers.assign(input=x, output=shape_tensor)
+            #output = fluid.layers.resize_nearest(input=input,out_shape=shape_tensor)
 
-	    #4
-	    #x = np.array([0.5]).astype("float32")
-	    #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
-	    #fluid.layers.assign(x,scale_tensor)
-	    #output = fluid.layers.resize_nearest(input=input,scale=scale_tensor)
+            #4
+            #x = np.array([0.5]).astype("float32")
+            #scale_tensor = fluid.data(name="scale", shape=[1], dtype="float32")
+            #fluid.layers.assign(x,scale_tensor)
+            #output = fluid.layers.resize_nearest(input=input,scale=scale_tensor)
 
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,3,6,10).astype("float32")
+            input_data = np.random.rand(2,3,6,10).astype("float32")
 
-	    output_data = exe.run(fluid.default_main_program(),
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
-	    print(output_data[0].shape)
+            print(output_data[0].shape)
 
-	    #1
-	    # (2, 3, 12, 12)
-	    #2
-	    # (2, 3, 12, 2)
-	    #3
-	    # (2, 3, 3, 12)
-	    #4
-	    # (2, 3, 3, 5)
+            #1
+            # (2, 3, 12, 12)
+            #2
+            # (2, 3, 12, 2)
+            #3
+            # (2, 3, 3, 12)
+            #4
+            # (2, 3, 3, 5)
 
-	    #imperative mode
-	    import paddle.fluid.dygraph as dg
+            #imperative mode
+            import paddle.fluid.dygraph as dg
 
-	    with dg.guard(place) as g:
-    		input = dg.to_variable(input_data)
-    		output = fluid.layers.resize_nearest(input=input, out_shape=[12,12])
-    		print(output.shape)
+            with dg.guard(place) as g:
+                input = dg.to_variable(input_data)
+                output = fluid.layers.resize_nearest(input=input, out_shape=[12,12])
+                print(output.shape)
 
-		# [2L, 3L, 12L, 12L]
+                # [2L, 3L, 12L, 12L]
 
 
 
     """
 
-    return image_resize(input,
-                        out_shape,
-                        scale,
-                        name,
-                        'NEAREST',
-                        actual_shape,
-                        align_corners,
-                        align_mode=1,
-                        data_format=data_format)
+    return image_resize(
+        input,
+        out_shape,
+        scale,
+        name,
+        'NEAREST',
+        actual_shape,
+        align_corners,
+        align_mode=1,
+        data_format=data_format,
+    )
 
 
 def image_resize_short(input, out_short_len, resample='BILINEAR'):
@@ -8712,15 +9278,18 @@ def image_resize_short(input, out_short_len, resample='BILINEAR'):
     in_shape = input.shape
     if len(in_shape) != 4:
         raise ValueError(
-            "The rank of input must be 4 (num_batches, channels, in_h, in_w).")
+            "The rank of input must be 4 (num_batches, channels, in_h, in_w)."
+        )
     hw = in_shape[2:4]
     short_idx = hw.index(min(hw))
     long_idx = 1 - short_idx
     out_shape = list(hw)
     out_shape[short_idx] = out_short_len
     out_shape[long_idx] = int(
-        float(out_shape[long_idx]) *
-        (float(out_short_len) / float(hw[short_idx])) + 0.5)
+        float(out_shape[long_idx])
+        * (float(out_short_len) / float(hw[short_idx]))
+        + 0.5
+    )
     return image_resize(input=input, out_shape=out_shape, resample=resample)
 
 
@@ -8759,12 +9328,12 @@ def gather(input, index, overwrite=True):
         index (Tensor): The index input tensor with rank=1. Data type is int32 or int64.
         overwrite (bool, optional): The mode that updating the grad when has same index.
             If True, use the overwrite mode to update the grad of the same index,
-	    if False, use the accumulate mode to update the grad of the same index.
-	    Default value is True.
+            if False, use the accumulate mode to update the grad of the same index.
+            Default value is True.
 
     Returns:
         output (Tensor): The output is a tensor with the same rank as input.
-    
+
     Examples:
 
         .. code-block:: python
@@ -8781,19 +9350,21 @@ def gather(input, index, overwrite=True):
         return _legacy_C_ops.gather(input, index, None, 'overwrite', overwrite)
 
     check_variable_and_dtype(
-        input, 'x',
-        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'], 'gather')
+        input,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'gather',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
     helper = LayerHelper('gather', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="gather",
-                     inputs={
-                         "X": input,
-                         "Index": index
-                     },
-                     outputs={"Out": out},
-                     attrs={'overwrite': overwrite})
+    helper.append_op(
+        type="gather",
+        inputs={"X": input, "Index": index},
+        outputs={"Out": out},
+        attrs={'overwrite': overwrite},
+    )
     return out
 
 
@@ -8878,18 +9449,20 @@ def gather_nd(input, index, name=None):
         if _in_legacy_dygraph():
             return _legacy_C_ops.gather_nd(input, index)
     check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'], 'gather_np')
+        input,
+        'input',
+        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
+        'gather_np',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="gather_nd",
-                     inputs={
-                         "X": input,
-                         "Index": index
-                     },
-                     outputs={"Out": output})
+    helper.append_op(
+        type="gather_nd",
+        inputs={"X": input, "Index": index},
+        outputs={"Out": output},
+    )
     return output
 
 
@@ -8897,8 +9470,8 @@ def gather_nd(input, index, name=None):
 def scatter(input, index, updates, name=None, overwrite=True):
     """
     :alias_main: paddle.scatter
-	:alias: paddle.scatter,paddle.tensor.scatter,paddle.tensor.manipulation.scatter
-	:old_api: paddle.fluid.layers.scatter
+        :alias: paddle.scatter,paddle.tensor.scatter,paddle.tensor.manipulation.scatter
+        :old_api: paddle.fluid.layers.scatter
 
     **Scatter Layer**
 
@@ -8937,8 +9510,8 @@ def scatter(input, index, updates, name=None, overwrite=True):
         name(str, optional): The default value is None.  Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
         overwrite (bool): The mode that updating the output when there are same indices.
             If True, use the overwrite mode to update the output of the same index,
-	    if False, use the accumulate mode to update the output of the same index.
-	    Default value is True.
+            if False, use the accumulate mode to update the output of the same index.
+            Default value is True.
 
     Returns:
         Variable(Tensor|LoDTensor): The output is a Tensor with the same shape as input.
@@ -8974,14 +9547,12 @@ def scatter(input, index, updates, name=None, overwrite=True):
     helper = LayerHelper('scatter', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="scatter",
-                     inputs={
-                         "X": input,
-                         "Ids": index,
-                         "Updates": updates
-                     },
-                     attrs={'overwrite': overwrite},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="scatter",
+        inputs={"X": input, "Ids": index, "Updates": updates},
+        attrs={'overwrite': overwrite},
+        outputs={"Out": out},
+    )
     return out
 
 
@@ -9066,13 +9637,11 @@ def scatter_nd_add(ref, index, updates, name=None):
             helper = LayerHelper('scatter_nd_add', **locals())
             dtype = helper.input_dtype(input_param_name='ref')
             output = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type="scatter_nd_add",
-                             inputs={
-                                 "X": ref,
-                                 "Index": index,
-                                 "Updates": updates
-                             },
-                             outputs={"Out": output})
+            helper.append_op(
+                type="scatter_nd_add",
+                inputs={"X": ref, "Index": index, "Updates": updates},
+                outputs={"Out": output},
+            )
             return output
 
 
@@ -9149,9 +9718,9 @@ def random_crop(x, shape, seed=None):
 
     """
     helper = LayerHelper("random_crop", **locals())
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'uint8', 'int16', 'int32'],
-                             'random_crop')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'uint8', 'int16', 'int32'], 'random_crop'
+    )
     check_type(shape, 'shape', (list, Variable), 'random_crop')
     dtype = x.dtype
     out = helper.create_variable_for_type_inference(dtype)
@@ -9163,19 +9732,16 @@ def random_crop(x, shape, seed=None):
         seed = helper.create_variable(
             name=unique_name.generate("random_crop_seed"),
             dtype="int64",
-            persistable=True)
+            persistable=True,
+        )
     elif not isinstance(seed, Variable):
         raise ValueError("'seed' must be a Variable or an int.")
-    helper.append_op(type="random_crop",
-                     inputs={
-                         "X": x,
-                         "Seed": seed
-                     },
-                     outputs={
-                         "Out": out,
-                         "SeedOut": seed
-                     },
-                     attrs=op_attrs)
+    helper.append_op(
+        type="random_crop",
+        inputs={"X": x, "Seed": seed},
+        outputs={"Out": out, "SeedOut": seed},
+        attrs=op_attrs,
+    )
     return out
 
 
@@ -9246,8 +9812,7 @@ def relu(x, name=None):
                 out1 = fluid.layers.relu(x1)
                 print(out1.numpy())
                 # [[0.  0. ]
-                #  [1.  2.6]]
-"""
+                #  [1.  2.6]]"""
 
     if in_dygraph_mode():
         return _C_ops.relu(x)
@@ -9260,9 +9825,9 @@ def relu(x, name=None):
     helper = LayerHelper('relu', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="relu",
-                     inputs={"X": helper.input('x')},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}
+    )
     return out
 
 
@@ -9332,10 +9897,9 @@ def selu(x, scale=None, alpha=None, name=None):
     if alpha is not None:
         attrs["alpha"] = alpha
 
-    helper.append_op(type="selu",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type="selu", inputs={"X": x}, outputs={"Out": out}, attrs=attrs
+    )
     return out
 
 
@@ -9386,23 +9950,23 @@ def mean_iou(input, label, num_classes):
         return _legacy_C_ops.mean_iou(input, label, 'num_classes', num_classes)
 
     helper = LayerHelper('mean_iou', **locals())
-    check_variable_and_dtype(input, 'Predictions', ['int32', 'int64'],
-                             'mean_iou')
+    check_variable_and_dtype(
+        input, 'Predictions', ['int32', 'int64'], 'mean_iou'
+    )
     check_variable_and_dtype(label, 'Labels', ['int32', 'int64'], 'mean_iou')
     out_mean_iou = helper.create_variable_for_type_inference(dtype='float32')
     out_wrong = helper.create_variable_for_type_inference(dtype='int32')
     out_correct = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type="mean_iou",
-                     inputs={
-                         "Predictions": input,
-                         "Labels": label
-                     },
-                     outputs={
-                         "OutMeanIou": out_mean_iou,
-                         "OutWrong": out_wrong,
-                         "OutCorrect": out_correct
-                     },
-                     attrs={"num_classes": num_classes})
+    helper.append_op(
+        type="mean_iou",
+        inputs={"Predictions": input, "Labels": label},
+        outputs={
+            "OutMeanIou": out_mean_iou,
+            "OutWrong": out_wrong,
+            "OutCorrect": out_correct,
+        },
+        attrs={"num_classes": num_classes},
+    )
     return out_mean_iou, out_wrong, out_correct
 
 
@@ -9499,10 +10063,12 @@ def crop(x, shape=None, offsets=None, name=None):
     else:
         attrs['offsets'] = offsets
 
-    helper.append_op(type='crop',
-                     inputs=ipts,
-                     outputs={'Out': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='crop',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
@@ -9595,11 +10161,13 @@ def crop_tensor(x, shape=None, offsets=None, name=None):
 
     """
     helper = LayerHelper('crop_tensor', **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'crop_tensor')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'crop_tensor'
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'crop_tensor')
-    check_type(offsets, 'offsets', (list, tuple, Variable, type(None)),
-               'crop_tensor')
+    check_type(
+        offsets, 'offsets', (list, tuple, Variable, type(None)), 'crop_tensor'
+    )
 
     if offsets is None:
         offsets = [0] * len(x.shape)
@@ -9612,25 +10180,30 @@ def _attr_shape_check(shape_val):
         if not isinstance(shape_val, int):
             raise TypeError(
                 "Attr(shape)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(shape_val))
+                % type(shape_val)
+            )
         if shape_val == 0:
             raise ValueError(
                 "Attr(shape) of Op(crop_tensor) should not be zero, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
         if shape_val < -1:
             raise ValueError(
                 "When the element in Attr(shape) of Op(crop_tensor) is negative, only -1 is supported, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
 
     def _attr_offsets_check(offset_val):
         if not isinstance(offset_val, int):
             raise TypeError(
                 "Attr(offsets)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(offset_val))
+                % type(offset_val)
+            )
         if offset_val < 0:
             raise ValueError(
                 "Attr(offsets) of Op(crop_tensor) should be greater or equal to zero, but received: %s."
-                % str(offset_val))
+                % str(offset_val)
+            )
 
     if isinstance(offsets, Variable):
         offsets.stop_gradient = True
@@ -9671,11 +10244,9 @@ def _attr_offsets_check(offset_val):
             else:
                 _attr_shape_check(dim_size)
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 new_shape_tensor.append(temp_out)
                 shape_attr.append(dim_size)
         ipts['ShapeTensor'] = new_shape_tensor
@@ -9685,18 +10256,20 @@ def _attr_offsets_check(offset_val):
             _attr_shape_check(dim_size)
         attrs['shape'] = shape
 
-    helper.append_op(type='crop_tensor',
-                     inputs=ipts,
-                     outputs={'Out': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='crop_tensor',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
 def affine_grid(theta, out_shape, name=None):
     """
     :alias_main: paddle.nn.functional.affine_grid
-	:alias: paddle.nn.functional.affine_grid,paddle.nn.functional.vision.affine_grid
-	:old_api: paddle.fluid.layers.affine_grid
+        :alias: paddle.nn.functional.affine_grid,paddle.nn.functional.vision.affine_grid
+        :old_api: paddle.fluid.layers.affine_grid
 
     It generates a grid of (x,y) coordinates using the parameters of
     the affine transformation that correspond to a set of points where
@@ -9739,11 +10312,15 @@ def affine_grid(theta, out_shape, name=None):
     """
     helper = LayerHelper('affine_grid')
 
-    check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
-                             'affine_grid')
-
-    if not (isinstance(out_shape, list) or isinstance(out_shape, tuple) or \
-            isinstance(out_shape, Variable)):
+    check_variable_and_dtype(
+        theta, 'theta', ['float32', 'float64'], 'affine_grid'
+    )
+
+    if not (
+        isinstance(out_shape, list)
+        or isinstance(out_shape, tuple)
+        or isinstance(out_shape, Variable)
+    ):
         raise ValueError("The out_shape should be a list, tuple or Variable.")
 
     if not isinstance(theta, Variable):
@@ -9754,27 +10331,32 @@ def affine_grid(theta, out_shape, name=None):
     attrs = {}
     if isinstance(out_shape, Variable):
         ipts['OutputShape'] = out_shape
-        check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
-                                 'affine_grid')
+        check_variable_and_dtype(
+            out_shape, 'out_shape', ['int32'], 'affine_grid'
+        )
     else:
         attrs['output_shape'] = out_shape
     if core.is_compiled_with_rocm():
         # ROCM platform do not have MIOPEN kernel for affine_grid
         attrs['use_cudnn'] = False
 
-    helper.append_op(type='affine_grid',
-                     inputs=ipts,
-                     outputs={'Output': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
-def pad2d(input,
-          paddings=[0, 0, 0, 0],
-          mode='constant',
-          pad_value=0.0,
-          data_format="NCHW",
-          name=None):
+def pad2d(
+    input,
+    paddings=[0, 0, 0, 0],
+    mode='constant',
+    pad_value=0.0,
+    data_format="NCHW",
+    name=None,
+):
     """
 
     Pad 2-d images according to 'paddings' and 'mode'.
@@ -9788,10 +10370,10 @@ def pad2d(input,
             Otherwise, it is a 1-D Tensor with shape [4]. Data type is int32.
             Default is [0, 0, 0, 0].
         mode (str): Three modes: 'constant' (default), 'reflect', 'edge' .
-        	When in 'constant' mode, this op uses a constant value to pad the input tensor.
-        	When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
-        	When in 'edge' mode, uses input boundaries to pad the input tensor.
-        	Default is 'constant'
+                When in 'constant' mode, this op uses a constant value to pad the input tensor.
+                When in 'reflect' mode, uses reflection of the input boundaries to pad the input tensor.
+                When in 'edge' mode, uses input boundaries to pad the input tensor.
+                Default is 'constant'
         pad_value (float32): The value to fill the padded areas in 'constant' mode . Default is 0.0
         data_format (str): An string from: "NHWC", "NCHW". Specify the data format of
                            the input data.
@@ -9799,7 +10381,7 @@ def pad2d(input,
         name (str, optional) : The default value is None.  Normally there is no need for
                     user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
 
-    Returns: 
+    Returns:
         Tensor, a 4-D Tensor padded according to paddings and mode and data type is same as input.
 
     Examples:
@@ -9862,15 +10444,29 @@ def pad2d(input,
             #    [2. 1. 2. 3. 2.]]]]
     """
     if _non_static_mode():
-        _paddings = paddings.numpy().tolist() if isinstance(
-            paddings, Variable) else paddings
-        return _legacy_C_ops.pad2d(input, 'mode', mode, 'pad_value', pad_value,
-                                   'data_format', data_format, 'paddings',
-                                   _paddings)
+        _paddings = (
+            paddings.numpy().tolist()
+            if isinstance(paddings, Variable)
+            else paddings
+        )
+        return _legacy_C_ops.pad2d(
+            input,
+            'mode',
+            mode,
+            'pad_value',
+            pad_value,
+            'data_format',
+            data_format,
+            'paddings',
+            _paddings,
+        )
 
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        "pad2d")
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        "pad2d",
+    )
 
     attrs = {'mode': mode, 'pad_value': pad_value, 'data_format': data_format}
     inputs = {'X': [input]}
@@ -9882,16 +10478,18 @@ def pad2d(input,
 
     helper = LayerHelper('pad2d', **locals())
 
-    assert mode in ['reflect', 'edge', 'constant'
-                    ], "mode should be one of constant, reflect, edge."
+    assert mode in [
+        'reflect',
+        'edge',
+        'constant',
+    ], "mode should be one of constant, reflect, edge."
 
     dtype = helper.input_dtype(input_param_name='input')
     out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type='pad2d',
-                     inputs=inputs,
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type='pad2d', inputs=inputs, outputs={"Out": out}, attrs=attrs
+    )
 
     return out
 
@@ -9900,8 +10498,8 @@ def pad2d(input,
 def elu(x, alpha=1.0, name=None):
     """
     :alias_main: paddle.nn.functional.elu
-	:alias: paddle.nn.functional.elu,paddle.nn.functional.activation.elu
-	:old_api: paddle.fluid.layers.elu
+        :alias: paddle.nn.functional.elu,paddle.nn.functional.activation.elu
+        :old_api: paddle.fluid.layers.elu
 
     ${comment}
     Args:
@@ -9930,10 +10528,12 @@ def elu(x, alpha=1.0, name=None):
     helper = LayerHelper('elu', **locals())
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='elu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': alpha})
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha},
+    )
     return out
 
 
@@ -9971,13 +10571,15 @@ def relu6(x, threshold=6.0, name=None):
 
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='relu6',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'threshold': threshold,
-                         'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"]
-                     })
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={
+            'threshold': threshold,
+            'use_mkldnn': _global_flags()["FLAGS_use_mkldnn"],
+        },
+    )
     return out
 
 
@@ -10014,7 +10616,8 @@ def pow(x, factor=1.0, name=None):
             # y_2 is x^{3.0}
     """
     check_variable_and_dtype(
-        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow')
+        x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], 'pow'
+    )
 
     helper = LayerHelper('pow', **locals())
     inputs = {'X': x}
@@ -10027,10 +10630,9 @@ def pow(x, factor=1.0, name=None):
         attrs['factor'] = factor
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='pow',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='pow', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -10070,13 +10672,12 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
 
     helper = LayerHelper('stanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='stanh',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'scale_a': scale_a,
-                         'scale_b': scale_b
-                     })
+    helper.append_op(
+        type='stanh',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale_a': scale_a, 'scale_b': scale_b},
+    )
     return out
 
 
@@ -10109,18 +10710,18 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
     if _non_static_mode():
         return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hard_sigmoid')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hard_sigmoid'
+    )
 
     helper = LayerHelper('hard_sigmoid', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='hard_sigmoid',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'slope': slope,
-                         'offset': offset
-                     })
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': slope, 'offset': offset},
+    )
     return out
 
 
@@ -10128,8 +10729,8 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None):
 def swish(x, beta=1.0, name=None):
     r"""
     :alias_main: paddle.nn.functional.swish
-	:alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
-	:old_api: paddle.fluid.layers.swish
+        :alias: paddle.nn.functional.swish,paddle.nn.functional.activation.swish
+        :old_api: paddle.fluid.layers.swish
 
     Elementwise swish activation function. See `Searching for Activation Functions <https://arxiv.org/abs/1710.05941>`_ for more details.
 
@@ -10201,16 +10802,19 @@ def swish(x, beta=1.0, name=None):
 
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='swish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'slope': beta})
+    helper.append_op(
+        type='swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': beta},
+    )
     return out
 
 
 @deprecated(since="2.0.0", update_to="paddle.static.nn.prelu")
 def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
     r"""
+
     prelu activation.
 
     .. math::
@@ -10225,26 +10829,19 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
         element: All elements do not share alpha. Each element has its own alpha.
 
     Parameters:
-    
         x (Tensor): The input Tensor or LoDTensor with data type float32.
-
         mode (str): The mode for weight sharing.
-
-        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable \
-        weight (alpha), it can be create by ParamAttr. None by default. \
-        For detailed information, please refer to :ref:`api_fluid_ParamAttr`.
-
-        name (str, optional): Name for the operation (optional, default is None). \
-        For more information, please refer to :ref:`api_guide_Name`.
-        
+        param_attr (ParamAttr|None, optional): The parameter attribute for the learnable
+            weight (alpha), it can be create by ParamAttr. None by default.
         data_format(str, optional): Data format that specifies the layout of input.
             It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: A tensor with the same shape and data type as x.
+        Tensor, A tensor with the same shape and data type as x.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
@@ -10265,52 +10862,57 @@ def prelu(x, mode, param_attr=None, data_format="NCHW", name=None):
     if mode == 'channel':
 
         true_data_format = [
-            'NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC'
+            'NC',
+            'NCL',
+            'NCHW',
+            'NCDHW',
+            'NLC',
+            'NHWC',
+            'NDHWC',
         ]
         if data_format not in true_data_format:
             raise ValueError(
                 "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+            )
 
         data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
-        assert len(
-            x.shape
-        ) >= 2, "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
-        #NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
+        assert (
+            len(x.shape) >= 2
+        ), "The size of input shape should be equal or larger than 2 in prelu() when mode is 'channel'"
+        # NOTE(zhiqiu): The alpha_shape should be [1, channel] + [1] * len(x.shape[2:]).
         # To be consistent with Prelu, it is simplified.
-        #NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
-        #NOTE(GuoxiaWang): support NHWC data format
+        # NOTE(zhiqiu): Revert shape to [1, channel, 1, 1] for compatibility with saved model of old version.
+        # NOTE(GuoxiaWang): support NHWC data format
         if data_format == 'NHWC':
             alpha_shape = [1, 1, 1, x.shape[-1]]
         else:
             alpha_shape = [1, x.shape[1], 1, 1]
 
     elif mode == 'element':
-        assert len(
-            x.shape
-        ) >= 1, "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
+        assert (
+            len(x.shape) >= 1
+        ), "The size of input shape should be equal or larger than 1 in prelu() when mode is 'element'"
         alpha_shape = [1] + list(x.shape)[1:]
     dtype = helper.input_dtype(input_param_name='x')
-    alpha = helper.create_parameter(attr=helper.param_attr,
-                                    shape=alpha_shape,
-                                    dtype=dtype,
-                                    is_bias=False,
-                                    default_initializer=Constant(0.25))
+    alpha = helper.create_parameter(
+        attr=helper.param_attr,
+        shape=alpha_shape,
+        dtype=dtype,
+        is_bias=False,
+        default_initializer=Constant(0.25),
+    )
     if in_dygraph_mode():
         return _C_ops.prelu(x, alpha, data_format, mode)
 
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="prelu",
-                     inputs={
-                         "X": x,
-                         'Alpha': alpha
-                     },
-                     attrs={
-                         "mode": mode,
-                         "data_format": data_format
-                     },
-                     outputs={"Out": out})
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x, 'Alpha': alpha},
+        attrs={"mode": mode, "data_format": data_format},
+        outputs={"Out": out},
+    )
     return out
 
 
@@ -10351,13 +10953,12 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None):
 
     helper = LayerHelper('brelu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='brelu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         't_min': t_min,
-                         't_max': t_max
-                     })
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': t_min, 't_max': t_max},
+    )
     return out
 
 
@@ -10424,15 +11025,18 @@ def soft_relu(x, threshold=40.0, name=None):
             res = exe.run(fluid.default_main_program(), feed={'x':img}, fetch_list=[output])
             print(res) # [array([[0.6931472, 1.3132616], [2.126928 , 3.0485873]], dtype=float32)]
     """
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'soft_relu')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'soft_relu'
+    )
 
     helper = LayerHelper('soft_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='soft_relu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='soft_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -10501,8 +11105,11 @@ def flatten(x, axis=1, name=None):
             # out shape is [16, 3]
     """
     check_variable_and_dtype(
-        x, 'x', ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
-        'flatten')
+        x,
+        'x',
+        ['float32', 'float64', 'int8', 'int32', 'int64', 'uint8'],
+        'flatten',
+    )
     if _non_static_mode():
         return _legacy_C_ops.flatten2(x, 'axis', axis)[0]
 
@@ -10516,13 +11123,12 @@ def flatten(x, axis=1, name=None):
 
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='flatten2',
-                     inputs={"X": x},
-                     outputs={
-                         'Out': out,
-                         'XShape': x_shape
-                     },
-                     attrs={"axis": axis})
+    helper.append_op(
+        type='flatten2',
+        inputs={"X": x},
+        outputs={'Out': out, 'XShape': x_shape},
+        attrs={"axis": axis},
+    )
     return out
 
 
@@ -10582,10 +11188,10 @@ def stack(x, axis=0, name=None):
                                      Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
                                      Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
-                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
         name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
-    
+
 
     Returns:
         Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`.
@@ -10616,42 +11222,53 @@ def stack(x, axis=0, name=None):
     if not isinstance(x, list) and not isinstance(x, tuple):
         # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
         # In that case, Variable is array of tensors indeed.
-        if isinstance(x, Variable) and x.desc.type(
-        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        if (
+            isinstance(x, Variable)
+            and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
             x = [x]
         else:
             raise TypeError(
-                "The type of '%s' in %s must be %s, but received %s" %
-                ('x', 'stack', 'list[Tensor], tuple[Tensor] or TensorArray',
-                 type(x)))
+                "The type of '%s' in %s must be %s, but received %s"
+                % (
+                    'x',
+                    'stack',
+                    'list[Tensor], tuple[Tensor] or TensorArray',
+                    type(x),
+                )
+            )
 
     helper = LayerHelper('stack', **locals())
 
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
-                            "number of the elements must be 1, but received %s." % len(x)
+        assert len(x) == 1, (
+            "If the elements of 'x' in stack are Variable(LoDTensorArray), "
+            "number of the elements must be 1, but received %s." % len(x)
+        )
         out_index = helper.create_variable_for_type_inference(dtype="int32")
 
         for i in x:
-            check_variable_and_dtype(i, 'x', \
-                ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
-
-        helper.append_op(type='tensor_array_to_tensor',
-                         inputs={'X': x[0]},
-                         outputs={
-                             'Out': [out],
-                             'OutIndex': [out_index]
-                         },
-                         attrs={
-                             'axis': axis,
-                             'use_stack': True
-                         })
+            check_variable_and_dtype(
+                i,
+                'x',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'stack',
+            )
+
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': x[0]},
+            outputs={'Out': [out], 'OutIndex': [out_index]},
+            attrs={'axis': axis, 'use_stack': True},
+        )
     else:
-        helper.append_op(type='stack',
-                         inputs={'X': x},
-                         outputs={'Y': out},
-                         attrs={'axis': axis})
+        helper.append_op(
+            type='stack',
+            inputs={'X': x},
+            outputs={'Y': out},
+            attrs={'axis': axis},
+        )
 
     return out
 
@@ -10715,21 +11332,12 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
     out = helper.create_variable_for_type_inference(dtype=ins.dtype)
     loss_weight = helper.create_variable_for_type_inference(dtype=np.float64)
     mmap = helper.create_variable_for_type_inference(dtype=ins_tag.dtype)
-    helper.append_op(type='filter_by_instag',
-                     inputs={
-                         'Ins': ins,
-                         'Ins_tag': ins_tag,
-                         'Filter_tag': filter_tag
-                     },
-                     outputs={
-                         'Out': out,
-                         'LossWeight': loss_weight,
-                         'IndexMap': mmap
-                     },
-                     attrs={
-                         'is_lod': is_lod,
-                         'out_val_if_empty': out_val_if_empty
-                     })
+    helper.append_op(
+        type='filter_by_instag',
+        inputs={'Ins': ins, 'Ins_tag': ins_tag, 'Filter_tag': filter_tag},
+        outputs={'Out': out, 'LossWeight': loss_weight, 'IndexMap': mmap},
+        attrs={'is_lod': is_lod, 'out_val_if_empty': out_val_if_empty},
+    )
 
     return [out, loss_weight]
 
@@ -10737,8 +11345,8 @@ def filter_by_instag(ins, ins_tag, filter_tag, is_lod, out_val_if_empty=0):
 def unstack(x, axis=0, num=None):
     """
     :alias_main: paddle.unstack
-	:alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
-	:old_api: paddle.fluid.layers.unstack
+        :alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
+        :old_api: paddle.fluid.layers.unstack
 
     **UnStack Layer**
 
@@ -10787,13 +11395,12 @@ def unstack(x, axis=0, num=None):
     for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
-    helper.append_op(type='unstack',
-                     inputs={'X': [x]},
-                     outputs={'Y': outs},
-                     attrs={
-                         'axis': axis,
-                         'num': num
-                     })
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis, 'num': num},
+    )
     return outs
 
 
@@ -10801,8 +11408,8 @@ def unstack(x, axis=0, num=None):
 def expand(x, expand_times, name=None):
     """
     :alias_main: paddle.expand
-	:alias: paddle.expand,paddle.tensor.expand,paddle.tensor.manipulation.expand
-	:old_api: paddle.fluid.layers.expand
+        :alias: paddle.expand,paddle.tensor.expand,paddle.tensor.manipulation.expand
+        :old_api: paddle.fluid.layers.expand
 
     This operation tiles ``x`` multiple times according to the parameter ``expand_times``.
     The times number for each dimension of ``x`` is set by the parameter ``expand_times``.
@@ -10876,12 +11483,16 @@ def expand(x, expand_times, name=None):
     inputs = {"X": [x]}
     attrs = {}
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'expand')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand',
+    )
     check_type(expand_times, 'expand_times', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == True:
         raise ValueError(
-            "expand op bool date type must set the stop_gradient to be False")
+            "expand op bool date type must set the stop_gradient to be False"
+        )
 
     helper = LayerHelper('expand', input=x, **locals())
 
@@ -10892,8 +11503,9 @@ def get_attr_expand_times(list_expand_times):
                 attrs_expand_times.append(-1)
             else:
                 attrs_expand_times.append(times)
-                assert times > 0, (
-                    "Each element given in expand_times must not be negative.")
+                assert (
+                    times > 0
+                ), "Each element given in expand_times must not be negative."
         return attrs_expand_times
 
     if isinstance(expand_times, Variable):
@@ -10903,14 +11515,14 @@ def get_attr_expand_times(list_expand_times):
         attrs['expand_times'] = get_attr_expand_times(expand_times)
         if utils._contain_var(expand_times):
             inputs['expand_times_tensor'] = utils._convert_to_tensor_list(
-                expand_times)
+                expand_times
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='expand', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -10918,9 +11530,9 @@ def get_attr_expand_times(list_expand_times):
 def expand_as(x, target_tensor, name=None):
     """
     :alias_main: paddle.expand_as
-	:alias: paddle.expand_as,paddle.tensor.expand_as,paddle.tensor.manipulation.expand_as
-	:old_api: paddle.fluid.layers.expand_as
-    
+        :alias: paddle.expand_as,paddle.tensor.expand_as,paddle.tensor.manipulation.expand_as
+        :old_api: paddle.fluid.layers.expand_as
+
     expand_as operator tiles to the input by given expand tensor. You should set expand tensor
     for each dimension by providing tensor 'target_tensor'. The rank of X
     should be in [1, 6]. Please note that size of 'target_tensor' must be the same
@@ -10985,12 +11597,15 @@ def expand_as(x, target_tensor, name=None):
     if _non_static_mode():
         return _legacy_C_ops.expand_as(x, target_tensor)
 
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'int32', 'int64', 'bool'],
-                             'expand_as')
-    check_variable_and_dtype(target_tensor, 'target_tensor',
-                             ['float32', 'float64', 'int32', 'int64', 'bool'],
-                             'expand_as')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64', 'bool'], 'expand_as'
+    )
+    check_variable_and_dtype(
+        target_tensor,
+        'target_tensor',
+        ['float32', 'float64', 'int32', 'int64', 'bool'],
+        'expand_as',
+    )
     helper = LayerHelper('expand_as', input=x, **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
@@ -11004,14 +11619,16 @@ def expand_as(x, target_tensor, name=None):
 
 @deprecated(since='1.8.0', update_to="paddle.uniform")
 @templatedoc()
-def uniform_random_batch_size_like(input,
-                                   shape,
-                                   dtype='float32',
-                                   input_dim_idx=0,
-                                   output_dim_idx=0,
-                                   min=-1.0,
-                                   max=1.0,
-                                   seed=0):
+def uniform_random_batch_size_like(
+    input,
+    shape,
+    dtype='float32',
+    input_dim_idx=0,
+    output_dim_idx=0,
+    min=-1.0,
+    max=1.0,
+    seed=0,
+):
     """
     This OP initializes a variable with random values sampled from a
     uniform distribution in the range [min, max). The input_dim_idx used to get the input dimension value which will be used to resize the output dimension.
@@ -11074,39 +11691,46 @@ def uniform_random_batch_size_like(input,
 
 
     """
-    check_variable_and_dtype(input, 'Input', ("float32", 'float64', "uint16"),
-                             'uniform_random_batch_size_like')
+    check_variable_and_dtype(
+        input,
+        'Input',
+        ("float32", 'float64', "uint16"),
+        'uniform_random_batch_size_like',
+    )
     check_type(shape, 'shape', (list, tuple), 'uniform_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ('float32', 'float64', "uint16"),
-                'uniform_random_batch_size_like')
+    check_dtype(
+        dtype,
+        'dtype',
+        ('float32', 'float64', "uint16"),
+        'uniform_random_batch_size_like',
+    )
 
     helper = LayerHelper('uniform_random_batch_size_like', **locals())
     out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(type='uniform_random_batch_size_like',
-                     inputs={'Input': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'shape': shape,
-                         'input_dim_idx': input_dim_idx,
-                         'output_dim_idx': output_dim_idx,
-                         'min': min,
-                         'max': max,
-                         'seed': seed,
-                         'dtype': c_dtype
-                     })
+    helper.append_op(
+        type='uniform_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'min': min,
+            'max': max,
+            'seed': seed,
+            'dtype': c_dtype,
+        },
+    )
 
     return out
 
 
 @deprecated(since="2.0.0", update_to="paddle.normal")
 @templatedoc()
-def gaussian_random(shape,
-                    mean=0.0,
-                    std=1.0,
-                    seed=0,
-                    dtype='float32',
-                    name=None):
+def gaussian_random(
+    shape, mean=0.0, std=1.0, seed=0, dtype='float32', name=None
+):
     """
     This OP returns a Tensor filled with random values sampled from a Gaussian
     distribution, with ``shape`` and ``dtype``.
@@ -11162,21 +11786,21 @@ def gaussian_random(shape,
             # result_3 is:
             # [[-0.12310527,  0.8187662,   1.923219  ]
             #  [ 0.70721835,  0.5210541,  -0.03214082]]
-       
+
        .. code-block:: python
-       
+
            # declarative mode
            # required: skiptest
            import numpy as np
            from paddle import fluid
-   
+
            x = fluid.layers.gaussian_random((2, 3), std=2., seed=10)
-   
+
            place = fluid.CPUPlace()
            exe = fluid.Executor(place)
            start = fluid.default_startup_program()
            main = fluid.default_main_program()
-   
+
            exe.run(start)
            x_np, = exe.run(main, feed={}, fetch_list=[x])
 
@@ -11190,11 +11814,11 @@ def gaussian_random(shape,
            import numpy as np
            from paddle import fluid
            import paddle.fluid.dygraph as dg
-    
+
            place = fluid.CPUPlace()
            with dg.guard(place) as g:
                x = fluid.layers.gaussian_random((2, 4), mean=2., dtype="float32", seed=10)
-               x_np = x.numpy()       
+               x_np = x.numpy()
            x_np
            # array([[2.3060477 , 2.676496  , 3.9911983 , 0.9990833 ],
            #        [2.8675377 , 2.2279181 , 0.79029655, 2.8447366 ]], dtype=float32)
@@ -11205,15 +11829,24 @@ def gaussian_random(shape,
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
         place = _current_expected_place()
-        return _C_ops.gaussian_random(shape, float(mean), float(std), seed,
-                                      dtype, place)
+        return _C_ops.gaussian_random(
+            shape, float(mean), float(std), seed, dtype, place
+        )
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _legacy_C_ops.gaussian_random('shape', shape,
-                                             'mean', float(mean), 'std',
-                                             float(std), 'seed', seed, 'dtype',
-                                             dtype)
+        return _legacy_C_ops.gaussian_random(
+            'shape',
+            shape,
+            'mean',
+            float(mean),
+            'std',
+            float(std),
+            'seed',
+            seed,
+            'dtype',
+            dtype,
+        )
 
     check_type(shape, 'shape', (list, tuple, Variable), 'gaussian_random/randn')
     check_dtype(dtype, 'dtype', ['float32', 'float64'], 'gaussian_random/randn')
@@ -11224,19 +11857,17 @@ def gaussian_random(shape,
         'std': std,
         'seed': seed,
         'dtype': dtype,
-        'use_mkldnn': False
+        'use_mkldnn': False,
     }
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='gaussian_random/randn')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='gaussian_random/randn'
+    )
 
     helper = LayerHelper('gaussian_random', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='gaussian_random',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='gaussian_random', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
 
     return out
 
@@ -11270,28 +11901,28 @@ def sampling_id(x, min=0.0, max=1.0, seed=0, dtype='float32'):
 
     helper = LayerHelper('sampling_id', **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='sampling_id',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'min': min,
-                         'max': max,
-                         'seed': seed
-                     })
+    helper.append_op(
+        type='sampling_id',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'min': min, 'max': max, 'seed': seed},
+    )
 
     return out
 
 
 @deprecated(since='1.8.0', update_to="paddle.normal")
 @templatedoc()
-def gaussian_random_batch_size_like(input,
-                                    shape,
-                                    input_dim_idx=0,
-                                    output_dim_idx=0,
-                                    mean=0.0,
-                                    std=1.0,
-                                    seed=0,
-                                    dtype='float32'):
+def gaussian_random_batch_size_like(
+    input,
+    shape,
+    input_dim_idx=0,
+    output_dim_idx=0,
+    mean=0.0,
+    std=1.0,
+    seed=0,
+    dtype='float32',
+):
     """
     ${comment}
 
@@ -11322,26 +11953,40 @@ def gaussian_random_batch_size_like(input,
     """
 
     helper = LayerHelper('gaussian_random_batch_size_like', **locals())
-    check_type(input, 'input', (Variable),
-               'fluid.layers.gaussian_random_batch_size_like')
-    check_type(shape, 'shape', (list, tuple),
-               'fluid.layers.gaussian_random_batch_size_like')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'int'],
-                'fluid.layers.gaussian_random_batch_size_like')
+    check_type(
+        input,
+        'input',
+        (Variable),
+        'fluid.layers.gaussian_random_batch_size_like',
+    )
+    check_type(
+        shape,
+        'shape',
+        (list, tuple),
+        'fluid.layers.gaussian_random_batch_size_like',
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        ['float16', 'float32', 'int'],
+        'fluid.layers.gaussian_random_batch_size_like',
+    )
     out = helper.create_variable_for_type_inference(dtype)
     c_dtype = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(type='gaussian_random_batch_size_like',
-                     inputs={'Input': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'shape': shape,
-                         'input_dim_idx': input_dim_idx,
-                         'output_dim_idx': output_dim_idx,
-                         'mean': mean,
-                         'std': std,
-                         'seed': seed,
-                         'dtype': c_dtype
-                     })
+    helper.append_op(
+        type='gaussian_random_batch_size_like',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        attrs={
+            'shape': shape,
+            'input_dim_idx': input_dim_idx,
+            'output_dim_idx': output_dim_idx,
+            'mean': mean,
+            'std': std,
+            'seed': seed,
+            'dtype': c_dtype,
+        },
+    )
 
     return out
 
@@ -11453,7 +12098,7 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
-    
+
     Args:
         input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
@@ -11500,7 +12145,8 @@ def slice(input, axes, starts, ends):
             axes = list(axes)
             if len(axes) == 0:
                 raise ValueError(
-                    "Input axes should not be an empty list/tuple.")
+                    "Input axes should not be an empty list/tuple."
+                )
             for i in range(len(axes)):
                 if axes[i] < 0:
                     axes[i] = max(0, axes[i] + len(input.shape))
@@ -11509,8 +12155,10 @@ def slice(input, axes, starts, ends):
 
         else:
             raise ValueError(
-                "Input axes must be a python list or tuple, but reveived {}".
-                format(type(axes)))
+                "Input axes must be a python list or tuple, but reveived {}".format(
+                    type(axes)
+                )
+            )
 
         infer_flags = list(1 for i in range(len(axes)))
 
@@ -11518,7 +12166,8 @@ def slice(input, axes, starts, ends):
         if isinstance(starts, (list, tuple)):
             starts = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item
+                if isinstance(item, tmp_tensor_type)
+                else item
                 for item in starts
             ]
         elif isinstance(starts, tmp_tensor_type):
@@ -11528,7 +12177,9 @@ def slice(input, axes, starts, ends):
         if isinstance(ends, (list, tuple)):
             ends = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in ends
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in ends
             ]
             attrs += ('ends', ends)
         elif isinstance(ends, tmp_tensor_type):
@@ -11546,7 +12197,8 @@ def slice(input, axes, starts, ends):
                 axes = list(axes)
                 if len(axes) == 0:
                     raise ValueError(
-                        "Input axes should not be an empty list/tuple.")
+                        "Input axes should not be an empty list/tuple."
+                    )
                 for i in range(len(axes)):
                     if axes[i] < 0:
                         axes[i] = max(0, axes[i] + len(input.shape))
@@ -11555,8 +12207,10 @@ def slice(input, axes, starts, ends):
 
             else:
                 raise ValueError(
-                    "Input axes must be a python list or tuple, but reveived {}"
-                    .format(type(axes)))
+                    "Input axes must be a python list or tuple, but reveived {}".format(
+                        type(axes)
+                    )
+                )
 
             infer_flags = list(1 for i in range(len(axes)))
 
@@ -11565,7 +12219,8 @@ def slice(input, axes, starts, ends):
             if isinstance(starts, (list, tuple)):
                 starts = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in starts
                 ]
                 attrs += ('starts', starts)
@@ -11577,7 +12232,8 @@ def slice(input, axes, starts, ends):
             if isinstance(ends, (list, tuple)):
                 ends = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in ends
                 ]
                 attrs += ('ends', ends)
@@ -11586,16 +12242,27 @@ def slice(input, axes, starts, ends):
                 ends_tensor.stop_gradient = True
                 infer_flags = list(-1 for i in range(len(axes)))
 
-            return _legacy_C_ops.slice(input, starts_tensor, ends_tensor, None,
-                                       None, 'axes', axes, 'infer_flags',
-                                       infer_flags, *attrs)
+            return _legacy_C_ops.slice(
+                input,
+                starts_tensor,
+                ends_tensor,
+                None,
+                None,
+                'axes',
+                axes,
+                'infer_flags',
+                infer_flags,
+                *attrs,
+            )
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
-            "Input starts must be an Variable, python list or tuple.")
+            "Input starts must be an Variable, python list or tuple."
+        )
     if not isinstance(ends, (list, tuple, Variable)):
         raise ValueError(
-            "Input ends must be an Variable, python list or tuple.")
+            "Input ends must be an Variable, python list or tuple."
+        )
 
     helper = LayerHelper('slice', **locals())
 
@@ -11642,11 +12309,11 @@ def slice(input, axes, starts, ends):
     # infer_flags
     attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(type='slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('input')
+    )
+    helper.append_op(
+        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
@@ -11655,8 +12322,8 @@ def slice(input, axes, starts, ends):
 def strided_slice(input, axes, starts, ends, strides):
     """
     :alias_main: paddle.strided_slice
-	:alias: paddle.strided_slice,paddle.tensor.strided_slice,paddle.tensor.manipulation.strided_slice
-	:old_api: paddle.fluid.layers.strided_slice
+        :alias: paddle.strided_slice,paddle.tensor.strided_slice,paddle.tensor.manipulation.strided_slice
+        :old_api: paddle.fluid.layers.strided_slice
 
     This operator produces a slice of ``input`` along multiple axes. Similar to numpy:
     https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html
@@ -11757,9 +12424,12 @@ def strided_slice(input, axes, starts, ends, strides):
 
     helper = LayerHelper('strided_slice', **locals())
 
-    check_variable_and_dtype(input, 'input',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'strided_slice')
+    check_variable_and_dtype(
+        input,
+        'input',
+        ['bool', 'float32', 'float64', 'int32', 'int64'],
+        'strided_slice',
+    )
     check_type(axes, 'axes', (list, tuple), 'strided_slice')
     check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
     check_type(ends, 'ends', (list, tuple, Variable), 'strided_slice')
@@ -11767,8 +12437,9 @@ def strided_slice(input, axes, starts, ends, strides):
 
     def check_list_elements_dtype(list_input, input_name):
         if isinstance(list_input, Variable):
-            check_dtype(list_input.dtype, input_name, ['int32'],
-                        'strided_slice')
+            check_dtype(
+                list_input.dtype, input_name, ['int32'], 'strided_slice'
+            )
         else:
             for i, var in enumerate(list_input):
                 var_name = input_name + '[' + str(i) + ']'
@@ -11787,7 +12458,7 @@ def get_new_list_tensor(old_list):
                 dim.stop_gradient = True
                 new_list_tensor.append(dim)
             else:
-                assert (isinstance(dim, int))
+                assert isinstance(dim, int)
                 temp_out = helper.create_variable_for_type_inference('int32')
                 fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
                 new_list_tensor.append(temp_out)
@@ -11804,7 +12475,7 @@ def get_new_list_tensor(old_list):
             'starts': starts,
             'ends': ends,
             'strides': strides,
-            'infer_flags': infer_flags
+            'infer_flags': infer_flags,
         }
     else:
         # starts
@@ -11859,11 +12530,11 @@ def get_new_list_tensor(old_list):
                 attrs['strides'] = strides
         attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(type='strided_slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('input')
+    )
+    helper.append_op(
+        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
@@ -11871,8 +12542,8 @@ def get_new_list_tensor(old_list):
 def shape(input):
     """
     :alias_main: paddle.shape
-	:alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape
-	:old_api: paddle.fluid.layers.shape
+        :alias: paddle.shape,paddle.tensor.shape,paddle.tensor.attribute.shape
+        :old_api: paddle.fluid.layers.shape
 
     **Shape Layer**
 
@@ -11930,16 +12601,29 @@ def shape(input):
         out.stop_gradient = True
         return out
 
-    check_variable_and_dtype(input, 'input', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'shape')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'shape',
+    )
     helper = LayerHelper('shape', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type='shape',
-                     inputs={'Input': input},
-                     outputs={'Out': out},
-                     stop_gradient=True)
+    helper.append_op(
+        type='shape',
+        inputs={'Input': input},
+        outputs={'Out': out},
+        stop_gradient=True,
+    )
 
     return out
 
@@ -11987,7 +12671,7 @@ def size(input):
 
     Raises:
         TypeError: ``input`` must be a Tensor and the data type of ``input`` must be one of bool, float16, float32, float64, int32, int64.
-    
+
     Examples:
         .. code-block:: python
 
@@ -12007,8 +12691,11 @@ def size(input):
         return _legacy_C_ops.size(input)
 
     check_variable_and_dtype(
-        input, 'input',
-        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], "size")
+        input,
+        'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        "size",
+    )
     helper = LayerHelper('size', **locals())
     out = helper.create_variable_for_type_inference(dtype='int64')
     helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
@@ -12024,33 +12711,35 @@ def _elementwise_op(helper):
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
     check_variable_and_dtype(
-        x, 'x', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
-        op_type)
+        x,
+        'x',
+        ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type,
+    )
     check_variable_and_dtype(
-        y, 'y', ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
-        op_type)
+        y,
+        'y',
+        ['float16', 'uint16', 'float32', 'float64', 'int32', 'int64'],
+        op_type,
+    )
 
     axis = helper.kwargs.get('axis', -1)
     use_mkldnn = helper.kwargs.get('use_mkldnn', False)
     name = helper.kwargs.get('name', None)
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'axis': axis,
-                         'use_mkldnn': use_mkldnn
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={'axis': axis, 'use_mkldnn': use_mkldnn},
+    )
     return helper.append_activation(out)
 
 
 def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     """
-    
+
     Putting scale and bias to the input Tensor as following:
 
     ``bias_after_scale`` is True:
@@ -12075,9 +12764,9 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         Tensor: Output tensor of scale operator, with shape and data type same as input.
 
     Examples:
-    
+
         .. code-block:: python
-            
+
             # scale as a float32 number
             import paddle
 
@@ -12100,15 +12789,33 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
         return dygraph_utils._append_activation_in_dygraph(out)
     if _non_static_mode():
         _scale = scale.numpy().item(0) if isinstance(scale, Variable) else scale
-        out = _legacy_C_ops.scale(x, 'scale', float(_scale), 'bias',
-                                  float(bias), 'bias_after_scale',
-                                  bias_after_scale)
+        out = _legacy_C_ops.scale(
+            x,
+            'scale',
+            float(_scale),
+            'bias',
+            float(bias),
+            'bias_after_scale',
+            bias_after_scale,
+        )
         return dygraph_utils._append_activation_in_dygraph(out)
 
-    check_variable_and_dtype(x, "x", [
-        'float16', 'uint16', 'float32', 'float64', 'int8', 'int16', 'int32',
-        'int64', 'uint8'
-    ], "scale")
+    check_variable_and_dtype(
+        x,
+        "x",
+        [
+            'float16',
+            'uint16',
+            'float32',
+            'float64',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+        ],
+        "scale",
+    )
     inputs = {'X': [x]}
     attrs = {
         'bias': float(bias),
@@ -12121,91 +12828,90 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
     helper = LayerHelper('scale', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='scale',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='scale', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return helper.append_activation(out)
 
 
 def elementwise_add(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_add(x, y)
-        # z = x + y
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_add(x, y)
+            # z = x + y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [3., 8., 6.]
+            print(z_value) # [3., 8., 6.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_add(x, y, axis=1)
-        # z = x + y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_add(x, y, axis=1)
+            # z = x + y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_add(x, y, axis=3)
-        # z = x + y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_add(x, y, axis=3)
+            # z = x + y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
@@ -12215,7 +12921,8 @@ def gen_data():
             axis=axis,
             act=act,
             op_name='elementwise_add',
-            use_mkldnn=_global_flags()["FLAGS_use_mkldnn"])
+            use_mkldnn=_global_flags()["FLAGS_use_mkldnn"],
+        )
 
     return _elementwise_op(LayerHelper('elementwise_add', **locals()))
 
@@ -12224,90 +12931,88 @@ def gen_data():
 def elementwise_div(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_div(x, y)
-        # z = x / y
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_div(x, y)
+            # z = x / y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [2., 0.6, 2.]
+            print(z_value) # [2., 0.6, 2.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_div(x, y, axis=1)
-        # z = x / y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_div(x, y, axis=1)
+            # z = x / y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_div(x, y, axis=3)
-        # z = x / y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_div(x, y, axis=3)
+            # z = x / y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_div')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_div'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_div', **locals()))
 
@@ -12315,90 +13020,88 @@ def gen_data():
 def elementwise_sub(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_sub(x, y)
-        # z = x - y
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_sub(x, y)
+            # z = x - y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [1., -2., 2.]
+            print(z_value) # [1., -2., 2.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_sub(x, y, axis=1)
-        # z = x - y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_sub(x, y, axis=1)
+            # z = x - y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_sub(x, y, axis=3)
-        # z = x - y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_sub(x, y, axis=3)
+            # z = x - y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_sub')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_sub'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
 
@@ -12407,222 +13110,216 @@ def gen_data():
 def elementwise_mul(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_mul(x, y)
-        # z = x * y
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_mul(x, y)
+            # z = x * y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # [2., 15., 8.]
+            print(z_value) # [2., 15., 8.]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_mul(x, y, axis=1)
-        # z = x * y
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_mul(x, y, axis=1)
+            # z = x * y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) # z.shape=[2,3,4,5]
+            print(z_value) # z.shape=[2,3,4,5]
 
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
-                "y": np.random.randint(1, 5, size=[5]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[5], dtype='float32')
-        z = fluid.layers.elementwise_mul(x, y, axis=3)
-        # z = x * y
+            def gen_data():
+                return {
+                    "x": np.random.randint(1, 5, size=[2, 3, 4, 5]).astype('float32'),
+                    "y": np.random.randint(1, 5, size=[5]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[5], dtype='float32')
+            z = fluid.layers.elementwise_mul(x, y, axis=3)
+            # z = x * y
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
-        print(z_value) # z.shape=[2,3,4,5]
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
+            print(z_value) # z.shape=[2,3,4,5]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_mul')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_mul'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_mul', **locals()))
 
 
 def elementwise_max(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_max
-	:alias: paddle.elementwise_max,paddle.tensor.elementwise_max,paddle.tensor.math.elementwise_max
-	:old_api: paddle.fluid.layers.elementwise_max
+        :alias_main: paddle.elementwise_max
+            :alias: paddle.elementwise_max,paddle.tensor.elementwise_max,paddle.tensor.math.elementwise_max
+            :old_api: paddle.fluid.layers.elementwise_max
 
-Examples:
+    Examples:
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_max(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_max(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[2, 5, 4]
+            print(z_value) #[2, 5, 4]
 
 
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_max(x, y, axis=1)
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_max(x, y, axis=1)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value)#[[[[1., 1., 1., 1., 1.] .... [1., 1., 1., 1., 1.]]]]
+            print(z_value)#[[[[1., 1., 1., 1., 1.] .... [1., 1., 1., 1., 1.]]]]
 
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_max')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_max'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_max', **locals()))
 
 
 def elementwise_min(x, y, axis=-1, act=None, name=None):
     """
-    :alias_main: paddle.elementwise_min
-	:alias: paddle.elementwise_min,paddle.tensor.elementwise_min,paddle.tensor.math.elementwise_min
-	:old_api: paddle.fluid.layers.elementwise_min
+        :alias_main: paddle.elementwise_min
+            :alias: paddle.elementwise_min,paddle.tensor.elementwise_min,paddle.tensor.math.elementwise_min
+            :old_api: paddle.fluid.layers.elementwise_min
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_min(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_min(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[1, 3, 2]
+            print(z_value) #[1, 3, 2]
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.ones((2, 3, 4, 5)).astype('float32'),
-                "y": np.zeros((3, 4)).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
-        y = fluid.data(name="y", shape=[3,4], dtype='float32')
-        z = fluid.layers.elementwise_min(x, y, axis=1)
+            def gen_data():
+                return {
+                    "x": np.ones((2, 3, 4, 5)).astype('float32'),
+                    "y": np.zeros((3, 4)).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[2,3,4,5], dtype='float32')
+            y = fluid.data(name="y", shape=[3,4], dtype='float32')
+            z = fluid.layers.elementwise_min(x, y, axis=1)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
 
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value)#[[[[0., 0., 0., 0., 0.] .... [0., 0., 0., 0., 0.]]]]
+            print(z_value)#[[[[0., 0., 0., 0., 0.] .... [0., 0., 0., 0., 0.]]]]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_min')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_min'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_min', **locals()))
 
@@ -12630,37 +13327,35 @@ def gen_data():
 def elementwise_pow(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([2, 3, 4]).astype('float32'),
-                "y": np.array([1, 5, 2]).astype('float32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='float32')
-        y = fluid.data(name="y", shape=[3], dtype='float32')
-        z = fluid.layers.elementwise_pow(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([2, 3, 4]).astype('float32'),
+                    "y": np.array([1, 5, 2]).astype('float32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='float32')
+            y = fluid.data(name="y", shape=[3], dtype='float32')
+            z = fluid.layers.elementwise_pow(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[2, 243, 16]
+            print(z_value) #[2, 243, 16]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_pow')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_pow'
+        )
     return _elementwise_op(LayerHelper('elementwise_pow', **locals()))
 
 
@@ -12668,37 +13363,35 @@ def gen_data():
 def elementwise_mod(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([10, 15, 8]).astype('int32'),
-                "y": np.array([3, 6, 5]).astype('int32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='int32')
-        y = fluid.data(name="y", shape=[3], dtype='int32')
-        z = fluid.layers.elementwise_mod(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([10, 15, 8]).astype('int32'),
+                    "y": np.array([3, 6, 5]).astype('int32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='int32')
+            y = fluid.data(name="y", shape=[3], dtype='int32')
+            z = fluid.layers.elementwise_mod(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[1, 3, 3]
+            print(z_value) #[1, 3, 3]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_mod')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_mod'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_mod', **locals()))
 
@@ -12707,78 +13400,89 @@ def gen_data():
 def elementwise_floordiv(x, y, axis=-1, act=None, name=None):
     """
 
-Examples:
+    Examples:
 
-    ..  code-block:: python
+        ..  code-block:: python
 
-        import paddle.fluid as fluid
-        import numpy as np
-        import paddle
+            import paddle.fluid as fluid
+            import numpy as np
+            import paddle
 
-        def gen_data():
-            return {
-                "x": np.array([10, 15, 8]).astype('int32'),
-                "y": np.array([3, 7, 5]).astype('int32')
-            }
-        paddle.enable_static()
-        x = fluid.data(name="x", shape=[3], dtype='int32')
-        y = fluid.data(name="y", shape=[3], dtype='int32')
-        z = fluid.layers.elementwise_floordiv(x, y)
+            def gen_data():
+                return {
+                    "x": np.array([10, 15, 8]).astype('int32'),
+                    "y": np.array([3, 7, 5]).astype('int32')
+                }
+            paddle.enable_static()
+            x = fluid.data(name="x", shape=[3], dtype='int32')
+            y = fluid.data(name="y", shape=[3], dtype='int32')
+            z = fluid.layers.elementwise_floordiv(x, y)
 
-        place = fluid.CPUPlace()
-        exe = fluid.Executor(place)
-        z_value = exe.run(feed=gen_data(),
-                            fetch_list=[z.name])
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            z_value = exe.run(feed=gen_data(),
+                                fetch_list=[z.name])
 
-        print(z_value) #[3, 2, 1]
+            print(z_value) #[3, 2, 1]
     """
     if _non_static_mode():
-        return _elementwise_op_in_dygraph(x,
-                                          y,
-                                          axis=axis,
-                                          act=act,
-                                          op_name='elementwise_floordiv')
+        return _elementwise_op_in_dygraph(
+            x, y, axis=axis, act=act, op_name='elementwise_floordiv'
+        )
 
     return _elementwise_op(LayerHelper('elementwise_floordiv', **locals()))
 
 
 for func in [
-        elementwise_add,
-        elementwise_div,
-        elementwise_sub,
-        elementwise_mul,
-        elementwise_max,
-        elementwise_pow,
-        elementwise_min,
-        elementwise_mod,
-        elementwise_floordiv,
+    elementwise_add,
+    elementwise_div,
+    elementwise_sub,
+    elementwise_mul,
+    elementwise_max,
+    elementwise_pow,
+    elementwise_min,
+    elementwise_mod,
+    elementwise_floordiv,
 ]:
     op_proto = OpProtoHolder.instance().get_op_proto(func.__name__)
 
     # insert the c++ doc string on top of python doc string
-    func.__doc__ = _generate_doc_string_(
-        op_proto,
-        additional_args_lines=[
-            "axis (int32, optional): If X.dimension != Y.dimension, \
+    func.__doc__ = (
+        _generate_doc_string_(
+            op_proto,
+            additional_args_lines=[
+                "axis (int32, optional): If X.dimension != Y.dimension, \
             Y.dimension must be a subsequence of x.dimension. \
             And axis is the start dimension index for broadcasting Y onto X. ",
-            "act (string, optional): Activation applied to the output. \
+                "act (string, optional): Activation applied to the output. \
             Default is None. Details: :ref:`api_guide_activations_en` ",
-            "name (string, optional): Name of the output. \
+                "name (string, optional): Name of the output. \
             Default is None. It's used to print debug info for developers. Details: \
-            :ref:`api_guide_Name` "
-        ],
-        skip_attrs_set={
-            "x_data_format", "y_data_format", "axis", "use_quantizer",
-            "mkldnn_data_type", "Scale_x", "Scale_y", "Scale_out"
-        }) + """\n""" + str(func.__doc__)
+            :ref:`api_guide_Name` ",
+            ],
+            skip_attrs_set={
+                "x_data_format",
+                "y_data_format",
+                "axis",
+                "use_quantizer",
+                "mkldnn_data_type",
+                "Scale_x",
+                "Scale_y",
+                "Scale_out",
+            },
+        )
+        + """\n"""
+        + str(func.__doc__)
+    )
 
     doc_list = func.__doc__.splitlines()
 
     for idx, val in enumerate(doc_list):
-        if val.startswith("Warning: ") and val.endswith(
-                " instead."
-        ) and "and will be removed in future versions." in val:
+        if (
+            val.startswith("Warning: ")
+            and val.endswith(" instead.")
+            and "and will be removed in future versions." in val
+        ):
             doc_list.insert(0, doc_list.pop(idx))
             func.__doc__ = "\n" + "\n".join(i for i in doc_list)
             break
@@ -12789,9 +13493,12 @@ def gen_data():
         op_proto,
         additional_args_lines=[
             "act (basestring|None): Activation applied to the output.",
-            "name (basestring|None): Name of the output."
-        ])
-    func.__doc__ = func.__doc__ + """
+            "name (basestring|None): Name of the output.",
+        ],
+    )
+    func.__doc__ = (
+        func.__doc__
+        + """
 
 Examples:
   .. code-block:: python
@@ -12826,8 +13533,16 @@ def gen_data():
     x5 = fluid.layers.data(name="x5", shape=[2, 3, 4, 5], dtype='float32')
     y5 = fluid.layers.data(name="y5", shape=[2], dtype='float32')
     z5 = fluid.layers.%s(x5, y5, axis=0)
-    """ % (func.__name__, func.__name__, func.__name__, func.__name__,
-           func.__name__, func.__name__)
+    """
+        % (
+            func.__name__,
+            func.__name__,
+            func.__name__,
+            func.__name__,
+            func.__name__,
+            func.__name__,
+        )
+    )
 
 
 def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
@@ -12838,14 +13553,18 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
         else:
             return op(x)
     check_variable_and_dtype(
-        x, "x",
+        x,
+        "x",
         ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-        op_name)
+        op_name,
+    )
     if y is not None:
         check_variable_and_dtype(
-            y, "y",
+            y,
+            "y",
             ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-            op_name)
+            op_name,
+        )
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -12854,18 +13573,16 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     if binary_op and x.dtype != y.dtype:
         raise ValueError(
             "(InvalidArgument) The DataType of %s Op's Variable must be consistent, but received %s and %s."
-            % (op_name, x.dtype, y.dtype))
+            % (op_name, x.dtype, y.dtype)
+        )
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(type=op_name,
-                         inputs={
-                             "X": x,
-                             "Y": y
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type=op_name, inputs={"X": x, "Y": y}, outputs={"Out": out}
+        )
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -12907,12 +13624,9 @@ def logical_and(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_and(x, y)
 
-    return _logical_op(op_name="logical_and",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_or(x, y, out=None, name=None):
@@ -12927,7 +13641,7 @@ def logical_or(x, y, out=None, name=None):
 
     .. note::
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
-    
+
     Args:
         x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
@@ -12952,12 +13666,9 @@ def logical_or(x, y, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_or(x, y)
-    return _logical_op(op_name="logical_or",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_xor(x, y, out=None, name=None):
@@ -12998,12 +13709,9 @@ def logical_xor(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_xor(x, y)
 
-    return _logical_op(op_name="logical_xor",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -13036,18 +13744,15 @@ def logical_not(x, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_not(x)
-    return _logical_op(op_name="logical_not",
-                       x=x,
-                       y=None,
-                       name=name,
-                       out=out,
-                       binary_op=False)
+    return _logical_op(
+        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False
+    )
 
 
 @templatedoc()
 def clip(x, min, max, name=None):
     """
-	:old_api: paddle.fluid.layers.clip
+        :old_api: paddle.fluid.layers.clip
 
     ${comment}
 
@@ -13078,21 +13783,20 @@ def clip(x, min, max, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'clip')
 
     if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
-
-    out = helper.create_variable(type=x.type,
-                                 name=name,
-                                 dtype=x.dtype,
-                                 persistable=False)
-
-    helper.append_op(type="clip",
-                     inputs={"X": x},
-                     attrs={
-                         "min": min,
-                         "max": max
-                     },
-                     outputs={"Out": out})
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
+
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
+
+    helper.append_op(
+        type="clip",
+        inputs={"X": x},
+        attrs={"min": min, "max": max},
+        outputs={"Out": out},
+    )
 
     return out
 
@@ -13136,18 +13840,20 @@ def clip_by_norm(x, max_norm, name=None):
     check_type(max_norm, 'max_norm', (float), 'clip_by_norm')
 
     if name is None:
-        name = unique_name.generate_with_ignorable_key(".".join(
-            [helper.name, 'tmp']))
+        name = unique_name.generate_with_ignorable_key(
+            ".".join([helper.name, 'tmp'])
+        )
 
-    out = helper.create_variable(type=x.type,
-                                 name=name,
-                                 dtype=x.dtype,
-                                 persistable=False)
+    out = helper.create_variable(
+        type=x.type, name=name, dtype=x.dtype, persistable=False
+    )
 
-    helper.append_op(type="clip_by_norm",
-                     inputs={"X": x},
-                     attrs={"max_norm": max_norm},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="clip_by_norm",
+        inputs={"X": x},
+        attrs={"max_norm": max_norm},
+        outputs={"Out": out},
+    )
 
     return out
 
@@ -13186,10 +13892,9 @@ def mean(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'mean')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="mean",
-                     inputs={"X": x},
-                     attrs={},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="mean", inputs={"X": x}, attrs={}, outputs={"Out": out}
+    )
 
     return out
 
@@ -13221,10 +13926,12 @@ def merge_selected_rows(x, name=None):
 
     helper = LayerHelper("merge_selected_rows", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="merge_selected_rows",
-                     inputs={"X": x},
-                     attrs={},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="merge_selected_rows",
+        inputs={"X": x},
+        attrs={},
+        outputs={"Out": out},
+    )
     return out
 
 
@@ -13264,8 +13971,14 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
 
     """
     if _non_static_mode():
-        return _legacy_C_ops.mul(x, y, 'x_num_col_dims', x_num_col_dims,
-                                 'y_num_col_dims', y_num_col_dims)
+        return _legacy_C_ops.mul(
+            x,
+            y,
+            'x_num_col_dims',
+            x_num_col_dims,
+            'y_num_col_dims',
+            y_num_col_dims,
+        )
 
     inputs = {"X": [x], "Y": [y]}
     attrs = {"x_num_col_dims": x_num_col_dims, "y_num_col_dims": y_num_col_dims}
@@ -13274,13 +13987,9 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None):
     check_variable_and_dtype(y, 'y', ['float16', 'float32', 'float64'], 'mul')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="mul",
-                     inputs={
-                         "X": x,
-                         "Y": y
-                     },
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="mul", inputs={"X": x, "Y": y}, attrs=attrs, outputs={"Out": out}
+    )
     return out
 
 
@@ -13410,24 +14119,27 @@ def space_to_depth(x, blocksize, name=None):
     if not (isinstance(blocksize, int)):
         raise ValueError("blocksize must be a python Int")
 
-    check_variable_and_dtype(x, 'x', \
-        ['float16', 'float32', 'float64', 'int32', 'int64'], 'space_to_depth')
+    check_variable_and_dtype(
+        x,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'space_to_depth',
+    )
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="space_to_depth",
-                     inputs={"X": x},
-                     attrs={"blocksize": blocksize},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="space_to_depth",
+        inputs={"X": x},
+        attrs={"blocksize": blocksize},
+        outputs={"Out": out},
+    )
     return out
 
 
-def affine_channel(x,
-                   scale=None,
-                   bias=None,
-                   data_layout='NCHW',
-                   name=None,
-                   act=None):
+def affine_channel(
+    x, scale=None, bias=None, data_layout='NCHW', name=None, act=None
+):
     """
 
     Applies a separate affine transformation to each channel of the input.
@@ -13494,14 +14206,12 @@ def affine_channel(x,
     check_type(bias, 'bias', (Variable, type(None)), 'affine_channel')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="affine_channel",
-                     inputs={
-                         "X": x,
-                         'Scale': scale,
-                         'Bias': bias
-                     },
-                     attrs={"data_layout": data_layout},
-                     outputs={"Out": out})
+    helper.append_op(
+        type="affine_channel",
+        inputs={"X": x, 'Scale': scale, 'Bias': bias},
+        attrs={"data_layout": data_layout},
+        outputs={"Out": out},
+    )
     return helper.append_activation(out)
 
 
@@ -13600,8 +14310,9 @@ def similarity_focus(input, axis, indexes, name=None):
     """
     helper = LayerHelper('similarity_focus', **locals())
     # check attrs
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             "similarity_focus")
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], "similarity_focus"
+    )
     check_type(axis, 'axis', int, "similarity_focus")
     check_type(indexes, 'indexes', list, "similarity_focus")
     if axis != 1 and axis != 2 and axis != 3:
@@ -13610,13 +14321,12 @@ def similarity_focus(input, axis, indexes, name=None):
         raise ValueError("indexes can not be empty.")
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='similarity_focus',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         "axis": axis,
-                         "indexes": indexes
-                     })
+    helper.append_op(
+        type='similarity_focus',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={"axis": axis, "indexes": indexes},
+    )
     return out
 
 
@@ -13671,15 +14381,15 @@ def hash(input, hash_size, num_hash=1, name=None):
     check_type(hash_size, 'hash_size', int, 'hash')
     check_type(num_hash, 'num_hash', int, 'hash')
     helper = LayerHelper('hash', **locals())
-    out = helper.create_variable_for_type_inference(helper.input_dtype(),
-                                                    stop_gradient=True)
-    helper.append_op(type='hash',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'num_hash': num_hash,
-                         'mod_by': hash_size
-                     })
+    out = helper.create_variable_for_type_inference(
+        helper.input_dtype(), stop_gradient=True
+    )
+    helper.append_op(
+        type='hash',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'num_hash': num_hash, 'mod_by': hash_size},
+    )
     return out
 
 
@@ -13773,8 +14483,9 @@ def grid_sampler(x, grid, name=None):
     helper = LayerHelper("grid_sampler", **locals())
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sampler')
-    check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
-                             'grid_sampler')
+    check_variable_and_dtype(
+        grid, 'grid', ['float32', 'float64'], 'grid_sampler'
+    )
     if not isinstance(x, Variable):
         return ValueError("The x should be a Variable")
 
@@ -13786,10 +14497,9 @@ def grid_sampler(x, grid, name=None):
 
     attrs = {'use_cudnn': False} if core.is_compiled_with_rocm() else {}
 
-    helper.append_op(type='grid_sampler',
-                     inputs=ipts,
-                     outputs={'Output': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='grid_sampler', inputs=ipts, outputs={'Output': out}, attrs=attrs
+    )
     return out
 
 
@@ -13882,33 +14592,30 @@ def add_position_encoding(input, alpha, beta, name=None):
 
     """
     if _non_static_mode():
-        return _legacy_C_ops.add_position_encoding(input, "alpha", alpha,
-                                                   "beta", beta)
+        return _legacy_C_ops.add_position_encoding(
+            input, "alpha", alpha, "beta", beta
+        )
 
     helper = LayerHelper('add_position_encoding', **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             "add_position_encoding")
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], "add_position_encoding"
+    )
     dtype = helper.input_dtype()
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type="add_position_encoding",
-                     inputs={"X": input},
-                     outputs={"Out": out},
-                     attrs={
-                         "alpha": alpha,
-                         "beta": beta
-                     })
+    helper.append_op(
+        type="add_position_encoding",
+        inputs={"X": input},
+        outputs={"Out": out},
+        attrs={"alpha": alpha, "beta": beta},
+    )
     return out
 
 
-def bilinear_tensor_product(x,
-                            y,
-                            size,
-                            act=None,
-                            name=None,
-                            param_attr=None,
-                            bias_attr=None):
+def bilinear_tensor_product(
+    x, y, size, act=None, name=None, param_attr=None, bias_attr=None
+):
     r"""
     :api_attr: Static Graph
 
@@ -13959,23 +14666,21 @@ def bilinear_tensor_product(x,
 
     param_shape = [size, x.shape[1], y.shape[1]]
 
-    w = helper.create_parameter(attr=helper.param_attr,
-                                shape=param_shape,
-                                dtype=dtype,
-                                is_bias=False)
+    w = helper.create_parameter(
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=False
+    )
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
     inputs = {"X": x, "Y": y, "Weight": w}
     if helper.bias_attr:
         bias_size = [1, size]
-        bias = helper.create_parameter(attr=helper.bias_attr,
-                                       shape=bias_size,
-                                       dtype=dtype,
-                                       is_bias=True)
+        bias = helper.create_parameter(
+            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True
+        )
         inputs["Bias"] = bias
-    helper.append_op(type="bilinear_tensor_product",
-                     inputs=inputs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}
+    )
 
     # add activation
     return helper.append_activation(out)
@@ -14025,10 +14730,12 @@ def get_tensor_from_selected_rows(x, name=None):
         )
     helper = LayerHelper('get_tensor_from_selected_rows', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='get_tensor_from_selected_rows',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={})
+    helper.append_op(
+        type='get_tensor_from_selected_rows',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={},
+    )
     return out
 
 
@@ -14098,10 +14805,12 @@ def shuffle_channel(x, group, name=None):
     if not isinstance(group, int):
         raise TypeError("group must be int type")
 
-    helper.append_op(type="shuffle_channel",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={"group": group})
+    helper.append_op(
+        type="shuffle_channel",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"group": group},
+    )
     return out
 
 
@@ -14139,8 +14848,9 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             input = paddle.randn([6, 4, 2, 2])
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
-    return paddle.nn.functional.temporal_shift(x, seg_num, shift_ratio, name,
-                                               data_format)
+    return paddle.nn.functional.temporal_shift(
+        x, seg_num, shift_ratio, name, data_format
+    )
 
 
 class PyFuncRegistry(object):
@@ -14199,7 +14909,7 @@ def __call__(self, *args):
             func_ret = self._func(*args[idx:], **kwargs)
 
         if not isinstance(func_ret, (list, tuple)):
-            func_ret = (func_ret, )
+            func_ret = (func_ret,)
 
         ret = []
         for each_ret in func_ret:
@@ -14415,11 +15125,13 @@ def py_func_demo():
         out_list = out
     else:
         raise TypeError(
-            'Output must be Variable/list(Variable)/tuple(Variable)')
+            'Output must be Variable/list(Variable)/tuple(Variable)'
+        )
 
     fwd_func_id = PyFuncRegistry(func).id
-    bwd_func_id = PyFuncRegistry(
-        backward_func).id if backward_func is not None else -1
+    bwd_func_id = (
+        PyFuncRegistry(backward_func).id if backward_func is not None else -1
+    )
 
     for each_out in out_list:
         if len(each_out.shape) == 0:
@@ -14439,18 +15151,22 @@ def py_func_demo():
         for v in skip_vars_in_backward_input:
             if not v.name in fwd_in_out:
                 raise ValueError(
-                    'Variable {} is not found in forward inputs and outputs'.
-                    format(v.name))
+                    'Variable {} is not found in forward inputs and outputs'.format(
+                        v.name
+                    )
+                )
             backward_skip_vars.add(v.name)
 
-    helper.append_op(type='py_func',
-                     inputs={'X': x},
-                     outputs={'Out': out_list},
-                     attrs={
-                         'forward_callable_id': fwd_func_id,
-                         'backward_callable_id': bwd_func_id,
-                         'backward_skip_vars': list(backward_skip_vars)
-                     })
+    helper.append_op(
+        type='py_func',
+        inputs={'X': x},
+        outputs={'Out': out_list},
+        attrs={
+            'forward_callable_id': fwd_func_id,
+            'backward_callable_id': bwd_func_id,
+            'backward_skip_vars': list(backward_skip_vars),
+        },
+    )
     return out
 
 
@@ -14460,13 +15176,15 @@ def py_func_demo():
 
 
 @templatedoc()
-def psroi_pool(input,
-               rois,
-               output_channels,
-               spatial_scale,
-               pooled_height,
-               pooled_width,
-               name=None):
+def psroi_pool(
+    input,
+    rois,
+    output_channels,
+    spatial_scale,
+    pooled_height,
+    pooled_width,
+    name=None,
+):
     """
 
     ${comment}
@@ -14514,29 +15232,30 @@ def psroi_pool(input,
         raise TypeError("pooled_width must be int type")
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='psroi_pool',
-                     inputs={
-                         'X': input,
-                         'ROIs': rois
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'output_channels': output_channels,
-                         'spatial_scale': spatial_scale,
-                         'pooled_height': pooled_height,
-                         'pooled_width': pooled_width
-                     })
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': input, 'ROIs': rois},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width,
+        },
+    )
     return out
 
 
 @templatedoc()
-def prroi_pool(input,
-               rois,
-               spatial_scale=1.0,
-               pooled_height=1,
-               pooled_width=1,
-               batch_roi_nums=None,
-               name=None):
+def prroi_pool(
+    input,
+    rois,
+    spatial_scale=1.0,
+    pooled_height=1,
+    pooled_width=1,
+    batch_roi_nums=None,
+    name=None,
+):
     """
 
     The precise roi pooling implementation for paddle. Reference: https://arxiv.org/pdf/1807.11590.pdf
@@ -14599,14 +15318,16 @@ def prroi_pool(input,
     inputs_op = {'X': input, 'ROIs': rois}
     if batch_roi_nums is not None:
         inputs_op['BatchRoINums'] = batch_roi_nums
-    helper.append_op(type='prroi_pool',
-                     inputs=inputs_op,
-                     outputs={'Out': out},
-                     attrs={
-                         'spatial_scale': spatial_scale,
-                         'pooled_height': pooled_height,
-                         'pooled_width': pooled_width
-                     })
+    helper.append_op(
+        type='prroi_pool',
+        inputs=inputs_op,
+        outputs={'Out': out},
+        attrs={
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width,
+        },
+    )
     return out
 
 
@@ -14635,23 +15356,23 @@ def pixel_shuffle(x, upscale_factor):
     Examples:
         .. code-block:: python
 
-	    # declarative mode
-	    import paddle.fluid as fluid
-	    import numpy as np
-	    input = fluid.data(name="input", shape=[2,9,4,4])
-	    output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
-	    place = fluid.CPUPlace()
-	    exe = fluid.Executor(place)
-	    exe.run(fluid.default_startup_program())
+            # declarative mode
+            import paddle.fluid as fluid
+            import numpy as np
+            input = fluid.data(name="input", shape=[2,9,4,4])
+            output = fluid.layers.pixel_shuffle(x=input, upscale_factor=3)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            exe.run(fluid.default_startup_program())
 
-	    input_data = np.random.rand(2,9,4,4).astype("float32")
-	    output_data = exe.run(fluid.default_main_program(),
+            input_data = np.random.rand(2,9,4,4).astype("float32")
+            output_data = exe.run(fluid.default_main_program(),
                 feed={"input":input_data},
                 fetch_list=[output],
                 return_numpy=True)
 
- 	    # print(output.shape)
-	    # (2L, 1L, 12L, 12L)
+            # print(output.shape)
+            # (2L, 1L, 12L, 12L)
 
     """
 
@@ -14663,10 +15384,12 @@ def pixel_shuffle(x, upscale_factor):
     if not isinstance(upscale_factor, int):
         raise TypeError("upscale factor must be int type")
 
-    helper.append_op(type="pixel_shuffle",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={"upscale_factor": upscale_factor})
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor},
+    )
     return out
 
 
@@ -14716,8 +15439,9 @@ def fsp_matrix(x, y):
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fsp_matrix')
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'fsp_matrix')
     helper = LayerHelper('fsp_matrix', **locals())
-    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype(
-        input_param_name='x'))
+    out = helper.create_variable_for_type_inference(
+        dtype=helper.input_dtype(input_param_name='x')
+    )
     helper.append_op(type='fsp', inputs={'X': x, 'Y': y}, outputs={'Out': out})
     return out
 
@@ -14767,15 +15491,15 @@ def continuous_value_model(input, cvm, use_cvm=True):
     """
     helper = LayerHelper('cvm', **locals())
     out = helper.create_variable(dtype=input.dtype)
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'cvm')
-    helper.append_op(type='cvm',
-                     inputs={
-                         'X': [input],
-                         'CVM': [cvm]
-                     },
-                     outputs={'Y': [out]},
-                     attrs={"use_cvm": use_cvm})
+    check_variable_and_dtype(
+        input, 'input', ['float16', 'float32', 'float64'], 'cvm'
+    )
+    helper.append_op(
+        type='cvm',
+        inputs={'X': [input], 'CVM': [cvm]},
+        outputs={'Y': [out]},
+        attrs={"use_cvm": use_cvm},
+    )
     return out
 
 
@@ -14821,11 +15545,14 @@ def where(condition):
     helper = LayerHelper("where_index", **locals())
 
     out = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
-
-    helper.append_op(type='where_index',
-                     inputs={'Condition': condition},
-                     outputs={'Out': [out]})
+        dtype=core.VarDesc.VarType.INT64
+    )
+
+    helper.append_op(
+        type='where_index',
+        inputs={'Condition': condition},
+        outputs={'Out': [out]},
+    )
     return out
 
 
@@ -14884,21 +15611,21 @@ def unique(x, dtype='int32'):
              out, index = fluid.layers.unique(x) # out is [2, 3, 1, 5]; index is [0, 1, 1, 2, 3, 1]
     """
 
-    check_variable_and_dtype(x, "x", ['float32', 'float64', 'int32', 'int64'],
-                             "unique")
+    check_variable_and_dtype(
+        x, "x", ['float32', 'float64', 'int32', 'int64'], "unique"
+    )
     helper = LayerHelper("unique", **locals())
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     index = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type='unique',
-                     inputs={'X': x},
-                     attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-                     outputs={
-                         'Out': [out],
-                         'Index': [index]
-                     })
+    helper.append_op(
+        type='unique',
+        inputs={'X': x},
+        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+        outputs={'Out': [out], 'Index': [index]},
+    )
 
     return out, index
 
@@ -14912,7 +15639,7 @@ def unique_with_counts(x, dtype='int32'):
 
     Args:
         x(Variable): A 1-D input tensor with input shape of :math:`[N]` , the input data type is float32, float64, int32, int64.
-        dtype(np.dtype|core.VarDesc.VarType|str): The type of count and index tensor, it could be int32, int64. Defalut value is int32.
+        dtype(np.dtype|core.VarDesc.VarType|str): The type of count and index tensor, it could be int32, int64. Default value is int32.
 
     Returns:
         tuple, the variable type in tuple is Tensor, the output :attr:`out` data type is the same as input :attr:`x`, \
@@ -14931,11 +15658,13 @@ def unique_with_counts(x, dtype='int32'):
                                                         # count is [1, 3, 1, 1]
             # x.shape=(6,) out.shape=(4,), index.shape=(6,), count.shape=(4,)
     """
-    check_variable_and_dtype(x, "x", ['float32', 'float64', 'int32', 'int64'],
-                             "unique_with_counts")
+    check_variable_and_dtype(
+        x, "x", ['float32', 'float64', 'int32', 'int64'], "unique_with_counts"
+    )
     if not (dtype == 'int32' or dtype == 'int64'):
         raise TypeError(
-            "Op unique_with_counts, index dtype must be int32 or int64")
+            "Op unique_with_counts, index dtype must be int32 or int64"
+        )
 
     if x is None or len(x.shape) != 1:
         raise ValueError(
@@ -14950,33 +15679,33 @@ def unique_with_counts(x, dtype='int32'):
 
     count = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type='unique_with_counts',
-                     inputs={'X': x},
-                     attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
-                     outputs={
-                         'Out': [out],
-                         'Index': [index],
-                         'Count': [count]
-                     })
+    helper.append_op(
+        type='unique_with_counts',
+        inputs={'X': x},
+        attrs={'dtype': convert_np_dtype_to_dtype_(dtype)},
+        outputs={'Out': [out], 'Index': [index], 'Count': [count]},
+    )
 
     return out, index, count
 
 
-def deformable_conv(input,
-                    offset,
-                    mask,
-                    num_filters,
-                    filter_size,
-                    stride=1,
-                    padding=0,
-                    dilation=1,
-                    groups=None,
-                    deformable_groups=None,
-                    im2col_step=None,
-                    param_attr=None,
-                    bias_attr=None,
-                    modulated=True,
-                    name=None):
+def deformable_conv(
+    input,
+    offset,
+    mask,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=None,
+    deformable_groups=None,
+    im2col_step=None,
+    param_attr=None,
+    bias_attr=None,
+    modulated=True,
+    name=None,
+):
     r"""
     :api_attr: Static Graph
 
@@ -15107,10 +15836,12 @@ def deformable_conv(input,
                                              num_filters=2, filter_size=filter_size, padding=1, modulated=False)
     """
 
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'deformable_conv')
-    check_variable_and_dtype(offset, "offset", ['float32', 'float64'],
-                             'deformable_conv')
+    check_variable_and_dtype(
+        input, "input", ['float32', 'float64'], 'deformable_conv'
+    )
+    check_variable_and_dtype(
+        offset, "offset", ['float32', 'float64'], 'deformable_conv'
+    )
     check_type(mask, 'mask', (Variable, type(None)), 'deformable_conv')
 
     num_channels = input.shape[1]
@@ -15145,52 +15876,58 @@ def _get_default_param_initializer():
             raise ValueError(
                 "Invalid filter number, excepted number is larger than 0, but"
                 " received {}, please check the input shape and "
-                "filter size.".format(filter_elem_num))
-        std = (2.0 / filter_elem_num)**0.5
+                "filter size.".format(filter_elem_num)
+            )
+        std = (2.0 / filter_elem_num) ** 0.5
         return Normal(0.0, std, 0)
 
     filter_param = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        default_initializer=_get_default_param_initializer())
+        default_initializer=_get_default_param_initializer(),
+    )
 
     pre_bias = helper.create_variable_for_type_inference(dtype)
 
     if modulated:
-        helper.append_op(type='deformable_conv',
-                         inputs={
-                             'Input': input,
-                             'Filter': filter_param,
-                             'Offset': offset,
-                             'Mask': mask,
-                         },
-                         outputs={"Output": pre_bias},
-                         attrs={
-                             'strides': stride,
-                             'paddings': padding,
-                             'dilations': dilation,
-                             'groups': groups,
-                             'deformable_groups': deformable_groups,
-                             'im2col_step': im2col_step,
-                         })
+        helper.append_op(
+            type='deformable_conv',
+            inputs={
+                'Input': input,
+                'Filter': filter_param,
+                'Offset': offset,
+                'Mask': mask,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': stride,
+                'paddings': padding,
+                'dilations': dilation,
+                'groups': groups,
+                'deformable_groups': deformable_groups,
+                'im2col_step': im2col_step,
+            },
+        )
 
     else:
-        helper.append_op(type='deformable_conv_v1',
-                         inputs={
-                             'Input': input,
-                             'Filter': filter_param,
-                             'Offset': offset,
-                         },
-                         outputs={"Output": pre_bias},
-                         attrs={
-                             'strides': stride,
-                             'paddings': padding,
-                             'dilations': dilation,
-                             'groups': groups,
-                             'deformable_groups': deformable_groups,
-                             'im2col_step': im2col_step,
-                         })
+        helper.append_op(
+            type='deformable_conv_v1',
+            inputs={
+                'Input': input,
+                'Filter': filter_param,
+                'Offset': offset,
+            },
+            outputs={"Output": pre_bias},
+            attrs={
+                'strides': stride,
+                'paddings': padding,
+                'dilations': dilation,
+                'groups': groups,
+                'deformable_groups': deformable_groups,
+                'im2col_step': im2col_step,
+            },
+        )
 
     output = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
     return output
@@ -15266,23 +16003,26 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
             y = F.unfold(x, [3, 3], 1, 1, 1)
     """
 
-    return paddle.nn.functional.unfold(x, kernel_sizes, strides, paddings,
-                                       dilations, name)
-
-
-def deformable_roi_pooling(input,
-                           rois,
-                           trans,
-                           no_trans=False,
-                           spatial_scale=1.0,
-                           group_size=[1, 1],
-                           pooled_height=1,
-                           pooled_width=1,
-                           part_size=None,
-                           sample_per_part=1,
-                           trans_std=0.1,
-                           position_sensitive=False,
-                           name=None):
+    return paddle.nn.functional.unfold(
+        x, kernel_sizes, strides, paddings, dilations, name
+    )
+
+
+def deformable_roi_pooling(
+    input,
+    rois,
+    trans,
+    no_trans=False,
+    spatial_scale=1.0,
+    group_size=[1, 1],
+    pooled_height=1,
+    pooled_width=1,
+    part_size=None,
+    sample_per_part=1,
+    trans_std=0.1,
+    position_sensitive=False,
+    name=None,
+):
     r"""
 
     Deformable ROI Pooling Layer
@@ -15387,17 +16127,22 @@ def deformable_roi_pooling(input,
                                                 position_sensitive=False)
     """
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'deformable_roi_pooling')
-    check_variable_and_dtype(rois, 'rois', ['float32', 'float64'],
-                             'deformable_roi_pooling')
-    check_variable_and_dtype(trans, 'trans', ['float32', 'float64'],
-                             'deformable_roi_pooling')
-    check_type(group_size, 'group_size', (list, tuple),
-               'deformable_roi_pooling')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'deformable_roi_pooling'
+    )
+    check_variable_and_dtype(
+        rois, 'rois', ['float32', 'float64'], 'deformable_roi_pooling'
+    )
+    check_variable_and_dtype(
+        trans, 'trans', ['float32', 'float64'], 'deformable_roi_pooling'
+    )
+    check_type(
+        group_size, 'group_size', (list, tuple), 'deformable_roi_pooling'
+    )
     if part_size is not None:
-        check_type(part_size, 'part_size', (list, tuple),
-                   'deformable_roi_pooling')
+        check_type(
+            part_size, 'part_size', (list, tuple), 'deformable_roi_pooling'
+        )
 
     input_channels = input.shape[1]
     if position_sensitive == False:
@@ -15415,27 +16160,22 @@ def deformable_roi_pooling(input,
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
     top_count = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type="deformable_psroi_pooling",
-                     inputs={
-                         "Input": input,
-                         "ROIs": rois,
-                         "Trans": trans
-                     },
-                     outputs={
-                         "Output": output,
-                         "TopCount": top_count
-                     },
-                     attrs={
-                         "no_trans": no_trans,
-                         "spatial_scale": spatial_scale,
-                         "output_dim": output_channels,
-                         "group_size": group_size,
-                         "pooled_height": pooled_height,
-                         "pooled_width": pooled_width,
-                         "part_size": part_size,
-                         "sample_per_part": sample_per_part,
-                         "trans_std": trans_std
-                     })
+    helper.append_op(
+        type="deformable_psroi_pooling",
+        inputs={"Input": input, "ROIs": rois, "Trans": trans},
+        outputs={"Output": output, "TopCount": top_count},
+        attrs={
+            "no_trans": no_trans,
+            "spatial_scale": spatial_scale,
+            "output_dim": output_channels,
+            "group_size": group_size,
+            "pooled_height": pooled_height,
+            "pooled_width": pooled_width,
+            "part_size": part_size,
+            "sample_per_part": sample_per_part,
+            "trans_std": trans_std,
+        },
+    )
     return output
 
 
@@ -15458,7 +16198,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     For each value `v` in `input`, we reset it to a new value according to the
     following formula:
     ::
-   
+
         v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
 
     That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
@@ -15487,27 +16227,31 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             # [[-1], [1]]
     """
     if in_dygraph_mode():
-        return _C_ops.shard_index(input, index_num, nshards, shard_id,
-                                  ignore_value)
+        return _C_ops.shard_index(
+            input, index_num, nshards, shard_id, ignore_value
+        )
 
     check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
     helper = LayerHelper(op_type, **locals())
     if shard_id < 0 or shard_id >= nshards:
-        raise ValueError('The shard_id(%d) should be in [0, %d)' %
-                         (shard_id, nshards))
+        raise ValueError(
+            'The shard_id(%d) should be in [0, %d)' % (shard_id, nshards)
+        )
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=op_type,
-                     inputs={'X': [input]},
-                     outputs={'Out': out},
-                     attrs={
-                         'index_num': index_num,
-                         'nshards': nshards,
-                         'shard_id': shard_id,
-                         'ignore_value': ignore_value
-                     },
-                     stop_gradient=True)
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [input]},
+        outputs={'Out': out},
+        attrs={
+            'index_num': index_num,
+            'nshards': nshards,
+            'shard_id': shard_id,
+            'ignore_value': ignore_value,
+        },
+        stop_gradient=True,
+    )
     return out
 
 
@@ -15562,22 +16306,22 @@ def hard_swish(x, threshold=6.0, scale=6.0, offset=3.0, name=None):
         print(out)  # [[0.66666667, 1.66666667,3., 4.]]
     """
     if _non_static_mode():
-        return _legacy_C_ops.hard_swish(x, 'threshold', threshold, 'scale',
-                                        scale, 'offset', offset)
+        return _legacy_C_ops.hard_swish(
+            x, 'threshold', threshold, 'scale', scale, 'offset', offset
+        )
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hard_swish')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hard_swish'
+    )
 
     helper = LayerHelper('hard_swish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='hard_swish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'threshold': threshold,
-                         'scale': scale,
-                         'offset': offset
-                     })
+    helper.append_op(
+        type='hard_swish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold, 'scale': scale, 'offset': offset},
+    )
     return out
 
 
@@ -15648,15 +16392,20 @@ def mish(x, threshold=20, name=None):
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'mish')
     check_type(threshold, 'threshold', (float, int), 'mish')
-    assert threshold > 0, "threshold of mish should be greater than 0, " \
-                          "but got {}".format(threshold)
+    assert (
+        threshold > 0
+    ), "threshold of mish should be greater than 0, " "but got {}".format(
+        threshold
+    )
 
     helper = LayerHelper('mish', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='mish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='mish',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -15726,12 +16475,9 @@ def gather_tree(ids, parents):
 
 @deprecated(since="2.0.0", update_to="paddle.uniform")
 @templatedoc()
-def uniform_random(shape,
-                   dtype='float32',
-                   min=-1.0,
-                   max=1.0,
-                   seed=0,
-                   name=None):
+def uniform_random(
+    shape, dtype='float32', min=-1.0, max=1.0, seed=0, name=None
+):
     """
     This OP returns a Tensor filled with random values sampled from a uniform
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
@@ -15811,34 +16557,47 @@ def uniform_random(shape,
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        return _C_ops.uniform_random(shape, dtype, float(min), float(max), seed,
-                                     _current_expected_place())
+        return _C_ops.uniform_random(
+            shape,
+            dtype,
+            float(min),
+            float(max),
+            seed,
+            _current_expected_place(),
+        )
     elif _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        return _legacy_C_ops.uniform_random('shape',
-                                            shape, 'min', float(min), 'max',
-                                            float(max), 'seed', seed, 'dtype',
-                                            dtype)
+        return _legacy_C_ops.uniform_random(
+            'shape',
+            shape,
+            'min',
+            float(min),
+            'max',
+            float(max),
+            'seed',
+            seed,
+            'dtype',
+            dtype,
+        )
 
     check_type(shape, 'shape', (list, tuple, Variable), 'uniform_random/rand')
-    check_dtype(dtype, 'dtype', ('float32', 'float64', 'uint16'),
-                'uniform_random/rand')
+    check_dtype(
+        dtype, 'dtype', ('float32', 'float64', 'uint16'), 'uniform_random/rand'
+    )
     check_type(min, 'min', (float, int, Variable), 'uniform_random/rand')
     check_type(max, 'max', (float, int, Variable), 'uniform_random/rand')
 
     inputs = dict()
     attrs = {'seed': seed, 'min': min, 'max': max, 'dtype': dtype}
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='uniform_random/rand')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='uniform_random/rand'
+    )
 
     helper = LayerHelper("uniform_random", **locals())
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="uniform_random",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="uniform_random", inputs=inputs, attrs=attrs, outputs={"Out": out}
+    )
     utils.try_set_static_shape_tensor(out, shape)
     return out
 
@@ -15848,7 +16607,7 @@ def unbind(input, axis=0):
     Removes a tensor dimension, then split the input tensor into multiple sub-Tensors.
     Args:
         input (Variable): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-       
+
         axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. If :math:`axis < 0`, the
             dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
@@ -15874,11 +16633,13 @@ def unbind(input, axis=0):
     helper = LayerHelper("unbind", **locals())
     check_type(input, 'input', (Variable), 'unbind')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'],
-                'unbind')
+    check_dtype(
+        dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'], 'unbind'
+    )
     if not isinstance(axis, (int)):
-        raise TypeError("The type of 'axis'  must be int, but received %s." %
-                        (type(axis)))
+        raise TypeError(
+            "The type of 'axis'  must be int, but received %s." % (type(axis))
+        )
     if isinstance(axis, np.generic):
         axis = np.asscalar(axis)
     input_shape = input.shape
@@ -15889,8 +16650,10 @@ def unbind(input, axis=0):
         for i in range(num)
     ]
 
-    helper.append_op(type="unbind",
-                     inputs={"X": input},
-                     outputs={"Out": outs},
-                     attrs={"axis": axis})
+    helper.append_op(
+        type="unbind",
+        inputs={"X": input},
+        outputs={"Out": outs},
+        attrs={"axis": axis},
+    )
     return outs
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 2910f4187a73e3..3973a7187908bf 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -1606,7 +1606,8 @@ def linspace(start, stop, num, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_num = fill_constant([1], 'int32', num)
     if in_dygraph_mode():
-        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, dtype)
+        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, dtype,
+                               _current_expected_place())
     if _in_legacy_dygraph():
         return _legacy_C_ops.linspace(tensor_start, tensor_stop, tensor_num,
                                       'dtype', dtype)
diff --git a/python/paddle/fluid/lazy_init.py b/python/paddle/fluid/lazy_init.py
index 8d98b1287e3e06..1e6a457ab32baa 100644
--- a/python/paddle/fluid/lazy_init.py
+++ b/python/paddle/fluid/lazy_init.py
@@ -36,7 +36,7 @@ def enable(self):
         """
         if self._state:
             return
-        assert framework.in_dygraph_mode(
+        assert framework._non_static_mode(
         ), "LazyInit.enable() is only available in dygraph mode."
         self._state = True
 
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
index e162daf2b87e14..a29852d722ce72 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_c_setup.py
@@ -13,18 +13,19 @@
 # limitations under the License.
 
 import os
-from paddle.fluid import core
 from distutils.sysconfig import get_python_lib
-from distutils.core import setup, Extension
+
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
+from paddle.fluid import core
+
 
 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
 # cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
 # for C/ObjC but not for C++
 class BuildExt(build_ext):
-
     def build_extensions(self):
         if '-Wstrict-prototypes' in self.compiler.compiler_so:
             self.compiler.compiler_so.remove('-Wstrict-prototypes')
@@ -48,8 +49,9 @@ def build_extensions(self):
     os.path.join(site_packages_path, 'paddle', 'include'),
 ]
 # include path third_party
-compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'],
-                                        'third_party')
+compile_third_party_path = os.path.join(
+    os.environ['PADDLE_BINARY_DIR'], 'third_party'
+)
 paddle_custom_kernel_include += [
     os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
     os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
@@ -61,9 +63,7 @@ def build_extensions(self):
 ]
 
 # libs
-libs = [':core_avx.so']
-if not core.has_avx_core and core.has_noavx_core:
-    libs = [':core_noavx.so']
+libs = [':libpaddle.so']
 
 custom_kernel_dot_module = Extension(
     'custom_kernel_dot',
@@ -71,10 +71,13 @@ def build_extensions(self):
     include_dirs=paddle_custom_kernel_include,
     library_dirs=paddle_custom_kernel_library_dir,
     libraries=libs,
-    extra_compile_args=paddle_extra_compile_args)
+    extra_compile_args=paddle_extra_compile_args,
+)
 
-setup(name='custom_kernel_dot_c',
-      version='1.0',
-      description='custom kernel fot compiling',
-      cmdclass={'build_ext': BuildExt},
-      ext_modules=[custom_kernel_dot_module])
+setup(
+    name='custom_kernel_dot_c',
+    version='1.0',
+    description='custom kernel fot compiling',
+    cmdclass={'build_ext': BuildExt},
+    ext_modules=[custom_kernel_dot_module],
+)
diff --git a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
index efe5368cdca56f..7fd37db98044af 100644
--- a/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
+++ b/python/paddle/fluid/tests/custom_kernel/custom_kernel_dot_setup.py
@@ -14,18 +14,17 @@
 
 import os
 import site
-from paddle.fluid import core
-from distutils.sysconfig import get_python_lib
-from distutils.core import setup, Extension
+from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
 
+from paddle.fluid import core
+
 
 # refer: https://note.qidong.name/2018/03/setup-warning-strict-prototypes
 # Avoid a gcc warning below:
 # cc1plus: warning: command line option ‘-Wstrict-prototypes’ is valid
 # for C/ObjC but not for C++
 class BuildExt(build_ext):
-
     def build_extensions(self):
         if '-Wstrict-prototypes' in self.compiler.compiler_so:
             self.compiler.compiler_so.remove('-Wstrict-prototypes')
@@ -46,12 +45,15 @@ def build_extensions(self):
 # include path
 site_packages_path = site.getsitepackages()
 paddle_custom_kernel_include = list(
-    map(lambda path: os.path.join(path, 'paddle', 'include'),
-        site_packages_path))
+    map(
+        lambda path: os.path.join(path, 'paddle', 'include'), site_packages_path
+    )
+)
 
 # include path third_party
-compile_third_party_path = os.path.join(os.environ['PADDLE_BINARY_DIR'],
-                                        'third_party')
+compile_third_party_path = os.path.join(
+    os.environ['PADDLE_BINARY_DIR'], 'third_party'
+)
 paddle_custom_kernel_include += [
     os.path.join(compile_third_party_path, 'install/gflags/include'),  # gflags
     os.path.join(compile_third_party_path, 'install/glog/include'),  # glog
@@ -59,12 +61,11 @@ def build_extensions(self):
 
 # libs path
 paddle_custom_kernel_library_dir = list(
-    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path))
+    map(lambda path: os.path.join(path, 'paddle', 'fluid'), site_packages_path)
+)
 
 # libs
-libs = [':core_avx.so']
-if not core.has_avx_core and core.has_noavx_core:
-    libs = [':core_noavx.so']
+libs = [':libpaddle.so']
 
 custom_kernel_dot_module = Extension(
     'custom_kernel_dot',
@@ -72,10 +73,13 @@ def build_extensions(self):
     include_dirs=paddle_custom_kernel_include,
     library_dirs=paddle_custom_kernel_library_dir,
     libraries=libs,
-    extra_compile_args=paddle_extra_compile_args)
+    extra_compile_args=paddle_extra_compile_args,
+)
 
-setup(name='custom_kernel_dot',
-      version='1.0',
-      description='custom kernel fot compiling',
-      cmdclass={'build_ext': BuildExt},
-      ext_modules=[custom_kernel_dot_module])
+setup(
+    name='custom_kernel_dot',
+    version='1.0',
+    description='custom kernel fot compiling',
+    cmdclass={'build_ext': BuildExt},
+    ext_modules=[custom_kernel_dot_module],
+)
diff --git a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
index 4ca05909fb17ad..ff7ff3e04a88e5 100644
--- a/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
+++ b/python/paddle/fluid/tests/custom_kernel/test_custom_kernel_load.py
@@ -48,7 +48,7 @@ def setUp(self):
                     paddle_lib_path = lib_dir
         self.default_path = os.path.sep.join(
             [paddle_lib_path, '..', '..', 'paddle-plugins'])
-        # copy so to defalut path
+        # copy so to default path
         cmd = 'mkdir -p {} && cp ./*.so {}'.format(self.default_path,
                                                    self.default_path)
         os.system(cmd)  # wait
diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
index e791ea8cb7600e..53b61b4bb6611b 100644
--- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
+++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu
@@ -44,7 +44,7 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
                                                  data_t* ddout_data,
                                                  int64_t num) {
   int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
-  for (int64_t i = num; i < num; i += blockDim.x * gridDim.x) {
+  for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) {
     ddout_data[i] = ddx_data[i] * (out_data[i] > static_cast<data_t>(0.)
                                        ? static_cast<data_t>(1.)
                                        : static_cast<data_t>(0.));
diff --git a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
index 0cc1b19e654bfa..25209795db5ba7 100644
--- a/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_custom_relu_op_setup.py
@@ -21,6 +21,7 @@
 import tempfile
 import subprocess
 import numpy as np
+from paddle import fluid
 from paddle.vision.transforms import Compose, Normalize
 from paddle.utils.cpp_extension.extension_utils import run_cmd
 from paddle.fluid.framework import _test_eager_guard
@@ -43,12 +44,9 @@ def custom_relu_dynamic(func, device, dtype, np_x, use_func=True):
         return out.numpy(), t.grad.numpy()
 
 
-def custom_relu_static(func,
-                       device,
-                       dtype,
-                       np_x,
-                       use_func=True,
-                       test_infer=False):
+def custom_relu_static(
+    func, device, dtype, np_x, use_func=True, test_infer=False
+):
     paddle.enable_static()
     paddle.set_device(device)
 
@@ -62,9 +60,11 @@ def custom_relu_static(func,
             exe = static.Executor()
             exe.run(static.default_startup_program())
             # in static mode, x data has been covered by out
-            out_v = exe.run(static.default_main_program(),
-                            feed={'X': np_x},
-                            fetch_list=[out.name])
+            out_v = exe.run(
+                static.default_main_program(),
+                feed={'X': np_x},
+                fetch_list=[out.name],
+            )
 
     paddle.disable_static()
     return out_v
@@ -87,11 +87,11 @@ def custom_relu_static_pe(func, device, dtype, np_x, use_func=True):
 
             # in static mode, x data has been covered by out
             compiled_prog = static.CompiledProgram(
-                static.default_main_program()).with_data_parallel(
-                    loss_name=out.name, places=places)
-            out_v = exe.run(compiled_prog,
-                            feed={'X': np_x},
-                            fetch_list=[out.name])
+                static.default_main_program()
+            ).with_data_parallel(loss_name=out.name, places=places)
+            out_v = exe.run(
+                compiled_prog, feed={'X': np_x}, fetch_list=[out.name]
+            )
 
     paddle.disable_static()
     return out_v
@@ -103,9 +103,9 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
     with static.scope_guard(static.Scope()):
         with static.program_guard(static.Program()):
             # simple module
-            data = static.data(name='data',
-                               shape=[None, 1, 28, 28],
-                               dtype='float32')
+            data = static.data(
+                name='data', shape=[None, 1, 28, 28], dtype='float32'
+            )
             label = static.data(name='label', shape=[None, 1], dtype='int64')
 
             hidden = static.nn.fc(data, size=128)
@@ -124,23 +124,21 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
 
             # train
             for i in range(4):
-                avg_loss_v = exe.run(static.default_main_program(),
-                                     feed={
-                                         'data': np_data,
-                                         'label': np_label
-                                     },
-                                     fetch_list=[avg_loss])
+                avg_loss_v = exe.run(
+                    static.default_main_program(),
+                    feed={'data': np_data, 'label': np_label},
+                    fetch_list=[avg_loss],
+                )
 
             # save inference model
             static.save_inference_model(path_prefix, [data], [predict], exe)
 
             # get train predict value
-            predict_v = exe.run(static.default_main_program(),
-                                feed={
-                                    'data': np_data,
-                                    'label': np_label
-                                },
-                                fetch_list=[predict])
+            predict_v = exe.run(
+                static.default_main_program(),
+                feed={'data': np_data, 'label': np_label},
+                fetch_list=[predict],
+            )
 
     return predict_v
 
@@ -151,30 +149,37 @@ def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
     t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
 
     out = func(t) if use_func else paddle.nn.functional.relu(t)
-    out.stop_gradient = False
-
-    dx = paddle.grad(outputs=[out],
-                     inputs=[t],
-                     create_graph=True,
-                     retain_graph=True)
+    dx = paddle.grad(
+        outputs=out,
+        inputs=t,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=True,
+        retain_graph=True,
+    )
 
-    dx[0].backward()
+    ddout = paddle.grad(
+        outputs=dx[0],
+        inputs=out.grad,
+        grad_outputs=paddle.ones_like(t),
+        create_graph=False,
+    )
 
-    assert dx[0].grad is not None
-    return dx[0].numpy(), dx[0].grad.numpy()
+    assert ddout[0].numpy() is not None
+    return dx[0].numpy(), ddout[0].numpy()
 
 
 class TestNewCustomOpSetUpInstall(unittest.TestCase):
-
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # compile, install the custom op egg into site-packages under background
         if os.name == 'nt':
             cmd = 'cd /d {} && python custom_relu_setup.py install'.format(
-                cur_dir)
+                cur_dir
+            )
         else:
             cmd = 'cd {} && {} custom_relu_setup.py install'.format(
-                cur_dir, sys.executable)
+                cur_dir, sys.executable
+            )
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -190,16 +195,18 @@ def setUp(self):
         custom_egg_path = [
             x for x in os.listdir(site_dir) if 'custom_relu_module_setup' in x
         ]
-        assert len(custom_egg_path
-                   ) == 1, "Matched egg number is %d." % len(custom_egg_path)
+        assert len(custom_egg_path) == 1, "Matched egg number is %d." % len(
+            custom_egg_path
+        )
         sys.path.append(os.path.join(site_dir, custom_egg_path[0]))
 
         # usage: import the package directly
         import custom_relu_module_setup
+
         # `custom_relu_dup` is same as `custom_relu_dup`
         self.custom_ops = [
             custom_relu_module_setup.custom_relu,
-            custom_relu_module_setup.custom_relu_dup
+            custom_relu_module_setup.custom_relu_dup,
         ]
 
         self.dtypes = ['float32', 'float64']
@@ -222,13 +229,16 @@ def test_static(self):
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static(custom_op, device, dtype, x)
-                    pd_out = custom_relu_static(custom_op, device, dtype, x,
-                                                False)
+                    pd_out = custom_relu_static(
+                        custom_op, device, dtype, x, False
+                    )
                     np.testing.assert_array_equal(
                         out,
                         pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.
-                        format(out, pd_out))
+                        err_msg='custom op out: {},\n paddle api out: {}'.format(
+                            out, pd_out
+                        ),
+                    )
 
     def test_static_pe(self):
         for device in self.devices:
@@ -238,13 +248,16 @@ def test_static_pe(self):
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
                     out = custom_relu_static_pe(custom_op, device, dtype, x)
-                    pd_out = custom_relu_static_pe(custom_op, device, dtype, x,
-                                                   False)
+                    pd_out = custom_relu_static_pe(
+                        custom_op, device, dtype, x, False
+                    )
                     np.testing.assert_array_equal(
                         out,
                         pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.
-                        format(out, pd_out))
+                        err_msg='custom op out: {},\n paddle api out: {}'.format(
+                            out, pd_out
+                        ),
+                    )
 
     def func_dynamic(self):
         for device in self.devices:
@@ -253,20 +266,26 @@ def func_dynamic(self):
                     continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 for custom_op in self.custom_ops:
-                    out, x_grad = custom_relu_dynamic(custom_op, device, dtype,
-                                                      x)
+                    out, x_grad = custom_relu_dynamic(
+                        custom_op, device, dtype, x
+                    )
                     pd_out, pd_x_grad = custom_relu_dynamic(
-                        custom_op, device, dtype, x, False)
+                        custom_op, device, dtype, x, False
+                    )
                     np.testing.assert_array_equal(
                         out,
                         pd_out,
-                        err_msg='custom op out: {},\n paddle api out: {}'.
-                        format(out, pd_out))
+                        err_msg='custom op out: {},\n paddle api out: {}'.format(
+                            out, pd_out
+                        ),
+                    )
                     np.testing.assert_array_equal(
                         x_grad,
                         pd_x_grad,
-                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.
-                        format(x_grad, pd_x_grad))
+                        err_msg='custom op x grad: {},\n paddle api x grad: {}'.format(
+                            x_grad, pd_x_grad
+                        ),
+                    )
 
     def test_dynamic(self):
         with _test_eager_guard():
@@ -279,22 +298,29 @@ def test_static_save_and_load_inference_model(self):
         np_label = np.random.random((1, 1)).astype("int64")
         path_prefix = "custom_op_inference/custom_relu"
         for device in self.devices:
-            predict = custom_relu_static_inference(self.custom_ops[0], device,
-                                                   np_data, np_label,
-                                                   path_prefix)
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix
+            )
             # load inference model
             with static.scope_guard(static.Scope()):
                 exe = static.Executor()
-                [inference_program, feed_target_names,
-                 fetch_targets] = static.load_inference_model(path_prefix, exe)
-                predict_infer = exe.run(inference_program,
-                                        feed={feed_target_names[0]: np_data},
-                                        fetch_list=fetch_targets)
+                [
+                    inference_program,
+                    feed_target_names,
+                    fetch_targets,
+                ] = static.load_inference_model(path_prefix, exe)
+                predict_infer = exe.run(
+                    inference_program,
+                    feed={feed_target_names[0]: np_data},
+                    fetch_list=fetch_targets,
+                )
                 np.testing.assert_array_equal(
                     predict,
                     predict_infer,
-                    err_msg='custom op predict: {},\n custom op infer predict: {}'
-                    .format(predict, predict_infer))
+                    err_msg='custom op predict: {},\n custom op infer predict: {}'.format(
+                        predict, predict_infer
+                    ),
+                )
         paddle.disable_static()
 
     def test_static_save_and_run_inference_predictor(self):
@@ -304,62 +330,80 @@ def test_static_save_and_run_inference_predictor(self):
         path_prefix = "custom_op_inference/custom_relu"
         from paddle.inference import Config
         from paddle.inference import create_predictor
+
         for device in self.devices:
-            predict = custom_relu_static_inference(self.custom_ops[0], device,
-                                                   np_data, np_label,
-                                                   path_prefix)
+            predict = custom_relu_static_inference(
+                self.custom_ops[0], device, np_data, np_label, path_prefix
+            )
             # load inference model
-            config = Config(path_prefix + ".pdmodel",
-                            path_prefix + ".pdiparams")
+            config = Config(
+                path_prefix + ".pdmodel", path_prefix + ".pdiparams"
+            )
             predictor = create_predictor(config)
             input_tensor = predictor.get_input_handle(
-                predictor.get_input_names()[0])
+                predictor.get_input_names()[0]
+            )
             input_tensor.reshape(np_data.shape)
             input_tensor.copy_from_cpu(np_data.copy())
             predictor.run()
             output_tensor = predictor.get_output_handle(
-                predictor.get_output_names()[0])
+                predictor.get_output_names()[0]
+            )
             predict_infer = output_tensor.copy_to_cpu()
             self.assertTrue(
                 np.isclose(predict, predict_infer, rtol=5e-5).any(),
                 "custom op predict: {},\n custom op infer predict: {}".format(
-                    predict, predict_infer))
+                    predict, predict_infer
+                ),
+            )
         paddle.disable_static()
 
-    def test_func_double_grad_dynamic(self):
+    def test_double_grad_dynamic(self):
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         for device in self.devices:
             for dtype in self.dtypes:
                 if device == 'cpu' and dtype == 'float16':
                     continue
                 x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
                 out, dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_ops[0], device, dtype, x)
+                    self.custom_ops[0], device, dtype, x
+                )
                 pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
-                    self.custom_ops[0], device, dtype, x, False)
+                    self.custom_ops[0], device, dtype, x, False
+                )
                 np.testing.assert_array_equal(
                     out,
                     pd_out,
                     err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out))
+                        out, pd_out
+                    ),
+                )
                 np.testing.assert_array_equal(
                     dx_grad,
                     pd_dx_grad,
-                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.
-                    format(dx_grad, pd_dx_grad))
+                    err_msg='custom op dx grad: {},\n paddle api dx grad: {}'.format(
+                        dx_grad, pd_dx_grad
+                    ),
+                )
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_with_dataloader(self):
         for device in self.devices:
             paddle.set_device(device)
             # data loader
             transform = Compose(
-                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')])
-            train_dataset = paddle.vision.datasets.MNIST(mode='train',
-                                                         transform=transform)
-            train_loader = paddle.io.DataLoader(train_dataset,
-                                                batch_size=64,
-                                                shuffle=True,
-                                                drop_last=True,
-                                                num_workers=0)
+                [Normalize(mean=[127.5], std=[127.5], data_format='CHW')]
+            )
+            train_dataset = paddle.vision.datasets.MNIST(
+                mode='train', transform=transform
+            )
+            train_loader = paddle.io.DataLoader(
+                train_dataset,
+                batch_size=64,
+                shuffle=True,
+                drop_last=True,
+                num_workers=0,
+            )
 
             for batch_id, (image, _) in enumerate(train_loader()):
                 out = self.custom_ops[0](image)
@@ -368,7 +412,9 @@ def test_with_dataloader(self):
                     out,
                     pd_out,
                     err_msg='custom op out: {},\n paddle api out: {}'.format(
-                        out, pd_out))
+                        out, pd_out
+                    ),
+                )
 
                 if batch_id == 5:
                     break
diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
index 099b1ddc1c01e7..367d1e6399032f 100644
--- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt
@@ -1,6 +1,6 @@
 if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set(PLUGIN_URL https://github.com/PaddlePaddle/PaddleCustomDevice.git)
-  set(PLUGIN_TAG d5e5ac1d8e9f7588d4c2998bb3b5ffc66f65af2e)
+  set(PLUGIN_TAG 0698428ddba21e6baecb690579f37c48896f7d56)
 
   file(
     GLOB TEST_OPS
@@ -8,10 +8,10 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
     "test_*.py")
   string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
-  list(REMOVE_ITEM TEST_OPS test_collective_process_group_xccl)
   foreach(TEST_OP ${TEST_OPS})
-    py_test(${TEST_OP} SRCS ${TEST_OP}.py ENVS PLUGIN_URL=${PLUGIN_URL}
-                            PLUGIN_TAG=${PLUGIN_TAG})
+    py_test(${TEST_OP}
+            SRCS ${TEST_OP}.py ENVS FLAGS_allocator_strategy=naive_best_fit
+                 PLUGIN_URL=${PLUGIN_URL} PLUGIN_TAG=${PLUGIN_TAG})
   endforeach()
 
   bash_test_modules(
@@ -19,6 +19,7 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
     START_BASH
     test_fleet_launch_custom_device.sh
     ENVS
+    FLAGS_allocator_strategy=naive_best_fit
     PADDLE_BINARY_DIR=${PADDLE_BINARY_DIR}
     PLUGIN_URL=${PLUGIN_URL}
     PLUGIN_TAG=${PLUGIN_TAG})
@@ -26,4 +27,5 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU)
   set_tests_properties(test_custom_cpu_plugin PROPERTIES TIMEOUT 120)
   set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120)
   set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120)
+  set_tests_properties(test_custom_cpu_to_static PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py b/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py
index db2510d2beb377..1127352d85d998 100644
--- a/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py
+++ b/python/paddle/fluid/tests/custom_runtime/test_collective_process_group_xccl.py
@@ -12,14 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import unittest
 import os
-import sys
 import copy
 import subprocess
 import time
+import tempfile
 
 
 def start_local_trainers(cluster,
@@ -28,7 +26,7 @@ def start_local_trainers(cluster,
                          training_script_args,
                          eager_mode=True,
                          log_dir=None):
-    from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+    from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc  # noqa: F401
 
     current_env = copy.copy(os.environ.copy())
     #paddle broadcast ncclUniqueId use socket, and
@@ -84,7 +82,7 @@ def start_local_trainers(cluster,
 
 
 def get_cluster_from_args(selected_gpus):
-    from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+    from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc  # noqa: F401
 
     cluster_node_ips = '127.0.0.1'
     node_ip = '127.0.0.1'
@@ -108,7 +106,7 @@ def get_cluster_from_args(selected_gpus):
 class TestMultipleCustomCPU(unittest.TestCase):
 
     def run_mnist_2custom_cpu(self, target_file_name, eager_mode=True):
-        from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+        from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc  # noqa: F401
 
         selected_devices = [0, 1]
         cluster = None
@@ -136,21 +134,32 @@ class TestProcessGroup(TestMultipleCustomCPU):
     def setUp(self):
         # compile so and set to current path
         cur_dir = os.path.dirname(os.path.abspath(__file__))
-        cmd = 'rm -rf PaddleCustomDevice \
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
             && git clone {} \
-            && cd PaddleCustomDevice/backends/custom_cpu \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
             && git checkout {} -b dev \
+            && cd backends/custom_cpu \
             && mkdir build && cd build && cmake .. && make -j8'.format(
-            os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'))
+            self.temp_dir.name, os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'))
         os.system(cmd)
 
         # set environment for loading and registering compiled custom kernels
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
-            cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
+            cur_dir, '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name))
+        os.environ['FLAGS_selected_custom_cpus'] = '0,1'
+        os.environ['CUSTOM_CPU_VISIBLE_DEVICES'] = '0,1'
+        os.environ['PADDLE_XCCL_BACKEND'] = 'custom_cpu'
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
 
     def test_process_group_xccl(self):
-        from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+        from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc  # noqa: F401
 
         self.run_mnist_2custom_cpu('process_group_xccl.py')
 
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
index 371f0018a0f8d0..79e3e506b906ed 100644
--- a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_plugin.py
@@ -14,9 +14,9 @@
 
 import os
 import sys
-import site
 import unittest
 import numpy as np
+import tempfile
 
 
 class TestCustomCPUPlugin(unittest.TestCase):
@@ -24,18 +24,27 @@ class TestCustomCPUPlugin(unittest.TestCase):
     def setUp(self):
         # compile so and set to current path
         cur_dir = os.path.dirname(os.path.abspath(__file__))
-        cmd = 'rm -rf PaddleCustomDevice \
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
             && git clone {} \
-            && cd PaddleCustomDevice/backends/custom_cpu \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
             && git checkout {} -b dev \
+            && cd backends/custom_cpu \
             && mkdir build && cd build && cmake .. && make -j8'.format(
-            os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'))
+            self.temp_dir.name, os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'))
         os.system(cmd)
 
         # set environment for loading and registering compiled custom kernels
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
-            cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
+            cur_dir, '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name))
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+        del os.environ['CUSTOM_DEVICE_ROOT']
 
     def test_custom_device(self):
         import paddle
@@ -183,9 +192,6 @@ def _test_scalar(self):
         k_t = paddle.to_tensor([3], dtype="int32")
         value_1, indices_1 = paddle.topk(data_1, k=k_t)
 
-    def tearDown(self):
-        del os.environ['CUSTOM_DEVICE_ROOT']
-
 
 if __name__ == '__main__':
     if os.name == 'nt' or sys.platform.startswith('darwin'):
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py
index 34bdb067c67c5d..2e307fbb826b53 100644
--- a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_profiler_plugin.py
@@ -14,9 +14,8 @@
 
 import os
 import sys
-import site
 import unittest
-import numpy as np
+import tempfile
 
 
 class TestCustomCPUProfilerPlugin(unittest.TestCase):
@@ -24,18 +23,27 @@ class TestCustomCPUProfilerPlugin(unittest.TestCase):
     def setUp(self):
         # compile so and set to current path
         cur_dir = os.path.dirname(os.path.abspath(__file__))
-        cmd = 'rm -rf PaddleCustomDevice \
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
             && git clone {} \
-            && cd PaddleCustomDevice/backends/custom_cpu \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
             && git checkout {} -b dev \
+            && cd backends/custom_cpu \
             && mkdir build && cd build && cmake .. && make -j8'.format(
-            os.getenv('PLUGIN_URL'), os.getenv('PLUGIN_TAG'))
+            self.temp_dir.name, os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'))
         os.system(cmd)
 
         # set environment for loading and registering compiled custom kernels
         # only valid in current process
         os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
-            cur_dir, 'PaddleCustomDevice/backends/custom_cpu/build')
+            cur_dir, '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name))
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+        del os.environ['CUSTOM_DEVICE_ROOT']
 
     def test_custom_device(self):
         import paddle
@@ -59,9 +67,6 @@ def _test_custom_profiler(self):
         p.stop()
         p.summary()
 
-    def tearDown(self):
-        del os.environ['CUSTOM_DEVICE_ROOT']
-
 
 if __name__ == '__main__':
     if os.name == 'nt' or sys.platform.startswith('darwin'):
diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py
new file mode 100644
index 00000000000000..e5e9638a0b5314
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_runtime/test_custom_cpu_to_static.py
@@ -0,0 +1,252 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import time
+import unittest
+import numpy as np
+import tempfile
+
+EPOCH_NUM = 1
+BATCH_SIZE = 1024
+
+
+def train_func_base(epoch_id, train_loader, model, cost, optimizer):
+
+    total_step = len(train_loader)
+    epoch_start = time.time()
+    for batch_id, (images, labels) in enumerate(train_loader()):
+        # forward
+        outputs = model(images)
+        loss = cost(outputs, labels)
+        # backward and optimize
+        loss.backward()
+        optimizer.step()
+        optimizer.clear_grad()
+        print("Epoch [{}/{}], Step [{}/{}], Loss: {}".format(
+            epoch_id + 1, EPOCH_NUM, batch_id + 1, total_step, loss.numpy()))
+    epoch_end = time.time()
+    print(
+        f"Epoch ID: {epoch_id+1}, FP32 train epoch time: {(epoch_end - epoch_start) * 1000} ms"
+    )
+
+
+def train_func_ampo1(epoch_id, train_loader, model, cost, optimizer, scaler):
+    import paddle
+
+    total_step = len(train_loader)
+    epoch_start = time.time()
+    for batch_id, (images, labels) in enumerate(train_loader()):
+        # forward
+        with paddle.amp.auto_cast(
+                custom_black_list={"flatten_contiguous_range", "greater_than"},
+                level='O1'):
+            outputs = model(images)
+            loss = cost(outputs, labels)
+        # backward and optimize
+        scaled = scaler.scale(loss)
+        scaled.backward()
+        scaler.minimize(optimizer, scaled)
+        optimizer.clear_grad()
+        print("Epoch [{}/{}], Step [{}/{}], Loss: {}".format(
+            epoch_id + 1, EPOCH_NUM, batch_id + 1, total_step, loss.numpy()))
+    epoch_end = time.time()
+    print(
+        f"Epoch ID: {epoch_id+1}, AMPO1 train epoch time: {(epoch_end - epoch_start) * 1000} ms"
+    )
+
+
+def test_func(epoch_id, test_loader, model, cost):
+    import paddle
+
+    # evaluation every epoch finish
+    model.eval()
+    avg_acc = [[], []]
+    for batch_id, (images, labels) in enumerate(test_loader()):
+        # forward
+        outputs = model(images)
+        loss = cost(outputs, labels)
+        # accuracy
+        acc_top1 = paddle.metric.accuracy(input=outputs, label=labels, k=1)
+        acc_top5 = paddle.metric.accuracy(input=outputs, label=labels, k=5)
+        avg_acc[0].append(acc_top1.numpy())
+        avg_acc[1].append(acc_top5.numpy())
+    model.train()
+    print(
+        f"Epoch ID: {epoch_id+1}, Top1 accurary: {np.array(avg_acc[0]).mean()}, Top5 accurary: {np.array(avg_acc[1]).mean()}"
+    )
+
+
+class TestCustomCPUPlugin(unittest.TestCase):
+
+    def setUp(self):
+        # compile so and set to current path
+        cur_dir = os.path.dirname(os.path.abspath(__file__))
+        self.temp_dir = tempfile.TemporaryDirectory()
+        cmd = 'cd {} \
+            && git clone {} \
+            && cd PaddleCustomDevice \
+            && git fetch origin \
+            && git checkout {} -b dev \
+            && cd backends/custom_cpu \
+            && mkdir build && cd build && cmake .. && make -j8'.format(
+            self.temp_dir.name, os.getenv('PLUGIN_URL'),
+            os.getenv('PLUGIN_TAG'))
+        os.system(cmd)
+
+        # set environment for loading and registering compiled custom kernels
+        # only valid in current process
+        os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join(
+            cur_dir, '{}/PaddleCustomDevice/backends/custom_cpu/build'.format(
+                self.temp_dir.name))
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_custom_cpu_plugin(self):
+        self._test_to_static()
+        self._test_amp_o1()
+
+    def _test_to_static(self):
+        import paddle
+
+        class LeNet5(paddle.nn.Layer):
+
+            def __init__(self):
+                super(LeNet5, self).__init__()
+                self.fc = paddle.nn.Linear(in_features=1024, out_features=10)
+                self.relu = paddle.nn.ReLU()
+                self.fc1 = paddle.nn.Linear(in_features=10, out_features=10)
+
+            def forward(self, x):
+                out = paddle.flatten(x, 1)
+                out = self.fc(out)
+                out = self.relu(out)
+                out = self.fc1(out)
+                return out
+
+        # set device
+        paddle.set_device('custom_cpu')
+
+        # model
+        model = LeNet5()
+
+        # cost and optimizer
+        cost = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.Adam(learning_rate=0.001,
+                                          parameters=model.parameters())
+
+        # convert to static model
+        build_strategy = paddle.static.BuildStrategy()
+        mnist = paddle.jit.to_static(model, build_strategy=build_strategy)
+
+        # data loader
+        transform = paddle.vision.transforms.Compose([
+            paddle.vision.transforms.Resize((32, 32)),
+            paddle.vision.transforms.ToTensor(),
+            paddle.vision.transforms.Normalize(mean=(0.1307, ), std=(0.3081, ))
+        ])
+        train_dataset = paddle.vision.datasets.MNIST(mode='train',
+                                                     transform=transform,
+                                                     download=True)
+        test_dataset = paddle.vision.datasets.MNIST(mode='test',
+                                                    transform=transform,
+                                                    download=True)
+        train_loader = paddle.io.DataLoader(train_dataset,
+                                            batch_size=BATCH_SIZE,
+                                            shuffle=True,
+                                            drop_last=True,
+                                            num_workers=2)
+        test_loader = paddle.io.DataLoader(test_dataset,
+                                           batch_size=BATCH_SIZE,
+                                           shuffle=True,
+                                           drop_last=True,
+                                           num_workers=2)
+
+        # train and eval
+        for epoch_id in range(EPOCH_NUM):
+            train_func_base(epoch_id, train_loader, model, cost, optimizer)
+            test_func(epoch_id, test_loader, model, cost)
+
+    def _test_amp_o1(self):
+        import paddle
+
+        class LeNet5(paddle.nn.Layer):
+
+            def __init__(self):
+                super(LeNet5, self).__init__()
+                self.fc = paddle.nn.Linear(in_features=1024, out_features=10)
+                self.relu = paddle.nn.ReLU()
+                self.fc1 = paddle.nn.Linear(in_features=10, out_features=10)
+
+            def forward(self, x):
+                out = paddle.flatten(x, 1)
+                out = self.fc(out)
+                out = self.relu(out)
+                out = self.fc1(out)
+                return out
+
+        # set device
+        paddle.set_device('custom_cpu')
+
+        # model
+        model = LeNet5()
+
+        # cost and optimizer
+        cost = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.Adam(learning_rate=0.001,
+                                          parameters=model.parameters())
+
+        # convert to static model
+        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+        model, optimizer = paddle.amp.decorate(models=model,
+                                               optimizers=optimizer,
+                                               level='O1')
+
+        # data loader
+        transform = paddle.vision.transforms.Compose([
+            paddle.vision.transforms.Resize((32, 32)),
+            paddle.vision.transforms.ToTensor(),
+            paddle.vision.transforms.Normalize(mean=(0.1307, ), std=(0.3081, ))
+        ])
+        train_dataset = paddle.vision.datasets.MNIST(mode='train',
+                                                     transform=transform,
+                                                     download=True)
+        test_dataset = paddle.vision.datasets.MNIST(mode='test',
+                                                    transform=transform,
+                                                    download=True)
+        train_loader = paddle.io.DataLoader(train_dataset,
+                                            batch_size=BATCH_SIZE,
+                                            shuffle=True,
+                                            drop_last=True,
+                                            num_workers=2)
+        test_loader = paddle.io.DataLoader(test_dataset,
+                                           batch_size=BATCH_SIZE,
+                                           shuffle=True,
+                                           drop_last=True,
+                                           num_workers=2)
+
+        # train and eval
+        for epoch_id in range(EPOCH_NUM):
+            train_func_ampo1(epoch_id, train_loader, model, cost, optimizer,
+                             scaler)
+            test_func(epoch_id, test_loader, model, cost)
+
+
+if __name__ == '__main__':
+    if os.name == 'nt' or sys.platform.startswith('darwin'):
+        # only support Linux now
+        exit()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh b/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh
index 5570c629dd9654..5269cd32120584 100644
--- a/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh
+++ b/python/paddle/fluid/tests/custom_runtime/test_fleet_launch_custom_device.sh
@@ -16,17 +16,20 @@
 
 set -e
 
-rm -rf PaddleCustomDevice && \
-git clone ${PLUGIN_URL} \
-&& pushd PaddleCustomDevice/backends/custom_cpu \
+temp_dir=$(mktemp --directory)
+pushd ${temp_dir} \
+&& git clone ${PLUGIN_URL} \
+&& pushd PaddleCustomDevice/ \
+&& git fetch origin \
 && git checkout ${PLUGIN_TAG} -b dev \ 
-&& mkdir build && pushd build && cmake .. && make -j8 && popd && popd
+&& pushd backends/custom_cpu \
+&& mkdir build && pushd build && cmake .. && make -j8 && popd && popd && popd && popd
 
 echo "begin test use custom_cpu"
 
 export FLAGS_selected_custom_cpus=0,1
 export CUSTOM_CPU_VISIBLE_DEVICES=0,1
-export CUSTOM_DEVICE_ROOT=PaddleCustomDevice/backends/custom_cpu/build
+export CUSTOM_DEVICE_ROOT=${temp_dir}/PaddleCustomDevice/backends/custom_cpu/build
 
 distributed_args="--devices=0,1"
 python -m paddle.distributed.fleet.launch ${distributed_args} custom_device_multi_process_collective.py fleetlaunch_custom_cpu
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a76b9d1789b3cf..f69711107898c2 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -75,6 +75,7 @@ if(NOT WITH_GPU)
   list(REMOVE_ITEM TEST_OPS test_fused_attention_op)
   list(REMOVE_ITEM TEST_OPS test_fused_attention_op_api)
   list(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_op)
+  list(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_int8_op)
   list(REMOVE_ITEM TEST_OPS test_fused_transformer_encoder_layer)
   list(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op)
   list(REMOVE_ITEM TEST_OPS test_fused_bias_dropout_residual_layer_norm_op_api)
@@ -100,6 +101,7 @@ if(((NOT WITH_ROCM) AND (NOT WITH_GPU)) OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_fleet_executor_task_node)
   list(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_run)
   list(REMOVE_ITEM TEST_OPS test_fleet_exe_dist_model_tensor)
+  list(REMOVE_ITEM TEST_OPS test_fleet_executor_cond_interceptor)
 endif()
 
 list(REMOVE_ITEM TEST_OPS test_deprecated_decorator)
@@ -144,6 +146,7 @@ if(WIN32)
   list(REMOVE_ITEM TEST_OPS test_complex_matmul)
   list(REMOVE_ITEM TEST_OPS test_ops_nms)
   list(REMOVE_ITEM TEST_OPS test_trt_convert_preln_residual_bias)
+  list(REMOVE_ITEM TEST_OPS test_fused_multi_transformer_int8_op)
 endif()
 list(REMOVE_ITEM TEST_OPS test_checkpoint_saver)
 
@@ -805,7 +808,7 @@ py_test_modules(
 # it is found that windows CI will run all the training unittests with the ON_INFER option turned on,
 # which will not appear in other CIs. The calculation behavior of some ops in inference mode is
 # inconsistent with that in non-inference mode.
-if(NOT ON_INFER)
+if(WITH_PYTHON)
   py_test_modules(test_parallel_executor_seresnext_base_cpu MODULES
                   test_parallel_executor_seresnext_base_cpu)
   py_test_modules(test_parallel_executor_seresnext_with_reduce_cpu MODULES
@@ -1208,6 +1211,10 @@ endif()
 if(WITH_GPU OR WITH_ROCM)
   set_tests_properties(test_rank_attention_op PROPERTIES TIMEOUT 120)
 endif()
+if(WITH_GPU AND NOT WIN32)
+  set_tests_properties(test_fused_multi_transformer_int8_op PROPERTIES TIMEOUT
+                                                                       60)
+endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)
 set_tests_properties(test_eigvals_op PROPERTIES TIMEOUT 400)
 set_tests_properties(
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
index 27f86dc9f100a7..bd6ccfd3922c84 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -37,9 +37,41 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
                   ${dist_ENVS})
   set_tests_properties(test_high_order_grad
                        PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
-  py_test_modules(test_grad_clip MODULES test_grad_clip ENVS ${dist_ENVS})
-  set_tests_properties(test_grad_clip PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
-                                                 TIMEOUT 50)
+  py_test_modules(test_iterable_dataset MODULES test_iterable_dataset ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_iterable_dataset
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
+  py_test_modules(test_pass_grad_clip MODULES test_pass_grad_clip ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_pass_grad_clip
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_gradient_merge MODULES test_pass_gradient_merge
+                  ENVS ${dist_ENVS})
+  set_tests_properties(test_pass_gradient_merge
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_recompute MODULES test_pass_recompute ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_pass_recompute
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_sharding MODULES test_pass_sharding ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_pass_sharding
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_pass_amp MODULES test_pass_amp ENVS ${dist_ENVS})
+  set_tests_properties(test_pass_amp PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE"
+                                                TIMEOUT 50)
+  py_test_modules(test_engine_callbacks MODULES test_engine_callbacks)
+  set_tests_properties(test_engine_callbacks
+                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 50)
+  py_test_modules(test_parallel_tuner MODULES test_parallel_tuner ENVS
+                  ${dist_ENVS})
+  set_tests_properties(test_parallel_tuner PROPERTIES TIMEOUT 120)
+  py_test_modules(test_parallel_tuner_full MODULES test_parallel_tuner_full
+                  ENVS ${dist_ENVS})
+  set_tests_properties(test_parallel_tuner_full PROPERTIES TIMEOUT 120)
+  py_test_modules(test_parallel_tuner_predict MODULES
+                  test_parallel_tuner_predict ENVS ${dist_ENVS})
+  set_tests_properties(test_parallel_tuner_predict PROPERTIES TIMEOUT 120)
 
   py_test_modules(test_while_op_completion MODULES test_while_op_completion
                   ENVS ${dist_ENVS})
@@ -58,6 +90,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_dist_embedding MODULES test_dist_embedding ENVS
                   ${dist_ENVS})
   py_test_modules(test_dist_slice MODULES test_dist_slice ENVS ${dist_ENVS})
+  py_test_modules(test_dist_split MODULES test_dist_split ENVS ${dist_ENVS})
   py_test_modules(test_cluster MODULES test_cluster ENVS ${dist_ENVS})
   py_test_modules(test_comm_cost MODULES test_comm_cost ENVS ${dist_ENVS})
   py_test_modules(test_comp_cost MODULES test_comp_cost ENVS ${dist_ENVS})
@@ -66,15 +99,20 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_prim_dist_op MODULES test_prim_dist_op ENVS ${dist_ENVS})
   py_test_modules(test_to_static MODULES test_to_static ENVS ${dist_ENVS})
   py_test_modules(test_dist_op_cost MODULES test_dist_op_cost ENVS ${dist_ENVS})
+
   py_test_modules(test_cluster_v2 MODULES test_cluster_v2)
   py_test_modules(test_process_mesh_v2 MODULES test_process_mesh_v2)
   py_test_modules(test_dist_attr_v2 MODULES test_dist_attr_v2)
   py_test_modules(test_lr_grad_clip MODULES test_lr_grad_clip)
-  py_test_modules(test_quantization MODULES test_quantization)
   py_test_modules(test_dist_matmul MODULES test_dist_matmul)
+  py_test_modules(test_process_mesh MODULES test_process_mesh)
+  py_test_modules(test_interface MODULES test_interface)
+  py_test_modules(test_strategy MODULES test_strategy)
+  py_test_modules(test_pass_quantization MODULES test_pass_quantization)
+  py_test_modules(test_dist_shape MODULES test_dist_shape)
+  py_test_modules(test_dist_assign MODULES test_dist_assign)
+  py_test_modules(test_conditional_block_reshard MODULES
+                  test_conditional_block_reshard)
+  py_test_modules(test_engine_api_error MODULES test_engine_api_error)
 
-  py_test_modules(test_iterable_dataset MODULES test_iterable_dataset ENVS
-                  ${dist_ENVS})
-  set_tests_properties(test_iterable_dataset
-                       PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE" TIMEOUT 80)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
new file mode 100644
index 00000000000000..ea3bdd32082403
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/amp_pass_unittest.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+from paddle.distributed.fleet import auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+
+def apply_pass(use_amp=False, level=None):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_amp:
+        amp = strategy.amp
+        amp.enable = True
+        amp.custom_white_list = ['softmax', 'layer_norm', 'gelu']
+        amp.custom_black_list = [
+            'c_softmax_with_cross_entropy', 'elementwise_div', 'reduce_sum'
+        ]
+        amp.init_loss_scaling = 32768
+        amp.use_fp16_guard = False
+        amp.use_pure_fp16 = level in ["o2", "o3"]
+        amp.use_optimizer_fp16 = level == "o3"
+        print("amp level: ", level)
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestAMPPass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-5
+        self.atol = 1e-8
+        self.batch_size = 1
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_amp=False, level=None):
+        reset_prog()
+
+        strategy = apply_pass(use_amp, level)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("mp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses, rtol=None, atol=None):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=rtol or self.rtol,
+            atol=atol or self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_amp_pass(self):
+        # mp2 training
+        mp_engine = self.get_engine()
+        history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        mp_losses = np.array(history.history["loss"])
+
+        # mp2 amp-o1 training
+        amp_o1_engine = self.get_engine(True, "o1")
+        history = amp_o1_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        amp_o1_losses = np.array(history.history["loss"])
+        amp_o1_engine.evaluate(self.dataset, 3, batch_size=self.batch_size)
+        # self.check_results(mp_losses, amp_o1_losses)
+
+        # mp2 amp-o2 training
+        amp_o2_engine = self.get_engine(True, "o2")
+        history = amp_o2_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        amp_o2_losses = np.array(history.history["loss"])
+        amp_o2_engine.evaluate(self.dataset, 3, batch_size=self.batch_size)
+        # self.check_results(mp_losses, amp_o2_losses)
+
+        # mp2 amp-o3 training
+        amp_o3_engine = self.get_engine(True, "o3")
+        history = amp_o3_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        amp_o3_losses = np.array(history.history["loss"])
+        amp_o3_engine.evaluate(self.dataset, 3, batch_size=self.batch_size)
+        # self.check_results(mp_losses, amp_o3_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
index d459ffd6d680d5..197bc151636170 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_model.py
@@ -28,11 +28,11 @@
 from paddle.fluid import layers
 from paddle.io import IterableDataset, DataLoader
 from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 paddle.enable_static()
 _global_parallel_strategy = None
-_global_process_mesh = None
+_global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
 batch_size = 4
 hidden_size = 1024
 sequence_len = 512
@@ -103,11 +103,7 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh,
-                              "dims_mappig": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _global_process_mesh, [None, None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -126,9 +122,6 @@ def mlp_pretrain_forward(train_program, start_program):
 
 
 def train():
-    global _global_process_mesh
-    _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
-
     dist_strategy = fleet.DistributedStrategy()
     dist_strategy.amp = False
     dist_strategy.pipeline = False
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
index 6bd48fb1963ede..ee8c79ff9b1ecb 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/auto_parallel_relaunch_with_gpt_planner.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from auto_parallel_relaunch_model import mlp_pretrain_forward
 from auto_parallel_relaunch_model import batch_generator_creator
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py b/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
index 60a915c53cddfc..1cbc8aed1202e8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/clip_grad_by_global_norm.py
@@ -18,24 +18,21 @@
 import numpy as np
 import paddle
 
-import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
-
-from paddle.distributed.auto_parallel.engine import Engine
+from paddle.distributed.fleet import auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
 from get_gpt_model import generate_model, create_data_holder, FakeDataset
 
 paddle.enable_static()
 
 
 def apply_pass(use_sharding=False):
-    strategy = fleet.DistributedStrategy()
-    strategy.semi_auto = True
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
     if use_sharding:
-        strategy.sharding = True
-        strategy.sharding_configs = {
-            "sharding_degree": 2,
-            "stage": 2,
-        }
+        sharding = strategy.sharding
+        sharding.degree = 2
+        sharding.stage = 2
     return strategy
 
 
@@ -76,34 +73,17 @@ def init(self, engine):
         paddle.seed(2022)
         np.random.seed(2022)
         random.seed(2022)
-        engine.mode = "train"
-        engine._executor.run(engine.startup_program)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
 
-    def get_dp2_engine(self):
+    def get_engine(self, use_sharding=False):
         reset_prog()
 
-        strategy = apply_pass()
+        strategy = apply_pass(use_sharding)
         clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
         opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
         model, loss = generate_model("dp")
-        inputs_spec, labels_spec = create_data_holder(self.batch_size)
-
-        engine = Engine(model, inputs_spec, labels_spec, strategy=strategy)
-        engine.prepare(optimizer=opt, loss=loss)
-        self.init(engine)
-        return engine
-
-    def get_dp2sharding2_engine(self):
-        reset_prog()
-
-        strategy = apply_pass(True)
-        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
-        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
-        model, loss = generate_model("dp")
-        inputs_spec, labels_spec = create_data_holder(self.batch_size)
-
-        engine = Engine(model, inputs_spec, labels_spec, strategy=strategy)
-        engine.prepare(optimizer=opt, loss=loss)
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
         self.init(engine)
         return engine
 
@@ -121,15 +101,13 @@ def check_result(self, dp_params, sharding_params):
 
     def test_grad_clip(self):
         # dp2 training
-        dp_engine = self.get_dp2_engine()
-        dp_engine.fit(self.dataset, batch_size=self.batch_size, use_cache=True)
+        dp_engine = self.get_engine()
+        dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
         dp_param_values = get_parameter_value(dp_engine.main_program)
 
         # dp2sharding2 training
-        sharding_engine = self.get_dp2sharding2_engine()
-        sharding_engine.fit(self.dataset,
-                            batch_size=self.batch_size,
-                            use_cache=True)
+        sharding_engine = self.get_engine(True)
+        sharding_engine.fit(self.dataset, 3, batch_size=self.batch_size)
         sharding_param_values = get_parameter_value(
             sharding_engine.main_program)
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
index 104614e3e9d4ed..0a50c6d3000a0a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api.py
@@ -20,6 +20,8 @@
 import numpy as np
 import subprocess
 import paddle
+import paddle.static as static
+import paddle.utils as utils
 import paddle.nn as nn
 import paddle.fluid as fluid
 import paddle.static as static
@@ -27,18 +29,22 @@
 import paddle.utils as utils
 from paddle.fluid import layers
 from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
+
+from paddle.distributed.fleet import auto
+from paddle.distributed.auto_parallel.interface import (
+    get_collection,
+    CollectionNames,
+)
 from paddle.optimizer.lr import CosineAnnealingDecay
 from paddle.fluid.dataloader.collate import default_collate_fn
 
 paddle.enable_static()
+
 global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
 PP_MESH_0 = auto.ProcessMesh([0])
 PP_MESH_1 = auto.ProcessMesh([1])
-batch_size = 1
+epoch_num = 1
+batch_size = 2
 batch_num = 10
 hidden_size = 1024
 sequence_len = 512
@@ -47,9 +53,12 @@
 
 paddle.seed(44)
 
+is_fetch = True
+is_feed = True
+my_feed_vars = []
 
-class MyDataset(Dataset):
 
+class MyDataset(Dataset):
     def __init__(self, num_samples):
         super(MyDataset, self).__init__()
         self.num_samples = num_samples
@@ -63,99 +72,449 @@ def __len__(self):
         return self.num_samples
 
 
-class MLPLayer(nn.Layer):
+def get_random_inputs_and_labels(image_shape, label_shape):
+    input = np.random.random(size=image_shape).astype('float32')
+    label = np.random.random(size=label_shape).astype('int64')
+    return input, label
+
+
+def batch_generator_creator():
+    def __reader__():
+        for _ in range(batch_num):
+            batch_input, batch_label = get_random_inputs_and_labels(
+                [batch_size, image_size], [batch_size, 1]
+            )
+            yield batch_input, batch_label
+
+    return __reader__
+
 
-    def __init__(self,
-                 hidden_size=1024,
-                 intermediate_size=4 * 1024,
-                 dropout_ratio=0.1,
-                 initializer_range=0.02):
+class MLPLayer(nn.Layer):
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4 * 1024,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    ):
         super(MLPLayer, self).__init__()
         d_model = hidden_size
         dim_feedforward = intermediate_size
         weight_attr = paddle.ParamAttr(
-            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range))
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)
+        )
         bias_attr = None
 
-        self.linear0 = nn.Linear(d_model,
-                                 dim_feedforward,
-                                 weight_attr,
-                                 bias_attr=bias_attr)
-        self.linear1 = nn.Linear(dim_feedforward,
-                                 d_model,
-                                 weight_attr,
-                                 bias_attr=bias_attr)
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr
+        )
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr
+        )
         self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        out = auto.shard_op(self.norm, dist_attr={"process_mesh":
-                                                  PP_MESH_0})(input)
+        out = auto.shard_op(self.norm, PP_MESH_0)(input)
         out = self.linear0(out)
+        if is_feed:
+            my_feed_vars.append((out, out.shape))
         out = F.gelu(out, approximate=True)
-        out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
-                                                     PP_MESH_1})(out)
+        out = auto.shard_op(self.linear1, PP_MESH_1)(out)
         out = self.dropout(out)
         out = self.linear2(out)
-        self.out = out
+        if is_feed:
+            my_feed_vars.append((out, out.shape))
+        if is_fetch:
+            auto.fetch(out, "my_fetch", logging=True)
         return out
 
 
-def train(fetch):
-    mlp = MLPLayer(hidden_size=hidden_size,
-                   intermediate_size=4 * hidden_size,
-                   dropout_ratio=0.1,
-                   initializer_range=0.02)
+def train_high_level(fetch):
+    global is_fetch
+    is_fetch = fetch
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    )
     loss = paddle.nn.CrossEntropyLoss()
-    optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
-                                      beta1=0.9,
-                                      beta2=0.999,
-                                      epsilon=1e-08,
-                                      grad_clip=None)
-
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    # init engine
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
-
-    # fetch
-    if fetch:
-        fetches = {'out': mlp.out}
-    else:
-        fetches = None
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None,
+    )
+    metric = paddle.metric.Accuracy()
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
+
+    # train
+    train_dataset = MyDataset(batch_num * batch_size)
+    eval_dataset1 = MyDataset(5 * batch_size)
+
+    history = engine.fit(
+        train_data=train_dataset,
+        epochs=2,
+        batch_size=batch_size,
+        valid_data=eval_dataset1,
+        log_freq=1,
+    )
+
+    # eval
+    eval_dataset2 = MyDataset(batch_size)
+    engine.evaluate(eval_dataset2, batch_size=batch_size)
+
+    # predict
+    test_dataset = MyDataset(batch_size)
+    outputs = engine.predict(test_dataset, batch_size=batch_size)
+
+    # save
+    temp_dir = tempfile.TemporaryDirectory()
+    model_filename = os.path.join(temp_dir.name, 'mlp')
+    engine.save(model_filename, training=True)
+    engine.load(model_filename)
+    temp_dir.cleanup()
+
+
+def train_low_level():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    )
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None,
+    )
+    metric = paddle.metric.Accuracy()
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(mlp, loss, optimizer, metrics=None, strategy=strategy)
+
+    feed_dict = {}
+    for feed_var, shape in my_feed_vars:
+        feed_dict[feed_var.name] = np.zeros(shape, dtype="float32")
+
+    # Build normal normal dataloader
+    # train
+    train_dataset = MyDataset(batch_num * batch_size)
+    train_dataloader = engine.dataloader(
+        train_dataset, batch_size=batch_size, mode="train"
+    )
+    engine.prepare(mode="train")
+    for data in train_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="train")
+
+    # eval
+    eval_dataset2 = MyDataset(batch_size)
+    eval_dataloader = engine.dataloader(
+        eval_dataset2, batch_size=batch_size, mode="eval"
+    )
+    engine.prepare(mode="eval")
+    for data in eval_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="eval")
+
+    # predict
+    engine.to_mode("predict")
+    test_dataset = MyDataset(batch_size)
+    predict_dataloader = engine.dataloader(test_dataset, batch_size=batch_size)
+    engine.prepare()
+    for data in predict_dataloader:
+        outs = engine.run(data, feed=feed_dict)
 
+    # save
+    temp_dir = tempfile.TemporaryDirectory()
+    model_filename = os.path.join(temp_dir.name, 'mlp')
+    engine.save(model_filename, training=True)
+    engine.load(model_filename)
+    temp_dir.cleanup()
+
+    # Build dataloader from generator
     # train
     train_dataset = MyDataset(batch_num * batch_size)
-    engine.fit(train_dataset,
-               batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size,
-               fetches=fetches)
+    train_dataloader = engine.dataloader_from_generator(
+        train_dataset, batch_size=batch_size, mode="train"
+    )
+    engine.prepare(mode="train")
+    for data in train_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="train")
 
     # eval
-    eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
+    engine.to_mode("eval")
+    eval_dataset2 = MyDataset(batch_size)
+    eval_dataloader = engine.dataloader_from_generator(
+        eval_dataset2, batch_size=batch_size
+    )
+    engine.prepare()
+    for data in eval_dataloader:
+        outs = engine.run(data, feed=feed_dict)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetches=fetches)
+    predict_dataloader = engine.dataloader_from_generator(
+        test_dataset, batch_size=batch_size, mode="predict"
+    )
+    engine.prepare(mode="predict")
+    for data in predict_dataloader:
+        outs = engine.run(data, feed=feed_dict, mode="predict")
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
-    model_filename = os.path.join(temp_dir.name, 'mlp_inf')
-    engine.save(model_filename, training=False, mode='predict')
+    model_filename = os.path.join(temp_dir.name, 'mlp')
+    engine.save(model_filename, training=True)
+    engine.load(model_filename)
     temp_dir.cleanup()
 
 
+def train_builtin_data_vars():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    )
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None,
+    )
+    metric = paddle.metric.Accuracy()
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
+
+    # train
+    engine.to_mode("train")
+
+    input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input')
+    label_spec = static.InputSpec([batch_size, 1], 'int64', 'label')
+    engine.prepare(inputs_spec=[input_spec], labels_spec=[label_spec])
+
+    with static.program_guard(engine.main_program, engine.startup_program):
+        feed_list = engine.inputs + engine.labels
+        print(feed_list)
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=feed_list, capacity=4 * batch_size, iterable=False
+        )
+
+        places = static.cuda_places()
+        loader.set_batch_generator(batch_generator_creator(), places=places)
+
+    for _ in range(epoch_num):
+        loader.start()  # call DataLoader.start() before each epoch starts
+        try:
+            while True:
+                engine.run()
+        except paddle.fluid.core.EOFException:
+            loader.reset()  # call DataLoader.reset() after catching EOFException
+
+
+def train_non_builtin_data_vars():
+    main_program = static.Program()
+    startup_program = static.Program()
+    with static.program_guard(
+        main_program, startup_program
+    ), utils.unique_name.guard():
+        input = static.data(
+            name="input", shape=[batch_size, image_size], dtype='float32'
+        )
+        label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
+
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=[input, label], capacity=4 * batch_size, iterable=False
+        )
+        places = static.cuda_places()
+        loader.set_batch_generator(batch_generator_creator(), places=places)
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02,
+        )
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None,
+        )
+        metric = paddle.metric.Accuracy()
+        predict = mlp(input)
+        loss_var = loss(predict, label)
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(
+        loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy
+    )
+
+    # train
+    engine.to_mode("train")
+    engine.prepare(
+        inputs=[input],
+        labels=[label],
+        main_program=main_program,
+        startup_program=startup_program,
+    )
+    for _ in range(epoch_num):
+        loader.start()  # call DataLoader.start() before each epoch starts
+        try:
+            while True:
+                engine.run()
+        except paddle.fluid.core.EOFException:
+            loader.reset()  # call DataLoader.reset() after catching EOFException
+
+
+def get_cost():
+    main_program = static.Program()
+    startup_program = static.Program()
+    with static.program_guard(
+        main_program, startup_program
+    ), utils.unique_name.guard():
+        input = static.data(
+            name="input", shape=[batch_size, image_size], dtype='float32'
+        )
+        label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
+
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=[input, label], capacity=4 * batch_size, iterable=False
+        )
+        places = static.cuda_places()
+        loader.set_batch_generator(batch_generator_creator(), places=places)
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02,
+        )
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None,
+        )
+        metric = paddle.metric.Accuracy()
+        predict = mlp(input)
+        loss_var = loss(predict, label)
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(
+        loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy
+    )
+    engine.prepare(
+        main_program=main_program,
+        startup_program=startup_program,
+        inputs=[input],
+        labels=[label],
+        mode="train",
+    )
+    engine.cost()
+
+
+def get_cost_by_default_program():
+    main_program = static.default_main_program()
+    startup_program = static.default_startup_program()
+    with static.program_guard(
+        main_program, startup_program
+    ), utils.unique_name.guard():
+        input = static.data(
+            name="input", shape=[batch_size, image_size], dtype='float32'
+        )
+        label = static.data(name="label", shape=[batch_size, 1], dtype='int64')
+
+        loader = paddle.io.DataLoader.from_generator(
+            feed_list=[input, label], capacity=4 * batch_size, iterable=False
+        )
+        places = static.cuda_places()
+        loader.set_batch_generator(batch_generator_creator(), places=places)
+
+        mlp = MLPLayer(
+            hidden_size=hidden_size,
+            intermediate_size=4 * hidden_size,
+            dropout_ratio=0.1,
+            initializer_range=0.02,
+        )
+        loss = paddle.nn.CrossEntropyLoss()
+        optimizer = paddle.optimizer.Adam(
+            learning_rate=0.00001,
+            beta1=0.9,
+            beta2=0.999,
+            epsilon=1e-08,
+            grad_clip=None,
+        )
+        metric = paddle.metric.Accuracy()
+        predict = mlp(input)
+        loss_var = loss(predict, label)
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(
+        loss=loss_var, optimizer=optimizer, metrics=metric, strategy=strategy
+    )
+    engine.cost(mode="train")
+
+
+def get_cost_by_spec():
+    mlp = MLPLayer(
+        hidden_size=hidden_size,
+        intermediate_size=4 * hidden_size,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    )
+    loss = paddle.nn.CrossEntropyLoss()
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.00001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-08,
+        grad_clip=None,
+    )
+    metric = paddle.metric.Accuracy()
+
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+
+    engine = auto.Engine(mlp, loss, optimizer, metric, strategy=strategy)
+
+    input_spec = static.InputSpec([batch_size, image_size], 'float32', 'input')
+    label_spec = static.InputSpec([batch_size, 1], 'int64', 'label')
+    engine.cost(mode="eval", inputs_spec=[input_spec], labels_spec=[label_spec])
+
+
 if __name__ == "__main__":
-    train(fetch=True)
-    train(fetch=False)
+    train_high_level(fetch=True)
+    train_high_level(fetch=False)
+    train_low_level()
+    train_builtin_data_vars()
+    train_non_builtin_data_vars()
+    get_cost()
+    get_cost_by_default_program()
+    get_cost_by_spec()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py
index 6a4c8a9986cef2..17735594c5a0d8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/engine_api_dp.py
@@ -26,14 +26,12 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 from paddle.fluid import layers
-from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
+from paddle.io import Dataset, DataLoader
+
+from paddle.distributed.fleet import auto
 
 paddle.enable_static()
-batch_size = 1
+batch_size = 2
 batch_num = 10
 hidden_size = 1024
 sequence_len = 512
@@ -91,6 +89,7 @@ def forward(self, input):
         out = self.linear1(out)
         out = self.dropout(out)
         out = self.linear2(out)
+        auto.fetch(out, "out")
         self.out = out
         return out
 
@@ -107,49 +106,32 @@ def train(fetch):
                                                      epsilon=1e-08,
                                                      grad_clip=None)
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.amp = False
-    dist_strategy.pipeline = False
-    dist_strategy.recompute = False
-    # init parallel optimizer
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
 
     # init engine
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
-
-    # fetch
-    if fetch:
-        fetches = {'out': mlp.out}
-    else:
-        fetches = None
+    engine = auto.Engine(mlp,
+                         loss,
+                         optimizer,
+                         paddle.metric.Accuracy(),
+                         strategy=dist_strategy)
 
     # train
     train_dataset = MyDataset(batch_num * batch_size)
-    engine.fit(train_dataset,
-               batch_size=batch_size,
-               steps_per_epoch=batch_num * batch_size,
-               fetches=fetches)
+    engine.fit(train_dataset, batch_size=batch_size)
 
     # eval
     eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
+    engine.evaluate(eval_dataset, batch_size=batch_size)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetches=fetches)
+    engine.predict(test_dataset, batch_size=batch_size)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
     model_filename = os.path.join(temp_dir.name, 'mlp_inf')
-    engine.save(model_filename, training=False, mode='predict')
+    engine.save(model_filename, training=False)
     temp_dir.cleanup()
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
index 0e5c6b387f987e..318773c71e09eb 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/get_gpt_model.py
@@ -14,8 +14,10 @@
 
 import sys
 import numpy as np
+import random
 
 import paddle
+from paddle.distributed.fleet import auto
 
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
@@ -25,7 +27,7 @@
 vocab_size = 1000
 
 
-class FakeDataset:
+class FakeDataset(paddle.io.Dataset):
 
     def __init__(self, num_samples):
         self.num_samples = num_samples
@@ -33,6 +35,9 @@ def __init__(self, num_samples):
         self.vocab_size = vocab_size
 
     def __getitem__(self, idx):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
         tokens = np.random.randint(self.vocab_size, size=self.sequence_len)
         position_ids = np.arange(self.sequence_len)
         attention_mask = np.tril(np.ones(self.sequence_len)).reshape(
@@ -67,15 +72,15 @@ def create_data_holder(batch_size):
 
 def generate_model(strategy):
     modeling.init_global()
+    ranks = list(range(paddle.distributed.get_world_size()))
+    modeling._global_process_mesh = auto.ProcessMesh(mesh=ranks,
+                                                     dim_names=["x"])
     if strategy == "serial":
         modeling._global_parallel_strategy = "serial"
-        modeling._global_process_mesh = [0]
     elif strategy == "mp":
         modeling._global_parallel_strategy = "mp"
-        modeling._global_process_mesh = [0, 1]
     elif strategy == "dp":
         modeling._global_parallel_strategy = "dp"
-        modeling._global_process_mesh = [0, 1]
     else:
         raise ValueError("Only support serial, mp2 and dp2.")
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
new file mode 100644
index 00000000000000..438e17d29f777b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/gradient_merge_pass_unittest.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+from paddle.distributed.fleet import auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+paddle.enable_static()
+
+
+def apply_pass(use_gradient_merge=False):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_gradient_merge:
+        gradient_merge = strategy.gradient_merge
+        gradient_merge.enable = True
+        gradient_merge.k_steps = 4
+        gradient_merge.avg = True
+
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestGradientMergePass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-5
+        self.atol = 1e-8
+        self.batch_size = 8
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2021)
+        np.random.seed(2021)
+        random.seed(2021)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_gradient_merge=False):
+        reset_prog()
+
+        strategy = apply_pass(use_gradient_merge)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("dp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=self.rtol,
+            atol=self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_gradient_merge_pass(self):
+        # dp2 training
+        dp_engine = self.get_engine()
+        history = dp_engine.fit(self.dataset,
+                                3,
+                                batch_size=self.batch_size,
+                                log_freq=1)
+        dp_losses = np.array(history.history["loss"])
+
+        # dp2 gradient merge training
+        gm_engine = self.get_engine(True)
+        history = gm_engine.fit(self.dataset,
+                                3,
+                                batch_size=self.batch_size,
+                                log_freq=1)
+        gm_losses = np.array(history.history["loss"])
+
+        # avg_loss = 0
+        # pass_avg_ret_list = []
+        # for i, pass_ret in enumerate(gm_losses):
+        #     if (i + 1) % 4 == 0:
+        #         avg_loss += pass_ret
+        #         pass_avg_ret_list.append(avg_loss / 4)
+        #         avg_loss = 0
+        #     else:
+        #         avg_loss += pass_ret
+
+        # NOTE: every sample data from dataset is all the same
+        self.check_results(dp_losses, gm_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
index 9ab49b30d9d677..d69b0cf342f85a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/high_order_grad.py
@@ -16,12 +16,8 @@
 import paddle
 import unittest
 import numpy as np
-import paddle.distributed.auto_parallel as auto
-
-from paddle.static import InputSpec
-from paddle.distributed import fleet
+from paddle.distributed.fleet import auto
 from paddle.incubate.autograd import Hessian
-from paddle.distributed.auto_parallel.engine import Engine
 
 np.random.seed(1234)
 paddle.seed(1234)
@@ -87,7 +83,7 @@ def forward(self, inputs, bc_index):
         return eq_loss, bc_u
 
 
-class LaplaceDataset:
+class LaplaceDataset(paddle.io.Dataset):
 
     def __init__(self, num_sample):
         self.num_sample = num_sample
@@ -129,23 +125,14 @@ def main():
     # model
     laplace = LaplaceModel()
 
-    # spec
-    inputs_spec = [
-        InputSpec([100, 2], 'float32', 'x'),
-        InputSpec([36], 'int64', 'bc_idx')
-    ]
-    labels_spec = InputSpec([36, 1], 'float32', 'bc_v')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    engine = Engine(laplace,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer=optimizer, loss=loss_func)
-    engine.fit(train_dataset, batch_size=None)
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
+
+    engine = auto.Engine(laplace,
+                         loss=loss_func,
+                         optimizer=optimizer,
+                         strategy=dist_strategy)
+    engine.fit(train_dataset, train_sample_split=2, batch_size=None)
 
     dist_context = engine.dist_context
     block = engine.main_program.global_block()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py b/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py
index 4ca3d14f7165a2..e19023daa68207 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/iterable_dataset.py
@@ -28,9 +28,8 @@
 from paddle.fluid import layers
 from paddle.io import Dataset, IterableDataset, DataLoader
 from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
+
+from paddle.distributed.fleet import auto
 from paddle.optimizer.lr import CosineAnnealingDecay
 from paddle.fluid.dataloader.collate import default_collate_fn
 
@@ -48,10 +47,9 @@
 paddle.seed(44)
 
 
-class MyDataset(IterableDataset):
+class MyDataset(paddle.io.IterableDataset):
 
     def __init__(self, num_samples):
-        super(MyDataset, self).__init__()
         self.num_samples = num_samples
 
     def __iter__(self):
@@ -61,10 +59,9 @@ def __iter__(self):
             yield input, label
 
 
-class MyDataset1(Dataset):
+class MyDataset1(paddle.io.Dataset):
 
     def __init__(self, num_samples):
-        super(MyDataset1, self).__init__()
         self.num_samples = num_samples
         self.data = []
         for i in range(self.num_samples):
@@ -112,12 +109,10 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        out = auto.shard_op(self.norm, dist_attr={"process_mesh":
-                                                  PP_MESH_0})(input)
+        out = auto.shard_op(self.norm, PP_MESH_0)(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        out = auto.shard_op(self.linear1, dist_attr={"process_mesh":
-                                                     PP_MESH_1})(out)
+        out = auto.shard_op(self.linear1, PP_MESH_1)(out)
         out = self.dropout(out)
         out = self.linear2(out)
         self.out = out
@@ -136,54 +131,36 @@ def train(fetch):
                                       epsilon=1e-08,
                                       grad_clip=None)
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
     dist_strategy.split_data = True
-    fleet.init(is_collective=True, strategy=dist_strategy)
 
     # init engine
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
-
-    # fetch
-    if fetch:
-        fetches = {'out': mlp.out}
-    else:
-        fetches = None
+    engine = auto.Engine(mlp,
+                         loss,
+                         optimizer,
+                         paddle.metric.Accuracy(),
+                         strategy=dist_strategy)
 
     # train
     train_dataset = MyDataset(batch_num * batch_size)
-    train_dataset1 = MyDataset1(batch_num)
-    engine.fit(train_dataset,
-               epochs=2,
-               batch_size=batch_size,
-               steps_per_epoch=batch_num,
-               fetches=fetches)
-
-    engine.fit(train_dataset1,
-               epochs=2,
-               batch_size=None,
-               steps_per_epoch=batch_num,
-               fetches=fetches)
+    engine.fit(train_dataset, epochs=2, batch_size=batch_size)
+
+    train_dataset1 = MyDataset1(batch_size * batch_num)
+    engine.fit(train_dataset1, epochs=2, batch_size=None)
 
     # eval
     eval_dataset = MyDataset(batch_size)
-    engine.evaluate(eval_dataset, batch_size, fetches=fetches)
+    engine.evaluate(eval_dataset, batch_size=batch_size)
 
     # predict
     test_dataset = MyDataset(batch_size)
-    engine.predict(test_dataset, batch_size, fetches=fetches)
+    engine.predict(test_dataset, batch_size=batch_size)
 
     # save
     temp_dir = tempfile.TemporaryDirectory()
     model_filename = os.path.join(temp_dir.name, 'mlp_inf')
-    engine.save(model_filename, training=False, mode='predict')
+    engine.save(model_filename, training=False)
     temp_dir.cleanup()
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py b/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py
index 8e058d16b87b36..484c67f69c39b2 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/optimization_tuner_api.py
@@ -27,10 +27,8 @@
 import paddle.utils as utils
 from paddle.fluid import layers
 from paddle.io import Dataset, IterableDataset, DataLoader
-from paddle.static import InputSpec
-from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
-from paddle.distributed.auto_parallel.engine import Engine
+
+from paddle.distributed.fleet import auto
 from engine_api_dp import MyDataset
 
 paddle.enable_static()
@@ -43,20 +41,6 @@
 
 paddle.seed(44)
 
-# class MyDataset(Dataset):
-
-#     def __init__(self, num_samples):
-#         super(MyDataset, self).__init__()
-#         self.num_samples = num_samples
-
-#     def __getitem__(self, index):
-#         input = np.random.uniform(size=image_size).astype("float32")
-#         label = np.random.randint(0, class_num - 1, dtype="int64")
-#         return input, label
-
-#     def __len__(self):
-#         return self.num_samples
-
 
 class MLPLayer(nn.Layer):
 
@@ -107,50 +91,33 @@ def train(fetch):
                                                      epsilon=1e-08,
                                                      grad_clip=None)
 
-    inputs_spec = InputSpec([batch_size, hidden_size], 'float32', 'x')
-    labels_spec = InputSpec([batch_size], 'int64', 'label')
-
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.amp = False
-    dist_strategy.pipeline = False
-    dist_strategy.recompute = False
-    # init parallel optimizer
-    dist_strategy.semi_auto = True
-    dist_strategy.sharding = True
-    dist_strategy.sharding_configs = {
-        "sharding_degree": 2,
-        "stage": 3,
-        "enable_tuning": True,
-    }
-    fleet.init(is_collective=True, strategy=dist_strategy)
-
-    # init engine
-    import tempfile
-    tmp_dir = tempfile.TemporaryDirectory()
-    dataset = MyDataset(batch_num * batch_size)
-
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
+    # sharding config
+    sharding = dist_strategy.sharding
+    sharding.enable = True
+    sharding.degree = 2
+    sharding.stage = 3
+    sharding.enable_tuning = True
+    sharding.tuning_range = [0, 1, 2, 3]
     # Tuning configuration
-    tuning_config = {
-        "batch_size": batch_size,
-        "dataset": dataset,
-        "profile_start_step": 1,
-        "profile_end_step": 5,
-        "run_after_tuning": True,
-        "sharding": {
-            "stage_range": [0, 1, 2, 3]
-        },
-        "verbose": True,
-    }
-    engine = Engine(mlp,
-                    inputs_spec=inputs_spec,
-                    labels_spec=labels_spec,
-                    strategy=dist_strategy,
-                    user_tuning_config=tuning_config)
-    engine.prepare(optimizer, loss, metrics=paddle.metric.Accuracy())
+    tuning = dist_strategy.tuning
+    tuning.enable = True
+    tuning.profile_start_step = 1
+    tuning.profile_end_step = 5
+    tuning.run_after_tuning = True
+    tuning.verbose = True
+
+    dataset = MyDataset(batch_num * batch_size)
+    engine = auto.Engine(mlp,
+                         loss,
+                         optimizer,
+                         paddle.metric.Accuracy(),
+                         strategy=dist_strategy)
+    engine._tune(dataset, batch_size=batch_size)
 
     # check tuned
-    assert (engine._dist_contexts['train'].strategy.sharding_configs['stage'] !=
-            3)
+    assert (engine._dist_contexts['train'].strategy.sharding.stage != 3)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
new file mode 100644
index 00000000000000..1a444353d03991
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/recompute_pass_unittest.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+from paddle.distributed.fleet import auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+
+def apply_pass(use_recompute=False):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_recompute:
+        recompute = strategy.recompute
+        recompute.enable = True
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestRecomputePass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-6
+        self.atol = 1e-8
+        self.batch_size = 1
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2022)
+        np.random.seed(2022)
+        random.seed(2022)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_recompute=False):
+        reset_prog()
+
+        strategy = apply_pass(use_recompute)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("mp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=self.rtol,
+            atol=self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_recompute_pass(self):
+        # mp2 training
+        mp_engine = self.get_engine()
+        history = mp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        mp_losses = np.array(history.history["loss"])
+
+        # mp2 recompute training
+        rc_engine = self.get_engine(True)
+        history = rc_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        rc_losses = np.array(history.history["loss"])
+        self.check_results(mp_losses, rc_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
new file mode 100644
index 00000000000000..356c8ec2e14a71
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/sharding_pass_unittest.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+from paddle.distributed.fleet import auto
+from paddle.fluid.dygraph.parallel import ParallelEnv
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+paddle.enable_static()
+
+
+def apply_pass(use_sharding=False, stage=None):
+    strategy = auto.Strategy()
+    strategy.auto_mode = "semi"
+    strategy.reinit = True
+    if use_sharding:
+        sharding = strategy.sharding
+        sharding.enable = True
+        sharding.degree = 2
+        sharding.stage = 1
+
+    return strategy
+
+
+def reset_prog():
+    paddle.fluid.framework.switch_main_program(paddle.static.Program())
+    paddle.fluid.framework.switch_startup_program(paddle.static.Program())
+
+
+class TestShardingPass(unittest.TestCase):
+
+    def setUp(self):
+        self.rtol = 1e-6
+        self.atol = 1e-8
+        self.batch_size = 2
+        self.batch_num = 10
+        self.clip_norm = 0.2
+        self.dataset = FakeDataset(self.batch_size * self.batch_num)
+
+    def init(self, engine):
+        paddle.seed(2022)
+        np.random.seed(2022)
+        random.seed(2022)
+        place = paddle.fluid.CUDAPlace(ParallelEnv().dev_id)
+        engine._executor = paddle.static.Executor(place)
+
+    def get_engine(self, use_sharding=False, stage=None):
+        reset_prog()
+
+        strategy = apply_pass(use_sharding, stage)
+        clip = paddle.nn.ClipGradByGlobalNorm(self.clip_norm)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        model, loss = generate_model("dp")
+
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        self.init(engine)
+        return engine
+
+    def check_results(self, ref_losses, check_losses):
+        np.testing.assert_allclose(
+            ref_losses,
+            check_losses,
+            rtol=self.rtol,
+            atol=self.atol,
+            err_msg='pass {} has wrong results!, \nu={}\nv={}\ndiff={}'.format(
+                __class__, ref_losses, check_losses, ref_losses - check_losses))
+
+    def test_sharding_pass(self):
+        # dp2 training
+        dp_engine = self.get_engine()
+        history = dp_engine.fit(self.dataset, 3, batch_size=self.batch_size)
+        dp_losses = np.array(history.history["loss"])
+
+        # sharding2 stage1 training
+        sharding1_engine = self.get_engine(True, 1)
+        history = sharding1_engine.fit(self.dataset,
+                                       3,
+                                       batch_size=self.batch_size)
+        sharding1_losses = np.array(history.history["loss"])
+        self.check_results(dp_losses, sharding1_losses)
+
+        # sharding2 stage2 training
+        sharding2_engine = self.get_engine(True, 2)
+        history = sharding2_engine.fit(self.dataset,
+                                       3,
+                                       batch_size=self.batch_size)
+        sharding2_losses = np.array(history.history["loss"])
+        self.check_results(dp_losses, sharding2_losses)
+
+        # sharding2 stage3 training
+        sharding3_engine = self.get_engine(True, 3)
+        history = sharding3_engine.fit(self.dataset,
+                                       3,
+                                       batch_size=self.batch_size)
+        sharding3_losses = np.array(history.history["loss"])
+        self.check_results(dp_losses, sharding3_losses)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
index 0fbe4f5bd3d095..700360452eba85 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_base_cost.py
@@ -24,7 +24,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
@@ -45,9 +45,10 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]],
+                                        dim_names=["x", "y", "z"])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
 
 
 class MLPLayer(nn.Layer):
@@ -74,16 +75,8 @@ def __init__(self,
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1, 1]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "y"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -111,16 +104,8 @@ def mlp_forward(train_program, start_program):
         embedding = paddle.nn.Embedding(10, hidden_size, sparse=True)
         embedding_out = embedding(fill_constant_out)
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(input, PP_MESH_0, ["x", None])
+        auto.shard_tensor(label, PP_MESH_1, ["x", None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
index 0a3a5993ffdb98..c0f7c8781928b4 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_comp_cost.py
@@ -82,6 +82,9 @@
 from paddle.distributed.auto_parallel.cost.comp_op_cost import Transpose2GradOpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import Unsqueeze2OpCost
 from paddle.distributed.auto_parallel.cost.comp_op_cost import WriteToArrayOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import DropoutGradOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import FusedSoftmaxMaskUpperTriangleOpCost
+from paddle.distributed.auto_parallel.cost.comp_op_cost import FusedSoftmaxMaskUpperTriangleGradOpCost
 
 from test_cluster import cluster_json
 
@@ -417,6 +420,22 @@ def test_comp_cost(self):
         self.assertTrue(op_cost.flops >= 0)
         self.assertTrue(op_cost.time >= 0)
         self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = DropoutGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = FusedSoftmaxMaskUpperTriangleOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
+        op_cost = FusedSoftmaxMaskUpperTriangleGradOpCost(cluster=cluster)
+        self.assertTrue(op_cost.flops >= 0)
+        self.assertTrue(op_cost.time >= 0)
+        self.assertTrue(op_cost.memory >= 0)
+
         # Remove unnecessary files
         if os.path.exists(cluster_json_path):
             os.remove(cluster_json_path)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_conditional_block_reshard.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_conditional_block_reshard.py
new file mode 100644
index 00000000000000..86371cbae64366
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_conditional_block_reshard.py
@@ -0,0 +1,96 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddle.static import InputSpec
+from paddle.distributed.fleet import auto
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=64,
+                 intermediate_size=4 * 64,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        self.norm = nn.LayerNorm(hidden_size, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            hidden_size,
+            intermediate_size,
+            paddle.ParamAttr(initializer=nn.initializer.Normal(
+                mean=0.0, std=initializer_range)),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            intermediate_size,
+            hidden_size,
+            paddle.ParamAttr(initializer=nn.initializer.Normal(
+                mean=0.0, std=initializer_range)),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+
+        auto.shard_tensor(self.linear0.weight, auto.ProcessMesh([0, 1], "x"),
+                          [None, "x"])
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+
+        auto.shard_tensor(self.linear1.weight, auto.ProcessMesh([0, 1], "x"),
+                          ["x", None])
+        out = self.linear1(out)
+
+        if paddle.mean(out) < 2:
+            out = self.norm(out)
+            out = self.linear0(out)
+            out = F.gelu(out, approximate=True)
+            out = self.linear1(out)
+        else:
+            out = self.norm(out)
+            out = self.linear0(out)
+            out = self.linear1(out)
+
+        return out
+
+
+def loss_fn(predict, label):
+    error_cost = paddle.nn.functional.square_error_cost(predict, label)
+    loss = paddle.mean(error_cost)
+    return loss
+
+
+class TestSubblock(unittest.TestCase):
+
+    def test_subblock(self):
+
+        mlp = MLPLayer()
+
+        strategy = auto.Strategy()
+        strategy.auto_mode = "semi"
+
+        engine = auto.Engine(model=mlp, loss=loss_fn, strategy=strategy)
+
+        input_sepc = InputSpec([4, 64], 'float32', 'input')
+        label_spec = InputSpec([4, 1], 'float32', 'label')
+        engine.prepare(inputs_spec=[input_sepc],
+                       labels_spec=[label_spec],
+                       mode="predict")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_assign.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_assign.py
new file mode 100644
index 00000000000000..b21dd606d8cb7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_assign.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle.distributed.fleet import auto
+
+paddle.enable_static()
+
+
+def make_program():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+
+        x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
+        y = paddle.static.data(name='y', shape=[4, 4, 8], dtype='float32')
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["d"]),
+                          [None, "d", None])
+
+        z = paddle.add(x, y)
+        paddle.assign(x, output=z)
+
+    return main_program, start_program
+
+
+def parallelizer(program_func, rank):
+    from paddle.distributed.auto_parallel.completion import Completer
+    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+
+    main_program, start_program = program_func()
+
+    dist_context = DistributedContext()
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation(main_program)
+    dist_context.block_state.parse_forward_blocks(main_program)
+
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
+                                                 [])
+
+    return dist_main_prog, dist_context
+
+
+class TestDistAssign(unittest.TestCase):
+
+    def test_dist_assign(self):
+
+        dist_main_prog, dist_context = parallelizer(make_program, 0)
+        ops = dist_main_prog.global_block().ops
+        for op in ops:
+            if op.type == "assign":
+                dist_op = dist_context.get_dist_op_for_program(op)
+                dist_op.dist_attr.impl_type == "assign"
+                dist_op.dist_attr.impl_idx == 0
+
+                x_name = op.input_arg_names[0]
+                out_name = op.output_arg_names[0]
+                out_var = dist_main_prog.global_block().vars[out_name]
+                dist_out = dist_context.get_dist_tensor_for_program(out_var)
+
+                x_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(
+                    x_name)
+                out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(
+                    out_name)
+
+                assert x_dims_mapping == out_dims_mapping
+                assert out_dims_mapping == dist_out.dist_attr.dims_mapping
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
index 62d87fcc191ad8..d2047332c9a226 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_context.py
@@ -25,7 +25,7 @@
 import paddle.nn.functional as F
 
 from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
@@ -34,7 +34,10 @@
 batch_size = 4
 hidden_size = 1024
 sequence_len = 512
-_g_process_mesh = [[0, 1], [2, 3]]
+_g_process_mesh = [
+    auto.ProcessMesh([0, 1], dim_names=["x"]),
+    auto.ProcessMesh([2, 3], dim_names=["x"])
+]
 
 
 def get_random_inputs_and_labels(input_shape, label_shape):
@@ -82,18 +85,10 @@ def __init__(self,
 
     def forward(self, input):
         out = self.norm(input)
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, 0]
-                          })
+        auto.shard_tensor(self.linear0.weight, _g_process_mesh[0], [None, "x"])
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[1],
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(self.linear1.weight, _g_process_mesh[1], ["x", None])
         out = self.linear1(out)
 
         return out
@@ -123,16 +118,8 @@ def get_program():
         dataloader.set_batch_generator(batch_generator_creator(),
                                        places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [0, -1, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(input, _g_process_mesh[0], ["x", None, None])
+        auto.shard_tensor(label, _g_process_mesh[0], ["x", None, None])
 
         mlp_start = MLPLayer(hidden_size=hidden_size,
                              intermediate_size=4 * hidden_size,
@@ -212,7 +199,7 @@ def test_deepcopy(self):
                 "_serial_ordered_nodes", "_serial_ordered_tensor_nodes", \
                 "_serial_ordered_op_nodes", "_original_serial_loss", \
                 "_original_serial_feed_vars", "_original_serial_fetch_vars", \
-                "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_lr_optimizer", \
+                "_serial_loss", "_serial_feed_vars", "_serial_fetch_vars", "_serial_optimizer", \
                 "_backup_serial_main_program_stack", "_backup_serial_startup_program_stack", \
                 "_pass_context"]
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
index 0b81b5bd48ca5f..86eab79b58722c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_embedding.py
@@ -14,7 +14,7 @@
 
 import unittest
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
@@ -42,19 +42,13 @@ def make_program_lookup_table_v1_mp_dp():
             is_sparse=False)
         loss = paddle.fluid.layers.reduce_mean(emb_out)
 
-        auto.shard_tensor(src_ids,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([[0, 1], [2,
-                                                                         3]]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(
+            src_ids, auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+            ["x", None, None])
         emb_weight = block.vars["emb_weight"]
-        auto.shard_tensor(emb_weight,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([[0, 1], [2,
-                                                                         3]]),
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(
+            emb_weight, auto.ProcessMesh([[0, 1], [2, 3]],
+                                         dim_names=["x", "y"]), ["y", None])
 
     return main_program, start_program, loss
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
index 8cf2b47660fe5c..6072a226f92480 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_matmul.py
@@ -14,7 +14,7 @@
 
 import unittest
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
@@ -22,82 +22,58 @@
 
 paddle.enable_static()
 
-mesh = [[0, 1], [2, 3]]
+mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
 
 
 def init_x_row(trans_x):
     if trans_x:
         x = paddle.static.data(name='x', shape=[10, 6, 8], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [0, 1, -1]
-                          })
+        auto.shard_tensor(x, mesh, ["x", "y", None])
+
         return x
     else:
         x = paddle.static.data(name='x', shape=[10, 8, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [0, -1, 1]
-                          })
+        auto.shard_tensor(x, mesh, ["x", None, "y"])
+
         return x
 
 
 def init_x_col(trans_x):
     if trans_x:
         x = paddle.static.data(name='x', shape=[6, 8], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [-1, 0]
-                          })
+        auto.shard_tensor(x, mesh, [None, "x"])
+
         return x
     else:
         x = paddle.static.data(name='x', shape=[8, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(x, mesh, ["x", None])
+
         return x
 
 
 def init_y_row(trans_y):
     if trans_y:
         y = paddle.static.data(name='y', shape=[4, 6], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [-1, 1]
-                          })
+        auto.shard_tensor(y, mesh, [None, "y"])
+
         return y
     else:
         y = paddle.static.data(name='y', shape=[6, 4], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(y, mesh, ["y", None])
+
         return y
 
 
 def init_y_col(trans_y):
     if trans_y:
         y = paddle.static.data(name='y', shape=[4, 6], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(y, mesh, ["y", None])
+
         return y
     else:
         y = paddle.static.data(name='y', shape=[6, 4], dtype='float32')
-        auto.shard_tensor(y,
-                          dist_attr={
-                              "process_mesh": mesh,
-                              "dims_mapping": [-1, 1]
-                          })
+        auto.shard_tensor(y, mesh, [None, "y"])
+
         return y
 
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
index 734bd7acf9dec1..c31991243b697c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_op_cost.py
@@ -16,7 +16,7 @@
 import copy
 
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.cluster import Cluster
 from paddle.distributed.auto_parallel.operators.common import get_distributed_operator_impl_container, is_elementwise_op
 
@@ -71,11 +71,8 @@ def make_program():
                                            shape=[4, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[2, 8], value=1, dtype='float32')
                 weight_attr = paddle.ParamAttr()
@@ -121,17 +118,12 @@ def make_program():
                                            shape=[8, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0]
-                                  })
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x"])
 
                 auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 # embedding
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[4], value=1, dtype='int32')
@@ -141,12 +133,9 @@ def make_program():
                 for op in main_program.global_block().ops:
                     if op.type == "lookup_table_v2":
                         W = main_program.global_block().vars[op.input("W")[0]]
-                        auto.shard_tensor(W,
-                                          dist_attr={
-                                              "process_mesh":
-                                              auto.ProcessMesh([0, 1]),
-                                              "dims_mapping": [0, -1]
-                                          })
+                        auto.shard_tensor(
+                            W, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                            ["x", None])
                 out = paddle.fluid.layers.transpose(out,
                                                     [1, 0])  # [8, 2] [-1, 0]
 
@@ -154,26 +143,20 @@ def make_program():
                 param1 = paddle.fluid.layers.create_parameter(
                     [4, 8], paddle.float32)  # [2, 8] [0, -1]
                 auto.shard_tensor(param1,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 param2 = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 4] [-1, 0]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, 0]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, "x"])
                 out1 = paddle.fluid.layers.matmul(out,
                                                   param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 8] [-1, -1]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, None])
                 tmp_out = paddle.fluid.layers.matmul(out1, tmp_param)
                 out2 = paddle.fluid.layers.matmul(tmp_out,
                                                   param2)  # [8, 4] [-1, 0]
@@ -227,17 +210,12 @@ def make_program():
                                            shape=[8, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0]
-                                  })
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x"])
 
                 auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 # embedding
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[4], value=1, dtype='int32')
@@ -247,12 +225,9 @@ def make_program():
                 for op in main_program.global_block().ops:
                     if op.type == "lookup_table_v2":
                         W = main_program.global_block().vars[op.input("W")[0]]
-                        auto.shard_tensor(W,
-                                          dist_attr={
-                                              "process_mesh":
-                                              auto.ProcessMesh([0, 1]),
-                                              "dims_mapping": [0, -1]
-                                          })
+                        auto.shard_tensor(
+                            W, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                            ["x", None])
                 out = paddle.fluid.layers.transpose(out,
                                                     [1, 0])  # [8, 2] [-1, 0]
 
@@ -260,25 +235,20 @@ def make_program():
                 param1 = paddle.fluid.layers.create_parameter(
                     [4, 8], paddle.float32)  # [2, 8] [0, -1]
                 auto.shard_tensor(param1,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 param2 = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 4] [-1, 0]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, 0]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, "x"])
                 out1 = paddle.matmul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 8] [-1, -1]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, None])
+
                 tmp_out = paddle.matmul(out1, tmp_param)
                 out2 = paddle.matmul(tmp_out, param2)  # [8, 4] [-1, 0]
 
@@ -331,17 +301,11 @@ def make_program():
                                            shape=[8, 1],
                                            dtype='float32')
                 label.stop_gradient = True
-                auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0]
-                                  })
-
+                auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x"])
                 auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 # embedding
                 tmp = paddle.fluid.layers.fill_constant_batch_size_like(
                     input=x, shape=[4], value=1, dtype='int32')
@@ -351,12 +315,9 @@ def make_program():
                 for op in main_program.global_block().ops:
                     if op.type == "lookup_table_v2":
                         W = main_program.global_block().vars[op.input("W")[0]]
-                        auto.shard_tensor(W,
-                                          dist_attr={
-                                              "process_mesh":
-                                              auto.ProcessMesh([0, 1]),
-                                              "dims_mapping": [0, -1]
-                                          })
+                        auto.shard_tensor(
+                            W, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                            ["x", None])
                 out = paddle.fluid.layers.transpose(out,
                                                     [1, 0])  # [8, 2] [-1, 0]
 
@@ -364,25 +325,21 @@ def make_program():
                 param1 = paddle.fluid.layers.create_parameter(
                     [4, 8], paddle.float32)  # [2, 8] [0, -1]
                 auto.shard_tensor(param1,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [0, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  ["x", None])
                 param2 = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 4] [-1, 0]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, 0]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, "x"])
+
                 out1 = paddle.fluid.layers.mul(out, param1)  # [8, 8] [-1, -1]
                 tmp_param = paddle.fluid.layers.create_parameter(
                     [8, 8], paddle.float32)  # [8, 8] [-1, -1]
                 auto.shard_tensor(param2,
-                                  dist_attr={
-                                      "process_mesh": auto.ProcessMesh([0, 1]),
-                                      "dims_mapping": [-1, -1]
-                                  })
+                                  auto.ProcessMesh([0, 1], dim_names=["x"]),
+                                  [None, None])
+
                 tmp_out = paddle.fluid.layers.mul(out1, tmp_param)
                 out2 = paddle.fluid.layers.mul(tmp_out,
                                                param2)  # [8, 4] [-1, 0]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
index dfddba3dda1c96..514246388973d8 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_pnorm.py
@@ -14,7 +14,7 @@
 
 import unittest
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
@@ -29,11 +29,8 @@ def make_program_dp2():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0, 1]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
         tmp_0 = paddle.norm(x, p=2)
     return main_program, start_program, tmp_0
 
@@ -44,11 +41,8 @@ def make_program_serial():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0]),
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0], dim_names=["x"]),
+                          [None, None, None])
         tmp_0 = paddle.norm(x, p=2)
     return main_program, start_program, tmp_0
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
index 60b43ef9fe3bc2..bc4918de2e57d3 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_reshape.py
@@ -14,7 +14,7 @@
 
 import unittest
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.fluid import program_guard
 from paddle.fluid.backward import append_backward
@@ -29,11 +29,9 @@ def make_program_dp2():
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
         x.stop_gradient = False
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0, 1]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
+
         tmp_0 = paddle.reshape(x, shape=[0, 0, 4, 2])
         tmp_1 = paddle.reshape(tmp_0, shape=[0, 0, 8])
         tmp_2 = tmp_1.reshape((tmp_1.shape[0], tmp_1.shape[1], -1))
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_shape.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_shape.py
new file mode 100644
index 00000000000000..5e18b7d90c519d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_shape.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle.distributed.fleet import auto
+
+paddle.enable_static()
+
+
+def make_program():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 4, 8], dtype='float32')
+        x.stop_gradient = False
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
+        shape = paddle.shape(x)
+    return main_program, start_program
+
+
+def parallelizer(program_func, rank):
+    from paddle.distributed.auto_parallel.completion import Completer
+    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+
+    main_program, start_program = program_func()
+
+    dist_context = DistributedContext()
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation(main_program)
+    dist_context.block_state.parse_forward_blocks(main_program)
+
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
+                                                 [])
+
+    return dist_main_prog, dist_context
+
+
+class TestDistShape(unittest.TestCase):
+
+    def test_dist_shape(self):
+
+        dist_main_prog, dist_context = parallelizer(make_program, 0)
+        ops = dist_main_prog.global_block().ops
+        shape_op = ops[0]
+        dist_op = dist_context.get_dist_op_for_program(shape_op)
+        dist_op.dist_attr.impl_type == "shape"
+        dist_op.dist_attr.impl_idx == 0
+
+        in_name = shape_op.input_arg_names[0]
+        out_name = shape_op.output_arg_names[0]
+        in_dims_mapping = dist_op.dist_attr.get_input_dims_mapping(in_name)
+        out_dims_mapping = dist_op.dist_attr.get_output_dims_mapping(out_name)
+
+        assert in_dims_mapping == [0, -1, -1]
+        assert out_dims_mapping == [-1]
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
index e12fd0f922a5e8..678828f949bfd9 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_slice.py
@@ -14,7 +14,7 @@
 
 import unittest
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 
 paddle.enable_static()
@@ -25,11 +25,9 @@ def make_program_dp2():
     start_program = paddle.fluid.Program()
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0, 1]),
-                              "dims_mapping": [0, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
+
         tmp_0 = x[0]
         tmp_1 = x[:, 0, :]
         tmp_2 = x[:, :, 1]
@@ -42,11 +40,9 @@ def make_program_serial():
     start_program = paddle.fluid.Program()
     with paddle.static.program_guard(main_program, start_program):
         x = paddle.static.data(name='x', shape=[4, 5, 6], dtype='float32')
-        auto.shard_tensor(x,
-                          dist_attr={
-                              "process_mesh": auto.ProcessMesh([0]),
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(x, auto.ProcessMesh([0], dim_names=["x"]),
+                          [None, None, None])
+
         tmp_0 = x[0]
         tmp_1 = x[:, 0, :]
         tmp_2 = x[:, :, 1]
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_split.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_split.py
new file mode 100644
index 00000000000000..566c57a140dc93
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_dist_split.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+from paddle.distributed.fleet import auto
+
+from paddle.fluid import program_guard
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+
+def make_program_dp2():
+    main_program = paddle.fluid.Program()
+    start_program = paddle.fluid.Program()
+    with paddle.static.program_guard(main_program, start_program):
+        x = paddle.static.data(name='x', shape=[4, 12, 16], dtype='float32')
+        x.stop_gradient = False
+        auto.shard_tensor(x, auto.ProcessMesh([0, 1], dim_names=["x"]),
+                          ["x", None, None])
+        out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=1)
+    return main_program, start_program
+
+
+def parallelizer(program_func, rank):
+    from paddle.distributed.auto_parallel.completion import Completer
+    from paddle.distributed.auto_parallel.partitioner import Partitioner
+    from paddle.distributed.auto_parallel.dist_context import DistributedContext
+
+    main_program, start_program = program_func()
+
+    dist_context = DistributedContext()
+    completer = Completer(dist_context)
+    completer.complete_forward_annotation(main_program)
+    dist_context.block_state.parse_forward_blocks(main_program)
+
+    partitioner = Partitioner(dist_context, rank)
+    dist_main_prog, _, _ = partitioner.partition(main_program, start_program,
+                                                 [])
+
+    return dist_main_prog, dist_context
+
+
+class TestDistSplit(unittest.TestCase):
+
+    def test_dist_split_dp2(self):
+
+        for rank in range(2):
+            dist_main_prog, dist_context = parallelizer(make_program_dp2, rank)
+            ops = dist_main_prog.global_block().ops
+            op_dist_attr = dist_context.get_op_dist_attr_for_program(ops[0])
+            assert op_dist_attr.impl_type == "split"
+            assert op_dist_attr.impl_idx == 0
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py
new file mode 100644
index 00000000000000..cd825524b8ae30
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_api_error.py
@@ -0,0 +1,311 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+import paddle.static as static
+import paddle.nn as nn
+import paddle.nn.functional as F
+from paddle.io import Dataset
+
+from paddle.distributed.fleet import auto
+
+paddle.enable_static()
+
+
+epoch_num = 1
+batch_size = 2
+batch_num = 10
+hidden_size = 1024
+sequence_len = 512
+image_size = hidden_size
+class_num = 10
+
+is_fetch = True
+is_feed = True
+my_feed_vars = []
+
+
+class TrainDataset(Dataset):
+    def __init__(self, num_samples):
+        super(TrainDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        label = np.random.randint(0, class_num - 1, dtype="int64")
+        return input, label
+
+    def __len__(self):
+        return self.num_samples
+
+
+class TestDataset(Dataset):
+    def __init__(self, num_samples):
+        super(TestDataset, self).__init__()
+        self.num_samples = num_samples
+
+    def __getitem__(self, index):
+        input = np.random.uniform(size=image_size).astype("float32")
+        return input
+
+    def __len__(self):
+        return self.num_samples
+
+
+class MLPLayer(nn.Layer):
+    def __init__(
+        self,
+        hidden_size=1024,
+        intermediate_size=4 * 1024,
+        dropout_ratio=0.1,
+        initializer_range=0.02,
+    ):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        weight_attr = paddle.ParamAttr(
+            initializer=nn.initializer.Normal(mean=0.0, std=initializer_range)
+        )
+        bias_attr = None
+
+        self.linear0 = nn.Linear(
+            d_model, dim_feedforward, weight_attr, bias_attr=bias_attr
+        )
+        self.linear1 = nn.Linear(
+            dim_feedforward, d_model, weight_attr, bias_attr=bias_attr
+        )
+        self.linear2 = nn.Linear(d_model, 1, weight_attr, bias_attr=bias_attr)
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+
+        if is_feed:
+            my_feed_vars.append((out, out.shape))
+
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        out = self.dropout(out)
+        out = self.linear2(out)
+
+        if is_feed:
+            my_feed_vars.append((out, out.shape))
+        if is_fetch:
+            auto.fetch(out, "my_fetch", logging=True)
+        return out
+
+
+class TestEngineErrorRaise(unittest.TestCase):
+    def setUp(self):
+        class NoSupportData1:
+            def __getitem__(self, index):
+                input = np.random.uniform(size=image_size).astype("float32")
+                label = np.random.randint(0, class_num - 1, dtype="int64")
+                return input, label
+
+        class NoSupportData2(TrainDataset):
+            def __getitem__(self, index):
+                input = [
+                    list(np.random.uniform(size=image_size).astype("float32"))
+                ]
+                label = [np.random.randint(0, class_num - 1, dtype="int64")]
+                return input, label
+
+        class NoSupportData3:
+            def __getitem__(self, index):
+                input = np.random.uniform(size=image_size).astype("float32")
+                return input
+
+        class NoSupportData4(TestDataset):
+            def __getitem__(self, index):
+                input = [
+                    list(np.random.uniform(size=image_size).astype("float32"))
+                ]
+                return input
+
+        self.no_support_data_1 = NoSupportData1()
+        self.no_support_data_2 = NoSupportData2(10)
+        self.no_support_data_3 = NoSupportData3()
+        self.no_support_data_4 = NoSupportData4(10)
+
+    def test_Engine(self):
+        with self.assertRaises(TypeError):
+            auto.Engine(model=paddle.static.Program())
+        with self.assertRaises(TypeError):
+            auto.Engine(loss="CrossEntropyLoss")
+        with self.assertRaises(TypeError):
+            auto.Engine(optimizer="adam")
+        with self.assertRaises(TypeError):
+            auto.Engine(metrics=["acc"])
+        with self.assertRaises(TypeError):
+            auto.Engine(cluster="cluster")
+        with self.assertRaises(TypeError):
+            auto.Engine(strategy="strategy")
+
+    def test_fit(self):
+
+        with self.assertRaises(TypeError):
+
+            engine = auto.Engine(
+                model=MLPLayer(),
+                loss=paddle.nn.CrossEntropyLoss(),
+                optimizer=paddle.optimizer.AdamW(0.00001),
+            )
+            engine.fit(train_data=self.no_support_data_1)
+
+        with self.assertRaises(TypeError):
+
+            engine = auto.Engine(
+                model=MLPLayer(),
+                loss=paddle.nn.CrossEntropyLoss(),
+                optimizer=paddle.optimizer.AdamW(0.00001),
+            )
+            engine.fit(train_data=self.no_support_data_2)
+
+    def test_evaluate(self):
+        with self.assertRaises(TypeError):
+
+            engine = auto.Engine(
+                model=MLPLayer(),
+                loss=paddle.nn.CrossEntropyLoss(),
+                metrics=paddle.metric.Accuracy(),
+            )
+            engine.evaluate(valid_data=self.no_support_data_3)
+
+        with self.assertRaises(TypeError):
+
+            engine = auto.Engine(
+                model=MLPLayer(),
+                loss=paddle.nn.CrossEntropyLoss(),
+                metrics=paddle.metric.Accuracy(),
+            )
+            engine.evaluate(
+                valid_data=self.no_support_data_4, valid_sample_split=1
+            )
+
+    def test_predict(self):
+        with self.assertRaises(TypeError):
+
+            engine = auto.Engine(model=MLPLayer())
+            engine.predict(
+                test_data=self.no_support_data_3, test_sample_split=1
+            )
+
+        with self.assertRaises(TypeError):
+
+            engine = auto.Engine(model=MLPLayer())
+            engine.predict(
+                test_data=self.no_support_data_4, test_sample_split=1
+            )
+
+    def build_program(self):
+        main_prog = static.Program()
+        startup_prog = static.Program()
+        with static.program_guard(main_prog, startup_prog):
+            input = static.data(
+                name="input",
+                shape=[batch_size // 2, image_size],
+                dtype='float32',
+            )
+            label = static.data(
+                name="label", shape=[batch_size // 2, 1], dtype='int64'
+            )
+            mlp = MLPLayer()
+            loss = paddle.nn.CrossEntropyLoss()
+            predict = mlp(input)
+            loss_var = loss(predict, label)
+        return main_prog, startup_prog, input, label, loss_var
+
+    def test_prepare(self):
+        with self.assertRaises(ValueError):
+            engine = auto.Engine(model=MLPLayer())
+            engine.prepare()
+
+        with self.assertRaises(AssertionError):
+            engine = auto.Engine(model=MLPLayer())
+            engine.prepare(mode="train")
+
+        with self.assertRaises(TypeError):
+            input = static.data(
+                name="input",
+                shape=[batch_size / 2, image_size],
+                dtype='float32',
+            )
+            label = static.data(
+                name="label", shape=[batch_size / 2, 1], dtype='int64'
+            )
+            engine = auto.Engine(model=MLPLayer())
+            engine.prepare(inputs_spec=input, labels_spec=label, mode="eval")
+
+        input_spec = static.InputSpec(
+            shape=[batch_size, image_size], dtype="float32", name="input"
+        )
+        label_spec = static.InputSpec(
+            shape=[batch_size, image_size], dtype="float32", name="input"
+        )
+        (
+            main_prog,
+            startup_prog,
+            input_var,
+            label_var,
+            loss_var,
+        ) = self.build_program()
+
+        with self.assertRaises(TypeError):
+            engine = auto.Engine(loss=loss_var)
+            engine.prepare(
+                inputs=input_spec,
+                labels=label_spec,
+                main_program=main_prog,
+                startup_program=startup_prog,
+                mode="eval",
+            )
+
+        with self.assertRaises(AssertionError):
+            engine = auto.Engine(loss=loss_var)
+            engine.prepare(
+                inputs_spec=[input_spec, input_spec],
+                labels_spec=[label_spec, label_spec],
+                inputs=input_var,
+                labels=label_var,
+                main_program=main_prog,
+                startup_program=startup_prog,
+                mode="predict",
+            )
+
+    def test_cost(self):
+        with self.assertRaises(ValueError):
+            engine = auto.Engine(model=MLPLayer())
+            engine.cost(mode="predict")
+
+
+class TestEngineDynamicErrorRaise(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_cost(self):
+        with self.assertRaises(ValueError):
+            engine = auto.Engine(model=MLPLayer())
+            engine.cost(mode="predict")
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_callbacks.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_callbacks.py
new file mode 100644
index 00000000000000..9baaee353f7153
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_engine_callbacks.py
@@ -0,0 +1,173 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import tempfile
+import shutil
+import time
+import random
+
+import paddle
+import paddle.vision.transforms as T
+
+from paddle.static import InputSpec
+from paddle.distributed.fleet import auto
+from paddle.distributed.auto_parallel.callbacks import config_callbacks
+from paddle.vision.models import LeNet
+from paddle.vision.datasets import MNIST
+
+paddle.enable_static()
+
+
+class TestCallbacks(unittest.TestCase):
+
+    def setUp(self):
+        self.save_dir = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.save_dir)
+
+    def run_callback(self):
+        epochs = 2
+        steps = 5
+        freq = 2
+        eval_steps = 2
+
+        inputs_spec = [InputSpec([None, 1, 28, 28], 'float32', 'image')]
+        strategy = auto.Strategy()
+        strategy.auto_mode = "semi"
+
+        engine = auto.Engine(LeNet(), strategy=strategy)
+        engine.prepare(inputs_spec, mode="predict")
+
+        cbks = config_callbacks(engine=engine,
+                                batch_size=128,
+                                epochs=epochs,
+                                steps=steps,
+                                log_freq=freq,
+                                verbose=self.verbose,
+                                metrics=['loss', 'acc'],
+                                save_dir=self.save_dir)
+        cbks.on_begin('train')
+
+        logs = {'loss': 50.341673, 'acc': 0.00256}
+        for epoch in range(epochs):
+            cbks.on_epoch_begin(epoch)
+            for step in range(steps):
+                cbks.on_batch_begin('train', step, logs)
+                logs['loss'] -= random.random() * 0.1
+                logs['acc'] += random.random() * 0.1
+                time.sleep(0.005)
+                cbks.on_batch_end('train', step, logs)
+            cbks.on_epoch_end(epoch, logs)
+
+            eval_logs = {'eval_loss': 20.341673, 'eval_acc': 0.256}
+            params = {
+                'steps': eval_steps,
+                'metrics': ['eval_loss', 'eval_acc'],
+            }
+            cbks.on_begin('eval', params)
+            for step in range(eval_steps):
+                cbks.on_batch_begin('eval', step, eval_logs)
+                eval_logs['eval_loss'] -= random.random() * 0.1
+                eval_logs['eval_acc'] += random.random() * 0.1
+                eval_logs['batch_size'] = 2
+                time.sleep(0.005)
+                cbks.on_batch_end('eval', step, eval_logs)
+            cbks.on_end('eval', eval_logs)
+
+            test_logs = {}
+            params = {'steps': eval_steps}
+            cbks.on_begin('predict', params)
+            for step in range(eval_steps):
+                cbks.on_batch_begin('predict', step, test_logs)
+                test_logs['batch_size'] = 2
+                time.sleep(0.005)
+                cbks.on_batch_end('predict', step, test_logs)
+            cbks.on_end('predict', test_logs)
+
+        cbks.on_end('train')
+
+        print(engine.history.history)
+
+    def test_callback_verbose_0(self):
+        self.verbose = 0
+        self.run_callback()
+
+    def test_callback_verbose_1(self):
+        self.verbose = 1
+        self.run_callback()
+
+    def test_callback_verbose_2(self):
+        self.verbose = 2
+        self.run_callback()
+
+    def test_callback_verbose_3(self):
+        self.verbose = 3
+        self.run_callback()
+
+
+class TestCallbacksEngine(unittest.TestCase):
+
+    def setUp(self):
+        self.save_dir = tempfile.mkdtemp()
+        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
+        self.train_dataset = MNIST(mode='train', transform=transform)
+        self.test_dataset = MNIST(mode='test', transform=transform)
+        self.prepare_engine()
+
+    def tearDown(self):
+        shutil.rmtree(self.save_dir)
+
+    def prepare_engine(self):
+        model = paddle.vision.models.LeNet()
+        loss = paddle.nn.CrossEntropyLoss()
+        base_lr = 1e-3
+        boundaries = [5, 8]
+        values = [base_lr * (0.1**i) for i in range(len(boundaries) + 1)]
+        lr = paddle.optimizer.lr.PiecewiseDecay(boundaries=boundaries,
+                                                values=values,
+                                                verbose=False)
+        optimizer = paddle.optimizer.Adam(learning_rate=lr,
+                                          parameters=model.parameters())
+        auto.fetch(model.parameters()[0], "param0", logging=True)
+        metrics = paddle.metric.Accuracy(topk=(1, 2))
+        self.engine = auto.Engine(model, loss, optimizer, metrics)
+
+    def test_fit_eval(self):
+        history = self.engine.fit(train_data=self.train_dataset,
+                                  valid_data=self.test_dataset,
+                                  batch_size=128,
+                                  steps_per_epoch=60,
+                                  valid_steps=40,
+                                  log_freq=20,
+                                  save_dir=self.save_dir,
+                                  save_freq=1)
+        print(history.history)
+
+    def test_eval(self):
+        self.engine.evaluate(valid_data=self.test_dataset,
+                             batch_size=128,
+                             steps=40,
+                             log_freq=10)
+
+    def test_predict(self):
+        logger_cbks = paddle.callbacks.ProgBarLogger()
+        self.engine.predict(test_data=self.test_dataset,
+                            batch_size=128,
+                            callbacks=[logger_cbks])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_interface.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_interface.py
new file mode 100644
index 00000000000000..5d2b6eacf4b9f8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_interface.py
@@ -0,0 +1,224 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.static as static
+import paddle.distributed as dist
+from paddle.distributed.fleet import auto
+from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+process_mesh1 = ProcessMesh(mesh=[[0, 1, 2, 3], [4, 5, 6, 7]],
+                            dim_names=["x", "y"])
+process_mesh2 = ProcessMesh(mesh=[0, 1, 2, 3], dim_names=["x"])
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
+
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        auto.shard_tensor(self.linear0.weight, process_mesh1[0], [None, "y"])
+        linear0 = auto.shard_op(self.linear0, process_mesh1,
+                                [["y", None, None]], [[None, "x", None]])
+        linear0_out = linear0(input)
+
+        gelu = auto.shard_op(F.gelu, process_mesh1, [["y", "x", None], None])
+        gelu_out = gelu(linear0_out, approximate=True)
+
+        auto.shard_tensor(self.linear1.weight, shard_spec=["y", None])
+        linear1 = auto.shard_op(self.linear1,
+                                process_mesh1[1],
+                                out_shard_specs=[["y", None, None]])
+        linear1_out = linear1(gelu_out)
+
+        return self.linear0, self.linear1, linear0_out, gelu_out, linear1_out
+
+
+class TestAutoParallelAPI(unittest.TestCase):
+
+    def test_api(self):
+        # input
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
+
+        auto.shard_tensor(input, process_mesh1, ["x", None, None])
+        auto.shard_tensor(label, process_mesh1, ["y", None, None])
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
+
+        with ProcessMesh(process_mesh1.mesh, process_mesh1.dim_names):
+            linear0, linear1, linear0_out, gelu_out, linear1_out = mlp(input)
+
+        default_program = paddle.fluid.default_main_program()
+        default_dist_context = get_default_distributed_context()
+
+        self.assertEqual(len(default_program.blocks[0].ops), 5)
+        matmul0 = default_program.blocks[0].ops[0]
+        self.assertEqual(matmul0.type, "matmul_v2")
+        ewise_add0 = default_program.blocks[0].ops[1]
+        self.assertEqual(ewise_add0.type, "elementwise_add")
+        gelu = default_program.blocks[0].ops[2]
+        self.assertEqual(gelu.type, "gelu")
+        matmul1 = default_program.blocks[0].ops[3]
+        self.assertEqual(matmul1.type, "matmul_v2")
+        ewise_add1 = default_program.blocks[0].ops[4]
+        self.assertEqual(ewise_add1.type, "elementwise_add")
+
+        dist_input = default_dist_context.get_dist_tensor_for_program(input)
+        self.assertEqual(dist_input.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_input.dist_attr.dims_mapping, [0, -1, -1])
+        self.assertTrue(dist_input.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(dist_input.dist_attr.is_annotated("dims_mapping"))
+
+        dist_input = default_dist_context.get_dist_tensor_for_program(label)
+        self.assertEqual(dist_input.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_input.dist_attr.dims_mapping, [1, -1, -1])
+        self.assertTrue(dist_input.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(dist_input.dist_attr.is_annotated("dims_mapping"))
+
+        dist_linear0_weight = default_dist_context.get_dist_tensor_for_program(
+            linear0.weight)
+        self.assertEqual(dist_linear0_weight.dist_attr.process_mesh,
+                         process_mesh1[0])
+        self.assertEqual(dist_linear0_weight.dist_attr.dims_mapping, [-1, 0])
+        self.assertTrue(
+            dist_linear0_weight.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(
+            dist_linear0_weight.dist_attr.is_annotated("dims_mapping"))
+
+        dist_linear1_weight = default_dist_context.get_dist_tensor_for_program(
+            linear1.weight)
+        self.assertEqual(dist_linear1_weight.dist_attr.process_mesh,
+                         process_mesh1)
+        self.assertEqual(dist_linear1_weight.dist_attr.dims_mapping, [1, -1])
+        self.assertTrue(
+            dist_linear1_weight.dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(
+            dist_linear1_weight.dist_attr.is_annotated("dims_mapping"))
+
+        dist_linear1_out = default_dist_context.get_dist_tensor_for_program(
+            linear1_out)
+        self.assertEqual(dist_linear1_out.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_linear1_out.dist_attr.dims_mapping, [-1, -1, -1])
+        self.assertTrue(dist_linear1_out.dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(
+            dist_linear1_out.dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(matmul0)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(input.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [1, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(ewise_add0)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(
+            linear0_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [-1, 0, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(gelu)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(
+            linear0_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [1, 0, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(gelu_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1)
+        self.assertEqual(tensor_dist_attr.dims_mapping, [-1, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(tensor_dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(matmul1)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(gelu_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(tensor_dist_attr.dims_mapping, [-1, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertFalse(tensor_dist_attr.is_annotated("dims_mapping"))
+
+        dist_op = default_dist_context.get_dist_op_for_program(ewise_add1)
+        self.assertEqual(dist_op.dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(dist_op.dist_attr.impl_type, "default")
+        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
+        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(
+            linear1_out.name)
+        self.assertEqual(tensor_dist_attr.process_mesh, process_mesh1[1])
+        self.assertEqual(tensor_dist_attr.dims_mapping, [0, -1, -1])
+        self.assertTrue(tensor_dist_attr.is_annotated("process_mesh"))
+        self.assertTrue(tensor_dist_attr.is_annotated("dims_mapping"))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py
index e7d73921eb34f8..3fed759424aaec 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_lr_grad_clip.py
@@ -20,14 +20,12 @@
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 import paddle.distributed.fleet as fleet
 
 from paddle.io import Dataset
 from paddle.static import InputSpec
 from paddle.fluid.framework import _non_static_mode
-from paddle.distributed.auto_parallel.engine import Engine
-from paddle.distributed.auto_parallel.hepler import ProgramHelper
 
 from test_to_static import MLPLayer, MyDataset
 
@@ -61,15 +59,13 @@ def init_dataset(self):
         self.dataset = MyDataset(self.batch_num * self.batch_size)
 
     def init_engine(self):
-        inputs = InputSpec([self.batch_size, self.hidden_size], 'float32', 'x')
-        labels = InputSpec([self.batch_size], 'int64', 'label')
+        # inputs = InputSpec([self.batch_size, self.hidden_size], 'float32', 'x')
+        # labels = InputSpec([self.batch_size], 'int64', 'label')
 
-        self.engine = Engine(model=self.mlp,
-                             inputs_spec=inputs,
-                             labels_spec=labels)
-        self.engine.prepare(optimizer=self.optimizer,
-                            loss=self.loss,
-                            metrics=paddle.metric.Accuracy())
+        self.engine = auto.Engine(model=self.mlp,
+                                  loss=self.loss,
+                                  optimizer=self.optimizer,
+                                  metrics=paddle.metric.Accuracy())
 
 
 class TestLRScheduler(TestEngineBase):
@@ -81,9 +77,9 @@ def init_optimizer(self):
 
     def test_lr_scheduler(self):
         self.init_engine()
+        self.engine.fit(self.dataset, batch_size=self.batch_size)
         lr = self.engine._optimizer._learning_rate
         assert isinstance(lr, paddle.optimizer.lr.LRScheduler)
-        self.engine.fit(self.dataset, batch_size=self.batch_size)
 
 
 class TestGradClipByGlobalNorm(TestEngineBase):
@@ -95,7 +91,6 @@ def init_optimizer(self):
 
     def test_grad_clip(self):
 
-        clip = self.engine._optimizer._grad_clip
         self.engine.fit(self.dataset, batch_size=self.batch_size)
         self.check_program()
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner.py
new file mode 100644
index 00000000000000..ab48e2838f9b99
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static as static
+
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context
+from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+import sys
+
+sys.path.append("..")
+import auto_parallel_gpt_model as modeling
+from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [
+    ProcessMesh([0, 1], dim_names=["x"]),
+    ProcessMesh([2, 3], dim_names=["x"])
+]
+
+
+def get_program_v3():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+    place = paddle.set_device("gpu")
+    gpus = [0, 1]
+    batch_size = 8
+    sequence_len = 512
+    vocab_size = 1000
+
+    train_program = static.Program()
+    start_program = static.Program()
+    modeling.init_global()
+    modeling._global_parallel_strategy = None
+    # modeling.DPMPPP_MESH_LIST = [
+    #     ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+    #     ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
+    # ]
+    with static.program_guard(train_program, start_program):
+        tokens = paddle.static.data(name="tokens",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        position_ids = paddle.static.data(name="position_ids",
+                                          shape=[batch_size, sequence_len],
+                                          dtype='int64')
+        attention_mask = paddle.static.data(
+            name="attention_mask",
+            shape=[batch_size, 1, sequence_len, sequence_len],
+            dtype='float32')
+        labels = paddle.static.data(name="labels",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        loss_mask = paddle.static.data(name="loss_mask",
+                                       shape=[batch_size, sequence_len],
+                                       dtype='float32')
+        data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
+
+        gpt = GPTModel(vocab_size=1000,
+                       hidden_size=1024,
+                       num_hidden_layers=2,
+                       num_attention_heads=16,
+                       intermediate_size=4 * 1024,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.0,
+                       attention_probs_dropout_prob=0.0,
+                       max_position_embeddings=1024,
+                       type_vocab_size=1,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       eos_token_id=7,
+                       bos_token_id=0,
+                       eol_token_id=3,
+                       pp_degree=1)
+
+        model = GPTForPretraining(gpt,
+                                  vocab_size=1000,
+                                  hidden_size=64,
+                                  initializer_range=0.02)
+        preds = model(tokens, position_ids, attention_mask)
+        criterion = GPTPretrainingCriterion()
+        loss = criterion(preds, labels, loss_mask)
+
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=None)
+
+        feed_vars = {
+            "inputs": [tokens, position_ids, attention_mask, loss_mask],
+            "labels": [labels]
+        }
+        fetch_vars = {"loss": [loss]}
+
+    return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars
+
+
+class TestParallelTunerTrain(unittest.TestCase):
+
+    def test_tune_with_train(self):
+        flag = False
+        set_default_distributed_context(DistributedContext())
+        train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3(
+        )
+        cluster = Cluster()
+        cluster.gen_default_config_cluster(node_count=1, device_count=8)
+        dist_context = DistributedContext(train_program, start_program,
+                                          optimizer, loss, feed_vars,
+                                          fetch_vars, cluster)
+        dist_context.initialize()
+        parallel_tuner = ParallelTuner(dist_context, max_trials=3, mode="train")
+        parallel_tuner.tune()
+        parallel_tuner._store_best_parallel_strategy()
+        flag = True
+        self.assertTrue(flag)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_full.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_full.py
new file mode 100644
index 00000000000000..27833a6a185009
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_full.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static as static
+
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context
+from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.planner_v2 import Planner
+from paddle.distributed.auto_parallel.strategy import Strategy
+import sys
+
+sys.path.append("..")
+import auto_parallel_gpt_model as modeling
+from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [
+    ProcessMesh([0, 1], dim_names=["x"]),
+    ProcessMesh([2, 3], dim_names=["x"])
+]
+
+
+def get_program_v3():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+    place = paddle.set_device("gpu")
+    gpus = [0, 1]
+    batch_size = 8
+    sequence_len = 512
+    vocab_size = 1000
+
+    train_program = static.Program()
+    start_program = static.Program()
+    modeling.init_global()
+    modeling._global_parallel_strategy = "dp_mp_pp"
+    modeling.DPMPPP_MESH_LIST = [
+        ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+        ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
+    ]
+    with static.program_guard(train_program, start_program):
+        tokens = paddle.static.data(name="tokens",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        position_ids = paddle.static.data(name="position_ids",
+                                          shape=[batch_size, sequence_len],
+                                          dtype='int64')
+        attention_mask = paddle.static.data(
+            name="attention_mask",
+            shape=[batch_size, 1, sequence_len, sequence_len],
+            dtype='float32')
+        labels = paddle.static.data(name="labels",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        loss_mask = paddle.static.data(name="loss_mask",
+                                       shape=[batch_size, sequence_len],
+                                       dtype='float32')
+        data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
+
+        gpt = GPTModel(vocab_size=1000,
+                       hidden_size=1024,
+                       num_hidden_layers=2,
+                       num_attention_heads=16,
+                       intermediate_size=4 * 1024,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.0,
+                       attention_probs_dropout_prob=0.0,
+                       max_position_embeddings=1024,
+                       type_vocab_size=1,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       eos_token_id=7,
+                       bos_token_id=0,
+                       eol_token_id=3,
+                       pp_degree=len(modeling.DPMPPP_MESH_LIST))
+
+        model = GPTForPretraining(gpt,
+                                  vocab_size=1000,
+                                  hidden_size=64,
+                                  initializer_range=0.02)
+        preds = model(tokens, position_ids, attention_mask)
+        criterion = GPTPretrainingCriterion()
+        loss = criterion(preds, labels, loss_mask)
+
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=None)
+
+        feed_vars = {
+            "inputs": [tokens, position_ids, attention_mask, loss_mask],
+            "labels": [labels]
+        }
+        fetch_vars = {"loss": [loss]}
+
+    return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars
+
+
+class TestParallelTunerFull(unittest.TestCase):
+
+    def test_tune_with_planner(self):
+        flag = False
+        set_default_distributed_context(DistributedContext())
+        train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3(
+        )
+        cluster = Cluster()
+        cluster.gen_default_config_cluster(node_count=1, device_count=8)
+        strategy = Strategy()
+        strategy.auto_mode = "full"
+        dist_context = DistributedContext(train_program, start_program,
+                                          optimizer, loss, feed_vars,
+                                          fetch_vars, cluster, strategy)
+        dist_context.initialize()
+        planner = Planner("train", dist_context)
+        planner._parallel_tuner = ParallelTuner(planner._dist_context,
+                                                mode=planner._mode,
+                                                max_trials=3)
+        planner.plan()
+        flag = True
+        self.assertTrue(flag)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_predict.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_predict.py
new file mode 100644
index 00000000000000..2d7a2c10579a7a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_parallel_tuner_predict.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import paddle
+import paddle.static as static
+
+from paddle.distributed import fleet
+from paddle.distributed.auto_parallel.cluster import Cluster
+from paddle.distributed.auto_parallel.dist_context import DistributedContext, set_default_distributed_context
+from paddle.distributed.auto_parallel.tuner.parallel_tuner import ParallelTuner
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+import sys
+
+sys.path.append("..")
+import auto_parallel_gpt_model as modeling
+from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+_g_process_mesh = [
+    ProcessMesh([0, 1], dim_names=["x"]),
+    ProcessMesh([2, 3], dim_names=["x"])
+]
+
+
+def get_program_v3():
+    dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.semi_auto = True
+    # fleet.init(is_collective=True, strategy=dist_strategy)
+    place = paddle.set_device("gpu")
+    gpus = [0, 1]
+    batch_size = 8
+    sequence_len = 512
+    vocab_size = 1000
+
+    train_program = static.Program()
+    start_program = static.Program()
+    modeling.init_global()
+    modeling._global_parallel_strategy = "dp_mp_pp"
+    modeling.DPMPPP_MESH_LIST = [
+        ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+        ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
+    ]
+    with static.program_guard(train_program, start_program):
+        tokens = paddle.static.data(name="tokens",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        position_ids = paddle.static.data(name="position_ids",
+                                          shape=[batch_size, sequence_len],
+                                          dtype='int64')
+        attention_mask = paddle.static.data(
+            name="attention_mask",
+            shape=[batch_size, 1, sequence_len, sequence_len],
+            dtype='float32')
+        labels = paddle.static.data(name="labels",
+                                    shape=[batch_size, sequence_len],
+                                    dtype='int64')
+        loss_mask = paddle.static.data(name="loss_mask",
+                                       shape=[batch_size, sequence_len],
+                                       dtype='float32')
+        data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
+
+        gpt = GPTModel(vocab_size=1000,
+                       hidden_size=1024,
+                       num_hidden_layers=2,
+                       num_attention_heads=16,
+                       intermediate_size=4 * 1024,
+                       hidden_act="gelu",
+                       hidden_dropout_prob=0.0,
+                       attention_probs_dropout_prob=0.0,
+                       max_position_embeddings=1024,
+                       type_vocab_size=1,
+                       initializer_range=0.02,
+                       pad_token_id=0,
+                       eos_token_id=7,
+                       bos_token_id=0,
+                       eol_token_id=3,
+                       pp_degree=len(modeling.DPMPPP_MESH_LIST))
+
+        model = GPTForPretraining(gpt,
+                                  vocab_size=1000,
+                                  hidden_size=64,
+                                  initializer_range=0.02)
+        preds = model(tokens, position_ids, attention_mask)
+        criterion = GPTPretrainingCriterion()
+        loss = criterion(preds, labels, loss_mask)
+
+        optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.00001,
+                                                         beta1=0.9,
+                                                         beta2=0.999,
+                                                         epsilon=1e-08,
+                                                         grad_clip=None)
+
+        feed_vars = {
+            "inputs": [tokens, position_ids, attention_mask, loss_mask],
+            "labels": [labels]
+        }
+        fetch_vars = {"loss": [loss]}
+
+    return train_program, start_program, None, loss, optimizer, feed_vars, fetch_vars
+
+
+class TestParallelTunerPredict(unittest.TestCase):
+
+    def test_tune_predict(self):
+        flag = False
+        set_default_distributed_context(DistributedContext())
+        train_program, start_program, dataloader, loss, optimizer, feed_vars, fetch_vars = get_program_v3(
+        )
+        cluster = Cluster()
+        cluster.gen_default_config_cluster(node_count=1, device_count=8)
+        dist_context = DistributedContext(train_program, start_program,
+                                          optimizer, loss, feed_vars,
+                                          fetch_vars, cluster)
+        dist_context.initialize()
+
+        parallel_tuner = ParallelTuner(dist_context,
+                                       max_trials=3,
+                                       mode="predict")
+        parallel_tuner.tune()
+        flag = True
+
+        self.assertTrue(flag)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py
new file mode 100644
index 00000000000000..ed2cf0328e85c5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_amp.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestAMPPass(unittest.TestCase):
+
+    def test_mp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "amp_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_grad_clip.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_grad_clip.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/auto_parallel/test_grad_clip.py
rename to python/paddle/fluid/tests/unittests/auto_parallel/test_pass_grad_clip.py
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_gradient_merge.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_gradient_merge.py
new file mode 100644
index 00000000000000..e55ddbea583366
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_gradient_merge.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestGradientMergePass(unittest.TestCase):
+
+    def test_dp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir,
+                                         "gradient_merge_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_quantization.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_quantization.py
new file mode 100644
index 00000000000000..b1b888d2b0da95
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_quantization.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import sys
+import random
+import numpy as np
+import paddle
+
+from paddle.distributed.fleet import auto
+from get_gpt_model import generate_model, create_data_holder, FakeDataset
+
+paddle.enable_static()
+
+
+def apply_pass():
+    dist_strategy = auto.Strategy()
+    dist_strategy.auto_mode = "semi"
+    qat = dist_strategy.qat
+    qat.enable = True
+    qat.channel_wise_abs_max = True
+    qat.weight_bits = 8
+    qat.activation_bits = 8
+    qat.not_quant_pattern = ['skip_quant']
+    return dist_strategy
+
+
+class TestQuantizationPass(unittest.TestCase):
+
+    def test_qat_pass(self):
+
+        batch_size = 8
+        batch_num = 10
+
+        strategy = apply_pass()
+        model, loss = generate_model("serial")
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001)
+        engine = auto.Engine(model, loss, opt, strategy=strategy)
+        dataset = FakeDataset(batch_size * batch_num)
+        engine.fit(dataset, 3, batch_size=batch_size)
+
+        self.check_program(engine.main_program)
+
+    def check_program(self, program):
+
+        quantizable_op_and_inputs = {'matmul_v2': ['X', 'Y']}
+        quantizable_grad_op_inputs = {'matmul_v2_grad': ['X', 'Y']}
+
+        quantized_ops = set()
+        for block in program.blocks:
+            for op in block.ops:
+                is_quntized = False
+                if op.type in quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        if ".quantized" in arg_name:
+                            is_quntized = True
+
+                if not is_quntized:
+                    continue
+
+                # check forward
+                if op.type in quantizable_op_and_inputs:
+                    for arg_name in op.input_arg_names:
+                        assert arg_name.endswith('.quantized.dequantized')
+                        quantized_ops.add(arg_name)
+
+            for op in block.ops:
+                is_quntized = False
+                if op.type in quantizable_grad_op_inputs:
+                    for pname in quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        if ".quantized" in arg_name:
+                            is_quntized = True
+
+                if not is_quntized:
+                    continue
+
+                # check backward
+                if op.type in quantizable_grad_op_inputs:
+                    for pname in quantizable_grad_op_inputs[op.type]:
+                        arg_name = op.input(pname)[0]
+                        assert arg_name.endswith('.quantized.dequantized')
+                        assert arg_name in quantized_ops
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_recompute.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_recompute.py
new file mode 100644
index 00000000000000..e7eb7ddd2a604b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_recompute.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestRecomputePass(unittest.TestCase):
+
+    def test_mp2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "recompute_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_sharding.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_sharding.py
new file mode 100644
index 00000000000000..77e969c83bf812
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_pass_sharding.py
@@ -0,0 +1,49 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tempfile
+import unittest
+import os
+import sys
+import shutil
+import subprocess
+from paddle.distributed.fleet.launch_utils import run_with_coverage
+
+
+class TestShardingPass(unittest.TestCase):
+
+    def test_dp2sharding2(self):
+        file_dir = os.path.dirname(os.path.abspath(__file__))
+        launch_model_path = os.path.join(file_dir, "sharding_pass_unittest.py")
+
+        if os.environ.get("WITH_COVERAGE", "OFF") == "ON":
+            coverage_args = ["-m", "coverage", "run", "--branch", "-p"]
+        else:
+            coverage_args = []
+
+        tmp_dir = tempfile.TemporaryDirectory()
+        cmd = [sys.executable, "-u"] + coverage_args + [
+            "-m", "paddle.distributed.launch", "--devices", "0,1", "--log_dir",
+            tmp_dir.name, launch_model_path
+        ]
+
+        process = subprocess.Popen(cmd)
+        process.wait()
+        self.assertEqual(process.returncode, 0)
+
+        tmp_dir.cleanup()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
index 67894f6dd93df9..b3dcd97cd20fdc 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_prim_dist_op.py
@@ -14,13 +14,13 @@
 
 import unittest
 import paddle
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.fluid import program_guard
 from paddle.incubate.autograd import prim2orig, enable_prim, prim_enabled
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.partitioner import Partitioner
 from paddle.distributed.auto_parallel.utils import set_var_dist_attr
@@ -78,7 +78,7 @@ def init_prog(self):
                                        outputs={'Z': self.w_grad},
                                        attrs=self.attrs)
 
-        op = self.layer_help.append_op(type="reduce_p",
+        op = self.layer_help.append_op(type="reduce_sum_p",
                                        inputs={'X': self.tmp2},
                                        outputs={'Y': self.batch_reduced},
                                        attrs={"axis": [0]})
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py
new file mode 100644
index 00000000000000..ce38780564b5be
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh.py
@@ -0,0 +1,145 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.nn as nn
+import paddle.nn.functional as F
+import paddle.static as static
+from paddle.distributed.fleet import auto
+from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
+from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
+
+paddle.enable_static()
+
+batch_size = 4
+epoch_num = 10
+hidden_size = 1024
+sequence_len = 512
+
+
+class MLPLayer(nn.Layer):
+
+    def __init__(self,
+                 hidden_size=1024,
+                 intermediate_size=4 * 1024,
+                 dropout_ratio=0.1,
+                 initializer_range=0.02):
+        super(MLPLayer, self).__init__()
+        d_model = hidden_size
+        dim_feedforward = intermediate_size
+        param_initializer = nn.initializer.Normal(mean=0.0,
+                                                  std=initializer_range)
+
+        self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
+        self.linear0 = nn.Linear(
+            d_model,
+            dim_feedforward,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+        self.linear1 = nn.Linear(
+            dim_feedforward,
+            d_model,
+            weight_attr=paddle.ParamAttr(initializer=param_initializer),
+            bias_attr=None)
+
+    def forward(self, input):
+        out = self.norm(input)
+        out = self.linear0(out)
+        out = F.gelu(out, approximate=True)
+        out = self.linear1(out)
+        return out
+
+
+class TestProcessMesh(unittest.TestCase):
+
+    def test_construction(self):
+        mesh = [[0, 1, 2], [3, 4, 5]]
+        process_mesh = ProcessMesh(mesh, dim_names=["x", "y"])
+        self.assertEqual(process_mesh.shape, [2, 3])
+        self.assertEqual(process_mesh.process_ids, [0, 1, 2, 3, 4, 5])
+        self.assertEqual(process_mesh.dim_names, ["x", "y"])
+        self.assertEqual(process_mesh.ndim, 2)
+        self.assertEqual(process_mesh, process_mesh)
+        self.assertEqual(str(process_mesh), str(process_mesh))
+
+        sub_process_mesh1 = process_mesh[0]
+        self.assertEqual(sub_process_mesh1.shape, [3])
+        self.assertEqual(sub_process_mesh1.process_ids, [0, 1, 2])
+        self.assertEqual(sub_process_mesh1.dim_names, ["y"])
+        self.assertEqual(sub_process_mesh1.ndim, 1)
+
+        sub_process_mesh2 = process_mesh[:, 1]
+        self.assertEqual(sub_process_mesh2.shape, [2])
+        self.assertEqual(sub_process_mesh2.process_ids, [1, 4])
+        self.assertEqual(sub_process_mesh2.dim_names, ["x"])
+        self.assertEqual(sub_process_mesh2.ndim, 1)
+
+        sub_process_mesh3 = sub_process_mesh2[:]
+        self.assertEqual(sub_process_mesh3.shape, [2])
+        self.assertEqual(sub_process_mesh3.process_ids, [1, 4])
+        self.assertEqual(sub_process_mesh3.dim_names, ["x"])
+        self.assertEqual(sub_process_mesh3.ndim, 1)
+
+        sub_process_mesh4 = process_mesh[1, 1]
+        self.assertEqual(sub_process_mesh4.shape, [1])
+        self.assertEqual(sub_process_mesh4.process_ids, [4])
+        self.assertEqual(sub_process_mesh4.dim_names, ["d0"])
+        self.assertEqual(sub_process_mesh4.ndim, 1)
+
+        sub_process_mesh5 = sub_process_mesh3[0]
+        self.assertEqual(sub_process_mesh5.shape, [1])
+        self.assertEqual(sub_process_mesh5.process_ids, [1])
+        self.assertEqual(sub_process_mesh5.dim_names, ["d0"])
+        self.assertEqual(sub_process_mesh5.ndim, 1)
+
+    def test_context_manager(self):
+        mesh = np.array([1, 2, 3, 4])
+        input = static.data(name="input",
+                            shape=[batch_size, sequence_len, hidden_size],
+                            dtype='float32')
+        label = static.data(name="label",
+                            shape=[batch_size, sequence_len, 1],
+                            dtype='float32')
+
+        mlp = MLPLayer(hidden_size=hidden_size,
+                       intermediate_size=4 * hidden_size,
+                       dropout_ratio=0.1,
+                       initializer_range=0.02)
+
+        with ProcessMesh(mesh, "d"):
+            out = mlp(input)
+
+        default_program = paddle.fluid.default_main_program()
+        default_dist_context = get_default_distributed_context()
+
+        for block in default_program.blocks:
+            for tensor in block.vars.values():
+                dist_tensor = default_dist_context.get_dist_tensor_for_program(
+                    tensor)
+                if dist_tensor is not None:
+                    self.assertEqual(dist_tensor.dist_attr.process_mesh,
+                                     ProcessMesh(mesh))
+            for op in block.ops:
+                dist_op = default_dist_context.get_dist_op_for_program(op)
+                if dist_op is not None:
+                    self.assertEqual(dist_op.dist_attr.process_mesh,
+                                     ProcessMesh(mesh))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py
index fcfafcb3e6d6d1..3c58f9e8cd393a 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_process_mesh_v2.py
@@ -13,7 +13,8 @@
 # limitations under the License
 
 import unittest
-from paddle.distributed.auto_parallel.process_mesh_v2 import ProcessMesh
+from paddle.distributed.auto_parallel.process_mesh_v2 import (
+    ProcessMesh, compute_compatible_process_mesh, merge_process_mesh)
 
 
 class TestProcessMesh(unittest.TestCase):
@@ -39,6 +40,54 @@ def test_process_mesh(self):
         self.assertNotEqual(process_mesh, process_mesh2)
         self.assertEqual(str(process_mesh), str(process_mesh))
 
+    def test_compute_compatible_process_mesh(self):
+        process_mesh1 = ProcessMesh([[0, 1, 2], [3, 4, 5]],
+                                    dim_names=["x", "y"])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, None])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [None, process_mesh1])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+
+        process_mesh2 = ProcessMesh([[0, 1, 2], [3, 4, 5]])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, process_mesh2])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+        self.assertEqual(compatible_process_mesh, process_mesh2)
+
+        process_mesh2 = ProcessMesh([[0, 1, 2, 3, 4, 5]])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, process_mesh2])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+
+        process_mesh2 = ProcessMesh([[0, 1, 2]])
+        compatible_process_mesh = compute_compatible_process_mesh(
+            [process_mesh1, process_mesh2])
+        self.assertEqual(compatible_process_mesh, process_mesh1)
+
+    def test_merge_process_mesh(self):
+        process_mesh1 = ProcessMesh([[0, 1, 2], [3, 4, 5]],
+                                    dim_names=["x", "y"])
+        merged_process_mesh = merge_process_mesh([process_mesh1, None])
+        print(merged_process_mesh)
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+        merged_process_mesh = merge_process_mesh([None, process_mesh1])
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+
+        process_mesh2 = ProcessMesh([[0, 1, 2], [3, 4, 5]])
+        merged_process_mesh = merge_process_mesh([process_mesh1, process_mesh2])
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+
+        process_mesh2 = ProcessMesh([[0, 1, 2]])
+        merged_process_mesh = merge_process_mesh([process_mesh1, process_mesh2])
+        self.assertEqual(merged_process_mesh, ProcessMesh([0, 1, 2, 3, 4, 5]))
+
+        process_mesh2 = ProcessMesh([[6, 7]])
+        merged_process_mesh = merge_process_mesh([process_mesh1, process_mesh2])
+        self.assertEqual(merged_process_mesh,
+                         ProcessMesh([0, 1, 2, 3, 4, 5, 6, 7]))
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_quantization.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_quantization.py
deleted file mode 100644
index f84ee03e0c9401..00000000000000
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_quantization.py
+++ /dev/null
@@ -1,180 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import unittest
-import sys
-import numpy as np
-import paddle
-
-import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
-
-from paddle.distributed.auto_parallel.engine import Engine
-from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
-
-sys.path.append("..")
-import auto_parallel_gpt_model as modeling
-from auto_parallel_gpt_model import GPTModel, GPTForPretraining, GPTPretrainingCriterion
-
-paddle.enable_static()
-
-
-class FakeDataset:
-
-    def __init__(self, num_samples, sequence_len, vocab_size):
-        self.num_samples = num_samples
-        self.sequence_len = sequence_len
-        self.vocab_size = vocab_size
-
-    def __getitem__(self, idx):
-        tokens = np.random.randint(self.vocab_size, size=self.sequence_len)
-        position_ids = np.arange(self.sequence_len)
-        attention_mask = np.tril(np.ones(self.sequence_len)).reshape(
-            (1, self.sequence_len, self.sequence_len)).astype(np.float32)
-        labels = np.random.randint(self.vocab_size, size=self.sequence_len)
-        loss_mask = np.ones(self.sequence_len).astype(np.float32)
-        return tokens, position_ids, attention_mask, labels, loss_mask
-
-    def __len__(self):
-        return self.num_samples
-
-
-def apply_pass():
-    dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.semi_auto = True
-    dist_strategy.qat = True
-    dist_strategy.qat_configs = {
-        'channel_wise_abs_max': True,
-        'weight_bits': 8,
-        'activation_bits': 8,
-        'not_quant_pattern': ['skip_quant'],
-    }
-    return dist_strategy
-
-
-def create_data_holder(batch_size, sequence_len):
-    tokens = paddle.static.InputSpec(name="tokens",
-                                     shape=[batch_size, sequence_len],
-                                     dtype='int64')
-    position_ids = paddle.static.InputSpec(name="position_ids",
-                                           shape=[batch_size, sequence_len],
-                                           dtype='int64')
-    attention_mask = paddle.static.InputSpec(
-        name="attention_mask",
-        shape=[batch_size, 1, sequence_len, sequence_len],
-        dtype='float32')
-    labels = paddle.static.InputSpec(name="labels",
-                                     shape=[batch_size, sequence_len],
-                                     dtype='int64')
-    loss_mask = paddle.static.InputSpec(name="loss_mask",
-                                        shape=[batch_size, sequence_len],
-                                        dtype='float32')
-    return [tokens, position_ids, attention_mask], [labels, loss_mask]
-
-
-def get_gpt_model():
-    modeling.init_global()
-    modeling._global_parallel_strategy = "serial"
-    modeling._global_process_mesh = auto.ProcessMesh(mesh=[0])
-
-    gpt = GPTModel(vocab_size=1000,
-                   hidden_size=64,
-                   num_hidden_layers=2,
-                   num_attention_heads=8,
-                   intermediate_size=256,
-                   hidden_act="gelu",
-                   hidden_dropout_prob=0.0,
-                   attention_probs_dropout_prob=0.0,
-                   max_position_embeddings=1024,
-                   type_vocab_size=1,
-                   initializer_range=0.02,
-                   pad_token_id=0,
-                   eos_token_id=7,
-                   bos_token_id=0,
-                   eol_token_id=3)
-    model = GPTForPretraining(gpt,
-                              vocab_size=1000,
-                              hidden_size=64,
-                              initializer_range=0.02)
-    criterion = GPTPretrainingCriterion()
-    return model, criterion
-
-
-class TestQuantizationPass(unittest.TestCase):
-
-    def test_qat_pass(self):
-
-        batch_size = 8
-        batch_num = 10
-        sequence_len = 512
-        vocab_size = 1000
-
-        strategy = apply_pass()
-        model, loss = get_gpt_model()
-        opt = paddle.optimizer.AdamW(learning_rate=0.00001)
-        inputs_spec, labels_spec = create_data_holder(batch_size=batch_size,
-                                                      sequence_len=sequence_len)
-
-        engine = Engine(model, inputs_spec, labels_spec, strategy=strategy)
-        engine.prepare(optimizer=opt, loss=loss)
-
-        dataset = FakeDataset(batch_size * batch_num, sequence_len, vocab_size)
-        engine.fit(train_data=dataset, batch_size=batch_size)
-
-        self.check_program(engine.main_program)
-
-    def check_program(self, program):
-
-        quantizable_op_and_inputs = {'matmul_v2': ['X', 'Y']}
-        quantizable_grad_op_inputs = {'matmul_v2_grad': ['X', 'Y']}
-
-        quantized_ops = set()
-        for block in program.blocks:
-            for op in block.ops:
-                is_quntized = False
-                if op.type in quantizable_op_and_inputs:
-                    for arg_name in op.input_arg_names:
-                        if ".quantized" in arg_name:
-                            is_quntized = True
-
-                if not is_quntized:
-                    continue
-
-                # check forward
-                if op.type in quantizable_op_and_inputs:
-                    for arg_name in op.input_arg_names:
-                        assert arg_name.endswith('.quantized.dequantized')
-                        quantized_ops.add(arg_name)
-
-            for op in block.ops:
-                is_quntized = False
-                if op.type in quantizable_grad_op_inputs:
-                    for pname in quantizable_grad_op_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        if ".quantized" in arg_name:
-                            is_quntized = True
-
-                if not is_quntized:
-                    continue
-
-                # check backward
-                if op.type in quantizable_grad_op_inputs:
-                    for pname in quantizable_grad_op_inputs[op.type]:
-                        arg_name = op.input(pname)[0]
-                        assert arg_name.endswith('.quantized.dequantized')
-                        assert arg_name in quantized_ops
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
new file mode 100644
index 00000000000000..4d17ea10dcb2c3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_strategy.py
@@ -0,0 +1,206 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+# import yaml
+import unittest
+from paddle.distributed.fleet import auto
+
+
+class TestStrategy(unittest.TestCase):
+
+    def test_default_config(self):
+        strategy = auto.Strategy()
+
+        recompute = strategy.recompute
+        self.assertEqual(recompute.enable, False)
+        self.assertEqual(recompute.checkpoints, None)
+
+        amp = strategy.amp
+        self.assertEqual(amp.enable, False)
+        self.assertAlmostEqual(amp.init_loss_scaling, 32768.0)
+        self.assertEqual(amp.incr_every_n_steps, 1000)
+        self.assertEqual(amp.decr_every_n_nan_or_inf, 2)
+        self.assertAlmostEqual(amp.incr_ratio, 2.0)
+        self.assertAlmostEqual(amp.decr_ratio, 0.8)
+        self.assertEqual(amp.use_dynamic_loss_scaling, True)
+        self.assertEqual(amp.custom_black_list, [])
+        self.assertEqual(amp.custom_white_list, [])
+        self.assertEqual(amp.custom_black_varnames, [])
+        self.assertEqual(amp.use_pure_fp16, False)
+        self.assertEqual(amp.use_fp16_guard, True)
+        self.assertEqual(amp.use_optimizer_fp16, False)
+
+        sharding = strategy.sharding
+        self.assertEqual(sharding.enable, False)
+        self.assertEqual(sharding.stage, 1)
+        self.assertEqual(sharding.degree, 8)
+        self.assertAlmostEqual(sharding.segment_broadcast_MB, 32.0)
+        self.assertEqual(sharding.enable_tuning, False)
+        self.assertEqual(sharding.tuning_range, [])
+
+        gradient_merge = strategy.gradient_merge
+        self.assertEqual(gradient_merge.enable, False)
+        self.assertEqual(gradient_merge.k_steps, 1)
+        self.assertEqual(gradient_merge.avg, True)
+
+        qat = strategy.qat
+        self.assertEqual(qat.enable, False)
+        self.assertEqual(qat.channel_wise_abs_max, True)
+        self.assertEqual(qat.weight_bits, 8)
+        self.assertEqual(qat.activation_bits, 8)
+        self.assertEqual(qat.not_quant_pattern, ['skip_quant'])
+        self.assertEqual(qat.algo, None)
+
+        tuning = strategy.tuning
+        self.assertEqual(tuning.enable, False)
+        self.assertEqual(tuning.batch_size, 1)
+        self.assertEqual(tuning.dataset, None)
+        self.assertEqual(tuning.profile_start_step, 1)
+        self.assertEqual(tuning.profile_end_step, 1)
+        self.assertEqual(tuning.run_after_tuning, True)
+        self.assertEqual(tuning.verbose, True)
+
+    def test_modify_config(self):
+        strategy = auto.Strategy()
+
+        recompute = strategy.recompute
+        recompute.enable = True
+        recompute.checkpoints = ["x"]
+        self.assertEqual(recompute.enable, True)
+        self.assertEqual(recompute.checkpoints, ["x"])
+
+        amp = strategy.amp
+        amp.enable = True
+        amp.init_loss_scaling = 16384.0
+        amp.incr_every_n_steps = 2000
+        amp.decr_every_n_nan_or_inf = 4
+        amp.incr_ratio = 4.0
+        amp.decr_ratio = 0.4
+        amp.use_dynamic_loss_scaling = False
+        amp.custom_white_list = ["x"]
+        amp.custom_black_list = ["y"]
+        amp.custom_black_varnames = ["z"]
+        amp.use_pure_fp16 = True
+        amp.use_fp16_guard = False
+        amp.use_optimizer_fp16 = True
+        self.assertEqual(amp.enable, True)
+        self.assertAlmostEqual(amp.init_loss_scaling, 16384.0)
+        self.assertEqual(amp.incr_every_n_steps, 2000)
+        self.assertEqual(amp.decr_every_n_nan_or_inf, 4)
+        self.assertAlmostEqual(amp.incr_ratio, 4.0)
+        self.assertAlmostEqual(amp.decr_ratio, 0.4)
+        self.assertEqual(amp.use_dynamic_loss_scaling, False)
+        self.assertEqual(amp.custom_white_list, ["x"])
+        self.assertEqual(amp.custom_black_list, ["y"])
+        self.assertEqual(amp.custom_black_varnames, ["z"])
+        self.assertEqual(amp.use_pure_fp16, True)
+        self.assertEqual(amp.use_fp16_guard, False)
+        self.assertEqual(amp.use_optimizer_fp16, True)
+
+        sharding = strategy.sharding
+        sharding.enable = True
+        sharding.stage = 2
+        sharding.degree = 2
+        sharding.segment_broadcast_MB = 64.0
+        sharding.enable_tuning = True
+        sharding.tuning_range = [1, 2, 3]
+        self.assertEqual(sharding.enable, True)
+        self.assertEqual(sharding.stage, 2)
+        self.assertEqual(sharding.degree, 2)
+        self.assertAlmostEqual(sharding.segment_broadcast_MB, 64.0)
+        self.assertEqual(sharding.enable_tuning, True)
+        self.assertEqual(sharding.tuning_range, [1, 2, 3])
+
+        gradient_merge = strategy.gradient_merge
+        gradient_merge.enable = True
+        gradient_merge.k_steps = 4
+        gradient_merge.avg = False
+        self.assertEqual(gradient_merge.enable, True)
+        self.assertEqual(gradient_merge.k_steps, 4)
+        self.assertEqual(gradient_merge.avg, False)
+
+    # def test_file_config(self):
+    #     yaml_data = """
+    #     all_ranks: false
+    #     amp:
+    #         custom_black_list:
+    #         - y
+    #         custom_black_varnames:
+    #         - z
+    #         custom_white_list:
+    #         - x
+    #         decr_every_n_nan_or_inf: 4
+    #         decr_ratio: 0.4
+    #         enable: false
+    #         incr_every_n_steps: 2000
+    #         incr_ratio: 4.0
+    #         init_loss_scaling: 16384.0
+    #         use_dynamic_loss_scaling: false
+    #         use_fp16_guard: false
+    #         use_optimizer_fp16: true
+    #         use_pure_fp16: true
+    #     auto_mode: semi
+    #     gradient_merge:
+    #         avg: false
+    #         enable: false
+    #         k_steps: 4
+    #     gradient_scale: true
+    #     qat:
+    #         activation_bits: 8
+    #         algo: null
+    #         channel_wise_abs_max: true
+    #         enable: false
+    #         not_quant_pattern:
+    #         - skip_quant
+    #         weight_bits: 8
+    #     recompute:
+    #         checkpoints: null
+    #         enable: false
+    #         enable_tuning: false
+    #     return_numpy: true
+    #     seed: null
+    #     sharding:
+    #         enable: false
+    #         enable_tuning: true
+    #         segment_broadcast_MB: 64.0
+    #         degree: 8
+    #         stage: 2
+    #         tuning_range: None
+    #     split_data: false
+    #     tuning:
+    #         batch_size: 1
+    #         dataset: null
+    #         enable: false
+    #         profile_end_step: 1
+    #         profile_start_step: 1
+    #         run_after_tuning: true
+    #         verbose: true
+    #     use_cache: true
+    #     """
+    #     yaml_path = "./strategy.yml"
+    #     yaml_dict = yaml.load(yaml_data, Loader=yaml.Loader)
+    #     with open(yaml_path, 'w') as outfile:
+    #         yaml.dump(yaml_dict, outfile, default_flow_style=False)
+
+    #     strategy = auto.Strategy(yaml_path)
+    #     self.assertEqual(yaml_dict, strategy.to_dict())
+
+    #     # Remove the created file
+    #     if os.path.exists(yaml_path):
+    #         os.remove(yaml_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
index a3ab87160da682..94d88a69bea357 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_to_static.py
@@ -20,14 +20,14 @@
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 import paddle.distributed.fleet as fleet
 
+from paddle import LazyGuard
 from paddle.io import Dataset
 from paddle.static import InputSpec
 from paddle.fluid.framework import _non_static_mode
-from paddle.distributed.auto_parallel.engine import Engine
-from paddle.distributed.auto_parallel.hepler import ProgramHelper
+from paddle.distributed.auto_parallel.helper import ProgramHelper
 
 batch_size = 4
 batch_num = 30
@@ -110,7 +110,7 @@ def test_apply_optimzier(self):
         program_helper.to('train')
 
         forward_ops = program_helper.main_program.block(0).ops
-        self.assertEqual(len(forward_ops), 21)
+        self.assertEqual(len(forward_ops), 17)
 
         # step 2: apply optimzer to generate whole program
         optimize_ops, _ = program_helper.apply_optimizer(optimizer)
@@ -119,7 +119,7 @@ def test_apply_optimzier(self):
             op for op in program_helper.main_program.block(0).ops
             if op.type == 'sgd'
         ]
-        self.assertEqual(len(all_ops), 41)
+        self.assertEqual(len(all_ops), 37)
         self.assertEqual(len(optimize_ops), len(sgd_ops))
 
         program_helper.reset()
@@ -139,23 +139,43 @@ def test_to_static(self):
 
         dataset = MyDataset(batch_num * batch_size)
 
-        inputs = InputSpec([batch_size, hidden_size], 'float32', 'x')
-        labels = InputSpec([batch_size], 'int64', 'label')
+        # inputs = InputSpec([batch_size, hidden_size], 'float32', 'x')
+        # labels = InputSpec([batch_size], 'int64', 'label')
 
-        engine = Engine(model=mlp,
-                        inputs_spec=inputs,
-                        labels_spec=labels,
-                        strategy=None)
         assert _non_static_mode() == True
-
-        engine.prepare(optimizer=optimizer,
-                       loss=loss,
-                       metrics=paddle.metric.Accuracy())
-
-        assert _non_static_mode() == False
+        engine = auto.Engine(model=mlp,
+                             loss=loss,
+                             optimizer=optimizer,
+                             metrics=paddle.metric.Accuracy(),
+                             strategy=None)
         engine.fit(dataset, batch_size=batch_size)
         engine.evaluate(dataset, batch_size=batch_size)
         engine.predict(dataset, batch_size=batch_size)
+        assert _non_static_mode() == False
+
+
+class TestLazyInit(unittest.TestCase):
+
+    def test_lazy_init(self):
+
+        with LazyGuard():
+            mlp = MLPLayer(hidden_size=hidden_size,
+                           intermediate_size=4 * hidden_size,
+                           dropout_ratio=0.1,
+                           initializer_range=0.02)
+            loss = paddle.nn.CrossEntropyLoss()
+
+        metrics = paddle.metric.Accuracy()
+        loss = paddle.nn.CrossEntropyLoss()
+        inputs = InputSpec([batch_size, hidden_size], 'float32', 'x')
+        labels = InputSpec([batch_size], 'int64', 'label')
+
+        program_helper = ProgramHelper(mlp, loss, [metrics], [inputs], [labels])
+        program_helper.build_program(mode='train')
+        ops = program_helper.startup_program.block(0).ops
+        vars = program_helper.startup_program.block(0).vars
+        assert len(vars.keys()) == len(ops)
+        program_helper.reset()
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
index f0c6a0b7cdf799..58ff36aba09dba 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_tunable_space.py
@@ -136,6 +136,16 @@ def test_state(self):
         self.assertEqual(new_space.variables["int_range"].step, 1)
         self.assertEqual(new_space.variables["int_range"].endpoint, False)
 
+    def test_expection(self):
+        space = ts.TunableSpace()
+        flag = True
+        try:
+            val = space.get_value("test")
+            flag = False
+        except:
+            pass
+        self.assertTrue(flag)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
index 3dabe38ff6e1d7..f0edf8d6e2d835 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_completion.py
@@ -19,7 +19,7 @@
 import paddle.utils as utils
 import paddle.static as static
 import paddle.nn.functional as F
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.completion import Completer
@@ -36,7 +36,7 @@
 epoch_num = 10
 hidden_size = 1024
 sequence_len = 512
-_g_process_mesh = [[0, 1], [2, 3]]
+_g_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=['x', 'y'])
 
 
 def get_random_inputs_and_labels(input_shape, label_shape):
@@ -84,18 +84,12 @@ def __init__(self,
 
     def forward(self, input):
         out = self.norm(input)
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, 0]
-                          })
+        auto.shard_tensor(self.linear0.weight, _g_process_mesh[:, 0],
+                          [None, 'x'])
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[1],
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(self.linear1.weight, _g_process_mesh[:, 1],
+                          ['x', None])
         out = self.linear1(out)
 
         return out
@@ -155,16 +149,8 @@ def get_program():
         dataloader.set_batch_generator(batch_generator_creator(),
                                        places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, -1, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh[0],
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _g_process_mesh[:, 0], [None, None, None])
+        auto.shard_tensor(label, _g_process_mesh[:, 0], [None, None, None])
 
         mlp_start = MLPLayer(hidden_size=hidden_size,
                              intermediate_size=4 * hidden_size,
@@ -201,6 +187,14 @@ def test_completer(self):
             train_program)
         # print_program_with_dist_attr(complete_train_program, dist_context)
 
+    def test_completer_by_dist_op(self):
+        train_program, start_program, dataloader, i, loss = get_program()
+        dist_context = DistributedContext()
+        completer = Completer(dist_context)
+        complete_train_program = completer.complete_forward_annotation(
+            train_program)
+        complete_train_program = completer._complete_tensor_dist_attr_by_op()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
index 3c6e086ae7face..ee91842cb70f99 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel/test_while_op_partition.py
@@ -20,7 +20,7 @@
 import paddle.fluid as fluid
 import paddle.static as static
 import paddle.nn.functional as F
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.completion import Completer
@@ -37,7 +37,7 @@
 epoch_num = 10
 hidden_size = 1024
 sequence_len = 512
-_g_process_mesh = auto.ProcessMesh([0, 1])
+_g_process_mesh = auto.ProcessMesh([0, 1], dim_names=['x'])
 
 
 def get_random_inputs_and_labels(input_shape, label_shape):
@@ -85,61 +85,21 @@ def __init__(self,
 
     def forward(self, input):
 
-        auto.shard_tensor(self.norm.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
-        auto.shard_tensor(self.norm.bias,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, 0]
-                          })
-        auto.shard_tensor(self.linear0.bias,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [0]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(self.linear1.bias,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(self.norm.weight, _g_process_mesh, [None])
+        auto.shard_tensor(self.norm.bias, _g_process_mesh, [None])
+        auto.shard_tensor(self.linear0.weight, _g_process_mesh, [None, 'x'])
+        auto.shard_tensor(self.linear0.bias, _g_process_mesh, ['x'])
+        auto.shard_tensor(self.linear1.weight, _g_process_mesh, ['x', None])
+        auto.shard_tensor(self.linear1.bias, _g_process_mesh, [None])
 
         out = self.norm(input)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, None])
         out = self.linear0(out)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, 0]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, 'x'])
         out = F.gelu(out, approximate=True)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, 0]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, 'x'])
         out = self.linear1(out)
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(out, _g_process_mesh, [None, None, None])
 
         return out
 
@@ -155,21 +115,13 @@ def get_program():
 
         # 循环计数器
         i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
-        auto.shard_tensor(i,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(i, _g_process_mesh, [None])
 
         # 循环次数
         loop_len = fluid.layers.fill_constant(shape=[1],
                                               dtype='int64',
                                               value=epoch_num)
-        auto.shard_tensor(loop_len,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(loop_len, _g_process_mesh, [None])
 
         # input
         input = static.data(name="input",
@@ -188,25 +140,13 @@ def get_program():
         dataloader.set_batch_generator(batch_generator_creator(),
                                        places=paddle.static.cuda_places())
         # data dist_attr
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _g_process_mesh, [None, None, None])
+        auto.shard_tensor(label, _g_process_mesh, [None, None, None])
 
         # fill constant bsz like
         tmp = paddle.fluid.layers.fill_constant_batch_size_like(
             input=input, shape=[-1, 16, 0, 48], dtype='float32', value=0)
-        auto.shard_tensor(tmp,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, 0, -1, -1]
-                          })
+        auto.shard_tensor(tmp, _g_process_mesh, [None, 'x', None, None])
 
         # model
         mlp_start = MLPLayer(hidden_size=hidden_size,
@@ -216,28 +156,21 @@ def get_program():
         pred = mlp_start(input)
 
         input_array = fluid.layers.array_write(pred, i)
-        auto.shard_tensor(input_array,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        # TODO: check whether this annotation is needed
+        # auto.shard_tensor(input_array,
+        #                   dist_attr={
+        #                       "process_mesh": _g_process_mesh,
+        #                       "dims_mapping": [-1, -1, -1]
+        #                   })
 
         cond = fluid.layers.less_than(x=i, y=loop_len)
-        auto.shard_tensor(cond,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(cond, _g_process_mesh, [None])
 
         while_op = fluid.layers.While(cond=cond)
         with while_op.block():
 
             pre_input = fluid.layers.array_read(array=input_array, i=i)
-            auto.shard_tensor(pre_input,
-                              dist_attr={
-                                  "process_mesh": _g_process_mesh,
-                                  "dims_mapping": [-1, -1, -1]
-                              })
+            auto.shard_tensor(pre_input, _g_process_mesh, [None, None, None])
 
             mlp_while = MLPLayer(hidden_size=hidden_size,
                                  intermediate_size=4 * hidden_size,
@@ -251,11 +184,7 @@ def get_program():
             fluid.layers.less_than(x=i, y=loop_len, cond=cond)
 
         end_pred = fluid.layers.array_read(array=input_array, i=i)
-        auto.shard_tensor(end_pred,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(end_pred, _g_process_mesh, [None, None, None])
 
         mlp_end = MLPLayer(hidden_size=hidden_size,
                            intermediate_size=4 * hidden_size,
@@ -264,18 +193,10 @@ def get_program():
         pred = mlp_end(end_pred)
 
         error_cost = paddle.nn.functional.square_error_cost(pred, label)
-        auto.shard_tensor(error_cost,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1, -1, -1]
-                          })
+        auto.shard_tensor(error_cost, _g_process_mesh, [None, None, None])
 
         loss = paddle.mean(error_cost)
-        auto.shard_tensor(loss,
-                          dist_attr={
-                              "process_mesh": _g_process_mesh,
-                              "dims_mapping": [-1]
-                          })
+        auto.shard_tensor(loss, _g_process_mesh, [None])
 
     return train_program, start_program, dataloader, i, loss
 
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
index c3f64e30fc5967..5aa852ccac134f 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_autoconvert.py
@@ -25,7 +25,7 @@
 import paddle.utils as utils
 import paddle.static as static
 import paddle.nn.functional as F
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.distributed import fleet
 from paddle.fluid.initializer import NumpyArrayInitializer
@@ -67,38 +67,18 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, "x"])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -120,28 +100,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -186,7 +150,7 @@ def test_mlp_mp2pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         input = np.random.random(size=(80, 64)).astype('float32')
         label = np.random.random(size=(80, 1)).astype('float32')
@@ -212,11 +176,11 @@ def test_mlp_mp2pp(self):
 
         set_default_distributed_context(None)
         _global_parallel_strategy = "pp"
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["pp0"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["pp1"])
 
         dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
         )
@@ -268,7 +232,7 @@ def test_mlp_pp2mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         global PP_MESH_0
         PP_MESH_0 = auto.ProcessMesh(mesh=[0])
         global PP_MESH_1
@@ -303,7 +267,7 @@ def test_mlp_pp2mp(self):
 
         set_default_distributed_context(None)
         _global_parallel_strategy = "mp"
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         dist_main_prog_load, dist_start_prog_load, loss_load = get_distributed_program(
         )
@@ -350,7 +314,7 @@ def test_input_invalid(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         dist_main_prog, _, _ = get_distributed_program()
         with self.assertRaises(TypeError):
             save_distributed_checkpoint(dist_main_prog, [""], [""],
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
index 9d2b2739401214..4b391de0f570ca 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_data_unshard.py
@@ -23,7 +23,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.fluid.core as core
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 import paddle.nn.functional as F
 from paddle.distributed import fleet
 
@@ -38,7 +38,7 @@ def test_dp2pp1mp1(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
-                MESH_0 = auto.ProcessMesh([0, 1])
+                MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
                 input = paddle.static.data(name='input', shape=[2, 8])
                 label = paddle.static.data(name='label', shape=[2, 8])
 
@@ -47,26 +47,10 @@ def create_model(train_program, start_program):
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(input,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [0, -1]
-                                  })
-                auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [0, -1]
-                                  })
-                auto.shard_tensor(linear0.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
-                auto.shard_tensor(linear1.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
+                auto.shard_tensor(input, MESH_0, ["x", None])
+                auto.shard_tensor(label, MESH_0, ["x", None])
+                auto.shard_tensor(linear0.weight, MESH_0, [None, None])
+                auto.shard_tensor(linear1.weight, MESH_0, [None, None])
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
@@ -124,7 +108,7 @@ def dp1pp1mp2(self):
         def create_model(train_program, start_program):
             with paddle.static.program_guard(train_program, start_program):
 
-                MESH_0 = auto.ProcessMesh([0, 1])
+                MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
                 input = paddle.static.data(name='input', shape=[8, 8])
                 label = paddle.static.data(name='label', shape=[8, 8])
 
@@ -133,27 +117,10 @@ def create_model(train_program, start_program):
                 linear0 = nn.Linear(8, 8, weight_attr)
                 linear1 = nn.Linear(8, 8, weight_attr)
 
-                auto.shard_tensor(input,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
-                auto.shard_tensor(label,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, -1]
-                                  })
-
-                auto.shard_tensor(linear0.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [-1, 0]
-                                  })
-                auto.shard_tensor(linear1.weight,
-                                  dist_attr={
-                                      "process_mesh": MESH_0,
-                                      "dims_mapping": [0, -1]
-                                  })
+                auto.shard_tensor(input, MESH_0, [None, None])
+                auto.shard_tensor(label, MESH_0, [None, None])
+                auto.shard_tensor(linear0.weight, MESH_0, [None, "x"])
+                auto.shard_tensor(linear1.weight, MESH_0, ["x", None])
 
                 linear0_out = linear0(input)
                 gelu_out = F.gelu(linear0_out)
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
index 87c746ab5d3b50..e065e2077f839c 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_gpt_model.py
@@ -22,7 +22,7 @@
 import paddle.nn as nn
 import paddle.nn.functional as F
 import paddle.tensor as tensor
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle import fluid
 from paddle.fluid import layers
 from paddle.distributed import fleet
@@ -114,30 +114,18 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.q_proj.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.q_proj.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.q_proj.weight, MPPP_MESH_LIST[self.mesh_idx],
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
+
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
         if isinstance(cache, self.StaticCache):
@@ -165,56 +153,30 @@ def compute_kv(self, key, value):
         """
         k = self.k_proj(key)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.k_proj.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.k_proj.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.k_proj.weight, MPPP_MESH_LIST[self.mesh_idx],
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
         v = self.v_proj(value)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.v_proj.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.v_proj.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.v_proj.weight, MPPP_MESH_LIST[self.mesh_idx],
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
         v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
@@ -287,30 +249,18 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.out_proj.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.out_proj.weight, _global_process_mesh,
+                              ["y", None])
         elif _global_parallel_strategy == "mp_pp":
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [0, -1]
-                              })
+                              MPPP_MESH_LIST[self.mesh_idx], ["x", None])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [1, -1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], ["y", None])
+
         outs = [out]
         if self.need_weights:
             outs.append(weights)
@@ -352,96 +302,53 @@ def forward(self,
         new_caches = []
         self.checkpoints = []
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  PP_MESH_LIST[0],
-                                  "dims_mapping":
-                                  [-1 for i in range(len(output.shape))]
-                              })
+            auto.shard_tensor(output, PP_MESH_LIST[0],
+                              [None for i in range(len(output.shape))])
         if _global_parallel_strategy == "dp_pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(output.shape) - 1)]
-                              })
+            auto.shard_tensor(output, DPPP_MESH_LIST[0], ["x"] +
+                              [None for i in range(len(output.shape) - 1)])
         if _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  MPPP_MESH_LIST[0],
-                                  "dims_mapping": [-1] +
-                                  [-1 for i in range(len(output.shape) - 1)]
-                              })
+            auto.shard_tensor(output, MPPP_MESH_LIST[0],
+                              [None for i in range(len(output.shape))])
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(output,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(output.shape) - 1)]
-                              })
+            auto.shard_tensor(output, DPMPPP_MESH_LIST[0], ["x"] +
+                              [None for i in range(len(output.shape) - 1)])
         for i, mod in enumerate(self.layers):
             if cache is None:
                 if use_cache:
                     if _global_parallel_strategy == "pp":
                         output, new_cache = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": PP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            mod, PP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                             tgt_mask,
+                                                             use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                PP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping":
-                                [-1 for i in range(len(output.shape))]
-                            })
+                            output, PP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_pp":
                         output, new_cache = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            mod, DPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPPP_MESH_LIST[mod.mesh_idx], ["x"] +
+                            [None for i in range(len(output.shape) - 1)])
                     elif _global_parallel_strategy == "mp_pp":
                         output, new_cache = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                MPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [-1] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, MPPP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_mp_pp":
                         output, new_cache = auto.shard_op(
                             mod,
-                            dist_attr={
-                                "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                            DPMPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                            tgt_mask, use_cache,
+                                                            cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPMPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPMPPP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     else:
                         output, new_cache = mod(output,
                                                 memory,
@@ -451,64 +358,35 @@ def forward(self,
                     new_caches.append(new_cache)
                 else:
                     if _global_parallel_strategy == "pp":
-                        output = auto.shard_op(mod,
-                                               dist_attr={
-                                                   "process_mesh":
-                                                   PP_MESH_LIST[mod.mesh_idx]
-                                               })(output, memory, tgt_mask,
-                                                  use_cache, cache)
+                        output = auto.shard_op(mod, PP_MESH_LIST[mod.mesh_idx])(
+                            output, memory, tgt_mask, use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                PP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping":
-                                [-1 for i in range(len(output.shape))]
-                            })
+                            output, PP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_pp":
-                        output = auto.shard_op(mod,
-                                               dist_attr={
-                                                   "process_mesh":
-                                                   DPPP_MESH_LIST[mod.mesh_idx]
-                                               })(output, memory, tgt_mask,
-                                                  use_cache, cache)
+                        output = auto.shard_op(
+                            mod, DPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPPP_MESH_LIST[mod.mesh_idx], ["x"] +
+                            [None for i in range(len(output.shape) - 1)])
                     elif _global_parallel_strategy == "mp_pp":
-                        output = auto.shard_op(mod,
-                                               dist_attr={
-                                                   "process_mesh":
-                                                   MPPP_MESH_LIST[mod.mesh_idx]
-                                               })(output, memory, tgt_mask,
-                                                  use_cache, cache)
+                        output = auto.shard_op(
+                            mod, MPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                               tgt_mask,
+                                                               use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                MPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [-1] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, MPPP_MESH_LIST[mod.mesh_idx],
+                            [None for i in range(len(output.shape))])
                     elif _global_parallel_strategy == "dp_mp_pp":
-                        output = auto.shard_op(
-                            mod,
-                            dist_attr={
-                                "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                            })(output, memory, tgt_mask, use_cache, cache)
+                        output = auto.shard_op(mod,
+                                               DPMPPP_MESH_LIST[mod.mesh_idx])(
+                                                   output, memory, tgt_mask,
+                                                   use_cache, cache)
                         auto.shard_tensor(
-                            output,
-                            dist_attr={
-                                "process_mesh":
-                                DPMPPP_MESH_LIST[mod.mesh_idx],
-                                "dims_mapping": [0] +
-                                [-1 for i in range(len(output.shape) - 1)]
-                            })
+                            output, DPMPPP_MESH_LIST[mod.mesh_idx], ["x"] +
+                            [None for i in range(len(output.shape) - 1)])
                     else:
                         output = mod(output,
                                      memory,
@@ -519,58 +397,33 @@ def forward(self,
                 if _global_parallel_strategy == "pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={"process_mesh": PP_MESH_LIST[mod.mesh_idx]
-                                   })(output, memory, tgt_mask, use_cache,
-                                      cache)
-                    auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh": PP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [-1 for i in range(len(output.shape))]
-                        })
+                        PP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
+                                                    use_cache, cache)
+                    auto.shard_tensor(output, PP_MESH_LIST[mod.mesh_idx],
+                                      [None for i in range(len(output.shape))])
                 elif _global_parallel_strategy == "dp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={
-                            "process_mesh": DPPP_MESH_LIST[mod.mesh_idx]
-                        })(output, memory, tgt_mask, use_cache, cache)
+                        DPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
+                                                      use_cache, cache)
                     auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh":
-                            DPPP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [0] + [-1 for i in range(len(output.shape) - 1)]
-                        })
+                        output, DPPP_MESH_LIST[mod.mesh_idx],
+                        ["x"] + [None for i in range(len(output.shape) - 1)])
                 elif _global_parallel_strategy == "mp_pp":
                     output, new_cache = auto.shard_op(
                         mod,
-                        dist_attr={
-                            "process_mesh": MPPP_MESH_LIST[mod.mesh_idx]
-                        })(output, memory, tgt_mask, use_cache, cache)
-                    auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh":
-                            MPPP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [-1] + [-1 for i in range(len(output.shape) - 1)]
-                        })
+                        MPPP_MESH_LIST[mod.mesh_idx])(output, memory, tgt_mask,
+                                                      use_cache, cache)
+                    auto.shard_tensor(output, MPPP_MESH_LIST[mod.mesh_idx],
+                                      [None for i in range(len(output.shape))])
                 elif _global_parallel_strategy == "dp_mp_pp":
                     output, new_cache = auto.shard_op(
-                        mod,
-                        dist_attr={
-                            "process_mesh": DPMPPP_MESH_LIST[mod.mesh_idx]
-                        })(output, memory, tgt_mask, use_cache, cache)
+                        mod, DPMPPP_MESH_LIST[mod.mesh_idx])(output, memory,
+                                                             tgt_mask,
+                                                             use_cache, cache)
                     auto.shard_tensor(
-                        output,
-                        dist_attr={
-                            "process_mesh":
-                            DPMPPP_MESH_LIST[mod.mesh_idx],
-                            "dims_mapping":
-                            [0] + [-1 for i in range(len(output.shape) - 1)]
-                        })
+                        output, DPMPPP_MESH_LIST[mod.mesh_idx],
+                        ["x"] + [None for i in range(len(output.shape) - 1)])
                 else:
                     output, new_cache = mod(output,
                                             memory,
@@ -661,55 +514,30 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, "x"])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, "y"])
         elif _global_parallel_strategy == "mp_pp":
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 0]
-                              })
+                              MPPP_MESH_LIST[self.mesh_idx], [None, "x"])
         if _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [-1, 1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], [None, "y"])
+
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.linear2.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.linear2.weight, _global_process_mesh,
+                              ["y", None])
         elif _global_parallel_strategy == "mp_pp":
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [0, -1]
-                              })
+                              MPPP_MESH_LIST[self.mesh_idx], ["x", None])
         elif _global_parallel_strategy == "dp_mp_pp":
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[self.mesh_idx],
-                                  "dims_mapping": [1, -1]
-                              })
+                              DPMPPP_MESH_LIST[self.mesh_idx], ["y", None])
         tgt = self.dropout2(
             self.linear2(F.gelu(self.linear1(tgt), approximate=True)))
         tgt = residual + tgt
@@ -757,29 +585,18 @@ def forward(self, input_ids, position_ids=None):
             position_ids = seq_length - ones
         input_embedings = self.word_embeddings(input_ids)
         if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, _global_process_mesh,
+                              ["y", None])
         elif _global_parallel_strategy == "mp_pp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": MPPP_MESH_LIST[0],
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, MPPP_MESH_LIST[0],
+                              ["x", None])
         elif _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": DPMPPP_MESH_LIST[0],
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.word_embeddings.weight, DPMPPP_MESH_LIST[0],
+                              ["y", None])
+
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
         embeddings = self.dropout(embeddings)
@@ -868,29 +685,14 @@ def forward(self,
         embedding_output = self.embeddings(input_ids=input_ids,
                                            position_ids=position_ids)
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh":
-                                  PP_MESH_LIST[0],
-                                  "dims_mapping":
-                                  [-1 for i in range(len(input_ids.shape))]
-                              })
+            auto.shard_tensor(input_ids, PP_MESH_LIST[0],
+                              [None for i in range(len(input_ids.shape))])
         if _global_parallel_strategy == "dp_pp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(input_ids.shape) - 1)]
-                              })
+            auto.shard_tensor(input_ids, DPPP_MESH_LIST[0], ["x"] +
+                              [None for i in range(len(input_ids.shape) - 1)])
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh":
-                                  DPMPPP_MESH_LIST[0],
-                                  "dims_mapping": [0] +
-                                  [-1 for i in range(len(input_ids.shape) - 1)]
-                              })
+            auto.shard_tensor(input_ids, DPMPPP_MESH_LIST[0], ["x"] +
+                              [None for i in range(len(input_ids.shape) - 1)])
         encoder_outputs = self.decoder(embedding_output,
                                        memory=None,
                                        tgt_mask=attention_mask,
@@ -914,12 +716,6 @@ def __init__(
         initializer_range=0.02,
     ):
         super(GPTForPretraining, self).__init__()
-        self.output_embeddings = nn.Embedding(
-            vocab_size,
-            hidden_size,
-            weight_attr=paddle.ParamAttr(name="output_embeddings",
-                                         initializer=nn.initializer.Normal(
-                                             mean=0.0, std=initializer_range)))
         self.gpt = gpt
 
     def forward(self,
@@ -929,6 +725,10 @@ def forward(self,
                 masked_positions=None,
                 use_cache=False,
                 cache=None):
+        input_ids.stop_gradient = True
+        position_ids.stop_gradient = True
+        attention_mask.stop_gradient = True
+
         outputs = self.gpt(input_ids,
                            position_ids=position_ids,
                            attention_mask=attention_mask,
@@ -938,9 +738,47 @@ def forward(self,
             encoder_outputs, cached_kvs = outputs[:2]
         else:
             encoder_outputs = outputs
-        logits = paddle.matmul(encoder_outputs,
-                               self.output_embeddings.weight,
-                               transpose_y=True)
+
+        x = encoder_outputs
+        w = self.gpt.embeddings.word_embeddings.weight
+
+        mesh = None
+        if _global_parallel_strategy == "pp":
+            mesh = PP_MESH_LIST[-1]
+            x_dims_mapping = [None for i in range(len(x.shape))]
+            w_dims_mapping = [None for i in range(len(w.shape))]
+        elif _global_parallel_strategy == "dp":
+            mesh = _global_process_mesh
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = [None for i in range(len(w.shape))]
+        elif _global_parallel_strategy == "mp":
+            mesh = _global_process_mesh
+            x_dims_mapping = [None for i in range(len(x.shape))]
+            w_dims_mapping = ["x"] + [None for i in range(len(w.shape) - 1)]
+        elif _global_parallel_strategy == "dp_mp":
+            mesh = _global_process_mesh
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = ["y"] + [None for i in range(len(w.shape) - 1)]
+        elif _global_parallel_strategy == "dp_pp":
+            mesh = DPPP_MESH_LIST[-1]
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = [None for i in range(len(w.shape))]
+        elif _global_parallel_strategy == "mp_pp":
+            mesh = MPPP_MESH_LIST[-1]
+            x_dims_mapping = [None for i in range(len(x.shape))]
+            w_dims_mapping = ["x"] + [-1 for i in range(len(w.shape) - 1)]
+        elif _global_parallel_strategy == "dp_mp_pp":
+            mesh = DPMPPP_MESH_LIST[-1]
+            x_dims_mapping = ["x"] + [None for i in range(len(x.shape) - 1)]
+            w_dims_mapping = ["y"] + [None for i in range(len(w.shape) - 1)]
+
+        if mesh:
+            matmul = auto.shard_op(paddle.matmul, mesh,
+                                   [x_dims_mapping, w_dims_mapping, None])
+            logits = matmul(x, w, transpose_y=True)
+        else:
+            logits = paddle.matmul(x, w, transpose_y=True)
+
         if use_cache:
             return logits, cached_kvs
         else:
@@ -958,6 +796,30 @@ def __init__(self):
         self.loss_func = paddle.nn.CrossEntropyLoss(reduction="none")
 
     def forward(self, prediction_scores, masked_lm_labels, loss_mask):
+        masked_lm_labels.stop_gradient = True
+        loss_mask.stop_gradient = True
+
+        mesh = None
+        if _global_parallel_strategy == "dp":
+            mesh = _global_process_mesh
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
+        elif _global_parallel_strategy == "dp_mp":
+            mesh = _global_process_mesh
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
+        elif _global_parallel_strategy == "dp_pp":
+            mesh = DPPP_MESH_LIST[-1]
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
+        elif _global_parallel_strategy == "dp_mp_pp":
+            mesh = DPMPPP_MESH_LIST[-1]
+            dims_mapping = ["x"
+                            ] + [None for i in range(len(loss_mask.shape) - 1)]
+
+        if mesh:
+            auto.shard_tensor(loss_mask, mesh, dims_mapping)
+
         masked_lm_loss = self.loss_func(prediction_scores,
                                         masked_lm_labels.unsqueeze(2))
         loss_mask = loss_mask.reshape([-1])
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
index 12f4cc08b0874e..f1671eaf70d332 100644
--- a/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
+++ b/python/paddle/fluid/tests/unittests/auto_parallel_save_load.py
@@ -25,7 +25,7 @@
 import paddle.utils as utils
 import paddle.static as static
 import paddle.nn.functional as F
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.distributed import fleet
 from paddle.fluid.initializer import NumpyArrayInitializer
@@ -64,38 +64,18 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, "x"])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              ["x", None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -119,28 +99,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         elif _global_parallel_strategy == "mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -183,7 +147,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         dist_main_prog, dist_start_prog, loss = get_distributed_program()
         place = paddle.set_device("gpu")
@@ -230,7 +194,7 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
 
         dist_main_prog, dist_start_prog, loss = get_distributed_program()
 
@@ -278,11 +242,11 @@ def test_mlp_pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh([0, 1])
+        _global_process_mesh = auto.ProcessMesh([0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["x"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["x"])
 
         dist_main_prog, dist_start_prog, loss = get_distributed_program()
 
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
index 51104223f954db..c8e1b3965228d7 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_jvp_and_transpose.py
@@ -400,6 +400,75 @@ def init_data(self):
         ]
 
 
+class TestAbsPJVPAndTranspose(TestAddPJVPAndTranspose):
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'abs_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        self.prim_input = {
+            'X': X,
+        }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        self.all_ops = [
+            # prim op:
+            'abs_p',
+            # jvp op:
+            'select_p',
+            'ge_p',
+            'fill_constant_p',
+            'fill_constant_p',
+            'sub_p',
+            # transpose op:
+        ]
+
+
+class TestCastPJVPAndTranspose(TestAddPJVPAndTranspose):
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'cast_p'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+        self.prim_input = {
+            'X': X,
+        }
+        self.prim_output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {'dtype': paddle.float64}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[5, 6], dtype='int64')
+        self.jvp_args = (X_DOT, )
+        self.jvp_out_shape_map = {0: self.prim_output['Y']}
+
+        # Set transpose
+        check_dot = lambda v: True
+        Y_BAR = paddle.static.data(name='Y_BAR', shape=[5, 6], dtype='float')
+        self.transpose_args = (check_dot, Y_BAR)
+        self.transpose_out_shape_map = {0: X}
+
+        self.all_ops = [
+            # prim op:
+            'cast_p',
+            # jvp op:
+            'cast_p',
+            # transpose op:
+            'cast_p'
+        ]
+
+
 class TestLogPJVPAndTranspose(TestAddPJVPAndTranspose):
 
     def init_data(self):
@@ -503,7 +572,7 @@ def init_data(self):
             # jvp op:
             'broadcast_p',
             # transpose op:
-            'reduce_p',
+            'reduce_sum_p',
             'reshape_p'
         ]
 
@@ -650,11 +719,11 @@ def init_data(self):
         ]
 
 
-class TestReducePJVPAndTranspose(TestAddPJVPAndTranspose):
+class TestReduceSumPJVPAndTranspose(TestAddPJVPAndTranspose):
 
     def init_data(self):
         # Set prim op
-        self.op_type = 'reduce_p'
+        self.op_type = 'reduce_sum_p'
         X = paddle.static.data(name='X', shape=[2, 3, 4, 5], dtype='float64')
         self.prim_input = {'X': X}
         self.prim_output = {
@@ -682,9 +751,9 @@ def init_data(self):
 
         self.all_ops = [
             # prim op:
-            'reduce_p',
+            'reduce_sum_p',
             # jvp op:
-            'reduce_p',
+            'reduce_sum_p',
             # transpose op:
             'reshape_p',
             'broadcast_p',
@@ -978,6 +1047,96 @@ def init_data(self):
         ]
 
 
+class TestGtPJVPAndTranspose(TestAddPJVPAndTranspose):
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'gt_p'
+        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
+
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        self.all_ops = [
+            # prim op:
+            'gt_p',
+            # jvp op:
+            'fill_constant_p',
+            # transpose op:
+        ]
+
+
+class TestGePJVPAndTranspose(TestAddPJVPAndTranspose):
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'ge_p'
+        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
+
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        self.all_ops = [
+            # prim op:
+            'ge_p',
+            # jvp op:
+            'fill_constant_p',
+            # transpose op:
+        ]
+
+
+class TestNePJVPAndTranspose(TestAddPJVPAndTranspose):
+
+    def init_data(self):
+        # Set prim op
+        self.op_type = 'ne_p'
+        X = paddle.static.data(name='X', shape=[4, 5], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[4, 5], dtype='float64')
+
+        self.prim_input = {'X': X, 'Y': Y}
+        self.prim_output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.prim_attrs = {}
+
+        # Set JVP
+        X_DOT = paddle.static.data(name='X_DOT', shape=[4, 5], dtype='float64')
+        Y_DOT = paddle.static.data(name='Y_DOT', shape=[4, 5], dtype='float64')
+        self.jvp_args = (X_DOT, Y_DOT)
+        self.jvp_out_shape_map = {0: self.prim_output['Z']}
+
+        self.all_ops = [
+            # prim op:
+            'ne_p',
+            # jvp op:
+            'fill_constant_p',
+            # transpose op:
+        ]
+
+
 class TestPowPJVPAndTranspose(TestAddPJVPAndTranspose):
 
     def init_data(self):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
index c9f1aa6c41a957..e1d5ee11a13ace 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_orig2prim.py
@@ -18,6 +18,7 @@
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.layers.utils import flatten
 from paddle.incubate.autograd.primrules import _orig2prim, _prim2orig, _jvp, _transpose
+import paddle.fluid.core as core
 
 paddle.enable_static()
 
@@ -109,6 +110,26 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestElementWiseDivOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'elementwise_div'
+        X = paddle.static.data(name='X', shape=[8, 8], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[8, 8], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['elementwise_div', 'div_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
 class TestMatmulV2Orig2Prim(TestElementWiseAddOrig2Prim):
 
     def init_data(self):
@@ -228,6 +249,26 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestAbsOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'abs'
+        X = paddle.static.data(name='X', shape=[3, 4], dtype='float')
+
+        self.input = {
+            'X': X,
+        }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['abs', 'abs_p']
+        self.out_map = {0: self.output['Out']}
+
+
 class TestLogOrig2Prim(TestElementWiseAddOrig2Prim):
 
     def init_data(self):
@@ -343,6 +384,46 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestFillAnyLikeOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'fill_any_like'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {
+            'X': X,
+        }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['fill_any_like', 'fill_constant_p']
+        self.out_map = {0: self.output['Out']}
+
+
+class TestFillAnyLikeOrig2Prim2(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'fill_any_like'
+        X = paddle.static.data(name='X', shape=[5, 6], dtype='int64')
+
+        self.input = {
+            'X': X,
+        }
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'dtype': paddle.float32, 'value': 5}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['fill_any_like', 'fill_constant_p']
+        self.out_map = {0: self.output['Out']}
+
+
 class TestSumOrig2Prim(TestElementWiseAddOrig2Prim):
 
     def init_data(self):
@@ -381,7 +462,9 @@ def init_data(self):
         }
 
         self.orig2prim_args = (X, )
-        self.all_ops = ['p_norm', 'reshape_p', 'sqrt_p', 'reduce_p', 'mul_p']
+        self.all_ops = [
+            'p_norm', 'reshape_p', 'sqrt_p', 'reduce_sum_p', 'mul_p'
+        ]
         self.out_map = {0: self.output['Out']}
 
 
@@ -404,7 +487,9 @@ def init_data(self):
         }
 
         self.orig2prim_args = (X, )
-        self.all_ops = ['p_norm', 'reshape_p', 'sqrt_p', 'reduce_p', 'mul_p']
+        self.all_ops = [
+            'p_norm', 'reshape_p', 'sqrt_p', 'reduce_sum_p', 'mul_p'
+        ]
         self.out_map = {0: self.output['Out']}
 
 
@@ -539,6 +624,63 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestNeOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'not_equal'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype='bool')
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['not_equal', 'ne_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestGtOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'greater_than'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype='bool')
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['greater_than', 'gt_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestGeOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'greater_equal'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+        Y = paddle.static.data(name='Y', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype='bool')
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, Y)
+        self.all_ops = ['greater_equal', 'ge_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
 class TestPowOrig2Prim(TestElementWiseAddOrig2Prim):
 
     def init_data(self):
@@ -624,5 +766,118 @@ def init_data(self):
         self.out_map = {0: self.output['Out']}
 
 
+class TestReduceSumOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'reduce_sum'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': [0, 1], 'keep_dim': False}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = ['reduce_sum', 'reduce_sum_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestReduceMeanOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'reduce_mean'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'axis': [0, 1], 'keep_dim': False}
+
+        self.orig2prim_args = (X, )
+        self.all_ops = [
+            'reduce_mean', 'reduce_sum_p', 'fill_constant_p', 'div_p'
+        ]
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestSizeOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'size'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'Input': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(
+                dtype=paddle.int64)
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, )
+        self.all_ops = ['size', 'fill_constant_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestCastOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'cast'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'in_dtype': X.dtype, 'out_dtype': paddle.float64}
+        self.orig2prim_args = (X, )
+        self.all_ops = ['cast', 'cast_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestPowScalarOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'pow'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'factor': 2.}
+        self.orig2prim_args = (None, X)
+        self.all_ops = ['pow', 'pow_p', 'fill_constant_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
+class TestSquareOrig2Prim(TestElementWiseAddOrig2Prim):
+
+    def init_data(self):
+        self.op_type = 'square'
+        X = paddle.static.data(name='X', shape=[5, 8], dtype='float')
+
+        self.input = {'X': X}
+        self.output = {
+            'Out':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+        self.orig2prim_args = (X, )
+        self.all_ops = ['square', 'pow_p', 'fill_constant_p']
+        # { prim_op_output_index: orig_op_output_var }
+        self.out_map = {0: self.output['Out']}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
index 4d0f150073604f..a89b91bdd2b64a 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_prim2orig.py
@@ -244,6 +244,26 @@ def init_data(self):
         self.out_map = {self.output['Y']: 0}
 
 
+class TestAbsPPrim2Orig(TestAddPPrim2Orig):
+
+    def init_data(self):
+        self.op_type = 'abs_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+
+        self.input = {
+            'X': X,
+        }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['abs_p', 'abs']
+        self.out_map = {self.output['Y']: 0}
+
+
 class TestLogPPrim2Orig(TestAddPPrim2Orig):
 
     def init_data(self):
@@ -375,7 +395,7 @@ def init_data(self):
 class TestReducePPrim2Orig(TestAddPPrim2Orig):
 
     def init_data(self):
-        self.op_type = 'reduce_p'
+        self.op_type = 'reduce_sum_p'
         X = paddle.static.data(name='X', shape=[3, 9, 5], dtype='float64')
 
         self.input = {'X': X}
@@ -386,7 +406,7 @@ def init_data(self):
         self.attrs = {'axis': [1], 'keepdim': True}
 
         self.prim2orig_args = (X, )
-        self.all_ops = ['reduce_p', 'reduce_sum']
+        self.all_ops = ['reduce_sum_p', 'reduce_sum']
         self.out_map = {self.output['Y']: 0}
 
 
@@ -555,6 +575,63 @@ def init_data(self):
         self.out_map = {self.output['Z']: 0}
 
 
+class TestNePPrim2Orig(TestAddPPrim2Orig):
+
+    def init_data(self):
+        self.op_type = 'ne_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype='bool')
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['ne_p', 'not_equal']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestGtPPrim2Orig(TestAddPPrim2Orig):
+
+    def init_data(self):
+        self.op_type = 'gt_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype='bool')
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['gt_p', 'greater_than']
+        self.out_map = {self.output['Z']: 0}
+
+
+class TestGePPrim2Orig(TestAddPPrim2Orig):
+
+    def init_data(self):
+        self.op_type = 'ge_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+        Y = paddle.static.data(name='Y', shape=[7, 8], dtype='float64')
+
+        self.input = {'X': X, 'Y': Y}
+        self.output = {
+            'Z':
+            self.layer_help.create_variable_for_type_inference(dtype='bool')
+        }
+        self.attrs = {}
+
+        self.prim2orig_args = (X, Y)
+        self.all_ops = ['ge_p', 'greater_equal']
+        self.out_map = {self.output['Z']: 0}
+
+
 class TestPowPPrim2Orig(TestAddPPrim2Orig):
 
     def init_data(self):
@@ -593,5 +670,25 @@ def init_data(self):
         self.out_map = {self.output['Z']: 0}
 
 
+class TestCastPPrim2Orig(TestAddPPrim2Orig):
+
+    def init_data(self):
+        self.op_type = 'cast_p'
+        X = paddle.static.data(name='X', shape=[7, 8], dtype='float64')
+
+        self.input = {
+            'X': X,
+        }
+        self.output = {
+            'Y':
+            self.layer_help.create_variable_for_type_inference(dtype=X.dtype)
+        }
+        self.attrs = {'dtype': paddle.int64}
+
+        self.prim2orig_args = (X, )
+        self.all_ops = ['cast_p', 'cast']
+        self.out_map = {self.output['Y']: 0}
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
index 7edbb6ef77f26e..bdc54563fc8d2a 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primapi.py
@@ -150,6 +150,8 @@ def without_program_guard():
      (np.random.rand(3, 3), np.random.rand(3, 3)),
      (np.random.rand(3, 3), np.random.rand(3, 3)), 'float64'),
     ('log', paddle.log, (np.random.rand(3, 4), ), None, 'float32'),
+    ('abs', paddle.abs, (np.random.uniform(-10, 10,
+                                           (10, 10)), ), None, 'float32'),
 ))
 # paddle.where, paddle.pow, paddle.maximum has no double grad definition,
 # can not compute forward grad use double trick
@@ -255,6 +257,8 @@ def test_illegal_param(self):
          (np.random.rand(2, 3), np.random.rand(3, 2)), None, 'float32'),
         ('multiply', paddle.multiply,
          (np.random.rand(2, 3), np.random.rand(2, 3)), None, 'float64'),
+        ('div', paddle.divide,
+         (np.random.rand(2, 3), np.random.rand(2, 3)), None, 'float64'),
         ('add', paddle.add,
          (np.random.rand(2, 3), np.random.rand(2, 3)), None, 'float32'),
         ('input_not_sequence', paddle.tanh,
@@ -283,7 +287,36 @@ def test_illegal_param(self):
          (np.random.rand(200, 189), ), None, 'float32'),
         ('gelu_approximate', lambda x: paddle.nn.functional.gelu(x, True),
          (np.random.rand(200, 189), ), None, 'float32'),
-    ))
+        ('sum', paddle.sum, (np.random.rand(200, 345), ), None, 'float32'),
+        ('sum_with_axis', lambda x: paddle.sum(x, axis=1),
+         (np.random.rand(200, 345), ), None, 'float32'),
+        ('sum_with_keepdim', lambda x: paddle.sum(x, keepdim=True),
+         (np.random.rand(200, 345), ), None, 'float32'),
+        ('mean', paddle.mean, (np.random.rand(200, 345), ), None, 'float32'),
+        ('mean_with_axis', lambda x: paddle.mean(x, axis=1),
+         (np.random.rand(200, 345), ), None, 'float32'),
+        ('mean_with_keepdim', lambda x: paddle.mean(x, keepdim=True),
+         (np.random.rand(200, 345), ), None, 'float32'),
+        ('mean_with_axis_keepdim',
+         lambda x: paddle.mean(x, axis=0, keepdim=True),
+         (np.random.rand(200, 345), ), None, 'float32'),
+        ('abs', paddle.abs, (np.random.uniform(-10, 10,
+                                               (200, 345)), ), None, 'float32'),
+        ('cast_float', lambda x: paddle.cast(x, paddle.float64),
+         (np.random.rand(10, 20), ), None, 'float32'),
+        ('cast_int', lambda x: paddle.cast(x, paddle.int32),
+         (np.random.rand(10, 20), ), None, 'float32'),
+        ('square', paddle.square, (np.random.rand(100), ), None, 'float32'),
+        ('pow_scalar', lambda x: paddle.pow(x, 2),
+         (np.random.rand(20, 30), ), None, 'float32'),
+        ('var', paddle.var, (np.random.rand(200, 324), ), None, 'float32'),
+        ('var_with_axis', lambda x: paddle.var(x, axis=1),
+         (np.random.rand(10, 20, 30), ), None, 'float32'),
+        ('var_without_unbiased',
+         lambda x: paddle.var(x, axis=1, unbiased=False),
+         (np.random.rand(10, 20, 30), ), None, 'float32'),
+        ('var_with_keepdim', lambda x: paddle.var(x, axis=1, keepdim=True),
+         (np.random.rand(10, 20, 30), ), None, 'float32')))
 class TestGrad(unittest.TestCase):
 
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_primops.py b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
index 25a3d9bce235a0..35291432f6e8fb 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_primops.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_primops.py
@@ -42,7 +42,11 @@
         ('cos', primops.cos, randn(2, 3), {}, (2, 3), 'float64'),
         ('exp', primops.exp, randn(2, 3), {}, (2, 3), 'float64'),
         ('erf', primops.erf, randn(2, 3), {}, (2, 3), 'float64'),
+        ('abs', primops.abs, randn(2, 3), {}, (2, 3), 'float64'),
         ('log', primops.log, randn(2, 3), {}, (2, 3), 'float64'),
+        ('cast', primops.cast, randn(2, 3), {
+            'dtype': paddle.int64
+        }, (2, 3), 'int64'),
         ('reshape', primops.reshape, randn(2, 3), {
             'shape': (3, 2)
         }, (3, 2), 'float64'),
@@ -58,10 +62,10 @@
         ('concat_axis1', primops.concat, ((randn(2, 3), randn(2, 3)), ), {
             'axis': 1
         }, (2, 6), 'float64'),
-        ('reduce_axis1', primops.reduce, randn(2, 3), {
+        ('reduce_axis1', primops.reduce_sum, randn(2, 3), {
             'axis': (1, )
         }, (2, ), 'float64'),
-        ('reduce_axis01', primops.reduce, randn(2, 3), {
+        ('reduce_axis01', primops.reduce_sum, randn(2, 3), {
             'axis': (0, 1)
         }, (1, ), 'float64'),
         ('split', primops.split, randn(2, 3), {
@@ -99,6 +103,9 @@
         ('select', primops.select,
          (randn(2, 3) > 0, randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
         ('eq', primops.eq, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
+        ('ne', primops.ne, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
+        ('gt', primops.gt, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
+        ('ge', primops.ge, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'bool'),
         ('pow', primops.pow, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
         ('max', primops.max, (randn(2, 3), randn(2, 3)), {}, (2, 3), 'float64'),
     ))
diff --git a/python/paddle/fluid/tests/unittests/autograd/test_transform.py b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
index f976ef729cc7a0..6c0aa697550bc3 100644
--- a/python/paddle/fluid/tests/unittests/autograd/test_transform.py
+++ b/python/paddle/fluid/tests/unittests/autograd/test_transform.py
@@ -290,8 +290,8 @@ def init_data(self):
             'index_select'
         ]
         self.orig2prim_ops = [
-            'broadcast_p', 'add_p', 'reshape_p', 'mul_p', 'reduce_p', 'sqrt_p',
-            'broadcast_p', 'sub_p', 'concat_p', 'gather_p'
+            'broadcast_p', 'add_p', 'reshape_p', 'mul_p', 'reduce_sum_p',
+            'sqrt_p', 'broadcast_p', 'sub_p', 'concat_p', 'gather_p'
         ]
         self.linearize_ops = self.orig2prim_ops + [
             # call fill_const() in linearize() function
@@ -306,7 +306,7 @@ def init_data(self):
             'mul_p',
             'mul_p',
             'add_p',
-            'reduce_p',
+            'reduce_sum_p',
             'fill_constant_p',  # 'sqrt_p', Will not append sqrt_p op when apply JVP for sqrt_p
             'mul_p',
             'div_p',
@@ -326,7 +326,7 @@ def init_data(self):
             'fill_constant_p',
             'mul_p',
             # transposed op
-            'reduce_p',
+            'reduce_sum_p',
             'reshape_p',
             'reshape_p',
             'mul_p',
@@ -334,7 +334,7 @@ def init_data(self):
             'reshape_p',
             'broadcast_p',
             'div_p',
-            'reduce_p',
+            'reduce_sum_p',
             'reshape_p',
             'fill_constant_p',
             'sub_p',
diff --git a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
index 4431f16d7b6e52..69cfef8e58fba1 100644
--- a/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/CMakeLists.txt
@@ -71,14 +71,14 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_allreduce_api MODULES test_collective_allreduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
     test_collective_alltoall_api MODULES test_collective_alltoall_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -98,7 +98,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_alltoall_single_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_alltoall_single_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -125,7 +125,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_broadcast_api MODULES test_collective_broadcast_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_broadcast_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -154,7 +154,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_isend_irecv_api MODULES test_collective_isend_irecv_api
     ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_isend_irecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -187,7 +187,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_api MODULES test_collective_reduce_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   bash_test_modules(
@@ -207,7 +207,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_reduce_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_reduce_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -221,7 +221,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_scatter_api MODULES test_collective_scatter_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_scatter_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "180" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -235,7 +235,7 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_collective_sendrecv_api MODULES test_collective_sendrecv_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
   set_tests_properties(test_collective_sendrecv_api
-                       PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
@@ -268,17 +268,82 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_eager_dist_api MODULES test_eager_dist_api ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
-  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS
-                                                      "RUN_TYPE=DIST")
+    test_communication_stream_allgather_api MODULES
+    test_communication_stream_allgather_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_allgather_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_new_group_api MODULES test_new_group_api ENVS
+    test_communication_stream_allreduce_api MODULES
+    test_communication_stream_allreduce_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_allreduce_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_alltoall_api MODULES
+    test_communication_stream_alltoall_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_alltoall_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_alltoall_single_api MODULES
+    test_communication_stream_alltoall_single_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_alltoall_single_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_broadcast_api MODULES
+    test_communication_stream_broadcast_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_broadcast_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_reduce_api MODULES
+    test_communication_stream_reduce_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_reduce_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_reduce_scatter_api MODULES
+    test_communication_stream_reduce_scatter_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_reduce_scatter_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_scatter_api MODULES
+    test_communication_stream_scatter_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_scatter_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_communication_stream_sendrecv_api MODULES
+    test_communication_stream_sendrecv_api ENVS
+    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
+  set_tests_properties(test_communication_stream_sendrecv_api
+                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+endif()
+if((WITH_GPU OR WITH_ROCM) AND (LINUX))
+  py_test_modules(
+    test_eager_dist_api MODULES test_eager_dist_api ENVS
     "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
-  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS
-                                                     "RUN_TYPE=DIST")
+  set_tests_properties(test_eager_dist_api PROPERTIES TIMEOUT "120" LABELS
+                                                      "RUN_TYPE=DIST")
 endif()
 if((WITH_GPU
     OR WITH_ROCM
@@ -298,11 +363,21 @@ if((WITH_GPU
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_communication_stream_allreduce_api MODULES
-    test_communication_stream_allreduce_api ENVS
-    "PYTHONPATH=..:${PADDLE_BINARY_DIR}/python;http_proxy=;https_proxy=")
-  set_tests_properties(test_communication_stream_allreduce_api
-                       PROPERTIES TIMEOUT "120" LABELS "RUN_TYPE=DIST")
+    test_new_group_api MODULES test_new_group_api ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+  set_tests_properties(test_new_group_api PROPERTIES TIMEOUT "120" LABELS
+                                                     "RUN_TYPE=DIST")
+endif()
+if((WITH_ROCM OR WITH_GPU) AND (LINUX))
+  bash_test_modules(
+    test_world_size_and_rank
+    START_BASH
+    test_world_size_and_rank.sh
+    LABELS
+    "RUN_TYPE=DIST"
+    ENVS
+    "PADDLE_DIST_UT_PORT=21532;http_proxy=;https_proxy=")
+  set_tests_properties(test_world_size_and_rank PROPERTIES TIMEOUT "120")
 endif()
 add_subdirectory(fleet)
 add_subdirectory(multinode)
diff --git a/python/paddle/fluid/tests/unittests/collective/README.md b/python/paddle/fluid/tests/unittests/collective/README.md
index 2370ce07e05b4a..e4d3c90c309dd1 100644
--- a/python/paddle/fluid/tests/unittests/collective/README.md
+++ b/python/paddle/fluid/tests/unittests/collective/README.md
@@ -8,11 +8,11 @@
 * `name`: the test's name
 * `os`: The supported operator system, ignoring case. If the test run in multiple operator systems, use ";" to split systems, for example, `apple;linux` means the test runs on both Apple and Linux. The supported values are `linux`,`win32` and `apple`. If the value is empty, this means the test runs on all opertaor systems.
 * `arch`: the device's architecture. similar to `os`, multiple valuse ars splited by ";" and ignoring case. The supported architectures are `gpu`, `xpu`, `ASCEND`, `ASCEND_CL` and `rocm`.
-* `timeout`: timeout of a unittest, whose unit is second. Blank means defalut.
-* `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`，which are case-insensitive. 
+* `timeout`: timeout of a unittest, whose unit is second. Blank means default.
+* `run_type`: run_type of a unittest. Supported values are `NIGHTLY`, `EXCLUSIVE`, `CINN`, `DIST`, `GPUPS`, `INFER`, `EXCLUSIVE:NIGHTLY`, `DIST:NIGHTLY`，which are case-insensitive.
 * `launcher`: the test launcher.Supported values are test_runner.py, dist_test.sh and custom scripts' name. Blank means test_runner.py.
-* `num_port`: the number of port used in a distributed unit test. Blank means automatically distributed port. 
-* `run_serial`: whether in serial mode. the value can be 1 or 0.Default (empty) is 0. Blank means defalut.
+* `num_port`: the number of port used in a distributed unit test. Blank means automatically distributed port.
+* `run_serial`: whether in serial mode. the value can be 1 or 0.Default (empty) is 0. Blank means default.
 * `ENVS`: required environments. multiple envirenmonts are splited by ";".
 * `conditions`: extra required conditions for some tests. The value is a list of boolean expression in cmake programmer, splited with ";". For example, the value can be `WITH_DGC;NOT WITH_NCCL` or `WITH_NCCL;${NCCL_VERSION} VERSION_GREATER_EQUAL 2212`,The relationship between these expressions is a conjunction.
 
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
index d485fd23d95710..38e1cc555da752 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allgather_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,10 +28,18 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
             tensor_list = []
-            paddle.distributed.all_gather(tensor_list, tindata)
-            return [tensor.numpy() for tensor in tensor_list]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_gather(tensor_list, tindata)
+                return [
+                    tensor.cast("float32").numpy() for tensor in tensor_list
+                ]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_gather(tensor_list, tindata)
+                return [tensor.numpy() for tensor in tensor_list]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
index 83588d450a7c91..92cc5a1623d9f8 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,9 +28,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.all_reduce(tindata)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.all_reduce(tindata)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.all_reduce(tindata)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_new_group_api.py b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_new_group_api.py
index 9d4e21aefff56c..b773826169963b 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_allreduce_new_group_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_allreduce_new_group_api.py
@@ -49,9 +49,7 @@ def get_model(self, main_prog, startup_program, rank):
                                   shape=[10, 1000],
                                   dtype='float32')
             gp = paddle.distributed.new_group([0, 1])
-            paddle.distributed.all_reduce(tindata,
-                                          group=gp,
-                                          use_calc_stream=True)
+            paddle.distributed.all_reduce(tindata, group=gp, sync_op=True)
             return [tindata]
 
 
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
index fcabaffd614d03..da6c5ec1b3ad2a 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_api_dygraph.py
@@ -25,30 +25,31 @@
 from six import string_types
 import math
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveAllToAllAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveAllToAllAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            tindata = paddle.split(tindata, 2, axis=0)
             toutdata = []
-            paddle.distributed.alltoall(tindata, toutdata)
-            return [data.numpy() for data in toutdata]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.cast("float32").numpy() for data in toutdata]
+            else:
+                tindata = paddle.to_tensor(indata)
+                tindata = paddle.split(tindata, 2, axis=0)
+                dist.alltoall(tindata, toutdata)
+                return [data.numpy() for data in toutdata]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveAllToAllAPI, "alltoall")
+    test_base.runtime_main(TestCollectiveAllToAllAPI, "alltoall")
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
index cb6777d20bc25b..3b52ac0e03ff6f 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single.py
@@ -69,7 +69,7 @@ def test_collective_alltoall_single(self):
                                     output,
                                     in_split_sizes,
                                     out_split_sizes,
-                                    use_calc_stream=False,
+                                    sync_op=False,
                                     group=group)
         task.wait()
 
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
index 5fac73989a6060..1100a4a481b5a8 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_alltoall_single_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import test_collective_api_base as test_base
 
@@ -26,10 +27,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            toutdata = paddle.to_tensor(indata)
-            paddle.distributed.alltoall_single(tindata, toutdata)
-            return [toutdata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                toutdata = paddle.to_tensor(tindata, "float32").cast("uint16")
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                toutdata = paddle.to_tensor(indata)
+                dist.alltoall_single(tindata, toutdata)
+                return [toutdata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
index 29f0b74bb405b8..de80e3b99a2d56 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_broadcast_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,9 +28,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.broadcast(tindata, src=1)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.broadcast(tindata, src=1)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.broadcast(tindata, src=1)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_global_gather.py b/python/paddle/fluid/tests/unittests/collective/collective_global_gather.py
index 60909f63211de9..fd6e8106da70f4 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_global_gather.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_global_gather.py
@@ -24,6 +24,7 @@
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 import pickle
 from paddle.fluid.framework import _enable_legacy_dygraph
+import paddle.distributed.utils.moe_utils as moe_utils
 
 paddle.enable_static()
 
@@ -51,8 +52,9 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
                                                      shape=[tot_expert],
                                                      dtype="int64")
 
-            output = paddle.distributed.utils.global_gather(
-                local_input_buf, local_expert_count, global_expert_count)
+            output = moe_utils.global_gather(local_input_buf,
+                                             local_expert_count,
+                                             global_expert_count)
 
             return [output]
 
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_global_gather_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_global_gather_dygraph.py
index 0b264f5ba89669..39749b81277948 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_global_gather_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_global_gather_dygraph.py
@@ -22,6 +22,7 @@
 import unittest
 import paddle.fluid.layers as layers
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import paddle.distributed.utils.moe_utils as moe_utils
 
 
 class TestCollectiveGlobalGatherAPI(TestCollectiveAPIRunnerBase):
@@ -51,8 +52,9 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
                                              in_feat).astype("float32")
             local_input_buf = paddle.to_tensor(local_input_buf)
             local_input_buf.stop_gradient = False
-            output = paddle.distributed.utils.global_gather(
-                local_input_buf, local_expert_count, global_expert_count)
+            output = moe_utils.global_gather(local_input_buf,
+                                             local_expert_count,
+                                             global_expert_count)
             output.stop_gradient = False
             c = output * output
             c.stop_gradient = False
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_global_scatter.py b/python/paddle/fluid/tests/unittests/collective/collective_global_scatter.py
index c4950025877df1..dd6245df2aceab 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_global_scatter.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_global_scatter.py
@@ -23,6 +23,7 @@
 import paddle.fluid.layers as layers
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
 import pickle
+import paddle.distributed.utils.moe_utils as moe_utils
 
 paddle.enable_static()
 
@@ -51,8 +52,9 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
                 paddle.split(local_expert_count, 2, axis=0),
                 global_expert_count)
             global_expert_count = paddle.concat(global_expert_count, axis=0)
-            output = paddle.distributed.utils.global_scatter(
-                local_input_buf, local_expert_count, global_expert_count)
+            output = moe_utils.global_scatter(local_input_buf,
+                                              local_expert_count,
+                                              global_expert_count)
             return [output]
 
     def run_trainer(self, args):
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_global_scatter_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_global_scatter_dygraph.py
index 82816c899e2cb4..e775bf50eb9e72 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_global_scatter_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_global_scatter_dygraph.py
@@ -22,6 +22,7 @@
 import unittest
 import paddle.fluid.layers as layers
 from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import paddle.distributed.utils.moe_utils as moe_utils
 
 
 class TestCollectiveGlobalScatterAPI(TestCollectiveAPIRunnerBase):
@@ -50,8 +51,9 @@ def get_model(self, main_prog, startup_program, rank, indata=None):
                 global_expert_count)
             global_expert_count = paddle.concat(global_expert_count, axis=0)
             local_input_buf.stop_gradient = False
-            output = paddle.distributed.utils.global_scatter(
-                local_input_buf, local_expert_count, global_expert_count)
+            output = moe_utils.global_scatter(local_input_buf,
+                                              local_expert_count,
+                                              global_expert_count)
             output.stop_gradient = False
             c = output * output
             c.backward()
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
index 70437216a8f856..da3d4c064300c7 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_isend_irecv_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,13 +28,23 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                task = paddle.distributed.isend(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.cast("float32").numpy()]
             else:
-                task = paddle.distributed.irecv(tindata, src=0)
-            task.wait()
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    task = dist.isend(tindata, dst=1)
+                else:
+                    task = dist.irecv(tindata, src=0)
+                task.wait()
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
index 257fc27ceee9f2..6e2b1e86bcd14f 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,9 +28,15 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            paddle.distributed.reduce(tindata, dst=0)
-            return [tindata.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                dist.reduce(tindata, dst=0)
+                return [tindata.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                dist.reduce(tindata, dst=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
index 0e36296e4089cf..f10aff4752bd26 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter.py
@@ -83,8 +83,9 @@ def test_collective_reduce_scatter_base(self):
         # [1, 2, 3, 4]  # Rank-1
 
         output = paddle.empty(shape=[2], dtype=input.dtype)
-        task = paddle.distributed.collective._reduce_scatter_base(
-            output, input, use_calc_stream=False)
+        task = paddle.distributed.collective._reduce_scatter_base(output,
+                                                                  input,
+                                                                  sync_op=False)
 
         task.wait()
 
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
index 1b0eb6aef9d47a..c5b5756ac74bac 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_reduce_scatter_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,10 +28,17 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            paddle.distributed.reduce_scatter(subdata1, [subdata1, subdata2])
-            return [subdata1.numpy()]
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.cast("float32").numpy()]
+            else:
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                dist.reduce_scatter(subdata1, [subdata1, subdata2])
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
index f37f5653806ec8..255b4e7e0ac598 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_scatter_api_dygraph.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
 import unittest
 import test_collective_api_base as test_base
@@ -27,15 +28,27 @@ def __init__(self):
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
-            if rank == 0:
-                paddle.distributed.scatter(subdata1, src=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.cast("float32").numpy()]
             else:
-                paddle.distributed.scatter(subdata1,
-                                           tensor_list=[subdata1, subdata2],
-                                           src=1)
-            return [subdata1.numpy()]
+                tindata = paddle.to_tensor(indata)
+                subdata1, subdata2 = paddle.split(tindata, 2, axis=0)
+                if rank == 0:
+                    dist.scatter(subdata1, src=1)
+                else:
+                    dist.scatter(subdata1,
+                                 tensor_list=[subdata1, subdata2],
+                                 src=1)
+                return [subdata1.numpy()]
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
index 8508c3d043c93d..f4ae715a349fcc 100644
--- a/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
+++ b/python/paddle/fluid/tests/unittests/collective/collective_sendrecv_api_dygraph.py
@@ -25,31 +25,34 @@
 from six import string_types
 import math
 import paddle
+import paddle.distributed as dist
 import paddle.fluid as fluid
-import paddle.fluid.profiler as profiler
-import paddle.fluid.unique_name as nameGen
-from paddle.fluid import core
-import unittest
-from multiprocessing import Process
-import paddle.fluid.layers as layers
-from functools import reduce
-from test_collective_api_base import TestCollectiveAPIRunnerBase, runtime_main
+import test_collective_api_base as test_base
 
 
-class TestCollectiveSendRecvAPI(TestCollectiveAPIRunnerBase):
+class TestCollectiveSendRecvAPI(test_base.TestCollectiveAPIRunnerBase):
 
     def __init__(self):
         self.global_ring_id = 0
 
     def get_model(self, main_prog, startup_program, rank, indata=None):
         with fluid.program_guard(main_prog, startup_program):
-            tindata = paddle.to_tensor(indata)
-            if rank == 0:
-                paddle.distributed.send(tindata, dst=1)
+            # NOTE: this is a hack relying on an undocumented behavior that `to_tensor` uses uint16 to replace bfloat16
+            if indata.dtype == "bfloat16":
+                tindata = paddle.to_tensor(indata, "float32").cast("uint16")
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.cast("float32").numpy()]
             else:
-                paddle.distributed.recv(tindata, src=0)
-            return [tindata.numpy()]
+                tindata = paddle.to_tensor(indata)
+                if rank == 0:
+                    dist.send(tindata, dst=1)
+                else:
+                    dist.recv(tindata, src=0)
+                return [tindata.numpy()]
 
 
 if __name__ == "__main__":
-    runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
+    test_base.runtime_main(TestCollectiveSendRecvAPI, "sendrecv")
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py
new file mode 100644
index 00000000000000..d0e46600b8b50f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_allgather_api_dygraph.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed as dist
+import test_communication_api_base as test_base
+import test_collective_api_base as test_collective_base
+
+
+class StreamAllgatherTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+
+        # case 1: pass an empty tensor list
+        empty_tensor_list = []
+        task = dist.stream.all_gather(empty_tensor_list,
+                                      tensor,
+                                      sync_op=self._sync_op,
+                                      use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        assert np.allclose(empty_tensor_list,
+                           test_data_list,
+                           rtol=1e-05,
+                           atol=1e-05)
+
+        # case 2: pass a pre-sized tensor list
+        full_tensor_list = [paddle.empty_like(tensor) for _ in test_data_list]
+        task = dist.stream.all_gather(full_tensor_list,
+                                      tensor,
+                                      sync_op=self._sync_op,
+                                      use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        assert np.allclose(full_tensor_list,
+                           test_data_list,
+                           rtol=1e-05,
+                           atol=1e-05)
+
+        # case 3: pass a pre-sized tensor
+        result_tensor = paddle.concat(
+            [paddle.to_tensor(data) for data in test_data_list])
+        out_tensor = paddle.empty_like(result_tensor)
+        task = dist.stream.all_gather(out_tensor,
+                                      tensor,
+                                      sync_op=self._sync_op,
+                                      use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        assert np.allclose(out_tensor, result_tensor, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamAllgatherTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
new file mode 100644
index 00000000000000..8e65ea8d8aee5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_api_dygraph.py
@@ -0,0 +1,113 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed as dist
+import test_communication_api_base as test_base
+import test_collective_api_base as test_collective_base
+
+
+class StreamAllToAllTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        nranks = len(test_data_list)
+        data1 = test_data_list[0]
+        data2 = test_data_list[1]
+        result1 = np.vstack(
+            [data1[0:data1.shape[0] // 2, :], data2[0:data2.shape[0] // 2, :]])
+        result2 = np.vstack(
+            [data1[data1.shape[0] // 2:, :], data2[data2.shape[0] // 2:, :]])
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        t1, t2 = paddle.split(tensor, nranks, axis=0)
+
+        # case 1: pass an empty tensor list
+        empty_tensor_list = []
+        task = dist.stream.alltoall(empty_tensor_list, [t1, t2],
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        result_tensor_list = np.vstack(empty_tensor_list)
+        if rank == 0:
+            assert np.allclose(result_tensor_list,
+                               result1,
+                               rtol=1e-05,
+                               atol=1e-05)
+        else:
+            assert np.allclose(result_tensor_list,
+                               result2,
+                               rtol=1e-05,
+                               atol=1e-05)
+
+        # case 2: pass a pre-sized tensor list
+        full_tensor_list = [paddle.empty_like(t1) for _ in test_data_list]
+        task = dist.stream.alltoall(full_tensor_list, [t1, t2],
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        result_tensor_list = np.vstack(full_tensor_list)
+        if rank == 0:
+            assert np.allclose(result_tensor_list,
+                               result1,
+                               rtol=1e-05,
+                               atol=1e-05)
+        else:
+            assert np.allclose(result_tensor_list,
+                               result2,
+                               rtol=1e-05,
+                               atol=1e-05)
+
+        # case 3: pass a pre-sized tensor
+        out_tensor = paddle.empty_like(tensor)
+        task = dist.stream.alltoall(out_tensor,
+                                    tensor,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(out_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(out_tensor, result2, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamAllToAllTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
new file mode 100644
index 00000000000000..9bdfe124b0b492
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_alltoall_single_api_dygraph.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.fluid as fluid
+import paddle.distributed as dist
+import test_communication_api_base as test_base
+import test_collective_api_base as test_collective_base
+
+
+class StreamAllToAllSingleTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        nranks = len(test_data_list)
+        data1 = paddle.to_tensor(test_data_list[0])
+        data2 = paddle.to_tensor(test_data_list[1])
+        result1 = np.vstack(
+            (data1[0:data1.shape[0] // 2, :], data2[0:data2.shape[0] // 2, :]))
+        result2 = np.vstack(
+            (data1[data1.shape[0] // 2:, :], data2[data2.shape[0] // 2:, :]))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+
+        out_tensor = paddle.empty_like(tensor)
+        task = dist.stream.alltoall_single(
+            out_tensor,
+            tensor,
+            sync_op=self._sync_op,
+            use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(out_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(out_tensor, result2, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamAllToAllSingleTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py
new file mode 100644
index 00000000000000..487dfd6ae68942
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_broadcast_api_dygraph.py
@@ -0,0 +1,54 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamBroadcastTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        src_rank = 1
+        result = test_collective_base.create_test_data(
+            shape=self._shape, dtype=self._dtype, seed=self._seeds[src_rank])
+        tensor = paddle.to_tensor(result)
+        task = dist.stream.broadcast(tensor,
+                                     src=src_rank,
+                                     sync_op=self._sync_op,
+                                     use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamBroadcastTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py
new file mode 100644
index 00000000000000..a487eac566ab5e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_api_dygraph.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamReduceTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        task = dist.stream.reduce(tensor,
+                                  dst=1,
+                                  sync_op=self._sync_op,
+                                  use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        result = sum(test_data_list)
+        if rank == 1:
+            assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(tensor,
+                               test_data_list[rank],
+                               rtol=1e-05,
+                               atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamReduceTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
new file mode 100644
index 00000000000000..effaf1cb6c99a4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_reduce_scatter_api_dygraph.py
@@ -0,0 +1,94 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+from paddle.distributed.communication.stream.reduce_scatter import _reduce_scatter_base
+
+
+class StreamReduceScatterTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+        reduce_result = sum(test_data_list)
+        result1 = reduce_result[0:reduce_result.shape[0] // 2]
+        result2 = reduce_result[reduce_result.shape[0] // 2:]
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+
+        # case 1: pass a pre-sized tensor list
+        t1, t2 = paddle.split(tensor, 2, axis=0)
+        result_tensor = paddle.empty_like(t1)
+        task = dist.stream.reduce_scatter(result_tensor, [t1, t2],
+                                          sync_op=self._sync_op,
+                                          use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05)
+
+        # case 2: pass a pre-sized tensor
+        result_tensor = paddle.empty_like(t1)
+        task = dist.stream.reduce_scatter(result_tensor,
+                                          tensor,
+                                          sync_op=self._sync_op,
+                                          use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05)
+
+        # case 3: test the legacy API
+        result_tensor = paddle.empty_like(t1)
+        task = _reduce_scatter_base(result_tensor,
+                                    tensor,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == 0:
+            assert np.allclose(result_tensor, result1, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(result_tensor, result2, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamReduceScatterTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py
new file mode 100644
index 00000000000000..6060e5050ca09b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_scatter_api_dygraph.py
@@ -0,0 +1,84 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import test_collective_api_base as test_collective_base
+
+
+class StreamScatterTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        src_rank = 1
+        src_data = test_data_list[src_rank]
+        result1 = src_data[0:src_data.shape[0] // 2]
+        result2 = src_data[src_data.shape[0] // 2:]
+
+        rank = dist.get_rank()
+
+        # case 1: pass a pre-sized tensor list
+        tensor = paddle.to_tensor(test_data_list[rank])
+        t1, t2 = paddle.split(tensor, 2, axis=0)
+        task = dist.stream.scatter(t1, [t1, t2],
+                                   src=src_rank,
+                                   sync_op=self._sync_op,
+                                   use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == src_rank:
+            assert np.allclose(t1, result2, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(t1, result1, rtol=1e-05, atol=1e-05)
+
+        # case 2: pass a pre-sized tensor
+        tensor = paddle.to_tensor(src_data)
+        t1 = paddle.empty_like(t1)
+        task = dist.stream.scatter(t1,
+                                   tensor,
+                                   src=src_rank,
+                                   sync_op=self._sync_op,
+                                   use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+        if rank == src_rank:
+            assert np.allclose(t1, result2, rtol=1e-05, atol=1e-05)
+        else:
+            assert np.allclose(t1, result1, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamScatterTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
new file mode 100644
index 00000000000000..c22e734adf2fbd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/communication_stream_sendrecv_api_dygraph.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import paddle
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import test_collective_api_base as test_collective_base
+import test_communication_api_base as test_base
+
+
+class StreamSendRecvTestCase():
+
+    def __init__(self):
+        self._sync_op = eval(os.getenv("sync_op"))
+        self._use_calc_stream = eval(os.getenv("use_calc_stream"))
+        self._backend = os.getenv("backend")
+        self._shape = eval(os.getenv("shape"))
+        self._dtype = os.getenv("dtype")
+        self._seeds = eval(os.getenv("seeds"))
+        if self._backend not in ["nccl", "gloo"]:
+            raise NotImplementedError(
+                "Only support nccl and gloo as the backend for now.")
+        os.environ["PADDLE_DISTRI_BACKEND"] = self._backend
+
+    def run_test_case(self):
+        dist.init_parallel_env()
+
+        test_data_list = []
+        for seed in self._seeds:
+            test_data_list.append(
+                test_collective_base.create_test_data(shape=self._shape,
+                                                      dtype=self._dtype,
+                                                      seed=seed))
+
+        src_rank = 0
+        dst_rank = 1
+
+        rank = dist.get_rank()
+        tensor = paddle.to_tensor(test_data_list[rank])
+        if rank == 0:
+            task = dist.stream.send(tensor,
+                                    dst=dst_rank,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        else:
+            task = dist.stream.recv(tensor,
+                                    src=src_rank,
+                                    sync_op=self._sync_op,
+                                    use_calc_stream=self._use_calc_stream)
+        if not self._sync_op:
+            task.wait()
+
+        result = test_data_list[src_rank]
+        assert np.allclose(tensor, result, rtol=1e-05, atol=1e-05)
+
+
+if __name__ == "__main__":
+    StreamSendRecvTestCase().run_test_case()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt
index d2cc96fd3e1774..b47e4b5b530f97 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/CMakeLists.txt
@@ -938,3 +938,8 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
     test_fleet_checkpoint PROPERTIES TIMEOUT "200" LABELS
                                      "RUN_TYPE=EXCLUSIVE:NIGHTLY")
 endif()
+if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
+  py_test_modules(
+    test_fleet_log MODULES test_fleet_log ENVS
+    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+endif()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py b/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
index 688a31b78de002..b570e866bf661b 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/auto_parallel_parallelizer.py
@@ -23,7 +23,7 @@
 import paddle.utils as utils
 from paddle.fluid import layers
 from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
 import paddle.fluid.core as core
 
@@ -82,11 +82,7 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh,
-                              "dims_mappig": [-1, -1, -1]
-                          })
+        auto.shard_tensor(input, _global_process_mesh, [None, None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -106,7 +102,7 @@ class TestMLPAutoParallelizer(unittest.TestCase):
     def test_mlp_serial(self):
 
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
 
         dist_strategy = fleet.DistributedStrategy()
         dist_strategy.amp = False
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
new file mode 100644
index 00000000000000..81ab434e5e88df
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/dygraph_group_sharded_stage2_comm_overlap.py
@@ -0,0 +1,245 @@
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import shutil
+import numpy as np
+import argparse
+import tempfile
+import ast
+import time
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.dygraph.nn import Linear
+from paddle.distributed import fleet
+from paddle.fluid.dygraph import nn
+from paddle.fluid.framework import _test_eager_guard
+
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_optimizer_stage2 import GroupShardedOptimizerStage2
+from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_stage2 import GroupShardedStage2
+
+seed = 2022
+epoch = 2
+linear_size = 1000
+
+np.random.seed(seed)
+paddle.seed(seed)
+
+
+class MLP(fluid.Layer):
+
+    def __init__(self, linear_size=1000, param_attr=None, bias_attr=None):
+        super(MLP, self).__init__()
+
+        self._linear1 = Linear(linear_size, linear_size)
+        self._linear2 = Linear(linear_size, linear_size)
+        self._linear3 = Linear(linear_size, 10)
+
+    def forward(self, inputs):
+        y = self._linear1(inputs)
+        y = self._linear2(y)
+        y = self._linear3(y)
+        return y
+
+
+def reader_decorator(linear_size=1000):
+
+    def __reader__():
+        for _ in range(100):
+            img = np.random.rand(linear_size).astype('float32')
+            label = np.ones(1).astype('int64')
+            yield img, label
+
+    return __reader__
+
+
+def optimizer_setting(model, use_pure_fp16, opt_group=False):
+    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
+    optimizer = paddle.optimizer.AdamW(parameters=[{
+        "params": model.parameters(),
+    }] if opt_group else model.parameters(),
+                                       learning_rate=0.001,
+                                       weight_decay=0.00001,
+                                       grad_clip=clip,
+                                       multi_precision=use_pure_fp16)
+
+    return optimizer
+
+
+def train_mlp(model,
+              sharding_stage,
+              batch_size=100,
+              use_pure_fp16=False,
+              accumulate_grad=False,
+              opt_group=False,
+              save_model=False,
+              test_minimize=False):
+    if sharding_stage != "dp":
+        group = paddle.distributed.new_group([0, 1], backend="nccl")
+    if opt_group:
+        optimizer = optimizer_setting(model=model,
+                                      use_pure_fp16=use_pure_fp16,
+                                      opt_group=opt_group)
+    else:
+        optimizer = optimizer_setting(model=model, use_pure_fp16=use_pure_fp16)
+
+    if sharding_stage == 2:
+        origin_model = model
+        optimizer = GroupShardedOptimizerStage2(
+            params=optimizer._parameter_list, optim=optimizer, group=group)
+        model = GroupShardedStage2(model,
+                                   optimizer,
+                                   group=group,
+                                   buffer_max_size=2**21)
+        model._set_reduce_overlap(True)
+        optimizer._set_broadcast_overlap(True, model)
+    else:
+        model = paddle.DataParallel(model)
+
+    # check optimizer.minimize() error
+    if test_minimize:
+        try:
+            optimizer.minimize()
+        except:
+            print(
+                "====== Find sharding_stage2_optimizer.minimize() error ======")
+        return
+
+    train_reader = paddle.batch(reader_decorator(),
+                                batch_size=batch_size,
+                                drop_last=True)
+
+    train_loader = paddle.io.DataLoader.from_generator(capacity=32,
+                                                       use_double_buffer=True,
+                                                       iterable=True,
+                                                       return_list=True,
+                                                       use_multiprocess=True)
+    train_loader.set_sample_list_generator(train_reader)
+
+    if sharding_stage == 2:
+        model.to(device="gpu")
+
+    for eop in range(epoch):
+        model.train()
+
+        for batch_id, data in enumerate(train_loader()):
+            img, label = data
+            label.stop_gradient = True
+            img.stop_gradient = True
+
+            out = model(img)
+            loss = paddle.nn.functional.cross_entropy(input=out, label=label)
+
+            avg_loss = paddle.mean(x=loss.cast(dtype=paddle.float32))
+            if batch_size == 20:
+                avg_loss = avg_loss / 5
+            avg_loss.backward()
+
+            if not accumulate_grad:
+                optimizer.step()
+                optimizer.clear_grad()
+
+        if accumulate_grad:
+            optimizer.step()
+            optimizer.clear_grad()
+
+    paddle.device.cuda.synchronize()
+
+    if save_model:
+        return model, optimizer
+    return model.parameters()
+
+
+def test_dp_stage2():
+    paddle.distributed.init_parallel_env()
+    mlp = MLP()
+    state_dict = mlp.state_dict()
+    mlp1 = MLP()
+    mlp2 = MLP()
+    mlp3 = MLP()
+    mlp4 = MLP()
+    mlp5 = MLP()
+    mlp6 = MLP()
+    mlp7 = MLP()
+    mlp1.set_state_dict(state_dict)
+    mlp2.set_state_dict(state_dict)
+    mlp3.set_state_dict(state_dict)
+    mlp4.set_state_dict(state_dict)
+    mlp5.set_state_dict(state_dict)
+    mlp6.set_state_dict(state_dict)
+    mlp7.set_state_dict(state_dict)
+
+    # DP VS stage2
+    dp_params = train_mlp(mlp1,
+                          sharding_stage="dp",
+                          use_pure_fp16=False,
+                          opt_group=False)
+    stage2_params = train_mlp(mlp2,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=False)
+    for i in range(len(dp_params)):
+        np.testing.assert_allclose(dp_params[i].numpy(),
+                                   stage2_params[i].numpy(),
+                                   rtol=1e-6)
+
+    # stage2 accumulate grad
+    stage2_params = train_mlp(mlp3, sharding_stage=2, accumulate_grad=True)
+    stage2_accumulate_grad = train_mlp(mlp4,
+                                       sharding_stage=2,
+                                       batch_size=20,
+                                       accumulate_grad=True)
+    for i in range(len(stage2_params)):
+        np.testing.assert_allclose(stage2_params[i].numpy(),
+                                   stage2_accumulate_grad[i].numpy(),
+                                   rtol=1e-5,
+                                   atol=1e-5)
+
+    # stage2 param list VS param group
+    stage2_params = train_mlp(mlp5,
+                              sharding_stage=2,
+                              use_pure_fp16=False,
+                              opt_group=True)
+    for i in range(len(dp_params)):
+        np.testing.assert_allclose(dp_params[i].numpy(),
+                                   stage2_params[i].numpy(),
+                                   rtol=1e-6)
+
+    # save/load model
+    output_dir = tempfile.mkdtemp()
+    model_file = os.path.join(output_dir, "model.pdmodel")
+    optimizer_file = os.path.join(output_dir, "model.pdopt")
+    model_stage2, optimizer_stage2 = train_mlp(mlp6,
+                                               sharding_stage=2,
+                                               use_pure_fp16=False,
+                                               opt_group=False,
+                                               save_model=True)
+    paddle.save(model_stage2.state_dict(), model_file)
+    paddle.save(optimizer_stage2.state_dict(), optimizer_file)
+    m_state_dict = paddle.load(model_file)
+    opt_state_dict = paddle.load(optimizer_file)
+    model_stage2.set_state_dict(m_state_dict)
+    optimizer_stage2.set_state_dict(opt_state_dict)
+    shutil.rmtree(output_dir)
+
+    # check optimizer.minimize() error
+    train_mlp(mlp7, sharding_stage=2, test_minimize=True)
+    return
+
+
+if __name__ == '__main__':
+    with _test_eager_guard():
+        test_dp_stage2()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_communicate_group.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_communicate_group.py
index f290705c312e09..d84ad27e2636d2 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_communicate_group.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_communicate_group.py
@@ -53,24 +53,21 @@ def test_all(self):
         paddle.distributed.scatter(result, [self.tensor2, self.tensor1],
                                    src=dp_src_rank,
                                    group=dp_gp,
-                                   use_calc_stream=True)
+                                   sync_op=True)
         if dp_rank == 0:
             assert np.array_equal(result, self.tensor2)
         elif dp_rank == 1:
             assert np.array_equal(result, self.tensor1)
         print("test scatter api ok")
 
-        paddle.distributed.broadcast(result,
-                                     src=1,
-                                     group=dp_gp,
-                                     use_calc_stream=True)
+        paddle.distributed.broadcast(result, src=1, group=dp_gp, sync_op=True)
         assert np.array_equal(result, self.tensor1)
         print("test broadcast api ok")
 
         paddle.distributed.reduce(result,
                                   dst=dp_src_rank,
                                   group=dp_gp,
-                                  use_calc_stream=True)
+                                  sync_op=True)
         if dp_rank == 0:
             assert np.array_equal(result, paddle.add(self.tensor1,
                                                      self.tensor1))
@@ -78,7 +75,7 @@ def test_all(self):
             assert np.array_equal(result, self.tensor1)
         print("test reduce api ok")
 
-        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        paddle.distributed.all_reduce(result, sync_op=True)
         assert np.array_equal(
             result,
             paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
@@ -92,7 +89,7 @@ def test_all(self):
         paddle.distributed.all_gather(result,
                                       self.tensor1,
                                       group=dp_gp,
-                                      use_calc_stream=True)
+                                      sync_op=True)
         assert np.array_equal(result[0], self.tensor1)
         assert np.array_equal(result[1], self.tensor1)
         print("test all_gather api ok")
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
new file mode 100644
index 00000000000000..1db15407a5fad4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_pp_transformer_unbalanced_data.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.distributed as dist
+import paddle.distributed.fleet as fleet
+from hybrid_parallel_pp_transformer import (
+    TestDistPPTraning,
+    set_random_seed,
+    ModelPipe,
+    batch_size,
+    length,
+    micro_batch_size,
+    vocab_size,
+)
+
+
+class TestDistPPTraningUnbalancedData(TestDistPPTraning):
+    def test_pp_model(self):
+        hcg = fleet.get_hybrid_communicate_group()
+        word_size = hcg.get_model_parallel_world_size()
+        dp_id = hcg.get_data_parallel_rank()
+        pp_id = hcg.get_stage_id()
+        rank_id = dist.get_rank()
+        topology = hcg.topology()
+        set_random_seed(1024, dp_id, rank_id)
+
+        model = ModelPipe(topology)
+        scheduler = paddle.optimizer.lr.PiecewiseDecay(
+            boundaries=[2], values=[0.001, 0.002], verbose=True
+        )
+        optimizer = paddle.optimizer.SGD(
+            learning_rate=scheduler, parameters=model.parameters()
+        )
+
+        model = fleet.distributed_model(model)
+        optimizer = fleet.distributed_optimizer(optimizer)
+
+        for step_id in range(5):
+            x = []
+            for _ in range(batch_size // micro_batch_size):
+                size = micro_batch_size
+                x_data = np.random.randint(0, vocab_size, size=[size, length])
+                x.append(paddle.to_tensor(x_data))
+            e_loss = model.eval_batch([x, x], True)
+            loss = model.train_batch([x, x], optimizer, scheduler)
+
+            # TODO(shenliang03) add utest for loss
+            if pp_id != 0:
+                np.testing.assert_allclose(loss.numpy(), e_loss.numpy())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
index aefe03b2610805..69bcdf56f6ddb3 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/hybrid_parallel_qat.py
@@ -26,7 +26,7 @@
 import unittest
 import paddle.nn as nn
 from paddle.fluid.contrib.slim.quantization import ImperativeQuantAware
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 
 
 def set_random_seed(seed, dp_id, rank_id):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/new_group.py b/python/paddle/fluid/tests/unittests/collective/fleet/new_group.py
index 56ef510c3047fe..28a2568f8e594c 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/new_group.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/new_group.py
@@ -36,21 +36,18 @@ def test_all(self):
         paddle.distributed.scatter(result, [self.tensor2, self.tensor1],
                                    src=0,
                                    group=gp,
-                                   use_calc_stream=True)
+                                   sync_op=True)
         if gp.rank == 0:
             assert np.array_equal(result, self.tensor2)
         elif gp.rank == 1:
             assert np.array_equal(result, self.tensor1)
         print("test scatter api ok")
 
-        paddle.distributed.broadcast(result,
-                                     src=1,
-                                     group=gp,
-                                     use_calc_stream=True)
+        paddle.distributed.broadcast(result, src=1, group=gp, sync_op=True)
         assert np.array_equal(result, self.tensor1)
         print("test broadcast api ok")
 
-        paddle.distributed.reduce(result, dst=0, group=gp, use_calc_stream=True)
+        paddle.distributed.reduce(result, dst=0, group=gp, sync_op=True)
         if gp.rank == 0:
             assert np.array_equal(result, paddle.add(self.tensor1,
                                                      self.tensor1))
@@ -58,7 +55,7 @@ def test_all(self):
             assert np.array_equal(result, self.tensor1)
         print("test reduce api ok")
 
-        paddle.distributed.all_reduce(result, use_calc_stream=True)
+        paddle.distributed.all_reduce(result, sync_op=True)
         assert np.array_equal(
             result,
             paddle.add(paddle.add(self.tensor1, self.tensor1), self.tensor1))
@@ -72,7 +69,7 @@ def test_all(self):
         paddle.distributed.all_gather(result,
                                       self.tensor1,
                                       group=gp,
-                                      use_calc_stream=True)
+                                      sync_op=True)
         assert np.array_equal(result[0], self.tensor1)
         assert np.array_equal(result[1], self.tensor1)
         print("test all_gather api ok")
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute.py
index 11ca15fd33104b..f5f59cf10279ae 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute.py
@@ -21,6 +21,7 @@
 from paddle.autograd import PyLayer
 from paddle.distributed.fleet.utils import recompute
 import random
+from paddle.incubate.distributed.fleet import recompute_sequential
 
 import paddle.fluid.layers as layers
 
@@ -53,48 +54,66 @@ class Naive_fc_net(paddle.nn.Layer):
     def __init__(self,
                  input_size=10,
                  recompute_blocks=[1, 3],
+                 use_fleet_sq=False,
+                 segments=1,
+                 use_raw_recompute=False,
                  recompute_kwargs={}):
         super(Naive_fc_net, self).__init__()
         self.recompute_blocks = recompute_blocks
         self.recompute_kwargs = recompute_kwargs
+        self.use_fleet_sq = use_fleet_sq
+        self.use_raw_recompute = use_raw_recompute
+        self.segments = segments
+
         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
         self.runfunc1 = get_fc_block(1, input_size, is_last=False)
         self.runfunc2 = get_fc_block(2, input_size, is_last=False)
         self.runfunc3 = get_fc_block(3, input_size, is_last=False)
         self.runfunc4 = get_fc_block(4, input_size, is_last=True)
 
-    def forward(self, inputs):
+        if self.use_fleet_sq and not use_raw_recompute:
+            self.runfuncs = paddle.nn.Sequential(self.runfunc0, self.runfunc1,
+                                                 self.runfunc2, self.runfunc3,
+                                                 self.runfunc4)
 
-        if 0 in self.recompute_blocks:
-            inputs = recompute(self.runfunc0, inputs)
-        else:
-            inputs = self.runfunc0(inputs)
+        self.layers = [
+            self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3,
+            self.runfunc4
+        ]
 
-        if 1 in self.recompute_blocks:
-            inputs = recompute(self.runfunc1, inputs)
-        else:
-            inputs = self.runfunc1(inputs)
+        # default segments = 2
+        if use_raw_recompute:
+            self.layers = [
+                paddle.nn.Sequential(self.runfunc0, self.runfunc1),
+                paddle.nn.Sequential(self.runfunc2, self.runfunc3,
+                                     self.runfunc4)
+            ]
 
-        if 2 in self.recompute_blocks:
-            inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
-        else:
-            inputs = self.runfunc2(inputs)
+    def forward(self, inputs):
 
-        if 3 in self.recompute_blocks:
-            inputs = recompute(self.runfunc3, inputs)
-        else:
-            inputs = self.runfunc3(inputs)
+        if self.use_fleet_sq and not self.use_raw_recompute:
+            return recompute_sequential({"segments": self.segments},
+                                        self.runfuncs, inputs)
 
-        if 4 in self.recompute_blocks:
-            inputs = recompute(self.runfunc4, inputs)
-        else:
-            inputs = self.runfunc4(inputs)
+        if self.use_raw_recompute:
+            inputs = recompute(self.layers[0], inputs)
+            return self.layers[1](inputs)
+
+        for i in range(len(self.layers)):
+            if i in self.recompute_blocks:
+                inputs = recompute(self.layers[i], inputs,
+                                   **self.recompute_kwargs)
+            else:
+                inputs = self.layers[i](inputs)
 
         return inputs
 
 
 def run_model(recompute_block=[],
               recompute_kwargs={},
+              use_fleet_sq=False,
+              use_raw_recompute=False,
+              segments=1,
               enable_autocast=False,
               pure_fp16=False):
     gen = paddle.seed(10)
@@ -105,6 +124,9 @@ def run_model(recompute_block=[],
     batch_size, input_size = 1, 10
     model = Naive_fc_net(input_size,
                          recompute_blocks=recompute_block,
+                         use_fleet_sq=use_fleet_sq,
+                         use_raw_recompute=use_raw_recompute,
+                         segments=segments,
                          recompute_kwargs=recompute_kwargs)
     loss_fn = paddle.nn.MSELoss(reduction='mean')
     optimizer = paddle.optimizer.SGD(learning_rate=0.01,
@@ -179,6 +201,34 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                                       pure_fp16=pure_fp16)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
+        # recompute second & fourth block using fleet
+        loss, param, grad = run_model(recompute_block=[1, 3],
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # recompute using recompute_sequential, segments=1
+        loss, param, grad = run_model(recompute_block=[],
+                                      use_fleet_sq=True,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # with base recompute, and segments=2
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[],
+            enable_autocast=enable_autocast,
+            use_raw_recompute=True,
+            pure_fp16=pure_fp16)
+
+        # recompute using recompute_sequential, segments=2
+        loss, param, grad = run_model(recompute_block=[],
+                                      use_fleet_sq=True,
+                                      segments=2,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
     def test_fc_net_with_dropout(self):
         self.test_base_case()
 
@@ -191,7 +241,7 @@ def test_fc_net_with_fp16(self):
     def test_recompute_kwargs(self):
         paddle.set_device("gpu")
         kwargs = {"is_test": False}
-        with self.assertRaises(ValueError):
+        with self.assertRaises(TypeError):
             loss_ref, param_ref, grad_ref = run_model(recompute_block=[2],
                                                       recompute_kwargs=kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
index bc97d53485be99..4b0c73370d3612 100755
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_recompute_for_eager.py
@@ -25,6 +25,7 @@
 from paddle.autograd import PyLayer
 from paddle.distributed.fleet.utils import recompute
 import random
+from paddle.incubate.distributed.fleet import recompute_sequential
 
 import paddle.fluid.layers as layers
 
@@ -57,48 +58,66 @@ class Naive_fc_net(paddle.nn.Layer):
     def __init__(self,
                  input_size=10,
                  recompute_blocks=[1, 3],
+                 use_fleet_sq=False,
+                 segments=1,
+                 use_raw_recompute=False,
                  recompute_kwargs={}):
         super(Naive_fc_net, self).__init__()
         self.recompute_blocks = recompute_blocks
         self.recompute_kwargs = recompute_kwargs
+        self.use_fleet_sq = use_fleet_sq
+        self.use_raw_recompute = use_raw_recompute
+        self.segments = segments
+
         self.runfunc0 = get_fc_block(0, input_size, is_last=False)
         self.runfunc1 = get_fc_block(1, input_size, is_last=False)
         self.runfunc2 = get_fc_block(2, input_size, is_last=False)
         self.runfunc3 = get_fc_block(3, input_size, is_last=False)
         self.runfunc4 = get_fc_block(4, input_size, is_last=True)
 
-    def forward(self, inputs):
+        if self.use_fleet_sq and not use_raw_recompute:
+            self.runfuncs = paddle.nn.Sequential(self.runfunc0, self.runfunc1,
+                                                 self.runfunc2, self.runfunc3,
+                                                 self.runfunc4)
 
-        if 0 in self.recompute_blocks:
-            inputs = recompute(self.runfunc0, inputs)
-        else:
-            inputs = self.runfunc0(inputs)
+        self.layers = [
+            self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3,
+            self.runfunc4
+        ]
 
-        if 1 in self.recompute_blocks:
-            inputs = recompute(self.runfunc1, inputs)
-        else:
-            inputs = self.runfunc1(inputs)
+        # default segments = 2
+        if use_raw_recompute:
+            self.layers = [
+                paddle.nn.Sequential(self.runfunc0, self.runfunc1),
+                paddle.nn.Sequential(self.runfunc2, self.runfunc3,
+                                     self.runfunc4)
+            ]
 
-        if 2 in self.recompute_blocks:
-            inputs = recompute(self.runfunc2, inputs, **self.recompute_kwargs)
-        else:
-            inputs = self.runfunc2(inputs)
+    def forward(self, inputs):
 
-        if 3 in self.recompute_blocks:
-            inputs = recompute(self.runfunc3, inputs)
-        else:
-            inputs = self.runfunc3(inputs)
+        if self.use_fleet_sq and not self.use_raw_recompute:
+            return paddle.incubate.distributed.fleet.recompute_sequential(
+                {"segments": self.segments}, self.runfuncs, inputs)
 
-        if 4 in self.recompute_blocks:
-            inputs = recompute(self.runfunc4, inputs)
-        else:
-            inputs = self.runfunc4(inputs)
+        if self.use_raw_recompute:
+            inputs = recompute(self.layers[0], inputs)
+            return self.layers[1](inputs)
+
+        for i in range(len(self.layers)):
+            if i in self.recompute_blocks:
+                inputs = recompute(self.layers[i], inputs,
+                                   **self.recompute_kwargs)
+            else:
+                inputs = self.layers[i](inputs)
 
         return inputs
 
 
 def run_model(recompute_block=[],
               recompute_kwargs={},
+              use_fleet_sq=False,
+              use_raw_recompute=False,
+              segments=1,
               enable_autocast=False,
               pure_fp16=False):
     gen = paddle.seed(10)
@@ -109,6 +128,9 @@ def run_model(recompute_block=[],
     batch_size, input_size = 1, 10
     model = Naive_fc_net(input_size,
                          recompute_blocks=recompute_block,
+                         use_fleet_sq=use_fleet_sq,
+                         use_raw_recompute=use_raw_recompute,
+                         segments=segments,
                          recompute_kwargs=recompute_kwargs)
     loss_fn = paddle.nn.MSELoss(reduction='mean')
     optimizer = paddle.optimizer.SGD(learning_rate=0.01,
@@ -183,6 +205,28 @@ def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
                                       pure_fp16=pure_fp16)
         check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
 
+        # recompute_sequential with segments=1 using fleet
+        loss, param, grad = run_model(recompute_block=[],
+                                      use_fleet_sq=True,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # with base recompute, and segments=2
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[],
+            enable_autocast=enable_autocast,
+            use_raw_recompute=True,
+            pure_fp16=pure_fp16)
+
+        # recompute using paddle.incubate.distributed.fleet.recompute_sequential, segments=2
+        loss, param, grad = run_model(recompute_block=[],
+                                      use_fleet_sq=True,
+                                      segments=2,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
     def test_fc_net_with_dropout(self):
         self.test_base_case()
 
@@ -201,7 +245,7 @@ def test_fc_net_with_fp16(self):
     def test_recompute_kwargs(self):
         paddle.set_device("gpu")
         kwargs = {"is_test": False}
-        with self.assertRaises(ValueError):
+        with self.assertRaises(TypeError):
             loss_ref, param_ref, grad_ref = run_model(recompute_block=[2],
                                                       recompute_kwargs=kwargs)
 
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage2.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage2.py
index 9d842d8719fe3a..aeeae15fe06534 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage2.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_dygraph_sharding_stage2.py
@@ -33,6 +33,9 @@ def test_dygraph_sharding_stage2_offload(self):
         self.run_mnist_2gpu('dygraph_sharding_stage2_offload.py',
                             eager_mode=False)
 
+    def test_dygraph_sharding_stage2_with_comm_overlap(self):
+        self.run_mnist_2gpu('dygraph_group_sharded_stage2_comm_overlap.py')
+
 
 if __name__ == "__main__":
     os.environ["FLAGS_enable_eager_mode"] = "1"
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_log.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_log.py
new file mode 100644
index 00000000000000..03cb281cf37c93
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_fleet_log.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+from paddle.distributed import fleet
+from paddle.distributed.fleet.utils.log_util import logger
+import logging
+import unittest
+
+
+class TestFleetLog(unittest.TestCase):
+
+    def setUp(self):
+        fleet.init(log_level="DEBUG")
+
+    def test_log_level(self):
+
+        # check correctly initialized
+        assert fleet.get_log_level_code() == logging._nameToLevel["DEBUG"]
+        assert logger.getEffectiveLevel() == logging._nameToLevel["DEBUG"]
+
+        # test set name
+        fleet.set_log_level("WARNING")
+        debug1 = fleet.get_log_level_code()
+        debug2 = logging._nameToLevel["WARNING"]
+        assert debug1 == debug2
+
+        # test set int
+        fleet.set_log_level(debug2)
+
+        # check the logger is changed
+        assert logger.getEffectiveLevel() == logging._nameToLevel["WARNING"]
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py
index 9bbae0928a7793..78bc37dd206e5c 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_pipeline_parallel.py
@@ -22,13 +22,14 @@
 
 
 class TestHybridPipeParallel(TestMultipleGpus):
-
     def test_hybrid_parallel_pp_layer(self):
         self.run_mnist_2gpu(
-            os.path.abspath('../../hybrid_parallel_pp_layer.py'))
+            os.path.abspath('../../hybrid_parallel_pp_layer.py')
+        )
         self.run_mnist_2gpu(
             os.path.abspath('../../hybrid_parallel_pp_layer.py'),
-            eager_mode=False)
+            eager_mode=False,
+        )
 
     def test_hybrid_parallel_pp_tuple_inputs(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_embedding.py')
@@ -36,8 +37,9 @@ def test_hybrid_parallel_pp_tuple_inputs(self):
 
     def test_hybrid_parallel_shared_weight(self):
         self.run_mnist_2gpu('hybrid_parallel_shared_weight.py')
-        self.run_mnist_2gpu('hybrid_parallel_shared_weight.py',
-                            eager_mode=False)
+        self.run_mnist_2gpu(
+            'hybrid_parallel_shared_weight.py', eager_mode=False
+        )
 
     def test_pipeline_parallel_amp(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_amp.py')
@@ -49,8 +51,9 @@ def test_pipeline_parallel_fp16(self):
 
     def test_hybrid_parallel_transformer(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py')
-        self.run_mnist_2gpu('hybrid_parallel_pp_transformer.py',
-                            eager_mode=False)
+        self.run_mnist_2gpu(
+            'hybrid_parallel_pp_transformer.py', eager_mode=False
+        )
 
     def test_hybrid_parallel_save_load(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_save_load.py')
@@ -64,6 +67,13 @@ def test_hybrid_parallel_pp_clip_grad(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py')
         self.run_mnist_2gpu('hybrid_parallel_pp_clip_grad.py', eager_mode=False)
 
+    def test_hybrid_parallel_transformer_unbalanced_data(self):
+        self.run_mnist_2gpu('hybrid_parallel_pp_transformer_unbalanced_data.py')
+        self.run_mnist_2gpu(
+            'hybrid_parallel_pp_transformer_unbalanced_data.py',
+            eager_mode=False,
+        )
+
 
 if __name__ == "__main__":
     os.environ["FLAGS_enable_eager_mode"] = "1"
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_qat.py b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_qat.py
index a5b2da46740ddf..b0e981babb6652 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_qat.py
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/test_parallel_dygraph_qat.py
@@ -22,7 +22,7 @@
 import os
 import subprocess
 
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 
 
 def get_cluster_from_args(selected_gpus):
diff --git a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv
index cdc856a9adaf43..c7fa5463225738 100644
--- a/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/fleet/testslist.csv
@@ -82,3 +82,4 @@ test_hdfs1,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_
 test_hdfs2,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_hdfs3,LINUX,,200,EXCLUSIVE:NIGHTLY,../../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=../..,
 test_fleet_checkpoint,LINUX,GPU;ROCM,200,EXCLUSIVE:NIGHTLY,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
+test_fleet_log,,,,DIST,test_runner.py,,,http_proxy=;https_proxy=;PYTHONPATH=../..,
diff --git a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
index 1635eb6c951bd7..ac6bbcd3ce82a3 100644
--- a/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
+++ b/python/paddle/fluid/tests/unittests/collective/process_group_nccl.py
@@ -90,13 +90,13 @@ def test_create_process_group_nccl(self):
             if pg.rank() == 0:
                 task = dist.all_reduce(tensor_x,
                                        dist.ReduceOp.MAX,
-                                       use_calc_stream=False)
+                                       sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_x, max_result)
             else:
                 task = dist.all_reduce(tensor_y,
                                        dist.ReduceOp.MAX,
-                                       use_calc_stream=False)
+                                       sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_y, max_result)
 
@@ -115,13 +115,13 @@ def test_create_process_group_nccl(self):
             if pg.rank() == 0:
                 task = dist.all_reduce(tensor_x,
                                        dist.ReduceOp.MIN,
-                                       use_calc_stream=False)
+                                       sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_x, min_result)
             else:
                 task = dist.all_reduce(tensor_y,
                                        dist.ReduceOp.MIN,
-                                       use_calc_stream=False)
+                                       sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_y, min_result)
 
@@ -140,13 +140,13 @@ def test_create_process_group_nccl(self):
             if pg.rank() == 0:
                 task = dist.all_reduce(tensor_x,
                                        dist.ReduceOp.PROD,
-                                       use_calc_stream=False)
+                                       sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_x, prod_result)
             else:
                 task = dist.all_reduce(tensor_y,
                                        dist.ReduceOp.PROD,
-                                       use_calc_stream=False)
+                                       sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_y, prod_result)
 
@@ -162,7 +162,7 @@ def test_create_process_group_nccl(self):
 
             broadcast_result = paddle.assign(tensor_x)
             if pg.rank() == 0:
-                task = dist.broadcast(tensor_x, 0, use_calc_stream=False)
+                task = dist.broadcast(tensor_x, 0, sync_op=False)
                 task.synchronize()
                 paddle.device.cuda.synchronize()
                 assert task.is_completed()
@@ -205,9 +205,7 @@ def test_create_process_group_nccl(self):
                     paddle.empty_like(tensor_x),
                     paddle.empty_like(tensor_x)
                 ]
-                task = dist.all_gather(tensor_out_list,
-                                       tensor_y,
-                                       use_calc_stream=False)
+                task = dist.all_gather(tensor_out_list, tensor_y, sync_op=False)
                 paddle.device.cuda.synchronize()
                 tensor_out = paddle.concat(tensor_out_list)
             out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
@@ -224,9 +222,7 @@ def test_create_process_group_nccl(self):
             # rank 1
             else:
                 tensor_out_list = []
-                task = dist.all_gather(tensor_out_list,
-                                       tensor_y,
-                                       use_calc_stream=False)
+                task = dist.all_gather(tensor_out_list, tensor_y, sync_op=False)
                 paddle.device.cuda.synchronize()
                 tensor_out = paddle.concat(tensor_out_list)
             out_1 = paddle.slice(tensor_out, [0], [0], [out_shape[0] // 2])
@@ -310,11 +306,11 @@ def test_create_process_group_nccl(self):
             tensor_y = paddle.to_tensor(y)
             sum_result = tensor_x + tensor_y
             if pg.rank() == 0:
-                task = dist.reduce(tensor_x, 0, use_calc_stream=True)
+                task = dist.reduce(tensor_x, 0, sync_op=True)
                 paddle.device.cuda.synchronize()
             # rank 1
             else:
-                task = dist.reduce(tensor_y, 0, use_calc_stream=False)
+                task = dist.reduce(tensor_y, 0, sync_op=False)
                 task.wait()
                 paddle.device.cuda.synchronize()
             if pg.rank() == 0:
@@ -335,14 +331,14 @@ def test_create_process_group_nccl(self):
                 task = dist.reduce(tensor_x,
                                    0,
                                    dist.ReduceOp.MAX,
-                                   use_calc_stream=False)
+                                   sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_x, max_result)
             else:
                 task = dist.reduce(tensor_y,
                                    0,
                                    dist.ReduceOp.MAX,
-                                   use_calc_stream=False)
+                                   sync_op=False)
                 task.wait()
 
             print("test reduce max api ok")
@@ -361,14 +357,14 @@ def test_create_process_group_nccl(self):
                 task = dist.reduce(tensor_x,
                                    0,
                                    dist.ReduceOp.MIN,
-                                   use_calc_stream=False)
+                                   sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_x, min_result)
             else:
                 task = dist.reduce(tensor_y,
                                    0,
                                    dist.ReduceOp.MIN,
-                                   use_calc_stream=False)
+                                   sync_op=False)
                 task.wait()
 
             print("test reduce min api ok")
@@ -387,14 +383,14 @@ def test_create_process_group_nccl(self):
                 task = dist.reduce(tensor_x,
                                    0,
                                    dist.ReduceOp.PROD,
-                                   use_calc_stream=False)
+                                   sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_x, prod_result)
             else:
                 task = dist.reduce(tensor_y,
                                    0,
                                    dist.ReduceOp.PROD,
-                                   use_calc_stream=False)
+                                   sync_op=False)
                 task.wait()
 
             print("test reduce prod api ok")
@@ -408,14 +404,12 @@ def test_create_process_group_nccl(self):
             tensor_y = paddle.to_tensor(y)
             if pg.rank() == 0:
                 in_1, in_2 = paddle.split(tensor_x, 2)
-                task = dist.scatter(tensor_y, [in_1, in_2],
-                                    0,
-                                    use_calc_stream=True)
+                task = dist.scatter(tensor_y, [in_1, in_2], 0, sync_op=True)
                 #task.wait()
                 paddle.device.cuda.synchronize()
             # rank 1
             else:
-                task = dist.scatter(tensor_y, [], 0, use_calc_stream=False)
+                task = dist.scatter(tensor_y, [], 0, sync_op=False)
                 task.wait()
                 paddle.device.cuda.synchronize()
             out1 = paddle.slice(tensor_x, [0], [0], [self.shape[0]])
@@ -436,10 +430,10 @@ def test_create_process_group_nccl(self):
             tensor_y = paddle.to_tensor(y)
 
             if pg.rank() == 0:
-                task = dist.send(tensor_x, 1, use_calc_stream=False)
+                task = dist.send(tensor_x, 1, sync_op=False)
                 task.wait()
             else:
-                task = dist.recv(tensor_y, 0, use_calc_stream=False)
+                task = dist.recv(tensor_y, 0, sync_op=False)
                 task.wait()
                 assert np.array_equal(tensor_y, tensor_x)
 
@@ -454,9 +448,9 @@ def test_create_process_group_nccl(self):
             tensor_y = paddle.to_tensor(y)
 
             if pg.rank() == 0:
-                task = dist.send(tensor_x, 1, use_calc_stream=True)
+                task = dist.send(tensor_x, 1, sync_op=True)
             else:
-                task = dist.recv(tensor_y, 0, use_calc_stream=True)
+                task = dist.recv(tensor_y, 0, sync_op=True)
                 assert np.array_equal(tensor_y, tensor_x)
 
             print("test send api ok")
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
index a01a96a0d6b29a..78ecf0816b67f0 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allgather_api.py
@@ -28,213 +28,55 @@ def _setup_config(self):
         pass
 
     def test_allgather_nccl(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "nccl",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "nccl",
+                                  dtype=dtype)
 
     def test_allgather_gloo(self):
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  dtype=dtype)
 
     def test_allgatther_nccl_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "nccl",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "complex64", "complex128"
+        ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "nccl",
+                                  static_mode="0",
+                                  dtype=dtype)
 
     def test_allgather_gloo_dygraph(self):
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float16")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="float64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="bool")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="uint8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int8")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int32")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="int64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex64")
-        self.check_with_place("collective_allgather_api_dygraph.py",
-                              "allgather",
-                              "gloo",
-                              "3",
-                              static_mode="0",
-                              dtype="complex128")
+        dtypes_to_test = [
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16", "complex64", "complex128"
+        ]
+        for dtype in dtypes_to_test:
+            self.check_with_place("collective_allgather_api_dygraph.py",
+                                  "allgather",
+                                  "gloo",
+                                  "3",
+                                  static_mode="0",
+                                  dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
index 2598606fc9cc25..65754989d3f7e5 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_allreduce_api.py
@@ -43,9 +43,11 @@ def test_allreduce_gloo(self):
 
     def test_allreduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
                                   "allreduce",
@@ -55,8 +57,8 @@ def test_allreduce_nccl_dygraph(self):
 
     def test_allreduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_allreduce_api_dygraph.py",
@@ -67,5 +69,5 @@ def test_allreduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
index e079e99efebf57..35e3bf323964d2 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_api.py
@@ -32,9 +32,11 @@ def test_alltoall_nccl(self):
 
     def test_alltoall_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_api_dygraph.py",
                                   "alltoall",
@@ -43,5 +45,5 @@ def test_alltoall_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
index fb1e5e9da22ef1..23d2a998d8e320 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_alltoall_single_api.py
@@ -24,9 +24,11 @@ def _setup_config(self):
 
     def test_alltooall_single_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_alltoall_single_api_dygraph.py",
                                   "alltoall",
@@ -35,5 +37,5 @@ def test_alltooall_single_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
index 2d21be144a68b6..e08930cefe9ca2 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_broadcast_api.py
@@ -37,9 +37,11 @@ def test_broadcast_gloo(self):
 
     def test_broadcast_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
                                   "broadcast",
@@ -49,8 +51,8 @@ def test_broadcast_nccl_dygraph(self):
 
     def test_broadcast_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_broadcast_api_dygraph.py",
@@ -61,5 +63,5 @@ def test_broadcast_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
index f9613abc240636..28e502821aa52c 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_isend_irecv_api.py
@@ -24,9 +24,11 @@ def _setup_config(self):
 
     def test_isend_irecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_isend_irecv_api_dygraph.py",
                                   "sendrecv",
@@ -35,5 +37,5 @@ def test_isend_irecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
index 2fa84ea2ed7f18..cc6093a3f431ca 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_api.py
@@ -40,9 +40,11 @@ def test_reduce_gloo(self):
 
     def test_reduce_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
                                   "reduce",
@@ -52,8 +54,8 @@ def test_reduce_nccl_dygraph(self):
 
     def test_reduce_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_api_dygraph.py",
@@ -64,5 +66,5 @@ def test_reduce_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
index 1d25527407f453..283f73020f7491 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_reduce_scatter_api.py
@@ -24,9 +24,11 @@ def _setup_config(self):
 
     def test_reduce_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_reduce_scatter_api_dygraph.py",
                                   "reduce_scatter",
@@ -35,5 +37,5 @@ def test_reduce_scatter_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
index 4093b8ed69093e..82ef4bd80e2d8d 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_scatter_api.py
@@ -36,9 +36,11 @@ def test_scatter_nccl(self):
 
     def test_scatter_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
                                   "scatter",
@@ -48,8 +50,8 @@ def test_scatter_nccl_dygraph(self):
 
     def test_scatter_gloo_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool", "bfloat16"
         ]
         for dtype in dtypes_to_test:
             self.check_with_place("collective_scatter_api_dygraph.py",
@@ -60,5 +62,5 @@ def test_scatter_gloo_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
index 940d6ec709bf1f..c2fc98ed18e38e 100644
--- a/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
+++ b/python/paddle/fluid/tests/unittests/collective/test_collective_sendrecv_api.py
@@ -34,9 +34,11 @@ def _setup_config(self):
 
     def test_sendrecv_nccl_dygraph(self):
         dtypes_to_test = [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'int8', 'uint8',
-            'bool'
+            "float16", "float32", "float64", "int32", "int64", "int8", "uint8",
+            "bool"
         ]
+        if self._nccl_version >= 2100:
+            dtypes_to_test.append("bfloat16")
         for dtype in dtypes_to_test:
             self.check_with_place("collective_sendrecv_api_dygraph.py",
                                   "sendrecv",
@@ -45,5 +47,5 @@ def test_sendrecv_nccl_dygraph(self):
                                   dtype=dtype)
 
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py
new file mode 100644
index 00000000000000..254b64907ce07f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_allgather_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamAllgatherAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamAllgatherAPI, self).setUp(num_of_devices=2,
+                                                               timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_allgather_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_allgather_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamAllgatherAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
new file mode 100644
index 00000000000000..4fa55d86840bc8
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamAllToAllAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamAllToAllAPI, self).setUp(num_of_devices=2,
+                                                              timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_alltoall_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_alltoall_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamAllToAllAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
new file mode 100644
index 00000000000000..f1f099b9571f8f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_alltoall_single_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamAllToAllSingleAPI(
+        test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamAllToAllSingleAPI,
+              self).setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_alltoall_single_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case(
+                "communication_stream_alltoall_single_api_dygraph.py",
+                user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamAllToAllSingleAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
new file mode 100644
index 00000000000000..07537a480e851a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_broadcast_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamBroadcastAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamBroadcastAPI, self).setUp(num_of_devices=2,
+                                                               timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_broadcast_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_broadcast_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamBroadcastAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
new file mode 100644
index 00000000000000..c8a04c8d893e16
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamReduceAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamReduceAPI, self).setUp(num_of_devices=2,
+                                                            timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_reduce_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_reduce_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamReduceAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
new file mode 100644
index 00000000000000..a90e634860d95b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_reduce_scatter_api.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamReduceScatterAPI(
+        test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamReduceScatterAPI,
+              self).setUp(num_of_devices=2, timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_reduce_scatter_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case(
+                "communication_stream_reduce_scatter_api_dygraph.py",
+                user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamReduceScatterAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
new file mode 100644
index 00000000000000..d96d931f43fbf4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_scatter_api.py
@@ -0,0 +1,51 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import itertools
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamScatterAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamScatterAPI, self).setUp(num_of_devices=2,
+                                                             timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_reduce_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_scatter_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamScatterAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
new file mode 100644
index 00000000000000..9590519bc2e13d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_communication_stream_sendrecv_api.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import test_communication_api_base as test_base
+
+
+class TestCommunicationStreamSendRecvAPI(test_base.CommunicationTestDistBase):
+
+    def setUp(self):
+        super(TestCommunicationStreamSendRecvAPI, self).setUp(num_of_devices=2,
+                                                              timeout=120)
+        self._default_envs = {
+            "backend": "nccl",
+            "shape": "(100, 200)",
+            "dtype": "float32",
+            "seeds": str(self._seeds)
+        }
+        self._changeable_envs = {
+            "sync_op": ["True", "False"],
+            "use_calc_stream": ["True", "False"]
+        }
+
+    def test_sendrecv_stream(self):
+        envs_list = test_base.gen_product_envs_list(self._default_envs,
+                                                    self._changeable_envs)
+        for envs in envs_list:
+            if eval(envs["use_calc_stream"]) and not eval(envs["sync_op"]):
+                continue
+            self.run_test_case("communication_stream_sendrecv_api_dygraph.py",
+                               user_defined_envs=envs)
+
+    def tearDown(self):
+        super(TestCommunicationStreamSendRecvAPI, self).tearDown()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/collective/test_world_size_and_rank.sh b/python/paddle/fluid/tests/unittests/collective/test_world_size_and_rank.sh
new file mode 100644
index 00000000000000..c559c4bd26cffc
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/test_world_size_and_rank.sh
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+CUDA_VISIBLE_DEVICES=0,1 python -m paddle.distributed.launch --gpus=0,1 world_size_and_rank.py
diff --git a/python/paddle/fluid/tests/unittests/collective/testslist.csv b/python/paddle/fluid/tests/unittests/collective/testslist.csv
index fc08f861e90774..2cf632a29d0ac1 100644
--- a/python/paddle/fluid/tests/unittests/collective/testslist.csv
+++ b/python/paddle/fluid/tests/unittests/collective/testslist.csv
@@ -7,32 +7,41 @@ test_c_split,linux,gpu;rocm,120,DIST,test_runner.py,2,,PYTHONPATH=..;http_proxy=
 test_collective_split_embedding,linux,rocm;gpu,300,DIST,../dist_test.sh,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_collective_allgather_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_allgather_object_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_allreduce_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_allreduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_alltoall_single,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_alltoall_single_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_alltoall_single_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_barrier_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_batch_isend_irecv,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_broadcast_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_broadcast_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_cpu_barrier_with_gloo,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_gather,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_global_scatter,linux,gpu;rocm,200,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_isend_irecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_isend_irecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_optimizer,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_process_group,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_reduce_scatter,linux,gpu;rocm,350,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_reduce_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_reduce_scatter_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_scatter,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_scatter_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_scatter_api,linux,gpu;rocm,180,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_sendrecv,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_collective_sendrecv_api,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_collective_sendrecv_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_col_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_embedding_none_divisible,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_split_row_linear,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_collective_wait,linux,gpu;rocm,300,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_communication_stream_allgather_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_alltoall_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_alltoall_single_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_broadcast_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_reduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_reduce_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_scatter_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_communication_stream_sendrecv_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
 test_eager_dist_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
 test_gen_nccl_id_op,,gpu;rocm;ASCEND;ASCEND_CL,,DIST,../dist_test.sh,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
-test_communication_stream_allreduce_api,linux,gpu;rocm,120,DIST,,2,,PYTHONPATH=..;http_proxy=;https_proxy=,
+test_new_group_api,linux,gpu;rocm,120,DIST,test_runner.py,2,,http_proxy=;https_proxy=;PYTHONPATH=..,
+test_world_size_and_rank,linux,rocm;gpu,120,DIST,test_world_size_and_rank.sh,2,,http_proxy=;https_proxy=,
diff --git a/python/paddle/fluid/tests/unittests/collective/world_size_and_rank.py b/python/paddle/fluid/tests/unittests/collective/world_size_and_rank.py
new file mode 100644
index 00000000000000..69620238a94377
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective/world_size_and_rank.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import paddle.distributed as dist
+
+
+class TestWorldSizeAndRankAPI(unittest.TestCase):
+
+    def setUp(self):
+        self._num_of_ranks = 2
+        self._subgroup_ranks = [0, 1]
+        dist.init_parallel_env()
+        self._subgroup = dist.new_group(self._subgroup_ranks)
+        self._global_rank = dist.get_rank()
+
+    def test_default_env_world_size(self):
+        self.assertEqual(dist.get_world_size(), self._num_of_ranks)
+
+    def test_given_group_world_size(self):
+        world_size = 2 if self._global_rank in self._subgroup_ranks else -1
+        self.assertEqual(dist.get_world_size(self._subgroup), world_size)
+
+    def test_given_group_rank(self):
+        rank = self._subgroup_ranks.index(
+            self._global_rank
+        ) if self._global_rank in self._subgroup_ranks else -1
+        self.assertEqual(dist.get_rank(self._subgroup), rank)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
index ec879e77611cd4..e741cab8d7a6e4 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/auto_parallel_pass_test_base.py
@@ -25,7 +25,7 @@
 from dist_pass_test_base import DistPassTestBase
 
 import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 sys.path.append("..")
 import auto_parallel_gpt_model as modeling
@@ -86,7 +86,7 @@ def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
                                          paddle.static.Program()):
             with paddle.static.scope_guard(scope):
                 with paddle.fluid.unique_name.guard():
-                    main_prog, startup_prog, inputs, outputs, reader = self.get_model(
+                    main_prog, startup_prog, inputs, outputs, data_loader = self.get_model(
                         place, **kwargs)
                     inputs = self._to_var_names(inputs)
                     outputs = self._to_var_names(outputs)
@@ -95,27 +95,57 @@ def _run_gpu_main(self, model, apply_pass, dump_file, **kwargs):
         exe = paddle.static.Executor(place)
         with paddle.static.scope_guard(scope):
             exe.run(startup_prog)
-            for batch_id, input_data in enumerate(reader()):
-                assert len(input_data) == len(inputs), "{} vs {}".format(
-                    len(input_data), len(inputs))
-                feed = dict(zip(inputs, input_data))
-                fetch_values = exe.run(main_prog, feed=feed, fetch_list=outputs)
-                if paddle.distributed.get_rank() == 0:
-                    output_dict = OrderedDict(zip(outputs, fetch_values))
-                    print('batch {}, outputs {}'.format(batch_id, output_dict))
-                all_fetch_values.append(fetch_values)
+            data_loader.start()
+            batch_id = 0
+            while True:
+                try:
+                    fetch_values = exe.run(main_prog, fetch_list=outputs)
+                    if paddle.distributed.get_rank() == 0:
+                        output_dict = OrderedDict(zip(outputs, fetch_values))
+                        print('batch {}, outputs {}'.format(
+                            batch_id, output_dict))
+                    all_fetch_values.append(fetch_values)
+                    batch_id += 1
+                except paddle.fluid.core.EOFException:
+                    data_loader.reset()
+                    break
         with open(dump_file, "wb") as f:
             pickle.dump(all_fetch_values, f)
 
     def get_gpt_model(self, strategy, place, batch_size, sequence_len,
                       vocab_size, **kwargs):
+
+        def gen_data():
+            np.random.seed(2021)
+            for _ in range(10):
+                tokens = []
+                position_ids = []
+                attention_mask = []
+                labels = []
+                loss_mask = []
+                for _ in range(batch_size):
+                    tokens.append(
+                        np.random.randint(vocab_size,
+                                          size=sequence_len).astype("int64"))
+                    position_ids.append(np.arange(sequence_len).astype("int64"))
+                    attention_mask.append(
+                        [np.tril(np.ones(sequence_len)).astype("float32")])
+                    labels.append(
+                        np.random.randint(vocab_size,
+                                          size=sequence_len).astype("int64"))
+                    loss_mask.append(np.ones(sequence_len).astype("float32"))
+
+                yield tokens, position_ids, attention_mask, labels, loss_mask
+
         modeling.init_global()
         if strategy == "dp":
             modeling._global_parallel_strategy = "dp"
-            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1],
+                                                             dim_names=["x"])
         elif strategy == "mp":
             modeling._global_parallel_strategy = "mp"
-            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+            modeling._global_process_mesh = auto.ProcessMesh(mesh=[0, 1],
+                                                             dim_names=["x"])
         else:
             raise ValueError("'get_gpt_model' only support dp and mp.")
 
@@ -137,23 +167,17 @@ def get_gpt_model(self, strategy, place, batch_size, sequence_len,
                                        dtype='float32')
         data_holder = [tokens, position_ids, attention_mask, labels, loss_mask]
 
+        data_loader = paddle.fluid.io.DataLoader.from_generator(
+            feed_list=data_holder, capacity=70, iterable=False)
+        data_loader.set_batch_generator(gen_data, paddle.static.cuda_places())
+
         if modeling._global_parallel_strategy == "dp":
-            auto.shard_tensor(tokens,
-                              dist_attr={
-                                  "process_mesh": modeling._global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(tokens, modeling._global_process_mesh,
+                              ["x", None])
         elif modeling._global_parallel_strategy == "pp":
-            auto.shard_tensor(tokens,
-                              dist_attr={
-                                  "process_mesh": modeling.PP_MESH_LIST[0],
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(attention_mask,
-                              dist_attr={
-                                  "process_mesh": modeling.PP_MESH_LIST[0],
-                                  "dims_mapping": [-1, -1, -1, -1]
-                              })
+            auto.shard_tensor(tokens, modeling.PP_MESH_LIST[0], [None, None])
+            auto.shard_tensor(attention_mask, modeling.PP_MESH_LIST[0],
+                              [None, None, None, None])
 
         gpt = GPTModel(vocab_size=1000,
                        hidden_size=64,
@@ -178,40 +202,21 @@ def get_gpt_model(self, strategy, place, batch_size, sequence_len,
         preds = model(tokens, position_ids, attention_mask)
         criterion = GPTPretrainingCriterion()
         loss = criterion(preds, labels, loss_mask)
-        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
 
+        clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
         if kwargs.get('optimizer', None) == "LarsMomentum":
             optimizer = paddle.fluid.optimizer.LarsMomentumOptimizer(
                 learning_rate=0.001, momentum=0.9)
         else:
-            optimizer = paddle.fluid.optimizer.AdamOptimizer(
-                learning_rate=0.00001,
-                beta1=0.9,
-                beta2=0.999,
-                epsilon=1e-08,
-                grad_clip=clip)
+            optimizer = paddle.optimizer.Adam(learning_rate=0.00001,
+                                              beta1=0.9,
+                                              beta2=0.999,
+                                              epsilon=1e-08,
+                                              grad_clip=clip)
         optimizer = fleet.distributed_optimizer(optimizer)
         startup_program = paddle.static.default_startup_program()
         _, _, dist_startup_prog, dist_main_prog = optimizer.minimize(
             loss, startup_program)
 
-        def gen_data():
-            np.random.seed(2021)
-            for _ in range(10):
-                tokens = []
-                position_ids = []
-                attention_mask = []
-                labels = []
-                loss_mask = []
-                for _ in range(batch_size):
-                    tokens.append(
-                        np.random.randint(vocab_size, size=sequence_len))
-                    position_ids.append(np.arange(sequence_len))
-                    attention_mask.append([np.tril(np.ones(sequence_len))])
-                    labels.append(
-                        np.random.randint(vocab_size, size=sequence_len))
-                    loss_mask.append(np.ones(sequence_len))
-
-                yield tokens, position_ids, attention_mask, labels, loss_mask
-
-        return dist_main_prog, dist_startup_prog, data_holder, [loss], gen_data
+        return dist_main_prog, dist_startup_prog, data_holder, [loss
+                                                                ], data_loader
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
index f8fe59f6979b12..1f94d7381450dd 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_data_parallel_optimization_pass.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
 from paddle.distributed.passes import new_pass, PassManager, PassContext
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
index 5ac78cc5fec4de..4c20153ccbfd99 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_fp16_pass.py
@@ -20,10 +20,19 @@
 import paddle
 import paddle.distributed.fleet as fleet
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
-from test_auto_parallel_amp_pass import TestAMPPass
 
 
-class TestPF16Pass(TestAMPPass):
+class TestPF16Pass(AutoPallelPassTestBase):
+
+    def init(self):
+        if paddle.is_compiled_with_cuda():
+            paddle.set_flags({'FLAGS_cudnn_deterministic': 1})
+        self.rtol = 1e-5
+        self.atol = 1e-8
+
+        paddle.seed(2021)
+        random.seed(2021)
+        np.random.seed(2021)
 
     def apply_passes(self):
         dist_strategy = fleet.DistributedStrategy()
@@ -34,14 +43,30 @@ def apply_passes(self):
                 'layer_norm',
                 'gelu',
             ],
-            "custom_black_list": ['c_softmax_with_cross_entropy'],
-            "init_loss_scaling": 32768,
-            "use_dynamic_loss_scaling": True,
-            "use_pure_fp16": True
+            "custom_black_list":
+            ['c_softmax_with_cross_entropy', 'elementwise_div', 'reduce_sum'],
+            "init_loss_scaling":
+            32768,
+            "use_dynamic_loss_scaling":
+            True,
+            "use_pure_fp16":
+            True,
+            "use_fp16_guard":
+            False
         }
         dist_strategy.semi_auto = True
         fleet.init(is_collective=True, strategy=dist_strategy)
 
+    def test_bs_8(self):
+        self.check_main(gpus=[0, 1],
+                        batch_size=8,
+                        sequence_len=512,
+                        vocab_size=1000)
+
+    def get_model(self, place, batch_size, sequence_len, vocab_size):
+        return self.get_gpt_model("mp", place, batch_size, sequence_len,
+                                  vocab_size)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
index 50e18718201865..de7ed4efb7f895 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_gradient_merge_pass.py
@@ -26,7 +26,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 
 from paddle.fluid.initializer import NumpyArrayInitializer
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
@@ -97,11 +97,8 @@ def forward(self, input):
 
 
 def mlp_forward(input, label, hidden_size):
-    auto.shard_tensor(input,
-                      dist_attr={
-                          "process_mesh": [0],
-                          "dims_mapping": [-1, -1]
-                      })
+    auto.shard_tensor(input, auto.ProcessMesh([0], dim_names=["x"]),
+                      [None, None])
     mlp = MLPLayer(hidden_size=hidden_size,
                    intermediate_size=4 * hidden_size,
                    initializer_range=0.02)
@@ -160,6 +157,12 @@ def test_result(self):
 
     def get_model(self, place, batch_size, hidden_size, max_step):
 
+        def gen_data():
+            for i in range(max_step):
+                x_data = input_data[i * batch_size:(i + 1) * batch_size, :]
+                y_data = label_data[i * batch_size:(i + 1) * batch_size, :]
+                yield x_data, y_data
+
         train_program = static.Program()
         startup_program = static.Program()
         with static.program_guard(train_program, startup_program), \
@@ -171,6 +174,12 @@ def get_model(self, place, batch_size, hidden_size, max_step):
                                 shape=[batch_size, 1],
                                 dtype='float32')
             input.stop_gradient = False
+            data_holder = [input, label]
+            data_loader = paddle.fluid.io.DataLoader.from_generator(
+                feed_list=data_holder, capacity=70, iterable=False)
+            data_loader.set_batch_generator(gen_data,
+                                            paddle.static.cuda_places())
+
             loss = mlp_forward(input, label, hidden_size)
 
         optimizer = paddle.fluid.optimizer.AdamOptimizer(learning_rate=0.01)
@@ -181,13 +190,8 @@ def get_model(self, place, batch_size, hidden_size, max_step):
         input_data = np.random.random(size=(128, hidden_size)).astype('float32')
         label_data = np.random.random(size=(128, 1)).astype('float32')
 
-        def reader():
-            for i in range(max_step):
-                x_data = input_data[i * batch_size:(i + 1) * batch_size, :]
-                y_data = label_data[i * batch_size:(i + 1) * batch_size, :]
-                yield x_data, y_data
-
-        return dist_main_prog, dist_startup_prog, [input, label], [loss], reader
+        return dist_main_prog, dist_startup_prog, [input,
+                                                   label], [loss], data_loader
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
index 7afa10d49dbf54..84084ed709c2de 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_recompute_pass.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.passes import new_pass, PassManager
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
index 16d63b09643607..7bc4c811c316d5 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_auto_parallel_sharding_pass.py
@@ -20,7 +20,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.distributed.fleet as fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.passes import new_pass, PassManager
 from auto_parallel_pass_test_base import AutoPallelPassTestBase
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py
index 85c3bf321a3b1b..c1cdd0b7336e30 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_adam_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class DemoNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
index a0090f6d8c310b..12aeae57b09aad 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_act_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class BatchNormActNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py
index eb9a901a40a9ff..72dfd60d4a0bd9 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_bn_add_act_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class BatchNormAddActNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py
index 11bd4f5d2b13a6..c725a9b91569c2 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_momentum_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class DemoNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
index 0a7442a18d7765..61897b37ea7c58 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_relu_depthwise_conv_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class ReluDepthwiseConvNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py
index 3e96e9d3440e0e..ab752c60ed3429 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_fuse_sgd_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class DemoNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py
index 32bb1ca83a9b36..0431c53c11c27c 100644
--- a/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py
+++ b/python/paddle/fluid/tests/unittests/distributed_passes/test_dist_inplace_addto_pass.py
@@ -21,6 +21,8 @@
 import unittest
 from dist_pass_test_base import DistPassTestBase
 
+paddle.enable_static()
+
 
 class DemoNet(nn.Layer):
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_recompute_hybrid.py b/python/paddle/fluid/tests/unittests/dygraph_recompute_hybrid.py
new file mode 100755
index 00000000000000..cc90f0433b9217
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_recompute_hybrid.py
@@ -0,0 +1,221 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.autograd import PyLayer
+from paddle.distributed.fleet.utils import recompute
+from paddle.incubate.distributed.fleet import recompute_hybrid
+import random
+from paddle.distributed import fleet
+
+import paddle.fluid.layers as layers
+
+
+def get_fc_block(block_idx, input_size, is_last=False):
+    block_name = "block_" + str(block_idx)
+    block = paddle.nn.Sequential(
+        (block_name + "_fc_0",
+         paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+        (block_name + "_dropout", paddle.nn.Dropout(p=0.5)),
+        (block_name + "_relu_1", paddle.nn.ReLU()),
+        (block_name + "_fc_1",
+         paddle.nn.Linear(input_size, input_size, bias_attr=False)),
+        (block_name + "_relu_2", paddle.nn.ReLU()),
+    )
+    if is_last:
+        block.add_sublayer(block_name + "_fc_2",
+                           paddle.nn.Linear(input_size, 1,
+                                            bias_attr=False))  # add sublayer
+    else:
+        block.add_sublayer(block_name + "_fc_2",
+                           paddle.nn.Linear(input_size,
+                                            input_size,
+                                            bias_attr=False))  # add sublayer
+    return block
+
+
+class Naive_fc_net(paddle.nn.Layer):
+
+    def __init__(self,
+                 input_size=10,
+                 recompute_blocks=[1, 3],
+                 offload=False,
+                 partition=False,
+                 recompute_kwargs={}):
+        super(Naive_fc_net, self).__init__()
+        self.recompute_blocks = recompute_blocks
+        self.recompute_kwargs = recompute_kwargs
+        self.offload = offload
+        self.partition = partition
+
+        self.runfunc0 = get_fc_block(0, input_size, is_last=False)
+        self.runfunc1 = get_fc_block(1, input_size, is_last=False)
+        self.runfunc2 = get_fc_block(2, input_size, is_last=False)
+        self.runfunc3 = get_fc_block(3, input_size, is_last=False)
+        self.runfunc4 = get_fc_block(4, input_size, is_last=True)
+
+        self.layers = [
+            self.runfunc0, self.runfunc1, self.runfunc2, self.runfunc3,
+            self.runfunc4
+        ]
+
+    def forward(self, inputs):
+        for i in range(len(self.layers)):
+            if i in self.recompute_blocks:
+                inputs = recompute_hybrid(
+                    {
+                        "mp_group": fleet.fleet._hcg.get_model_parallel_group(),
+                        "offload": self.offload,
+                        "partition": self.partition
+                    }, self.layers[i], inputs, **self.recompute_kwargs)
+            else:
+                inputs = self.layers[i](inputs)
+
+        return inputs
+
+
+def run_model(recompute_block=[],
+              recompute_kwargs={},
+              offload=False,
+              partition=False,
+              enable_autocast=False,
+              pure_fp16=False):
+    gen = paddle.seed(10)
+    gen.manual_seed(10)
+    np.random.seed(10)
+    random.seed(10)
+
+    batch_size, input_size = 1, 10
+    model = Naive_fc_net(input_size,
+                         recompute_blocks=recompute_block,
+                         offload=offload,
+                         partition=partition,
+                         recompute_kwargs=recompute_kwargs)
+    loss_fn = paddle.nn.MSELoss(reduction='mean')
+    optimizer = paddle.optimizer.SGD(learning_rate=0.01,
+                                     parameters=model.parameters())
+
+    model = fleet.distributed_model(model)
+    optimizer = fleet.distributed_optimizer(optimizer)
+
+    if enable_autocast:
+        scaler = paddle.amp.GradScaler()
+        scaler = fleet.distributed_scaler(scaler)
+
+    loss_ = []
+    param_ = []
+    grad_ = []
+    for step in range(10):
+
+        x_data = np.random.randn(batch_size, input_size).astype(np.float32)
+        x = paddle.to_tensor(x_data)
+        # x.stop_gradient = False
+        level = 'O2' if pure_fp16 else 'O1'
+        with paddle.amp.auto_cast(True, level=level):
+            y_pred = model(x)
+            loss = y_pred.mean()
+        if enable_autocast:
+            scaler.scale(loss).backward()
+            scaler.minimize(optimizer, loss)
+        else:
+            loss_.append(np.asarray(loss).tolist())
+            loss.backward()
+            optimizer.step()
+
+        param_.append(np.asarray(model.parameters()[9]).tolist())
+        grad_.append(np.asarray(model.parameters()[3]._grad_ivar()).tolist())
+
+        optimizer.clear_grad()
+    return loss_, param_, grad_
+
+
+class TestPyLayer(unittest.TestCase):
+
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        self.model_parallel_size = 2
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 1
+        strategy.hybrid_configs = {
+            "dp_degree": self.data_parallel_size,
+            "mp_degree": self.model_parallel_size,
+            "pp_degree": self.pipeline_parallel_size,
+        }
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_base_case(self, enable_autocast=False, pure_fp16=False):
+
+        def check_identical(loss_ref, param_ref, grad_ref, loss, param, grad):
+            self.assertEqual(loss_ref, loss)
+            self.assertEqual(param_ref, param)
+            self.assertEqual(grad_ref, grad)
+
+        # without recompute
+        loss_ref, param_ref, grad_ref = run_model(
+            recompute_block=[],
+            enable_autocast=enable_autocast,
+            pure_fp16=pure_fp16)
+
+        # with recompute, offload=False, partition=False
+        loss, param, grad = run_model(recompute_block=[1, 3],
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # with recompute, offload=True, partition=False
+        loss, param, grad = run_model(recompute_block=[1, 2, 3],
+                                      offload=True,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # with recompute, offload=False, partition=True
+        loss, param, grad = run_model(recompute_block=[1],
+                                      partition=True,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+        # with recompute, offload=True, partition=True
+        loss, param, grad = run_model(recompute_block=[1, 3, 4],
+                                      offload=True,
+                                      partition=True,
+                                      enable_autocast=enable_autocast,
+                                      pure_fp16=pure_fp16)
+        check_identical(loss_ref, param_ref, grad_ref, loss, param, grad)
+
+    def test_fc_net_with_dropout(self):
+        self.test_base_case()
+
+    def test_fc_net_with_amp(self):
+        self.test_base_case(enable_autocast=True)
+
+    def test_fc_net_with_fp16(self):
+        self.test_base_case(enable_autocast=True, pure_fp16=True)
+
+    def test_recompute_kwargs(self):
+        paddle.set_device("gpu")
+        kwargs = {"is_test": False}
+        with self.assertRaises(TypeError):
+            loss_ref, param_ref, grad_ref = run_model(recompute_block=[2],
+                                                      recompute_kwargs=kwargs)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index e1611d524ab8ea..dba92870d3ab72 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -34,7 +34,7 @@ set(TEST_EAGER_OPS
 list(REMOVE_ITEM TEST_OPS test_lac)
 # NOTE(Aurelius84): In case of Windows CI, if open ON_INFER, RWLOCK of Scope
 # will be removed and will cause some random failed in multi-thread.
-if(NOT ON_INFER)
+if(WITH_PYTHON)
   py_test_modules(test_lac MODULES test_lac ENVS FLAGS_enable_eager_mode=1)
   set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/decos.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/decos.py
new file mode 100644
index 00000000000000..6e3333c15a0ce1
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/decos.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy
+import paddle
+
+from functools import wraps
+
+
+def deco1(fun):
+
+    @wraps(fun)
+    def inner(*args, **kwargs):
+        print('in decos.deco1, added 1')
+        _t = paddle.to_tensor([1])
+        _tt = fun(*args, **kwargs)
+        return paddle.add(_t, _tt)
+
+    return inner
+
+
+def deco2(x=0):
+
+    def inner_deco(func):
+
+        @wraps(func)
+        def inner(*args, **kwargs):
+            print('in decos.deco2, added {}'.format(x))
+            _t = paddle.to_tensor(x)
+            _tt = func(*args, **kwargs)
+            return paddle.add(_t, _tt)
+
+        return inner
+
+    return inner_deco
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
index b37accce9d1b84..482206b906abd6 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/ifelse_simple_func.py
@@ -164,7 +164,7 @@ def nested_if_else(x_v):
             if paddle.mean(y).numpy()[0] < batch_size:
                 y = fluid.layers.abs(y)
             else:
-                tmp = fluid.layers.fill_constant([feat_size],
+                tmp = fluid.layers.fill_constant(y.shape,
                                                  dtype='float32',
                                                  value=-1)
                 y = y - tmp
@@ -273,7 +273,7 @@ def forward(self, input):
                         [hidden_dim], dtype='float32', value=9)
                     y = fluid.layers.abs(y)
                 else:
-                    tmp = fluid.layers.fill_constant([5],
+                    tmp = fluid.layers.fill_constant(y.shape,
                                                      dtype='float32',
                                                      value=-1)
                     y = y - tmp
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
index 52e67932326701..227191a68fe38e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_closure_analysis.py
@@ -20,6 +20,9 @@
 from paddle.fluid.dygraph.dygraph_to_static.utils import FunctionNameLivenessAnalysis
 from paddle.utils import gast
 import inspect
+from numpy import append
+
+global_a = []
 
 
 class JudgeVisitor(gast.NodeVisitor):
@@ -257,5 +260,70 @@ def init_dygraph_func(self):
         }]
 
 
+class TestPushPopTrans(unittest.TestCase):
+
+    def test(self):
+
+        def vlist_of_dict(x):
+            ma = {'a': []}
+            for i in range(3):
+                ma['a'].append(1)
+            return ma
+
+        x = paddle.to_tensor([3])
+        print(paddle.jit.to_static(vlist_of_dict).code)
+        print(paddle.jit.to_static(vlist_of_dict)(x))
+
+    def test2(self):
+        import numpy as np
+
+        def vlist_of_dict(x):
+            a = np.array([1, 2, 3])
+            for i in range(3):
+                np.append(a, 4)
+            return a
+
+        x = paddle.to_tensor([3])
+        print(paddle.jit.to_static(vlist_of_dict).code)
+        print(paddle.jit.to_static(vlist_of_dict)(x))
+
+    def test3(self):
+        import numpy as np
+
+        def vlist_of_dict(x):
+            a = np.array([1, 2, 3])
+            if True:
+                pass
+            return a
+
+        x = paddle.to_tensor([3])
+        print(paddle.jit.to_static(vlist_of_dict).code)
+        print(paddle.jit.to_static(vlist_of_dict)(x))
+
+    def test4(self):
+
+        def vlist_of_dict(x):
+            a = np.array([1, 2, 3])
+            for i in range(3):
+                append(a, 4)
+            return a
+
+        x = paddle.to_tensor([3])
+        print(paddle.jit.to_static(vlist_of_dict).code)
+        print(paddle.jit.to_static(vlist_of_dict)(x))
+
+    def test5(self):
+
+        def vlist_of_dict(x):
+            a = np.array([1, 2, 3])
+            for i in range(3):
+                global_a.append(4)
+            return a
+
+        x = paddle.to_tensor([3])
+        print(paddle.jit.to_static(vlist_of_dict).code)
+        print(paddle.jit.to_static(vlist_of_dict)(x))
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
index b3cdde6363924b..48d7d3eb20c458 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_convert_call.py
@@ -65,6 +65,23 @@ def dyfunc_with_third_library_logging(x_v):
     return x_v
 
 
+class A:
+
+    @staticmethod
+    def add(a, b):
+        """
+        dygraph mode, return a numpy object.
+        static mode, return a variable object.
+        """
+        return paddle.to_tensor(a.numpy() + b.numpy())
+
+
+@paddle.jit.to_static
+def dyfunc_with_staticmethod(x_v):
+    a = A()
+    return a.add(x_v, x_v)
+
+
 class TestRecursiveCall1(unittest.TestCase):
 
     def setUp(self):
@@ -188,6 +205,12 @@ def set_func(self):
         self.dygraph_func = dyfunc_with_third_library_logging
 
 
+class TestStaticMethod(TestRecursiveCall2):
+
+    def set_func(self):
+        self.dygraph_func = dyfunc_with_staticmethod
+
+
 # Situation 2 : test not_to_static
 
 
@@ -290,8 +313,10 @@ def set_answer_func(self):
         class StaticCode():
 
             def func_convert_then_not_to_static(x):
+                __return_value_0 = None
                 y = _jst.Call(func_not_to_static)(x)
-                return y
+                __return_value_0 = y
+                return __return_value_0
 
         self.answer_func = StaticCode.func_convert_then_not_to_static
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py
new file mode 100644
index 00000000000000..4acc789a451bb0
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_decorator_transform.py
@@ -0,0 +1,223 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import unittest
+import numpy as np
+import decos
+import warnings
+from functools import wraps
+from contextlib import contextmanager
+
+
+def deco1(func):
+
+    @wraps(func)
+    def inner(*args, **kwargs):
+        print('in deco1, added 1')
+        _x = 2
+        if (_x < 1):
+            _x += 1
+        else:
+            _x -= 1
+        _t = paddle.to_tensor([1])
+        _tt = func(*args, **kwargs)
+        return paddle.add(_t, _tt)
+
+    return inner
+
+
+def deco2(fun):
+
+    @wraps(fun)
+    def inner(*args, **kwargs):
+        print('in deco2, added 2')
+        _t = paddle.to_tensor([2])
+        _tt = fun(*args, **kwargs)
+        return paddle.add(_t, _tt)
+
+    return inner
+
+
+def deco3(x=3):
+
+    def inner_deco(func):
+
+        @wraps(func)
+        def inner(*args, **kwargs):
+            print('in deco3, added {}'.format(x))
+            _t = paddle.to_tensor(x)
+            _tt = func(*args, **kwargs)
+            return paddle.add(_t, _tt)
+
+        return inner
+
+    return inner_deco
+
+
+def deco4(func=None, x=0):
+
+    def decorated(pyfunc):
+
+        @wraps(pyfunc)
+        def inner_deco(*args, **kwargs):
+            print('in deco4, added {}'.format(x))
+            _t = paddle.to_tensor(x)
+            _tt = pyfunc(*args, **kwargs)
+            return paddle.add(_t, _tt)
+
+        return inner_deco
+
+    if func == None:
+        return decorated
+    return decorated(func)
+
+
+def deco5():
+    return deco2
+
+
+def deco6(x=0):
+    return deco2
+
+
+@deco2
+def fun1(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun1, x=%d' % (x))
+    return a
+
+
+@deco1
+@deco2
+def fun2(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun2, x=%d' % (x))
+    return a
+
+
+@deco3(3)
+def fun3(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun3, x=%d' % (x))
+    return a
+
+
+@deco4(x=4)
+def fun4(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun4, x=%d' % (x))
+    return a
+
+
+@deco2
+@deco4()
+def fun5(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun5, x=%d' % (x))
+    return a
+
+
+@decos.deco1
+@decos.deco2(2)
+def fun6(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun6, x=%d' % (x))
+    return a
+
+
+@deco5()
+def fun7(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun7, x=%d' % (x))
+    return a
+
+
+@deco6(2)
+def fun8(x, y=0):
+    a = paddle.to_tensor(y)
+    print('in fun8, x=%d' % (x))
+    return a
+
+
+@paddle.jit.to_static
+def forward():
+    funcs = [fun1, fun2, fun3, fun4, fun5, fun6, fun7, fun8]
+    out = []
+    for idx, fun in enumerate(funcs):
+        out.append(fun(idx + 1, idx + 1))
+    return out
+
+
+@contextmanager
+def contextmanager_warning():
+    yield
+
+
+@contextmanager_warning()
+def fun9():
+    print('in fun9 want contextmanager warning')
+
+
+@paddle.jit.to_static
+def warn1():
+    fun9()
+
+
+@paddle.no_grad()
+def fun10():
+    print('in fun10, paddle api decorated')
+    return True
+
+
+@paddle.jit.to_static
+def deco_with_paddle_api():
+    return fun10()
+
+
+class TestDecoratorTransform(unittest.TestCase):
+
+    def test_deco_transform(self):
+        outs = forward()
+        np.testing.assert_allclose(outs[0], np.array(3), rtol=1e-05)
+        np.testing.assert_allclose(outs[1], np.array(5), rtol=1e-05)
+        np.testing.assert_allclose(outs[2], np.array(6), rtol=1e-05)
+        np.testing.assert_allclose(outs[3], np.array(8), rtol=1e-05)
+        np.testing.assert_allclose(outs[4], np.array(7), rtol=1e-05)
+        np.testing.assert_allclose(outs[5], np.array(9), rtol=1e-05)
+        np.testing.assert_allclose(outs[6], np.array(9), rtol=1e-05)
+        np.testing.assert_allclose(outs[7], np.array(10), rtol=1e-05)
+
+    def test_contextmanager_warning(self):
+        paddle.disable_static()
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            warn1()
+            flag = False
+            for warn in w:
+                if (issubclass(warn.category, UserWarning)
+                    ) and "A context manager decorator is used" in str(
+                        warn.message):
+                    flag = True
+                    break
+            self.assertTrue(flag)
+
+    def test_deco_with_paddle_api(self):
+        self.assertTrue(deco_with_paddle_api())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 27d7389b903cc4..97f0cf99b5f65d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -399,16 +399,6 @@ def test_error(self):
 
 
 # # Situation 4: NotImplementedError
-class TestErrorInOther(unittest.TestCase):
-    def test(self):
-        paddle.disable_static()
-        prog_trans = paddle.jit.ProgramTranslator()
-        with self.assertRaises(NotImplementedError):
-            prog_trans.get_output(func_decorated_by_other_1)
-
-        with self.assertRaises(NotImplementedError):
-            func_decorated_by_other_2()
-
 
 class TestSuggestionErrorInRuntime(TestErrorBase):
     def set_func(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
index b422164cf38166..e7435922b1c609 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_origin_info.py
@@ -65,7 +65,7 @@ def set_test_func(self):
         self.func = simple_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [9, 10, 11]
+        self.static_abs_lineno_list = [9, 11, 12]
 
     def set_dygraph_info(self):
         self.line_num = 3
@@ -93,7 +93,6 @@ def _get_OriginInfo_map(self):
         self.static_func, _ = ast_to_func(transformed_ast, self.dygraph_func)
         info_map = create_and_update_origin_info_map(dygraph_ast,
                                                      self.static_func)
-
         return info_map
 
     def test_origin_info_map(self):
@@ -149,7 +148,7 @@ def set_test_func(self):
         self.func = nested_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [9, 11, 12, 13, 14]
+        self.static_abs_lineno_list = [9, 12, 14, 16, 17]
 
     def set_dygraph_info(self):
         self.line_num = 5
@@ -174,7 +173,7 @@ def set_test_func(self):
         self.func = decorated_func
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [9, 10]
+        self.static_abs_lineno_list = [9, 11]
 
     def set_dygraph_info(self):
         self.line_num = 2
@@ -208,7 +207,7 @@ def set_test_func(self):
         self.func = decorated_func2
 
     def set_static_lineno(self):
-        self.static_abs_lineno_list = [9, 10]
+        self.static_abs_lineno_list = [9, 11]
 
     def set_dygraph_info(self):
         self.line_num = 2
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py
new file mode 100644
index 00000000000000..7e177ee0c0cc01
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_place.py
@@ -0,0 +1,32 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import paddle
+import unittest
+
+
+class TestPlace(unittest.TestCase):
+
+    def test_place(self):
+
+        paddle.enable_static()
+        x = paddle.to_tensor([1, 2, 3, 4])
+        self.assertTrue(x.place() == None)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
index 2905bd07439cb0..748ff59534a3a4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
@@ -224,6 +224,49 @@ def test_diff_return(x):
     return y, z
 
 
+@to_static
+def test_return_if_else_2(x):
+    rr = 0
+    if True:
+        rr = 1
+        return 1
+    else:
+        a = 0
+
+
+@to_static
+def test_return_in_while_2(x):
+    while True:
+        a = 12
+        return 12
+    return 10
+
+
+@to_static
+def test_return_in_for_2(x):
+    a = 12
+    for i in range(10):
+        return 12
+    return 10
+
+
+@to_static
+def test_return_nested(x):
+
+    def func():
+        rr = 0
+        if True:
+            rr = 1
+            return 1
+            rr = 2
+        else:
+            a = 0
+            return 4
+        return 3
+
+    return func()
+
+
 class TestReturnBase(unittest.TestCase):
 
     def setUp(self):
@@ -256,7 +299,6 @@ def _test_value_impl(self):
                 np.testing.assert_allclose(dygraph_res[i],
                                            static_res[i],
                                            rtol=1e-05)
-
         elif isinstance(dygraph_res, np.ndarray):
             np.testing.assert_allclose(dygraph_res, static_res, rtol=1e-05)
         else:
@@ -282,6 +324,24 @@ def init_dygraph_func(self):
         self.dygraph_func = test_return_if
 
 
+class TestReturnOnlyIf(TestReturnBase):
+
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_if_else_2
+
+
+class TestReturnInFor(TestReturnBase):
+
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_in_for
+
+
+class TestReturnInWhile(TestReturnBase):
+
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_in_while
+
+
 class TestReturnIfDiff(TestReturnBase):
 
     def init_dygraph_func(self):
@@ -294,16 +354,18 @@ def init_dygraph_func(self):
         self.dygraph_func = test_return_if_else
 
 
-class TestReturnInWhile(TestReturnBase):
+class TestReturnInWhile2(TestReturnBase):
 
     def init_dygraph_func(self):
-        self.dygraph_func = test_return_in_while
+        self.dygraph_func = test_return_in_while_2
+        self.error = "Found return statement in While or For body and loop"
 
 
-class TestReturnInFor(TestReturnBase):
+class TestReturnInFor2(TestReturnBase):
 
     def init_dygraph_func(self):
-        self.dygraph_func = test_return_in_for
+        self.dygraph_func = test_return_in_for_2
+        self.error = "Found return statement in While or For body and loop"
 
 
 class TestRecursiveReturn(TestReturnBase):
@@ -371,6 +433,12 @@ def init_dygraph_func(self):
         self.dygraph_func = test_return_tuple_many_values
 
 
+class TestReturnNested(TestReturnBase):
+
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_nested
+
+
 class TestReturnSpecial(TestReturnBase):
 
     def init_dygraph_func(self):
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
index 3a7a1dc1b0eaef..6d8f0b14407847 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_slice.py
@@ -284,5 +284,23 @@ def test_compare_paddle_strided_slice_with_numpy(self):
         np.testing.assert_array_equal(sl.numpy(), array_slice)
 
 
+def slice_zero_shape_tensor(x):
+    y = x[1:2]
+    return y
+
+
+class TestSliceZeroShapeTensor(unittest.TestCase):
+
+    def test_slice(self):
+        paddle.disable_static()
+        x = paddle.ones([0, 0, 0, 0])
+        y = slice_zero_shape_tensor(x)
+        np.testing.assert_equal(y.shape, [0, 0, 0, 0])
+
+        static_func = paddle.jit.to_static(slice_zero_shape_tensor)
+        y = static_func(x)
+        np.testing.assert_equal(y.shape, [0, 0, 0, 0])
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py
index 2ad9153fbaaf21..a21a155d6004dc 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tensor_methods.py
@@ -62,6 +62,29 @@ def test_to_static_numpy_report_error(self):
             static_res = self._run(to_static=True)
 
 
+@paddle.jit.to_static
+def tensor_item(x):
+    x = paddle.to_tensor(x)
+    y = x.clone()
+    return y.item()
+
+
+class TestTensorItem(unittest.TestCase):
+
+    def _run(self, to_static):
+        prog_trans = paddle.jit.ProgramTranslator()
+        prog_trans.enable(to_static)
+        x = paddle.ones([1])
+        if to_static:
+            return tensor_item(x).numpy()
+        return tensor_item(x)
+
+    def test_tensor_clone(self):
+        dygraph_res = self._run(to_static=False)
+        static_res = self._run(to_static=True)
+        np.testing.assert_allclose(dygraph_res, static_res)
+
+
 @paddle.jit.to_static
 def tensor_size(x):
     x = paddle.to_tensor(x)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typehint.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typehint.py
new file mode 100644
index 00000000000000..b8addd53d5b180
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_typehint.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+import unittest
+
+from paddle.fluid.dygraph.jit import declarative
+
+SEED = 2020
+np.random.seed(SEED)
+
+
+class A:
+    pass
+
+
+def function(x: A) -> A:
+    t: A = A()
+    return 2 * x
+
+
+class TestTransformWhileLoop(unittest.TestCase):
+
+    def setUp(self):
+        self.place = fluid.CUDAPlace(
+            0) if fluid.is_compiled_with_cuda() else fluid.CPUPlace()
+        self.x = np.zeros(shape=(1), dtype=np.int32)
+        self._init_dyfunc()
+
+    def _init_dyfunc(self):
+        self.dyfunc = function
+
+    def _run_static(self):
+        return self._run(to_static=True)
+
+    def _run_dygraph(self):
+        return self._run(to_static=False)
+
+    def _run(self, to_static):
+        with fluid.dygraph.guard(self.place):
+            # Set the input of dyfunc to VarBase
+            tensor_x = fluid.dygraph.to_variable(self.x, zero_copy=False)
+            if to_static:
+                ret = declarative(self.dyfunc)(tensor_x)
+            else:
+                ret = self.dyfunc(tensor_x)
+            if hasattr(ret, "numpy"):
+                return ret.numpy()
+            else:
+                return ret
+
+    def test_ast_to_func(self):
+        static_numpy = self._run_static()
+        dygraph_numpy = self._run_dygraph()
+        print(static_numpy, dygraph_numpy)
+        np.testing.assert_allclose(dygraph_numpy, static_numpy, rtol=1e-05)
+
+
+class TestTypeHint(TestTransformWhileLoop):
+
+    def _init_dyfunc(self):
+        self.dyfunc = function
+
+
+if __name__ == '__main__':
+    with fluid.framework._test_eager_guard():
+        unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_warning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_warning.py
new file mode 100644
index 00000000000000..2e520f322a8b04
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_warning.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import warnings
+from paddle.fluid.dygraph.dygraph_to_static.program_translator import (
+    convert_to_static,
+)
+from paddle.fluid.layers.control_flow import cond
+
+
+@paddle.jit.to_static
+def fun1():
+    a = paddle.to_tensor(1)
+    b = paddle.to_tensor(2)
+    if a > b:
+        b = paddle.to_tensor(3)
+    else:
+        b = None
+
+
+def true_fn():
+    return [paddle.to_tensor(1), [paddle.to_tensor(2), paddle.to_tensor(3)]]
+
+
+def false_fn():
+    return [paddle.to_tensor(3), [None, paddle.to_tensor(4)]]
+
+
+class TestReturnNoneInIfelse(unittest.TestCase):
+    def test_dy2static_warning(self):
+        paddle.disable_static()
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            fun1()
+            flag = False
+            for warn in w:
+                if (
+                    issubclass(warn.category, UserWarning)
+                ) and "Set var to 'None' in ifelse block might lead to error." in str(
+                    warn.message
+                ):
+                    flag = True
+                    break
+            self.assertTrue(flag)
+
+    def test_cond_warning(self):
+        paddle.enable_static()
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            a = paddle.to_tensor(1)
+            b = paddle.to_tensor(2)
+            cond(a < b, true_fn, false_fn, return_names=['ret1', 'ret2'])
+            flag = False
+            for warn in w:
+                if (
+                    issubclass(warn.category, UserWarning)
+                ) and "Set var to 'None' in ifelse block might lead to error." in str(
+                    warn.message
+                ):
+                    flag = True
+                    break
+            self.assertTrue(flag)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/gradient_checker.py b/python/paddle/fluid/tests/unittests/gradient_checker.py
index be1fa92f0888e0..15320c395e2db9 100644
--- a/python/paddle/fluid/tests/unittests/gradient_checker.py
+++ b/python/paddle/fluid/tests/unittests/gradient_checker.py
@@ -268,6 +268,9 @@ def fail_test(msg):
     for v in x:
         v.stop_gradient = False
         v.persistable = True
+    for u in y:
+        u.stop_gradient = False
+        u.persistable = True
     if place is None:
         place = fluid.CPUPlace()
     if program is None:
@@ -364,6 +367,9 @@ def double_grad_check(x,
         v.stop_gradient = False
         v.persistable = True
     y = _as_list(y)
+    for u in y:
+        u.stop_gradient = False
+        u.persistable = True
 
     if program is None:
         program = fluid.default_main_program()
@@ -445,6 +451,9 @@ def triple_grad_check(x,
         v.stop_gradient = False
         v.persistable = True
     y = _as_list(y)
+    for u in y:
+        u.stop_gradient = False
+        u.persistable = True
 
     if program is None:
         program = fluid.default_main_program()
@@ -578,6 +587,9 @@ def get_static_double_grad(x,
     for v in x:
         v.stop_gradient = False
         v.persistable = True
+    for u in y:
+        u.stop_gradient = False
+        u.persistable = True
     if place is None:
         place = fluid.CPUPlace()
     if program is None:
@@ -692,7 +704,8 @@ def get_eager_double_grad(func,
                             allow_unused=True)
 
     if return_mid_result:
-        return dd_inputs, inputs + ddys
+        return [dd_input for dd_input in dd_inputs
+                if dd_input is not None], inputs + ddys
     else:
         return [
             dd_input.numpy() for dd_input in dd_inputs if dd_input is not None
@@ -735,7 +748,9 @@ def fail_test(msg):
         v.stop_gradient = False
         v.persistable = True
     y = _as_list(y)
-
+    for u in y:
+        u.stop_gradient = False
+        u.persistable = True
     y_grads_init = []
     for yi in y:
         np_type = dtype_to_np_dtype(yi.dtype)
@@ -857,8 +872,13 @@ def get_eager_triple_grad(func,
         dddy = paddle.ones(shape=dd_yi.shape, dtype=dd_yi.dtype)
         dddy.stop_gradient = False
         dddys.append(dddy)
-    ddd_inputs = paddle.grad(outputs=dd_y, inputs=dd_x, grad_outputs=dddys)
-    return [ddd_input.numpy() for ddd_input in ddd_inputs]
+    ddd_inputs = paddle.grad(outputs=dd_y,
+                             inputs=dd_x,
+                             grad_outputs=dddys,
+                             allow_unused=True)
+    return [
+        ddd_input.numpy() for ddd_input in ddd_inputs if ddd_input is not None
+    ]
 
 
 def triple_grad_check_for_dygraph(func,
@@ -897,7 +917,9 @@ def fail_test(msg):
         v.stop_gradient = False
         v.persistable = True
     y = _as_list(y)
-
+    for u in y:
+        u.stop_gradient = False
+        u.persistable = True
     y_grads_init = []
     for yi in y:
         np_type = dtype_to_np_dtype(yi.dtype)
diff --git a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
index 2d2f1548f076d9..9572d236fe329b 100644
--- a/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ipu/CMakeLists.txt
@@ -11,10 +11,14 @@ if(WITH_IPU)
   endforeach()
 
   set_tests_properties(test_conv_op_ipu PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 300)
-  set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_elemetwise_x_op_ipu PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_reduce_x_op_ipu PROPERTIES TIMEOUT 800)
   set_tests_properties(test_save_load_ipu PROPERTIES TIMEOUT 600)
   set_tests_properties(test_activation_ops_ipu PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_unary_ops_ipu PROPERTIES TIMEOUT 600)
+  set_tests_properties(test_greater_op_ipu PROPERTIES TIMEOUT 400)
+  set_tests_properties(test_conv2d_transpose_op_ipu PROPERTIES TIMEOUT 300)
+  set_tests_properties(test_model_parallel_ipu PROPERTIES TIMEOUT 300)
 
   add_subdirectory(custom_ops)
 
diff --git a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
index 0226ca4ae74325..d3473bc2b0d4a1 100644
--- a/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/op_test_ipu.py
@@ -204,6 +204,14 @@ def run_op_test(self, exec_mode, ipu_strategy=None):
             if self.is_fp16_mode(exec_mode):
                 ipu_strategy.set_precision_config(enable_fp16=True)
                 IPUOpTest.cast_model_to_fp16(self.main_prog)
+
+            # TODO(ipu) remove in the future version of popart
+            # keep the log clean, no side effects for tests without profiling
+            ipu_strategy.set_options(
+                {'engine_options': {
+                    'debug.retainDebugInformation': 'false'
+                }})
+
             program = paddle.static.IpuCompiledProgram(
                 self.main_prog,
                 ipu_strategy=ipu_strategy).compile(self.feed_list,
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
index 21bcb7b7314ab7..7118466a521019 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_inference_ipu.py
@@ -43,12 +43,6 @@ def set_feed_attr(self):
         self.feed_shape = [x.shape for x in self.feed_fp32.values()]
         self.feed_list = list(self.feed_fp32.keys())
 
-    def dtype_check(self, program, to_fp16_var_names):
-        block = program.global_block()
-        assert len(to_fp16_var_names) > 0
-        for var_name in to_fp16_var_names:
-            assert (block.var(var_name).dtype, paddle.float16)
-
     def set_attrs(self):
         self.num_ipus = 1
         self.enable_pipelining = False
@@ -84,7 +78,6 @@ def run_model(self, exec_mode):
             amp_list.unsupported_list = {}
             to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
                 self.main_prog, amp_list, use_fp16_guard=True)
-            self.dtype_check(self.main_prog, to_fp16_var_names)
 
         if self.is_ipu_mode(exec_mode):
             place = paddle.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
index a733a26d606164..51a0e91a29c3bc 100644
--- a/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
+++ b/python/paddle/fluid/tests/unittests/ipu/test_mixed_precision_training_ipu.py
@@ -55,12 +55,6 @@ def set_attrs(self):
         self.enable_manual_shard = False
         self.batches_per_step = 1
 
-    def dtype_check(self, program, to_fp16_var_names):
-        block = program.global_block()
-        assert len(to_fp16_var_names) > 0
-        for var_name in to_fp16_var_names:
-            assert (block.var(var_name).dtype, paddle.float16)
-
     @IPUOpTest.static_graph
     def build_model(self):
         x = paddle.static.data(name=self.feed_list[0],
@@ -94,7 +88,6 @@ def run_model(self, exec_mode):
             amp_list.unsupported_list = {}
             to_fp16_var_names = paddle.static.amp.cast_model_to_fp16(
                 self.main_prog, amp_list)
-            self.dtype_check(self.main_prog, to_fp16_var_names)
 
         if self.is_ipu_mode(exec_mode):
             place = paddle.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index 085ed55a979be7..5f3bfa62ebc1a6 100755
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -171,8 +171,6 @@ if(WITH_GPU AND TENSORRT_FOUND)
                                                                        240)
     set_tests_properties(test_trt_squeeze2_matmul_fuse_pass PROPERTIES TIMEOUT
                                                                        240)
-    set_tests_properties(test_trt_reshape2_matmul_fuse_pass PROPERTIES TIMEOUT
-                                                                       240)
     set_tests_properties(test_shuffle_channel_detect_pass PROPERTIES TIMEOUT
                                                                      120)
     if(WIN32)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
index 2d5dd9fe4bd5b6..d2a93adc2a36f9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_layernorm_shift_partition_pass.py
@@ -15,6 +15,7 @@
 from auto_scan_test import PassAutoScanTest, IgnoreReasons
 from program_config import TensorConfig, ProgramConfig, OpConfig
 import numpy as np
+import math
 import paddle.inference as paddle_infer
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
@@ -61,6 +62,24 @@ def sample_predictor_configs(self, program_config):
         })
         yield config, ['layernorm_shift_partition'], (1e-5, 1e-5)
 
+        # trt dynamic_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=1,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Half,
+            use_static=False,
+            use_calib_mode=False)
+        config.set_trt_dynamic_shape_info({
+            "input_data": [1, 9, 96],
+        }, {
+            "input_data": [4, 3136, 768],
+        }, {
+            "input_data": [1, 784, 384],
+        })
+        yield config, ['layernorm_shift_partition'], (1e-3, 1e-3)
+
     def sample_program_config(self, draw):
         axis = [0, 1, 3, 2, 4, 5]
         epsilon = draw(st.floats(min_value=0.0000001, max_value=0.001))
@@ -198,10 +217,220 @@ def generate_weight(attrs):
 
     def test(self):
         self.run_and_statis(quant=False,
-                            max_examples=20,
+                            max_examples=50,
+                            passes=["layernorm_shift_partition_fuse_pass"],
+                            max_duration=250,
+                            min_success_num=50)
+
+
+class TestLayernormShiftPartition2Pass(PassAutoScanTest):
+    """
+       |
+    layer_norm
+       |
+    reshape2
+       |
+      roll
+       |
+    reshape2
+       |
+    transpose2
+       |
+    reshape2
+       |
+    reshape2
+       |
+    """
+
+    def sample_predictor_configs(self, program_config):
+        # trt dynamic_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=1,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Float32,
+            use_static=False,
+            use_calib_mode=False)
+        config.set_trt_dynamic_shape_info({
+            "input_data": [1, 9, 96],
+        }, {
+            "input_data": [4, 3136, 768],
+        }, {
+            "input_data": [1, 784, 384],
+        })
+        yield config, ['layernorm_shift_partition'], (1e-5, 1e-5)
+
+        # trt dynamic_shape
+        config = self.create_trt_inference_config()
+        config.enable_tensorrt_engine(
+            max_batch_size=4,
+            workspace_size=102400,
+            min_subgraph_size=0,
+            precision_mode=paddle_infer.PrecisionType.Half,
+            use_static=False,
+            use_calib_mode=False)
+        config.set_trt_dynamic_shape_info({
+            "input_data": [1, 9, 96],
+        }, {
+            "input_data": [4, 3136, 768],
+        }, {
+            "input_data": [1, 784, 384],
+        })
+        yield config, ['layernorm_shift_partition'], (1e-3, 1e-3)
+
+    def sample_program_config(self, draw):
+        axis = [0, 1, 3, 2, 4, 5]
+        epsilon = draw(st.floats(min_value=0.0000001, max_value=0.001))
+        # begin_norm_axis has to be 2
+        begin_norm_axis = 2
+        batch_size = draw(st.integers(min_value=1, max_value=4))
+
+        window_size = draw(st.sampled_from([3, 5, 7]))
+        move_shape = draw(st.integers(min_value=1, max_value=8))
+        dim = draw(st.sampled_from([96, 192, 384, 768]))
+
+        def generate_input(attrs):
+            return np.random.random(
+                [attrs[1]["batch_size"],
+                 *attrs[1]["input_dim"]]).astype(np.float32)
+
+        def generate_weight(attrs):
+            return np.random.random(attrs[1]['input_dim'][-1]).astype(
+                np.float32)
+
+        attrs = [{
+            'begin_norm_axis': begin_norm_axis,
+            'epsilon': epsilon,
+        }, {
+            'batch_size': batch_size,
+            'input_dim': [(window_size * move_shape)**2, dim],
+        }, {
+            'axis': axis,
+            'input_resolution': window_size * move_shape,
+            'move_shape': move_shape,
+            'window_size': window_size,
+        }]
+
+        layer_norm_op = OpConfig(type="layer_norm",
+                                 inputs={
+                                     "X": ["input_data"],
+                                     "Bias": ["layer_norm_bias"],
+                                     "Scale": ["layer_norm_scale"]
+                                 },
+                                 outputs={
+                                     "Y": ["layer_norm_output1"],
+                                     "Mean": ["layer_norm_output2"],
+                                     "Variance": ["layer_norm_output3"]
+                                 },
+                                 attrs={
+                                     "begin_norm_axis":
+                                     attrs[0]["begin_norm_axis"],
+                                     "epsilon": attrs[0]["epsilon"],
+                                 })
+        reshape_op2 = OpConfig(type="reshape2",
+                               inputs={
+                                   "X": ["layer_norm_output1"],
+                               },
+                               outputs={
+                                   "Out": ["reshape_output2"],
+                                   "XShape": ["reshape_output2_xshape"],
+                               },
+                               attrs={
+                                   'shape': [
+                                       -1, attrs[2]["input_resolution"],
+                                       attrs[2]["input_resolution"],
+                                       attrs[1]["input_dim"][-1]
+                                   ]
+                               })
+        roll_op1 = OpConfig(type="roll",
+                            inputs={"X": ["reshape_output2"]},
+                            outputs={"Out": ["roll_output1"]},
+                            attrs={
+                                "axis": [1, 2],
+                                "shifts": [
+                                    -math.floor(
+                                        (attrs[2]["window_size"]) / 2.0),
+                                    -math.floor((attrs[2]["window_size"]) / 2.0)
+                                ]
+                            })
+        reshape_op3 = OpConfig(type="reshape2",
+                               inputs={
+                                   "X": ["roll_output1"],
+                               },
+                               outputs={
+                                   "Out": ["reshape_output3"],
+                                   "XShape": ["reshape_output3_xshape"],
+                               },
+                               attrs={
+                                   'shape': [
+                                       -1, attrs[2]["move_shape"],
+                                       attrs[2]["window_size"],
+                                       attrs[2]["move_shape"],
+                                       attrs[2]["window_size"],
+                                       attrs[1]["input_dim"][-1]
+                                   ]
+                               })
+        transpose_op4 = OpConfig(type='transpose2',
+                                 inputs={
+                                     "X": ["reshape_output3"],
+                                 },
+                                 outputs={"Out": ["transpose_output4"]},
+                                 attrs={"axis": attrs[2]['axis']})
+        reshape_op5 = OpConfig(type="reshape2",
+                               inputs={
+                                   "X": ["transpose_output4"],
+                               },
+                               outputs={
+                                   "Out": ["reshape_output5"],
+                                   "XShape": ["reshape_output5_xshape"],
+                               },
+                               attrs={
+                                   'shape': [
+                                       -1, attrs[2]["window_size"],
+                                       attrs[2]["window_size"],
+                                       attrs[1]["input_dim"][-1]
+                                   ]
+                               })
+        reshape_op6 = OpConfig(
+            type="reshape2",
+            inputs={
+                "X": ["reshape_output5"],
+            },
+            outputs={
+                "Out": ["reshape_output6"],
+                "XShape": ["reshape_output6_xshape"],
+            },
+            attrs={
+                'shape':
+                [-1, attrs[2]["window_size"]**2, attrs[1]["input_dim"][-1]]
+            })
+
+        program_config = ProgramConfig(
+            ops=[
+                layer_norm_op, reshape_op2, roll_op1, reshape_op3,
+                transpose_op4, reshape_op5, reshape_op6
+            ],
+            weights={
+                "layer_norm_bias":
+                TensorConfig(data_gen=partial(generate_weight, attrs)),
+                "layer_norm_scale":
+                TensorConfig(data_gen=partial(generate_weight, attrs))
+            },
+            inputs={
+                "input_data":
+                TensorConfig(data_gen=partial(generate_input, attrs)),
+            },
+            outputs=["reshape_output6"])
+
+        return program_config
+
+    def test(self):
+        self.run_and_statis(quant=False,
+                            max_examples=50,
                             passes=["layernorm_shift_partition_fuse_pass"],
                             max_duration=250,
-                            min_success_num=20)
+                            min_success_num=50)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
index b894fc708b4243..80b13cb82dd017 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_activation_fuse_pass.py
@@ -21,7 +21,6 @@
 
 
 class TestMatmulActivationMkldnnFusePass(PassAutoScanTest):
-
     def sample_program_config(self, draw):
         transpose_X = draw(st.booleans())
         transpose_Y = draw(st.booleans())
@@ -30,11 +29,25 @@ def sample_program_config(self, draw):
         channel = draw(st.sampled_from([8]))
         input_dim = draw(st.sampled_from([32]))
         activation_type = draw(
-            st.sampled_from([
-                'relu', 'gelu', 'swish', 'mish', 'sqrt', 'hard_swish',
-                'sigmoid', 'abs', 'relu6', 'clip', 'tanh', 'hard_sigmoid',
-                'leaky_relu'
-            ]))
+            st.sampled_from(
+                [
+                    'relu',
+                    'gelu',
+                    'swish',
+                    'mish',
+                    'sqrt',
+                    'hard_swish',
+                    'sigmoid',
+                    'abs',
+                    'relu6',
+                    'clip',
+                    'tanh',
+                    'hard_sigmoid',
+                    'leaky_relu',
+                    'scale',
+                ]
+            )
+        )
 
         def generate_input(type):
             if transpose_X and transpose_Y:
@@ -55,50 +68,60 @@ def generate_input(type):
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
-        matmul_op = OpConfig(type='matmul',
-                             inputs={
-                                 'X': ['matmul_X'],
-                                 'Y': ['matmul_Y']
-                             },
-                             outputs={'Out': ['matmul_output']},
-                             attrs={
-                                 'transpose_X': transpose_X,
-                                 'transpose_Y': transpose_Y,
-                                 'alpha': alpha
-                             })
+        matmul_op = OpConfig(
+            type='matmul',
+            inputs={'X': ['matmul_X'], 'Y': ['matmul_Y']},
+            outputs={'Out': ['matmul_output']},
+            attrs={
+                'transpose_X': transpose_X,
+                'transpose_Y': transpose_Y,
+                'alpha': alpha,
+                'use_mkldnn': True,
+            },
+        )
 
         if activation_type == "relu6":
-            activation_op = OpConfig(activation_type,
-                                     inputs={"X": ["matmul_output"]},
-                                     outputs={"Out": ["activation_output"]},
-                                     threshold=draw(
-                                         st.floats(min_value=1.0,
-                                                   max_value=10.0)))
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+                threshold=draw(st.floats(min_value=1.0, max_value=10.0)),
+            )
         elif activation_type == "leaky_relu":
-            activation_op = OpConfig(activation_type,
-                                     inputs={"X": ["matmul_output"]},
-                                     outputs={"Out": ["activation_output"]},
-                                     alpha=draw(
-                                         st.floats(min_value=0.1,
-                                                   max_value=1.0)))
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+                alpha=draw(st.floats(min_value=0.1, max_value=1.0)),
+            )
+        elif activation_type == "scale":
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+                scale=draw(st.sampled_from([0.125, 0.4, 0.875, 2])),
+            )
         elif activation_type == "swish":
-            activation_op = OpConfig(activation_type,
-                                     inputs={"X": ["matmul_output"]},
-                                     outputs={"Out": ["activation_output"]},
-                                     beta=draw(
-                                         st.floats(min_value=0.1,
-                                                   max_value=1.0)))
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+                beta=draw(st.floats(min_value=0.1, max_value=1.0)),
+            )
         elif activation_type == "clip":
             activation_op = OpConfig(
                 activation_type,
                 inputs={"X": ["matmul_output"]},
                 outputs={"Out": ["activation_output"]},
                 min=draw(st.floats(min_value=0.1, max_value=0.49)),
-                max=draw(st.floats(min_value=0.5, max_value=1.0)))
+                max=draw(st.floats(min_value=0.5, max_value=1.0)),
+            )
         else:
-            activation_op = OpConfig(activation_type,
-                                     inputs={"X": ["matmul_output"]},
-                                     outputs={"Out": ["activation_output"]})
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+            )
 
         model_net = [matmul_op, activation_op]
 
@@ -107,20 +130,32 @@ def generate_input(type):
             weights={},
             inputs={
                 'matmul_X': TensorConfig(data_gen=partial(generate_input, 'x')),
-                'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'y'))
+                'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'y')),
             },
-            outputs=['activation_output'])
+            outputs=['activation_output'],
+        )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(
+            use_mkldnn=True,
+            passes=[
+                'matmul_activation_mkldnn_fuse_pass',
+                'operator_scale_onednn_fuse_pass',
+            ],
+        )
         yield config, ['matmul'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(quant=False,
-                            max_examples=30,
-                            passes=['matmul_activation_mkldnn_fuse_pass'])
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            passes=[
+                'matmul_activation_mkldnn_fuse_pass',
+                'operator_scale_onednn_fuse_pass',
+            ],
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
index 153b81fa797af5..84fc91e01620be 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_matmul_v2_activation_fuse_pass.py
@@ -21,7 +21,6 @@
 
 
 class TestMatmulv2ActivationMkldnnFusePass(PassAutoScanTest):
-
     def sample_program_config(self, draw):
         transpose_X = draw(st.booleans())
         transpose_Y = draw(st.booleans())
@@ -29,11 +28,25 @@ def sample_program_config(self, draw):
         channel = draw(st.sampled_from([16, 32, 64]))
         input_dim = draw(st.sampled_from([16, 32, 64]))
         activation_type = draw(
-            st.sampled_from([
-                'relu', 'gelu', 'swish', 'mish', 'sqrt', 'hard_swish',
-                'sigmoid', 'abs', 'relu6', 'clip', 'tanh', 'hard_sigmoid',
-                'leaky_relu'
-            ]))
+            st.sampled_from(
+                [
+                    'relu',
+                    'gelu',
+                    'swish',
+                    'mish',
+                    'sqrt',
+                    'hard_swish',
+                    'sigmoid',
+                    'abs',
+                    'relu6',
+                    'clip',
+                    'tanh',
+                    'hard_sigmoid',
+                    'leaky_relu',
+                    'scale',
+                ]
+            )
+        )
 
         def generate_input(type):
             broadcast_X = st.booleans()
@@ -60,49 +73,59 @@ def generate_input(type):
             else:
                 return np.random.random(shape_y).astype(np.float32)
 
-        matmul_op = OpConfig(type='matmul_v2',
-                             inputs={
-                                 'X': ['matmul_X'],
-                                 'Y': ['matmul_Y']
-                             },
-                             outputs={'Out': ['matmul_output']},
-                             attrs={
-                                 'trans_x': transpose_X,
-                                 'trans_y': transpose_Y
-                             })
+        matmul_op = OpConfig(
+            type='matmul_v2',
+            inputs={'X': ['matmul_X'], 'Y': ['matmul_Y']},
+            outputs={'Out': ['matmul_output']},
+            attrs={
+                'trans_x': transpose_X,
+                'trans_y': transpose_Y,
+                'use_mkldnn': True,
+            },
+        )
 
         if activation_type == 'relu6':
-            activation_op = OpConfig(activation_type,
-                                     inputs={'X': ['matmul_output']},
-                                     outputs={'Out': ['activation_output']},
-                                     threshold=draw(
-                                         st.floats(min_value=1.0,
-                                                   max_value=10.0)))
+            activation_op = OpConfig(
+                activation_type,
+                inputs={'X': ['matmul_output']},
+                outputs={'Out': ['activation_output']},
+                threshold=draw(st.floats(min_value=1.0, max_value=10.0)),
+            )
         elif activation_type == 'leaky_relu':
-            activation_op = OpConfig(activation_type,
-                                     inputs={'X': ['matmul_output']},
-                                     outputs={'Out': ['activation_output']},
-                                     alpha=draw(
-                                         st.floats(min_value=0.1,
-                                                   max_value=1.0)))
+            activation_op = OpConfig(
+                activation_type,
+                inputs={'X': ['matmul_output']},
+                outputs={'Out': ['activation_output']},
+                alpha=draw(st.floats(min_value=0.1, max_value=1.0)),
+            )
+        elif activation_type == "scale":
+            activation_op = OpConfig(
+                activation_type,
+                inputs={"X": ["matmul_output"]},
+                outputs={"Out": ["activation_output"]},
+                scale=draw(st.sampled_from([0.125, 0.4, 0.875, 2])),
+            )
         elif activation_type == 'swish':
-            activation_op = OpConfig(activation_type,
-                                     inputs={'X': ['matmul_output']},
-                                     outputs={'Out': ['activation_output']},
-                                     beta=draw(
-                                         st.floats(min_value=0.1,
-                                                   max_value=1.0)))
+            activation_op = OpConfig(
+                activation_type,
+                inputs={'X': ['matmul_output']},
+                outputs={'Out': ['activation_output']},
+                beta=draw(st.floats(min_value=0.1, max_value=1.0)),
+            )
         elif activation_type == 'clip':
             activation_op = OpConfig(
                 activation_type,
                 inputs={'X': ['matmul_output']},
                 outputs={'Out': ['activation_output']},
                 min=draw(st.floats(min_value=0.1, max_value=0.49)),
-                max=draw(st.floats(min_value=0.5, max_value=1.0)))
+                max=draw(st.floats(min_value=0.5, max_value=1.0)),
+            )
         else:
-            activation_op = OpConfig(activation_type,
-                                     inputs={'X': ['matmul_output']},
-                                     outputs={'Out': ['activation_output']})
+            activation_op = OpConfig(
+                activation_type,
+                inputs={'X': ['matmul_output']},
+                outputs={'Out': ['activation_output']},
+            )
 
         model_net = [matmul_op, activation_op]
 
@@ -111,20 +134,32 @@ def generate_input(type):
             weights={},
             inputs={
                 'matmul_X': TensorConfig(data_gen=partial(generate_input, 'X')),
-                'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'Y'))
+                'matmul_Y': TensorConfig(data_gen=partial(generate_input, 'Y')),
             },
-            outputs=['activation_output'])
+            outputs=['activation_output'],
+        )
 
         return program_config
 
     def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_mkldnn=True)
+        config = self.create_inference_config(
+            use_mkldnn=True,
+            passes=[
+                'matmul_activation_mkldnn_fuse_pass',
+                'operator_scale_onednn_fuse_pass',
+            ],
+        )
         yield config, ['matmul_v2'], (1e-5, 1e-5)
 
     def test(self):
-        self.run_and_statis(quant=False,
-                            max_examples=30,
-                            passes=['matmul_activation_mkldnn_fuse_pass'])
+        self.run_and_statis(
+            quant=False,
+            max_examples=50,
+            passes=[
+                'matmul_activation_mkldnn_fuse_pass',
+                'operator_scale_onednn_fuse_pass',
+            ],
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
index 82b73609b2e11c..a4054a9bd6dc2b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_activation.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertActivationTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.random.random([32]).astype(np.float32)
@@ -41,11 +39,19 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
                 for op_type in [
-                        "relu", "sigmoid", "tanh", "relu6", "elu", "selu",
-                        "softsign", "stanh", "thresholded_relu", "softplus"
+                    "relu",
+                    "sigmoid",
+                    "tanh",
+                    "relu6",
+                    "elu",
+                    "selu",
+                    "softsign",
+                    "stanh",
+                    "thresholded_relu",
+                    "softplus",
                 ]:
                     # few samples to reduce time
-                    #for beta in [-0.2, 0.5, 0.67, 3]:
+                    # for beta in [-0.2, 0.5, 0.67, 3]:
                     #    for alpha in [-0.2, 0.5, 0.67, 3]:
                     for beta in [0.67]:
                         for alpha in [0.67]:
@@ -62,33 +68,34 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
                             if op_type == "softplus":
                                 dics = [{"beta": beta}]
 
-                            ops_config = [{
-                                "op_type": op_type,
-                                "op_inputs": {
-                                    "X": ["input_data"]
-                                },
-                                "op_outputs": {
-                                    "Out": ["output_data"]
-                                },
-                                "op_attrs": dics[0]
-                            }]
+                            ops_config = [
+                                {
+                                    "op_type": op_type,
+                                    "op_inputs": {"X": ["input_data"]},
+                                    "op_outputs": {"Out": ["output_data"]},
+                                    "op_attrs": dics[0],
+                                }
+                            ]
                             ops = self.generate_op_config(ops_config)
 
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={},
                                 inputs={
-                                    "input_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input1, dims, batch, dics))
+                                    "input_data": TensorConfig(
+                                        data_gen=partial(
+                                            generate_input1, dims, batch, dics
+                                        )
+                                    )
                                 },
-                                outputs=["output_data"])
+                                outputs=["output_data"],
+                            )
 
                             yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -131,19 +138,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
index 0a2877b9a2327e..1e5fd74879003f 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_anchor_generator.py
@@ -22,60 +22,66 @@
 
 
 class TrtConvertAnchorGeneratorTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(batch, attrs: List[Dict[str, Any]]):
             return np.random.random([batch, 3, 64, 64]).astype(np.float32)
 
         for batch in [1, 2, 4]:
             for anchor_sizes in [[64.0, 128.0, 256.0, 512.0]]:
                 for aspect_ratios in [[0.5, 1, 2], [0.4, 1.2, 3]]:
-                    for variances in [[1.0, 1.0, 1.0, 1.0],
-                                      [0.5, 1.0, 0.5, 1.0]]:
+                    for variances in [
+                        [1.0, 1.0, 1.0, 1.0],
+                        [0.5, 1.0, 0.5, 1.0],
+                    ]:
                         for stride in [[16.0, 16.0], [16.0, 32.0]]:
                             for offset in [0.5, 0.8]:
-                                dics = [{
-                                    "anchor_sizes": anchor_sizes,
-                                    "aspect_ratios": aspect_ratios,
-                                    "variances": variances,
-                                    "stride": stride,
-                                    "offset": offset
-                                }]
-
-                                ops_config = [{
-                                    "op_type": "anchor_generator",
-                                    "op_inputs": {
-                                        "Input": ["input_data"]
-                                    },
-                                    "op_outputs": {
-                                        "Anchors": ["output_anchors"],
-                                        "Variances": ["output_variances"]
-                                    },
-                                    "op_attrs": dics[0]
-                                }]
+                                dics = [
+                                    {
+                                        "anchor_sizes": anchor_sizes,
+                                        "aspect_ratios": aspect_ratios,
+                                        "variances": variances,
+                                        "stride": stride,
+                                        "offset": offset,
+                                    }
+                                ]
+
+                                ops_config = [
+                                    {
+                                        "op_type": "anchor_generator",
+                                        "op_inputs": {"Input": ["input_data"]},
+                                        "op_outputs": {
+                                            "Anchors": ["output_anchors"],
+                                            "Variances": ["output_variances"],
+                                        },
+                                        "op_attrs": dics[0],
+                                    }
+                                ]
                                 ops = self.generate_op_config(ops_config)
 
                                 program_config = ProgramConfig(
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input1, batch, dics))
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1, batch, dics
+                                            )
+                                        )
                                     },
                                     outputs=[
-                                        "output_anchors", "output_variances"
-                                    ])
+                                        "output_anchors",
+                                        "output_variances",
+                                    ],
+                                )
 
                                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -100,19 +106,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
index 8d01029c78a7d9..a19132571468a0 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_arg_max.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertArgMaxTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         input_shape = program_config.inputs["arg_max_input"].shape
         axis = program_config.ops[0].attrs["axis"]
@@ -33,7 +32,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(rank, batch):
             dims = [batch]
             for i in range(rank - 1):
@@ -48,36 +46,37 @@ def generate_input(rank, batch):
                         self.rank = rank
                         flatten = False
                         dtype = 2
-                        ops_config = [{
-                            "op_type": "arg_max",
-                            "op_inputs": {
-                                "X": ["arg_max_input"]
-                            },
-                            "op_outputs": {
-                                "Out": ["arg_max_out"]
-                            },
-                            "op_attrs": {
-                                "axis": axis,
-                                "keepdims": keepdims,
-                                "flatten": flatten,
-                                "dtype": dtype
+                        ops_config = [
+                            {
+                                "op_type": "arg_max",
+                                "op_inputs": {"X": ["arg_max_input"]},
+                                "op_outputs": {"Out": ["arg_max_out"]},
+                                "op_attrs": {
+                                    "axis": axis,
+                                    "keepdims": keepdims,
+                                    "flatten": flatten,
+                                    "dtype": dtype,
+                                },
                             }
-                        }]
+                        ]
                         ops = self.generate_op_config(ops_config)
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={},
                             inputs={
-                                "arg_max_input":
-                                TensorConfig(data_gen=partial(
-                                    generate_input, rank, batch))
+                                "arg_max_input": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, rank, batch
+                                    )
+                                )
                             },
-                            outputs=["arg_max_out"])
+                            outputs=["arg_max_out"],
+                        )
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.rank == 3:
                 self.dynamic_shape.min_input_shape = {
@@ -117,19 +116,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bmm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bmm.py
new file mode 100644
index 00000000000000..fb5c607b233c1d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_bmm.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+import os
+
+
+class TrtConvertBmmTest_dynamic(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [10, 11, 12, 13, 14, 15]:
+            input1_shape = [batch, 350, 75]
+            input2_shape = [batch, 75, 25]
+            dics = [{}]
+            ops_config = [
+                {
+                    "op_type": "bmm",
+                    "op_inputs": {"X": ["input1_data"], "Y": ["input2_data"]},
+                    "op_outputs": {"Out": ["output_data"]},
+                    "op_attrs": dics[0],
+                }
+            ]
+            ops = self.generate_op_config(ops_config)
+
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input1_data": TensorConfig(
+                        data_gen=partial(generate_input, input1_shape)
+                    ),
+                    "input2_data": TensorConfig(
+                        data_gen=partial(generate_input, input2_shape)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [10, 350, 75],
+                "input2_data": [10, 75, 25],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [100, 350, 75],
+                "input2_data": [100, 75, 25],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [15, 350, 75],
+                "input2_data": [15, 75, 25],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            if dynamic_shape:
+                return 1, 3
+            else:
+                return 0, 4
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False
+        ), 1e-3
+
+        # The output has little diff between gpu and trt in CI-Windows-Inference
+        tol_fp32 = 1e-4
+        tol_half = 1e-4
+        if os.name == 'nt':
+            tol_fp32 = 1e-2
+            tol_half = 1e-2
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), tol_fp32
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True
+        ), tol_half
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py
index 8dca14c02aa74a..3d01a0712aecc1 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_cast.py
@@ -49,9 +49,15 @@ def generate_input(type):
             else:
                 return np.ones([1, 3, 64, 64]).astype(np.float32)
 
-        for in_dtype in [0, 2, 4, 5, 6]:
-            for out_dtype in [0, 2, 4, 5, 6]:
-                dics = [{"in_dtype": in_dtype, "out_dtype": out_dtype}]
+        for in_dtype in [0, 2, 5, 6]:
+            for out_dtype in [0, 2, 5, 6]:
+                dics = [{
+                    "in_dtype": in_dtype,
+                    "out_dtype": out_dtype
+                }, {
+                    "in_dtype": out_dtype,
+                    "out_dtype": in_dtype
+                }]
 
                 ops_config = [{
                     "op_type": "cast",
@@ -59,10 +65,20 @@ def generate_input(type):
                         "X": ["input_data"]
                     },
                     "op_outputs": {
-                        "Out": ["cast_output_data"]
+                        "Out": ["cast_output_data0"]
                     },
                     "op_attrs": dics[0]
+                }, {
+                    "op_type": "cast",
+                    "op_inputs": {
+                        "X": ["cast_output_data0"]
+                    },
+                    "op_outputs": {
+                        "Out": ["cast_output_data1"]
+                    },
+                    "op_attrs": dics[1]
                 }]
+
                 ops = self.generate_op_config(ops_config)
 
                 program_config = ProgramConfig(
@@ -72,7 +88,7 @@ def generate_input(type):
                         "input_data":
                         TensorConfig(data_gen=partial(generate_input, in_dtype))
                     },
-                    outputs=["cast_output_data"])
+                    outputs=["cast_output_data1"])
 
                 yield program_config
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py
index aec2f3efd4f23d..c8b6688aedcb0d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_clip.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertClipTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([32]).astype(np.float32)
@@ -46,52 +44,52 @@ def generate_weight2(attrs: List[Dict[str, Any]]):
 
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
-                for op_inputs in [{
-                        "X": ["input_data"]
-                }, {
-                        "X": ["input_data"],
-                        "Min": ["Min_"],
-                        "Max": ["Max_"]
-                }]:
+                for op_inputs in [
+                    {"X": ["input_data"]},
+                    {"X": ["input_data"], "Min": ["Min_"], "Max": ["Max_"]},
+                ]:
                     self.input_num = len(op_inputs)
                     self.dims = dims
-                    dics = [{
-                        "min": np.random.uniform(1, 10),
-                        "max": np.random.uniform(10, 20)
-                    }, {
-                        "op_inputs": op_inputs
-                    }]
-                    ops_config = [{
-                        "op_type": "clip",
-                        "op_inputs": op_inputs,
-                        "op_outputs": {
-                            "Out": ["output_data"]
+                    dics = [
+                        {
+                            "min": np.random.uniform(1, 10),
+                            "max": np.random.uniform(10, 20),
                         },
-                        "op_attrs": dics[0]
-                    }]
+                        {"op_inputs": op_inputs},
+                    ]
+                    ops_config = [
+                        {
+                            "op_type": "clip",
+                            "op_inputs": op_inputs,
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={
-                            "Min_":
-                            TensorConfig(
-                                data_gen=partial(generate_weight1, dics)),
-                            "Max_":
-                            TensorConfig(
-                                data_gen=partial(generate_weight2, dics))
+                            "Min_": TensorConfig(
+                                data_gen=partial(generate_weight1, dics)
+                            ),
+                            "Max_": TensorConfig(
+                                data_gen=partial(generate_weight2, dics)
+                            ),
                         },
                         inputs={
-                            "input_data":
-                            TensorConfig(data_gen=partial(
-                                generate_input1, dims, batch, dics))
+                            "input_data": TensorConfig(
+                                data_gen=partial(
+                                    generate_input1, dims, batch, dics
+                                )
+                            )
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(self, program_config):
-
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -135,19 +133,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py
index e8c9a65bbfc939..2945648c8da563 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_concat.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertConcatTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -31,14 +30,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-        #The input dimension should be less than or equal to the set axis.
+        # The input dimension should be less than or equal to the set axis.
         if len(inputs['concat_input1'].shape) <= attrs[0]['axis']:
             return False
 
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -79,58 +77,83 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                         self.num_input = num_input
                         self.dims = dims
                         dics = [{"axis": axis}, {}]
-                        dics_intput = [{
-                            "X":
-                            ["concat_input1", "concat_input2", "concat_input3"],
-                            "AxisTensor": ["AxisTensor"],
-                        }, {
-                            "X":
-                            ["concat_input1", "concat_input2", "concat_input3"]
-                        }]
-                        dics_inputs = [{
-                            "concat_input1":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch)),
-                            "concat_input2":
-                            TensorConfig(
-                                data_gen=partial(generate_input2, dics, batch)),
-                            "concat_input3":
-                            TensorConfig(
-                                data_gen=partial(generate_input3, dics, batch)),
-                            "AxisTensor":
-                            TensorConfig(
-                                data_gen=partial(generate_weight1, dics))
-                        }, {
-                            "concat_input1":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch)),
-                            "concat_input2":
-                            TensorConfig(
-                                data_gen=partial(generate_input2, dics, batch)),
-                            "concat_input3":
-                            TensorConfig(
-                                data_gen=partial(generate_input3, dics, batch))
-                        }]
-                        ops_config = [{
-                            "op_type": "concat",
-                            "op_inputs": dics_intput[num_input],
-                            "op_outputs": {
-                                "Out": ["concat_output"]
+                        dics_intput = [
+                            {
+                                "X": [
+                                    "concat_input1",
+                                    "concat_input2",
+                                    "concat_input3",
+                                ],
+                                "AxisTensor": ["AxisTensor"],
+                            },
+                            {
+                                "X": [
+                                    "concat_input1",
+                                    "concat_input2",
+                                    "concat_input3",
+                                ]
+                            },
+                        ]
+                        dics_inputs = [
+                            {
+                                "concat_input1": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input1, dics, batch
+                                    )
+                                ),
+                                "concat_input2": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input2, dics, batch
+                                    )
+                                ),
+                                "concat_input3": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input3, dics, batch
+                                    )
+                                ),
+                                "AxisTensor": TensorConfig(
+                                    data_gen=partial(generate_weight1, dics)
+                                ),
+                            },
+                            {
+                                "concat_input1": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input1, dics, batch
+                                    )
+                                ),
+                                "concat_input2": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input2, dics, batch
+                                    )
+                                ),
+                                "concat_input3": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input3, dics, batch
+                                    )
+                                ),
                             },
-                            "op_attrs": dics[0]
-                        }]
+                        ]
+                        ops_config = [
+                            {
+                                "op_type": "concat",
+                                "op_inputs": dics_intput[num_input],
+                                "op_outputs": {"Out": ["concat_output"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={},
                             inputs=dics_inputs[num_input],
-                            outputs=["concat_output"])
+                            outputs=["concat_output"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.num_input == 0:
                 if self.dims == 4:
@@ -138,76 +161,76 @@ def generate_dynamic_shape(attrs):
                         "concat_input1": [1, 3, 24, 24],
                         "concat_input2": [1, 3, 24, 24],
                         "concat_input3": [1, 3, 24, 24],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [4, 3, 48, 48],
                         "concat_input2": [4, 3, 48, 48],
                         "concat_input3": [4, 3, 48, 48],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [1, 3, 24, 24],
                         "concat_input2": [1, 3, 24, 24],
                         "concat_input3": [1, 3, 24, 24],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                 elif self.dims == 3:
                     self.dynamic_shape.min_input_shape = {
                         "concat_input1": [1, 3, 24],
                         "concat_input2": [1, 3, 24],
                         "concat_input3": [1, 3, 24],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [4, 12, 48],
                         "concat_input2": [4, 12, 48],
                         "concat_input3": [4, 12, 48],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [1, 3, 24],
                         "concat_input2": [1, 3, 24],
                         "concat_input3": [1, 3, 24],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                 elif self.dims == 2:
                     self.dynamic_shape.min_input_shape = {
                         "concat_input1": [1, 24],
                         "concat_input2": [1, 24],
                         "concat_input3": [1, 24],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [4, 48],
                         "concat_input2": [4, 48],
                         "concat_input3": [4, 48],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [1, 24],
                         "concat_input2": [1, 24],
                         "concat_input3": [1, 24],
-                        "AxisTensor": [1]
+                        "AxisTensor": [1],
                     }
                 elif self.dims == 1:
                     self.dynamic_shape.min_input_shape = {
                         "concat_input1": [24],
                         "concat_input2": [24],
                         "concat_input3": [24],
-                        "AxisTensor": [0]
+                        "AxisTensor": [0],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [48],
                         "concat_input2": [48],
                         "concat_input3": [48],
-                        "AxisTensor": [0]
+                        "AxisTensor": [0],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [24],
                         "concat_input2": [24],
                         "concat_input3": [24],
-                        "AxisTensor": [0]
+                        "AxisTensor": [0],
                     }
             elif self.num_input == 1:
                 if self.dims == 4:
@@ -219,60 +242,60 @@ def generate_dynamic_shape(attrs):
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [4, 3, 48, 48],
                         "concat_input2": [4, 3, 48, 48],
-                        "concat_input3": [4, 3, 48, 48]
+                        "concat_input3": [4, 3, 48, 48],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [1, 3, 24, 24],
                         "concat_input2": [1, 3, 24, 24],
-                        "concat_input3": [1, 3, 24, 24]
+                        "concat_input3": [1, 3, 24, 24],
                     }
                 elif self.dims == 3:
                     self.dynamic_shape.min_input_shape = {
                         "concat_input1": [1, 3, 24],
                         "concat_input2": [1, 3, 24],
-                        "concat_input3": [1, 3, 24]
+                        "concat_input3": [1, 3, 24],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [4, 12, 48],
                         "concat_input2": [4, 12, 48],
-                        "concat_input3": [4, 12, 48]
+                        "concat_input3": [4, 12, 48],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [1, 3, 24],
                         "concat_input2": [1, 3, 24],
-                        "concat_input3": [1, 3, 24]
+                        "concat_input3": [1, 3, 24],
                     }
                 elif self.dims == 2:
                     self.dynamic_shape.min_input_shape = {
                         "concat_input1": [1, 24],
                         "concat_input2": [1, 24],
-                        "concat_input3": [1, 24]
+                        "concat_input3": [1, 24],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [4, 48],
                         "concat_input2": [4, 48],
-                        "concat_input3": [4, 48]
+                        "concat_input3": [4, 48],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [1, 24],
                         "concat_input2": [1, 24],
-                        "concat_input3": [1, 24]
+                        "concat_input3": [1, 24],
                     }
                 elif self.dims == 1:
                     self.dynamic_shape.min_input_shape = {
                         "concat_input1": [24],
                         "concat_input2": [24],
-                        "concat_input3": [24]
+                        "concat_input3": [24],
                     }
                     self.dynamic_shape.max_input_shape = {
                         "concat_input1": [48],
                         "concat_input2": [48],
-                        "concat_input3": [48]
+                        "concat_input3": [48],
                     }
                     self.dynamic_shape.opt_input_shape = {
                         "concat_input1": [24],
                         "concat_input2": [24],
-                        "concat_input3": [24]
+                        "concat_input3": [24],
                     }
 
         def clear_dynamic_shape():
@@ -296,29 +319,33 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
-
         def teller1(program_config, predictor_config):
             if len(program_config.inputs) == 4:
                 return True
             return False
 
-        self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT,
-                           "INPUT AxisTensor NOT SUPPORT")
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT AxisTensor NOT SUPPORT"
+        )
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
index ce69d9d7395a06..bbfaae6514da09 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_conv2d_transpose.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertConv2dTransposeTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -30,8 +29,10 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        if inputs['input_data'].shape[
-                1] != weights['conv2d_weight'].shape[1] * attrs[0]['groups']:
+        if (
+            inputs['input_data'].shape[1]
+            != weights['conv2d_weight'].shape[1] * attrs[0]['groups']
+        ):
             return False
 
         if inputs['input_data'].shape[1] != weights['conv2d_weight'].shape[0]:
@@ -54,12 +55,13 @@ def generate_input1(batch, num_channels, attrs: List[Dict[str, Any]]):
 
         def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
             if attrs[0]['groups'] == 1:
-                return np.random.random([num_channels, num_channels, 3,
-                                         3]).astype(np.float32)
+                return np.random.random(
+                    [num_channels, num_channels, 3, 3]
+                ).astype(np.float32)
             else:
                 return np.random.random(
-                    [num_channels, int(num_channels / 2), 3,
-                     3]).astype(np.float32)
+                    [num_channels, int(num_channels / 2), 3, 3]
+                ).astype(np.float32)
 
         for num_channels in [2, 4, 6]:
             for batch in [1, 4]:
@@ -67,99 +69,113 @@ def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
                     for paddings in [[0, 3], [1, 2, 3, 4]]:
                         for groups in [2]:
                             for padding_algorithm in [
-                                    'EXPLICIT', 'SAME', 'VALID'
+                                'EXPLICIT',
+                                'SAME',
+                                'VALID',
                             ]:
                                 for dilations in [[2, 2], [1, 2]]:
                                     for data_format in ['NCHW']:
 
                                         self.num_channels = num_channels
-                                        dics = [{
-                                            "data_fromat": data_format,
-                                            "dilations": dilations,
-                                            "padding_algorithm":
-                                            padding_algorithm,
-                                            "groups": groups,
-                                            "paddings": paddings,
-                                            "strides": strides,
-                                            "data_format": data_format,
-                                            "output_size": [],
-                                            "output_padding": []
-                                        }]
-
-                                        ops_config = [{
-                                            "op_type": "conv2d_transpose",
-                                            "op_inputs": {
-                                                "Input": ["input_data"],
-                                                "Filter": ["conv2d_weight"]
-                                            },
-                                            "op_outputs": {
-                                                "Output": ["output_data"]
-                                            },
-                                            "op_attrs": dics[0]
-                                        }]
+                                        dics = [
+                                            {
+                                                "data_fromat": data_format,
+                                                "dilations": dilations,
+                                                "padding_algorithm": padding_algorithm,
+                                                "groups": groups,
+                                                "paddings": paddings,
+                                                "strides": strides,
+                                                "data_format": data_format,
+                                                "output_size": [],
+                                                "output_padding": [],
+                                            }
+                                        ]
+
+                                        ops_config = [
+                                            {
+                                                "op_type": "conv2d_transpose",
+                                                "op_inputs": {
+                                                    "Input": ["input_data"],
+                                                    "Filter": ["conv2d_weight"],
+                                                },
+                                                "op_outputs": {
+                                                    "Output": ["output_data"]
+                                                },
+                                                "op_attrs": dics[0],
+                                            }
+                                        ]
                                         ops = self.generate_op_config(
-                                            ops_config)
+                                            ops_config
+                                        )
 
                                         program_config = ProgramConfig(
                                             ops=ops,
                                             weights={
-                                                "conv2d_weight":
-                                                TensorConfig(data_gen=partial(
-                                                    generate_weight1,
-                                                    num_channels, dics))
+                                                "conv2d_weight": TensorConfig(
+                                                    data_gen=partial(
+                                                        generate_weight1,
+                                                        num_channels,
+                                                        dics,
+                                                    )
+                                                )
                                             },
                                             inputs={
-                                                "input_data":
-                                                TensorConfig(data_gen=partial(
-                                                    generate_input1, batch,
-                                                    num_channels, dics))
+                                                "input_data": TensorConfig(
+                                                    data_gen=partial(
+                                                        generate_input1,
+                                                        batch,
+                                                        num_channels,
+                                                        dics,
+                                                    )
+                                                )
                                             },
-                                            outputs=["output_data"])
+                                            outputs=["output_data"],
+                                        )
 
                                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.num_channels == 2:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [1, 2, 32, 32],
-                    "output_data": [1, 24, 32, 32]
+                    "output_data": [1, 24, 32, 32],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [4, 2, 64, 64],
-                    "output_data": [4, 24, 64, 64]
+                    "output_data": [4, 24, 64, 64],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [1, 2, 64, 64],
-                    "output_data": [1, 24, 64, 64]
+                    "output_data": [1, 24, 64, 64],
                 }
             elif self.num_channels == 4:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [1, 4, 32, 32],
-                    "output_data": [1, 24, 32, 32]
+                    "output_data": [1, 24, 32, 32],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [4, 4, 64, 64],
-                    "output_data": [4, 24, 64, 64]
+                    "output_data": [4, 24, 64, 64],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [1, 4, 64, 64],
-                    "output_data": [1, 24, 64, 64]
+                    "output_data": [1, 24, 64, 64],
                 }
             else:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [1, 6, 32, 32],
-                    "output_data": [1, 24, 32, 32]
+                    "output_data": [1, 24, 32, 32],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [4, 6, 64, 64],
-                    "output_data": [4, 24, 64, 64]
+                    "output_data": [4, 24, 64, 64],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [1, 6, 64, 64],
-                    "output_data": [1, 24, 64, 64]
+                    "output_data": [1, 24, 64, 64],
                 }
 
         def clear_dynamic_shape():
@@ -178,10 +194,12 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
         # self.trt_param.precision = paddle_infer.PrecisionType.Int8
         # yield self.create_inference_config(), generate_trt_nodes_num(
         #     attrs, False), (1e-5, 1e-5)
@@ -190,24 +208,26 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
         # self.trt_param.precision = paddle_infer.PrecisionType.Int8
         # yield self.create_inference_config(), generate_trt_nodes_num(
         #     attrs, True), (1e-5, 1e-5)
 
     def add_skip_trt_case(self):
-
         def teller1(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Int8:
                 return True
             return False
 
         self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "When precisionType is int8 without relu op, output is different between Trt and Paddle."
+            teller1,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "When precisionType is int8 without relu op, output is different between Trt and Paddle.",
         )
 
     def test(self):
@@ -221,7 +241,6 @@ def test_quant(self):
 
 # Special case
 class TrtConvertConv2dTransposeTest2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         ver = paddle_infer.get_trt_compile_version()
         if ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 7000:
@@ -241,49 +260,52 @@ def generate_weight1(num_channels, attrs: List[Dict[str, Any]]):
         batch = 1
 
         self.num_channels = num_channels
-        dics = [{
-            "data_fromat": 'NCHW',
-            "dilations": [1, 1],
-            "padding_algorithm": 'EXPLICIT',
-            "groups": 1,
-            "paddings": [1, 1],
-            "strides": [2, 2],
-            "output_padding": [1, 1],
-            "output_size": [],
-        }]
-
-        ops_config = [{
-            "op_type": "conv2d_transpose",
-            "op_inputs": {
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"]
-            },
-            "op_outputs": {
-                "Output": ["output_data"]
-            },
-            "op_attrs": dics[0]
-        }]
+        dics = [
+            {
+                "data_fromat": 'NCHW',
+                "dilations": [1, 1],
+                "padding_algorithm": 'EXPLICIT',
+                "groups": 1,
+                "paddings": [1, 1],
+                "strides": [2, 2],
+                "output_padding": [1, 1],
+                "output_size": [],
+            }
+        ]
+
+        ops_config = [
+            {
+                "op_type": "conv2d_transpose",
+                "op_inputs": {
+                    "Input": ["input_data"],
+                    "Filter": ["conv2d_weight"],
+                },
+                "op_outputs": {"Output": ["output_data"]},
+                "op_attrs": dics[0],
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
             ops=ops,
             weights={
-                "conv2d_weight":
-                TensorConfig(
-                    data_gen=partial(generate_weight1, num_channels, dics))
+                "conv2d_weight": TensorConfig(
+                    data_gen=partial(generate_weight1, num_channels, dics)
+                )
             },
             inputs={
-                "input_data":
-                TensorConfig(data_gen=partial(generate_input1, batch,
-                                              num_channels, dics))
+                "input_data": TensorConfig(
+                    data_gen=partial(generate_input1, batch, num_channels, dics)
+                )
             },
-            outputs=["output_data"])
+            outputs=["output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 128, 20, 30],
@@ -311,19 +333,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-4
+            attrs, False
+        ), 1e-4
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e0, 1e-3)
+            attrs, False
+        ), (1e0, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-4
+            attrs, True
+        ), 1e-4
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e0, 1e-3)
+            attrs, True
+        ), (1e0, 1e-3)
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
index 5d8e93ef984f62..94a94371247534 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_dropout.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertDropoutTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([64]).astype(np.float32)
@@ -42,47 +40,57 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
             for batch in [1, 2, 4]:
                 for fix_seed in [False, True]:
                     for dropout_implementation in [
-                            "downgrade_in_infer", "upscale_in_train"
+                        "downgrade_in_infer",
+                        "upscale_in_train",
                     ]:
                         for dropout_prob in [np.random.random()]:
                             for seed in [0, 64, 128, 512]:
                                 self.dims = dims
-                                dics = [{
-                                    "fix_seed": fix_seed,
-                                    "dropout_implementation":
-                                    dropout_implementation,
-                                    "dropout_prob": dropout_prob,
-                                    "seed": seed,
-                                    "is_test": True
-                                }]
-
-                                ops_config = [{
-                                    "op_type": "dropout",
-                                    "op_inputs": {
-                                        "X": ["input_data"],
-                                    },
-                                    "op_outputs": {
-                                        "Out": ["dropout_output_data"]
-                                    },
-                                    "op_attrs": dics[0]
-                                }]
+                                dics = [
+                                    {
+                                        "fix_seed": fix_seed,
+                                        "dropout_implementation": dropout_implementation,
+                                        "dropout_prob": dropout_prob,
+                                        "seed": seed,
+                                        "is_test": True,
+                                    }
+                                ]
+
+                                ops_config = [
+                                    {
+                                        "op_type": "dropout",
+                                        "op_inputs": {
+                                            "X": ["input_data"],
+                                        },
+                                        "op_outputs": {
+                                            "Out": ["dropout_output_data"]
+                                        },
+                                        "op_attrs": dics[0],
+                                    }
+                                ]
                                 ops = self.generate_op_config(ops_config)
 
                                 program_config = ProgramConfig(
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input1, dims, batch, dics))
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1,
+                                                dims,
+                                                batch,
+                                                dics,
+                                            )
+                                        )
                                     },
-                                    outputs=["dropout_output_data"])
+                                    outputs=["dropout_output_data"],
+                                )
 
                                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -128,19 +136,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
index 5a1cc19c618b8d..e084f2791e57a7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_elementwise.py
@@ -24,12 +24,10 @@
 # This is the special test case with weight including batch dimension
 # I don't want to mess up the code written by others, so I wrote a class specifically
 class TrtConvertElementwiseTest_one_input_special_case0(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -38,41 +36,51 @@ def generate_weight():
 
         for batch in [1, 4]:
             for shape in [[batch, 32, 16, 32]]:
-                for op_type in ["elementwise_add", "elementwise_mul"]:
+                for op_type in [
+                    "elementwise_add",
+                    "elementwise_mul",
+                    "elementwise_sub",
+                    "elementwise_div",
+                    "elementwise_pow",
+                    "elementwise_min",
+                    "elementwise_max",
+                ]:
                     for axis in [-1]:
                         self.dims = len(shape)
                         dics = [{"axis": axis}]
-                        ops_config = [{
-                            "op_type": op_type,
-                            "op_inputs": {
-                                "X": ["input_data"],
-                                "Y": ["weight"]
-                            },
-                            "op_outputs": {
-                                "Out": ["output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        ops_config = [
+                            {
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["input_data"],
+                                    "Y": ["weight"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={
-                                "weight":
-                                TensorConfig(data_gen=partial(generate_weight))
+                                "weight": TensorConfig(
+                                    data_gen=partial(generate_weight)
+                                )
                             },
                             inputs={
-                                "input_data":
-                                TensorConfig(
-                                    data_gen=partial(generate_input, shape)),
+                                "input_data": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
                             },
-                            outputs=["output_data"])
+                            outputs=["output_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
             if self.dims == 4:
@@ -102,19 +110,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def add_skip_trt_case(self):
         pass
@@ -126,12 +138,10 @@ def test(self):
 
 # This is the special test case
 class TrtConvertElementwiseTest_one_input_special_case1(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -139,41 +149,48 @@ def generate_weight():
             return np.random.randn(1).astype(np.float32)
 
         for shape in [[32]]:
-            for op_type in ["elementwise_add", "elementwise_mul"]:
+            for op_type in [
+                "elementwise_add",
+                "elementwise_mul",
+                "elementwise_sub",
+                "elementwise_div",
+                "elementwise_pow",
+                "elementwise_min",
+                "elementwise_max",
+            ]:
                 for axis in [-1]:
                     self.dims = len(shape)
                     dics = [{"axis": axis}]
-                    ops_config = [{
-                        "op_type": op_type,
-                        "op_inputs": {
-                            "X": ["input_data"],
-                            "Y": ["weight"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {"X": ["input_data"], "Y": ["weight"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={
-                            "weight":
-                            TensorConfig(data_gen=partial(generate_weight))
+                            "weight": TensorConfig(
+                                data_gen=partial(generate_weight)
+                            )
                         },
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, shape)),
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [32]}
             self.dynamic_shape.max_input_shape = {"input_data": [64]}
@@ -197,19 +214,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def add_skip_trt_case(self):
         pass
@@ -220,12 +241,10 @@ def test(self):
 
 
 class TrtConvertElementwiseTest_one_input(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -233,43 +252,57 @@ def generate_weight():
             return np.random.randn(32).astype(np.float32)
 
         for batch in [1, 4]:
-            for shape in [[32], [batch, 32], [batch, 32, 32],
-                          [batch, 32, 16, 32]]:
-                for op_type in ["elementwise_add", "elementwise_mul"]:
+            for shape in [
+                [32],
+                [batch, 32],
+                [batch, 32, 32],
+                [batch, 32, 16, 32],
+            ]:
+                for op_type in [
+                    "elementwise_add",
+                    "elementwise_mul",
+                    "elementwise_sub",
+                    "elementwise_div",
+                    "elementwise_pow",
+                    "elementwise_min",
+                    "elementwise_max",
+                ]:
                     for axis in [-1 if len(shape) == 1 else 1]:
                         self.dims = len(shape)
                         dics = [{"axis": axis}]
-                        ops_config = [{
-                            "op_type": op_type,
-                            "op_inputs": {
-                                "X": ["input_data"],
-                                "Y": ["weight"]
-                            },
-                            "op_outputs": {
-                                "Out": ["output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        ops_config = [
+                            {
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["input_data"],
+                                    "Y": ["weight"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={
-                                "weight":
-                                TensorConfig(data_gen=partial(generate_weight))
+                                "weight": TensorConfig(
+                                    data_gen=partial(generate_weight)
+                                )
                             },
                             inputs={
-                                "input_data":
-                                TensorConfig(
-                                    data_gen=partial(generate_input, shape)),
+                                "input_data": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
                             },
-                            outputs=["output_data"])
+                            outputs=["output_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
             if self.dims == 1:
@@ -313,19 +346,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def add_skip_trt_case(self):
         pass
@@ -336,107 +373,112 @@ def test(self):
 
 
 class TrtConvertElementwiseTest_two_input_without_broadcast(
-        TrtLayerAutoScanTest):
-
+    TrtLayerAutoScanTest
+):
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
-        for shape in [[4], [4, 32], [2, 64, 32], [1, 8, 16, 32]]:
+        for shape in [[4], [4, 32], [2, 32, 16], [1, 8, 16, 32]]:
             for op_type in [
-                    "elementwise_add", "elementwise_mul", "elementwise_sub",
-                    "elementwise_div", "elementwise_pow"
+                "elementwise_add",
+                "elementwise_mul",
+                "elementwise_sub",
+                "elementwise_div",
+                "elementwise_pow",
+                "elementwise_min",
+                "elementwise_max",
             ]:
                 for axis in [0, -1]:
                     self.dims = len(shape)
                     dics = [{"axis": axis}]
-                    ops_config = [{
-                        "op_type": op_type,
-                        "op_inputs": {
-                            "X": ["input_data1"],
-                            "Y": ["input_data2"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {
+                                "X": ["input_data1"],
+                                "Y": ["input_data2"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data1":
-                            TensorConfig(
-                                data_gen=partial(generate_input, shape)),
-                            "input_data2":
-                            TensorConfig(
-                                data_gen=partial(generate_input, shape))
+                            "input_data1": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
+                            "input_data2": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1],
-                    "input_data2": [1]
+                    "input_data2": [1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [128],
-                    "input_data2": [128]
+                    "input_data2": [128],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data1": [32],
-                    "input_data2": [32]
+                    "input_data2": [32],
                 }
             elif self.dims == 2:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 4],
-                    "input_data2": [1, 4]
+                    "input_data2": [1, 4],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [128, 256],
-                    "input_data2": [128, 256]
+                    "input_data2": [128, 256],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data1": [32, 64],
-                    "input_data2": [32, 64]
+                    "input_data2": [32, 64],
                 }
             elif self.dims == 3:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 4, 4],
-                    "input_data2": [1, 4, 4]
+                    "input_data2": [1, 4, 4],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [128, 128, 256],
-                    "input_data2": [128, 128, 256]
+                    "input_data2": [128, 128, 256],
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "input_data1": [2, 64, 64],
-                    "input_data2": [2, 64, 64]
+                    "input_data1": [2, 32, 16],
+                    "input_data2": [2, 32, 16],
                 }
             elif self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 4, 4, 4],
-                    "input_data2": [1, 4, 4, 4]
+                    "input_data2": [1, 4, 4, 4],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [8, 128, 64, 128],
-                    "input_data2": [8, 128, 64, 128]
+                    "input_data2": [8, 128, 64, 128],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data1": [2, 64, 32, 32],
-                    "input_data2": [2, 64, 32, 32]
+                    "input_data2": [2, 64, 32, 32],
                 }
 
         def clear_dynamic_shape():
@@ -457,15 +499,17 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
@@ -478,7 +522,6 @@ def test(self):
 
 
 class TrtConvertElementwiseTest_two_input_with_broadcast(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         if len(inputs['input_data1'].shape) != len(inputs['input_data2'].shape):
@@ -487,7 +530,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -499,8 +541,12 @@ def generate_input(shape):
         input2_shape5_list = [[32], [2, 1, 32], [4, 1, 1, 32]]
         input2_shape6_list = [[1, 32], [1, 32], [1, 1, 1, 32]]
         input2_shape_list = [
-            input2_shape1_list, input2_shape2_list, input2_shape3_list,
-            input2_shape4_list, input2_shape5_list, input2_shape6_list
+            input2_shape1_list,
+            input2_shape2_list,
+            input2_shape3_list,
+            input2_shape4_list,
+            input2_shape5_list,
+            input2_shape6_list,
         ]
         axis1_list = [[-1], [1, -1], [1, -1]]
         axis2_list = [[-1], [0], [0]]
@@ -509,8 +555,12 @@ def generate_input(shape):
         axis5_list = [[-1, 1], [-1, 0], [-1, 0]]
         axis6_list = [[-1, 0], [-1, 1], [-1, 0]]
         axis_list = [
-            axis1_list, axis2_list, axis3_list, axis4_list, axis5_list,
-            axis6_list
+            axis1_list,
+            axis2_list,
+            axis3_list,
+            axis4_list,
+            axis5_list,
+            axis6_list,
         ]
 
         for i in range(3):
@@ -518,61 +568,75 @@ def generate_input(shape):
             for j in range(6):
                 input2_shape = input2_shape_list[j][i]
                 for op_type in [
-                        "elementwise_add", "elementwise_mul", "elementwise_sub",
-                        "elementwise_div", "elementwise_pow"
+                    "elementwise_add",
+                    "elementwise_mul",
+                    "elementwise_sub",
+                    "elementwise_div",
+                    "elementwise_pow",
+                    "elementwise_min",
+                    "elementwise_max",
                 ]:
                     for axis in axis_list[j][i]:
                         self.shape1 = input1_shape
                         self.shape2 = input2_shape
                         dics = [{"axis": axis}]
-                        ops_config = [{
-                            "op_type": op_type,
-                            "op_inputs": {
-                                "X": ["input_data1"],
-                                "Y": ["input_data2"]
-                            },
-                            "op_outputs": {
-                                "Out": ["output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        ops_config = [
+                            {
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["input_data1"],
+                                    "Y": ["input_data2"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data1":
-                                TensorConfig(data_gen=partial(
-                                    generate_input, input1_shape)),
-                                "input_data2":
-                                TensorConfig(data_gen=partial(
-                                    generate_input, input2_shape))
+                                "input_data1": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, input1_shape
+                                    )
+                                ),
+                                "input_data2": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, input2_shape
+                                    )
+                                ),
                             },
-                            outputs=["output_data"])
+                            outputs=["output_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
-            max_shape = [[128], [128, 128], [128, 128, 128],
-                         [128, 128, 128, 128]]
+            max_shape = [
+                [128],
+                [128, 128],
+                [128, 128, 128],
+                [128, 128, 128, 128],
+            ]
             min_shape = [[1], [1, 1], [1, 1, 1], [1, 1, 1, 1]]
             opt_shape = [[32], [32, 32], [32, 32, 32], [32, 32, 32, 32]]
 
             self.dynamic_shape.min_input_shape = {
                 "input_data1": min_shape[len(self.shape1) - 1],
-                "input_data2": min_shape[len(self.shape2) - 1]
+                "input_data2": min_shape[len(self.shape2) - 1],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data1": max_shape[len(self.shape1) - 1],
-                "input_data2": max_shape[len(self.shape2) - 1]
+                "input_data2": max_shape[len(self.shape2) - 1],
             }
             self.dynamic_shape.opt_input_shape = {
                 "input_data1": opt_shape[len(self.shape1) - 1],
-                "input_data2": opt_shape[len(self.shape2) - 1]
+                "input_data2": opt_shape[len(self.shape2) - 1],
             }
 
         def clear_dynamic_shape():
@@ -588,14 +652,14 @@ def clear_dynamic_shape():
         clear_dynamic_shape()
         if self.shape1[0] == self.shape2[0]:
             self.trt_param.precision = paddle_infer.PrecisionType.Float32
-            yield self.create_inference_config(), (1, 3), 1e-5
+            yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
             self.trt_param.precision = paddle_infer.PrecisionType.Half
             yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
@@ -608,12 +672,10 @@ def test(self):
 
 
 class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -622,49 +684,58 @@ def generate_weight():
             return np.random.rand(32).astype(np.float32)
 
         for batch in [1, 2, 4]:
-            for shape in [[32], [batch, 32], [batch, 32, 32],
-                          [batch, 32, 16, 32]]:
+            for shape in [
+                [32],
+                [batch, 32],
+                [batch, 32, 32],
+                [batch, 32, 16, 32],
+            ]:
                 for op_type in [
-                        "elementwise_add",
-                        "elementwise_mul",
-                        "elementwise_sub",
-                        "elementwise_div",
-                        "elementwise_pow",
+                    "elementwise_add",
+                    "elementwise_mul",
+                    "elementwise_sub",
+                    "elementwise_div",
+                    "elementwise_pow",
+                    "elementwise_min",
+                    "elementwise_max",
                 ]:
+                    self.op_type = op_type
                     for axis in [-1 if len(shape) == 1 else 1]:
                         self.dims = len(shape)
                         dics = [{"axis": axis}]
-                        ops_config = [{
-                            "op_type": op_type,
-                            "op_inputs": {
-                                "X": ["weight"],
-                                "Y": ["input_data"]
-                            },
-                            "op_outputs": {
-                                "Out": ["output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        ops_config = [
+                            {
+                                "op_type": op_type,
+                                "op_inputs": {
+                                    "X": ["weight"],
+                                    "Y": ["input_data"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={
-                                "weight":
-                                TensorConfig(data_gen=partial(generate_weight))
+                                "weight": TensorConfig(
+                                    data_gen=partial(generate_weight)
+                                )
                             },
                             inputs={
-                                "input_data":
-                                TensorConfig(
-                                    data_gen=partial(generate_input, shape)),
+                                "input_data": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                ),
                             },
-                            outputs=["output_data"])
+                            outputs=["output_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
             if self.dims == 1:
@@ -704,14 +775,14 @@ def clear_dynamic_shape():
         # for static_shape
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 3), 1e-5
+        yield self.create_inference_config(), (0, 3), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (0, 3), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), (1, 2), (1e-3, 1e-3)
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
index 285ac3f2202d73..8612acc51acbda 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_equal.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertElementwiseTest_one_input_corner_case(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
@@ -35,7 +34,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -44,86 +42,84 @@ def generate_input(shape):
                 for axis in [-1 if len(shape) == 1 else 1]:
                     self.dims = len(shape)
                     dics = [{"axis": axis}, {"in_dtype": 0, "out_dtype": 5}]
-                    ops_config = [{
-                        "op_type": "equal",
-                        "op_inputs": {
-                            "X": ["input_data1"],
-                            "Y": ["input_data2"]
-                        },
-                        "op_outputs": {
-                            "Out": ["compare_output_data"]
+                    ops_config = [
+                        {
+                            "op_type": "equal",
+                            "op_inputs": {
+                                "X": ["input_data1"],
+                                "Y": ["input_data2"],
+                            },
+                            "op_outputs": {"Out": ["compare_output_data"]},
+                            "op_attrs": dics[0],
                         },
-                        "op_attrs": dics[0]
-                    }, {
-                        "op_type": "cast",
-                        "op_inputs": {
-                            "X": ["compare_output_data"]
+                        {
+                            "op_type": "cast",
+                            "op_inputs": {"X": ["compare_output_data"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[1],
                         },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[1]
-                    }]
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data1":
-                            TensorConfig(
-                                data_gen=partial(generate_input, shape)),
-                            "input_data2":
-                            TensorConfig(
-                                data_gen=partial(generate_input, shape))
+                            "input_data1": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
+                            "input_data2": TensorConfig(
+                                data_gen=partial(generate_input, shape)
+                            ),
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The input.dims[1] must be equal to the weight's length.
             if self.dims == 2:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 1],
-                    "input_data2": [1, 1]
+                    "input_data2": [1, 1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [4, 1],
-                    "input_data2": [4, 1]
+                    "input_data2": [4, 1],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data1": [2, 1],
-                    "input_data2": [2, 1]
+                    "input_data2": [2, 1],
                 }
             elif self.dims == 3:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 1, 4],
-                    "input_data2": [1, 1, 4]
+                    "input_data2": [1, 1, 4],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [4, 1, 256],
-                    "input_data2": [1, 1, 256]
+                    "input_data2": [1, 1, 256],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data1": [2, 1, 16],
-                    "input_data2": [2, 1, 16]
+                    "input_data2": [2, 1, 16],
                 }
             elif self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
                     "input_data1": [1, 1, 4, 4],
-                    "input_data2": [1, 1, 4, 4]
+                    "input_data2": [1, 1, 4, 4],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data1": [4, 1, 128, 256],
-                    "input_data2": [4, 1, 128, 256]
+                    "input_data2": [4, 1, 128, 256],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data1": [2, 1, 32, 16],
-                    "input_data2": [2, 1, 32, 16]
+                    "input_data2": [2, 1, 32, 16],
                 }
 
         def clear_dynamic_shape():
@@ -144,19 +140,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py
index 0eb88b5c019f14..c8c21c4174cdd9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fc.py
@@ -23,10 +23,9 @@
 
 
 class TrtConvertFcTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         # The output has diff between gpu and trt in CI windows
-        if (os.name == 'nt'):
+        if os.name == 'nt':
             return False
         return True
 
@@ -34,12 +33,14 @@ def sample_program_configs(self):
         self.trt_param.workspace_size = 1073741824
 
         def generate_input1(batch, attrs: List[Dict[str, Any]]):
-            return np.random.random([batch, 3, 64, (int)(attrs[0]["m"] / 2),
-                                     2]).astype(np.float32)
+            return np.random.random(
+                [batch, 3, 64, (int)(attrs[0]["m"] / 2), 2]
+            ).astype(np.float32)
 
         def generate_w(batch, attrs: List[Dict[str, Any]]):
-            return np.random.random([attrs[0]["m"],
-                                     attrs[0]["n"]]).astype(np.float32)
+            return np.random.random([attrs[0]["m"], attrs[0]["n"]]).astype(
+                np.float32
+            )
 
         def generate_bias(batch, attrs: List[Dict[str, Any]]):
             return np.random.random([attrs[0]["n"]]).astype(np.float32)
@@ -53,7 +54,7 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                         "m": m,
                         "n": n,
                     },
-                    {}
+                    {},
                 ]
 
                 ops_config = [
@@ -62,12 +63,10 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                         "op_inputs": {
                             "Input": ["input_data"],
                             "W": ["w_data"],
-                            "Bias": ["bias_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
+                            "Bias": ["bias_data"],
                         },
-                        "op_attrs": dics[0]
+                        "op_outputs": {"Out": ["output_data"]},
+                        "op_attrs": dics[0],
                     },
                 ]
 
@@ -76,24 +75,26 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={
-                        "w_data":
-                        TensorConfig(data_gen=partial(generate_w, batch, dics)),
-                        "bias_data":
-                        TensorConfig(
-                            data_gen=partial(generate_bias, batch, dics))
+                        "w_data": TensorConfig(
+                            data_gen=partial(generate_w, batch, dics)
+                        ),
+                        "bias_data": TensorConfig(
+                            data_gen=partial(generate_bias, batch, dics)
+                        ),
                     },
                     inputs={
-                        "input_data":
-                        TensorConfig(
-                            data_gen=partial(generate_input1, batch, dics)),
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input1, batch, dics)
+                        ),
                     },
-                    outputs=["output_data"])
+                    outputs=["output_data"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 3, 32, 16, 2],
@@ -121,19 +122,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
@@ -143,10 +148,9 @@ def test_quant(self):
 
 
 class TrtConvertFcTest2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         # The output has diff between gpu and trt in CI windows
-        if (os.name == 'nt'):
+        if os.name == 'nt':
             return False
         return True
 
@@ -157,8 +161,9 @@ def generate_input1(batch, attrs: List[Dict[str, Any]]):
             return np.random.random([batch, 3, 64, 14]).astype(np.float32)
 
         def generate_w(batch, attrs: List[Dict[str, Any]]):
-            return np.random.random([attrs[0]["m"],
-                                     attrs[0]["n"]]).astype(np.float32)
+            return np.random.random([attrs[0]["m"], attrs[0]["n"]]).astype(
+                np.float32
+            )
 
         def generate_bias(batch, attrs: List[Dict[str, Any]]):
             return np.random.random([attrs[0]["n"]]).astype(np.float32)
@@ -172,7 +177,7 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                         "m": m,
                         "n": n,
                     },
-                    {}
+                    {},
                 ]
 
                 ops_config = [
@@ -181,12 +186,10 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                         "op_inputs": {
                             "Input": ["input_data"],
                             "W": ["w_data"],
-                            "Bias": ["bias_data"]
+                            "Bias": ["bias_data"],
                         },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
+                        "op_outputs": {"Out": ["output_data"]},
+                        "op_attrs": dics[0],
                     },
                 ]
 
@@ -195,24 +198,26 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={
-                        "w_data":
-                        TensorConfig(data_gen=partial(generate_w, batch, dics)),
-                        "bias_data":
-                        TensorConfig(
-                            data_gen=partial(generate_bias, batch, dics))
+                        "w_data": TensorConfig(
+                            data_gen=partial(generate_w, batch, dics)
+                        ),
+                        "bias_data": TensorConfig(
+                            data_gen=partial(generate_bias, batch, dics)
+                        ),
                     },
                     inputs={
-                        "input_data":
-                        TensorConfig(
-                            data_gen=partial(generate_input1, batch, dics)),
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input1, batch, dics)
+                        ),
                     },
-                    outputs=["output_data"])
+                    outputs=["output_data"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 3, 32, 14],
@@ -234,14 +239,14 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 2), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 2), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
@@ -277,7 +282,7 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                         "m": m,
                         "n": n,
                     },
-                    {}
+                    {},
                 ]
 
                 ops_config = [
@@ -286,12 +291,10 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                         "op_inputs": {
                             "Input": ["input_data"],
                             "W": ["w_data"],
-                            "Bias": ["bias_data"]
+                            "Bias": ["bias_data"],
                         },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
+                        "op_outputs": {"Out": ["output_data"]},
+                        "op_attrs": dics[0],
                     },
                 ]
 
@@ -300,24 +303,26 @@ def generate_bias(batch, attrs: List[Dict[str, Any]]):
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={
-                        "w_data":
-                        TensorConfig(data_gen=partial(generate_w, batch, dics)),
-                        "bias_data":
-                        TensorConfig(
-                            data_gen=partial(generate_bias, batch, dics))
+                        "w_data": TensorConfig(
+                            data_gen=partial(generate_w, batch, dics)
+                        ),
+                        "bias_data": TensorConfig(
+                            data_gen=partial(generate_bias, batch, dics)
+                        ),
                     },
                     inputs={
-                        "input_data":
-                        TensorConfig(
-                            data_gen=partial(generate_input1, batch, dics)),
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input1, batch, dics)
+                        ),
                     },
-                    outputs=["output_data"])
+                    outputs=["output_data"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 14, 1, 2],
@@ -339,16 +344,16 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 2), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 2), (1e-3, 1e-3)
         self.trt_param.precision = paddle_infer.PrecisionType.Int8
-        yield self.create_inference_config(), (1, 2), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 2), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py
index cc686be6d8a83e..b70e91a58508c4 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_fill_constant.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertSplitTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_value_data(attrs: List[Dict[str, Any]]):
             return np.array([1]).astype(np.int32)
 
@@ -47,21 +45,28 @@ def generate_shapelist_data(attrs: List[Dict[str, Any]]):
                             str_value = str_value
                         else:
                             str_value = ""
-                        dics = [{
-                            "str_value": str_value,
-                            "value": value,
-                            "shape": shape,
-                            "dtype": dtype
-                        }, {
-                            "axis": -1
-                        }]
-                        dics_intput = [{
-                            "ValueTensor": ["value_data"]
-                        }, {
-                            "ShapeTensor": ["shape_data"],
-                        }, {
-                            "ShapeTensorList": ["shapeT1_data", "shapeT2_data"],
-                        }, {}]
+                        dics = [
+                            {
+                                "str_value": str_value,
+                                "value": value,
+                                "shape": shape,
+                                "dtype": dtype,
+                            },
+                            {"axis": -1},
+                        ]
+                        dics_intput = [
+                            {"ValueTensor": ["value_data"]},
+                            {
+                                "ShapeTensor": ["shape_data"],
+                            },
+                            {
+                                "ShapeTensorList": [
+                                    "shapeT1_data",
+                                    "shapeT2_data",
+                                ],
+                            },
+                            {},
+                        ]
                         ops_config = [
                             {
                                 "op_type": "fill_constant",
@@ -69,7 +74,7 @@ def generate_shapelist_data(attrs: List[Dict[str, Any]]):
                                 "op_outputs": {
                                     "Out": ["out_data"],
                                 },
-                                "op_attrs": dics[0]
+                                "op_attrs": dics[0],
                             },
                         ]
 
@@ -81,26 +86,31 @@ def generate_input():
                             ops=ops,
                             weights={},
                             inputs={
-                                "value_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_value_data, dics)),
-                                "shape_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_shape_data, dics)),
-                                "shapeT1_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_shapelist_data, dics)),
-                                "shapeT2_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_shapelist_data, dics)),
+                                "value_data": TensorConfig(
+                                    data_gen=partial(generate_value_data, dics)
+                                ),
+                                "shape_data": TensorConfig(
+                                    data_gen=partial(generate_shape_data, dics)
+                                ),
+                                "shapeT1_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_shapelist_data, dics
+                                    )
+                                ),
+                                "shapeT2_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_shapelist_data, dics
+                                    )
+                                ),
                             },
-                            outputs=["out_data"])
+                            outputs=["out_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.input_shape = [1, 1]
             max_shape = list(self.input_shape)
@@ -118,7 +128,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if (self.num_input < 3):
+            if self.num_input < 3:
                 return 0, 6
             return 1, 5
 
@@ -131,10 +141,12 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py
index e9f9b70b916715..b9f8c4fffc34a2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_flatten.py
@@ -22,16 +22,14 @@
 
 
 class TrtConvertFlattenTest_dim_2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(batch):
             return np.random.random([batch, 32]).astype(np.float32)
 
-        for batch in [1, 2, 4]:
+        for batch in [1, 4]:
             for axis in [0, 1]:
                 for type in ["flatten", "flatten2"]:
                     if type == "flatten":
@@ -39,34 +37,35 @@ def generate_input(batch):
                     else:
                         op_outputs = {
                             "Out": ["output_data"],
-                            "XShape": ["xshape_data"]
+                            "XShape": ["xshape_data"],
                         }
                     dics = [{"axis": axis}]
-                    ops_config = [{
-                        "op_type": "flatten",
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": op_outputs,
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "flatten",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": op_outputs,
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, batch))
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch)
+                            )
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 64]}
@@ -100,35 +99,37 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         # for static_shape
         clear_dynamic_shape()
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertFlattenTest_dim_3(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(batch):
             return np.random.random([batch, 32, 64]).astype(np.float32)
 
-        for batch in [1, 2, 4]:
+        for batch in [1, 4]:
             for axis in [0, 1, 2]:
                 for type in ["flatten", "flatten2"]:
                     if type == "flatten":
@@ -136,38 +137,39 @@ def generate_input(batch):
                     else:
                         op_outputs = {
                             "Out": ["output_data"],
-                            "XShape": ["xshape_data"]
+                            "XShape": ["xshape_data"],
                         }
                     dics = [{"axis": axis}]
-                    ops_config = [{
-                        "op_type": "flatten",
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": op_outputs,
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "flatten",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": op_outputs,
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, batch))
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch)
+                            )
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 8, 8]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 64, 768]}
-            self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 256]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64]}
+            self.dynamic_shape.opt_input_shape = {"input_data": [2, 32, 64]}
 
         def clear_dynamic_shape():
             self.dynamic_shape.max_input_shape = {}
@@ -198,35 +200,37 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertFlattenTest_dim_4(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(batch):
             return np.random.random([batch, 8, 8, 8]).astype(np.float32)
 
-        for batch in [1, 2, 4]:
+        for batch in [1, 4]:
             for axis in [0, 1, 2, 3]:
                 for type in ["flatten", "flatten2"]:
                     if type == "flatten":
@@ -234,37 +238,38 @@ def generate_input(batch):
                     else:
                         op_outputs = {
                             "Out": ["output_data"],
-                            "XShape": ["xshape_data"]
+                            "XShape": ["xshape_data"],
                         }
                     dics = [{"axis": axis}]
-                    ops_config = [{
-                        "op_type": "flatten",
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": op_outputs,
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "flatten",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": op_outputs,
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, batch))
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch)
+                            )
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64, 64]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 32, 32]}
             self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 16, 8]}
 
         def clear_dynamic_shape():
@@ -294,36 +299,39 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
 
         # for static_shape
         clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertFlattenTest_dim_5(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(batch):
             return np.random.random([batch, 8, 8, 8]).astype(np.float32)
 
-        for batch in [1, 2, 4]:
+        for batch in [1, 4]:
             for axis in [0, 1, 2, 3, 4]:
                 for type in ["flatten", "flatten2"]:
                     if type == "flatten":
@@ -331,37 +339,38 @@ def generate_input(batch):
                     else:
                         op_outputs = {
                             "Out": ["output_data"],
-                            "XShape": ["xshape_data"]
+                            "XShape": ["xshape_data"],
                         }
                     dics = [{"axis": axis}]
-                    ops_config = [{
-                        "op_type": "flatten",
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": op_outputs,
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "flatten",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": op_outputs,
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, batch))
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input, batch)
+                            )
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 4, 4, 4]}
-            self.dynamic_shape.max_input_shape = {"input_data": [4, 32, 64, 64]}
+            self.dynamic_shape.max_input_shape = {"input_data": [4, 16, 16, 8]}
             self.dynamic_shape.opt_input_shape = {"input_data": [2, 16, 16, 8]}
 
         def clear_dynamic_shape():
@@ -391,20 +400,25 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
 
         # for static_shape
         clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
index 5405f114651061..784c12fc8eeadf 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather.py
@@ -23,7 +23,6 @@
 
 
 class TrtConvertGatherTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
@@ -35,13 +34,15 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(shape):
             return np.random.random(shape).astype(np.float32)
 
         def generate_input2(index):
             return np.array(index).astype(np.int32)
 
+        def generate_input4(index):
+            return np.array(index).astype(np.int64)
+
         def generate_input3(axis):
             return np.array([axis]).astype(np.int32)
 
@@ -49,108 +50,126 @@ def generate_input3(axis):
             for index in [[1, 4], [4, 8]]:
                 for axis in [0, 1, 2, 3]:
                     for overwrite in [True, False]:
-                        for input in [{
-                                "X": ["input_data"],
-                                "Index": ["index_data"]
-                        }, {
+                        for input in [
+                            {"X": ["input_data"], "Index": ["index_data"]},
+                            {
                                 "X": ["input_data"],
                                 "Index": ["index_data"],
-                                "Axis": ["axis_data"]
-                        }]:
-                            self.shape = shape
-                            self.axis = axis
-                            self.input_num = len(input)
-                            dics = [{"overwrite": overwrite, "axis": axis}]
-                            ops_config = [{
-                                "op_type": "gather",
-                                "op_inputs": input,
-                                "op_outputs": {
-                                    "Out": ["output_data"]
-                                },
-                                "op_attrs": dics[0]
-                            }]
-                            ops = self.generate_op_config(ops_config)
-
-                            program_config = ProgramConfig(
-                                ops=ops,
-                                weights={},
-                                inputs={
-                                    "input_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input1, shape)),
-                                    "index_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input2, index)),
-                                } if len(input) == 2 else {
-                                    "input_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input1, shape)),
-                                    "index_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input2, index)),
-                                    "axis_data":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input3, axis)),
-                                },
-                                outputs=["output_data"])
-
-                            yield program_config
+                                "Axis": ["axis_data"],
+                            },
+                        ]:
+                            for index_type_int32 in [True, False]:
+                                self.shape = shape
+                                self.axis = axis
+                                self.input_num = len(input)
+                                self.index_type_int32 = index_type_int32
+                                dics = [{"overwrite": overwrite, "axis": axis}]
+                                ops_config = [
+                                    {
+                                        "op_type": "gather",
+                                        "op_inputs": input,
+                                        "op_outputs": {"Out": ["output_data"]},
+                                        "op_attrs": dics[0],
+                                    }
+                                ]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={},
+                                    inputs={
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1, shape
+                                            )
+                                        ),
+                                        "index_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input2
+                                                if index_type_int32 == True
+                                                else generate_input4,
+                                                index,
+                                            )
+                                        ),
+                                    }
+                                    if len(input) == 2
+                                    else {
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1, shape
+                                            )
+                                        ),
+                                        "index_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input2, index
+                                            )
+                                        ),
+                                        "axis_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input3, axis
+                                            )
+                                        ),
+                                    },
+                                    outputs=["output_data"],
+                                )
+
+                                yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if len(self.shape) == 1:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [4],
-                    "index_data": [1]
+                    "index_data": [1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [128],
-                    "index_data": [4]
+                    "index_data": [4],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [16],
-                    "index_data": [2]
+                    "index_data": [2],
                 }
             elif len(self.shape) == 2:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [2, 4],
-                    "index_data": [1]
+                    "index_data": [1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [256, 256],
-                    "index_data": [4]
+                    "index_data": [4],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [64, 32],
-                    "index_data": [2]
+                    "index_data": [2],
                 }
             elif len(self.shape) == 3:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [2, 4, 4],
-                    "index_data": [1]
+                    "index_data": [1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [128, 256, 256],
-                    "index_data": [4]
+                    "index_data": [4],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [16, 64, 32],
-                    "index_data": [2]
+                    "index_data": [2],
                 }
             elif len(self.shape) == 4:
                 self.dynamic_shape.min_input_shape = {
                     "input_data": [2, 4, 4, 2],
-                    "index_data": [1]
+                    "index_data": [1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input_data": [128, 256, 64, 128],
-                    "index_data": [4]
+                    "index_data": [4],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input_data": [16, 64, 16, 32],
-                    "index_data": [2]
+                    "index_data": [2],
                 }
 
         def clear_dynamic_shape():
@@ -162,7 +181,7 @@ def generate_trt_nodes_num(dynamic_shape):
             if self.input_num == 3:
                 return 0, 5
             else:
-                if dynamic_shape:
+                if dynamic_shape and self.index_type_int32 == True:
                     return 1, 3
                 else:
                     return 0, 4
@@ -175,10 +194,12 @@ def generate_trt_nodes_num(dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -194,14 +215,17 @@ def add_skip_trt_case(self):
             def teller1(program_config, predictor_config):
                 if len(self.dynamic_shape.min_input_shape) != 0:
                     inputs = program_config.inputs
-                    if len(inputs['input_data'].shape) == 1 or len(
-                            inputs['index_data'].shape) == 1:
+                    if (
+                        len(inputs['input_data'].shape) == 1
+                        or len(inputs['index_data'].shape) == 1
+                    ):
                         return True
                 return False
 
             self.add_skip_case(
-                teller1, SkipReasons.TRT_NOT_SUPPORT,
-                "Need to repair the case: trt reshape out failed for dynamic shape mode when inputs' dims==1. under trt7.0 "
+                teller1,
+                SkipReasons.TRT_NOT_SUPPORT,
+                "Need to repair the case: trt reshape out failed for dynamic shape mode when inputs' dims==1. under trt7.0 ",
             )
 
     def test(self):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
index 9343f1ebd7cd0e..7f2372a8846cdc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gather_nd.py
@@ -19,60 +19,64 @@
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import os
 
 
 class TrtConvertGatherNdTest_dim_4_1(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        # The output has diff between gpu and trt in CI windows
+        # if ( and self.trt_param.precision == paddle_infer.PrecisionType.Half):
+        #     return False
         return True
 
     def sample_program_configs(self):
-
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
         def generate_input2():
             return np.ones([1]).astype(np.int32)
 
-        ops_config = [{
-            "op_type": "gather_nd",
-            "op_inputs": {
-                "X": ["input_data"],
-                "Index": ["index_data"]
-            },
-            "op_outputs": {
-                "Out": ["output_data"]
-            },
-            "op_attrs": {}
-        }]
+        ops_config = [
+            {
+                "op_type": "gather_nd",
+                "op_inputs": {"X": ["input_data"], "Index": ["index_data"]},
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
         ops = self.generate_op_config(ops_config)
-
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={},
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input1)),
-                "index_data": TensorConfig(data_gen=partial(generate_input2)),
-            },
-            outputs=["output_data"])
-
-        yield program_config
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                    "index_data": TensorConfig(
+                        data_gen=partial(generate_input2)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1]
+                "index_data": [1],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [1]
+                "index_data": [1],
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
-                "index_data": [1]
+                "input_data": [2, 32, 64, 64],
+                "index_data": [1],
             }
 
         def clear_dynamic_shape():
@@ -89,43 +93,51 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.",
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertGatherNdTest_dim_4_1_2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
         def generate_input2():
             return np.array([1, 2]).astype(np.int32)
 
-        ops_config = [{
-            "op_type": "gather_nd",
-            "op_inputs": {
-                "X": ["input_data"],
-                "Index": ["index_data"]
-            },
-            "op_outputs": {
-                "Out": ["output_data"]
-            },
-            "op_attrs": {}
-        }]
+        ops_config = [
+            {
+                "op_type": "gather_nd",
+                "op_inputs": {"X": ["input_data"], "Index": ["index_data"]},
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
@@ -135,25 +147,26 @@ def generate_input2():
                 "input_data": TensorConfig(data_gen=partial(generate_input1)),
                 "index_data": TensorConfig(data_gen=partial(generate_input2)),
             },
-            outputs=["output_data"])
+            outputs=["output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1]
+                "index_data": [2],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [4]
+                "index_data": [2],
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
-                "index_data": [2]
+                "input_data": [2, 32, 64, 64],
+                "index_data": [2],
             }
 
         def clear_dynamic_shape():
@@ -170,43 +183,51 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.",
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertGatherNdTest_dim_4_2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
         def generate_input2():
             return np.ones([2, 2]).astype(np.int32)
 
-        ops_config = [{
-            "op_type": "gather_nd",
-            "op_inputs": {
-                "X": ["input_data"],
-                "Index": ["index_data"]
-            },
-            "op_outputs": {
-                "Out": ["output_data"]
-            },
-            "op_attrs": {}
-        }]
+        ops_config = [
+            {
+                "op_type": "gather_nd",
+                "op_inputs": {"X": ["input_data"], "Index": ["index_data"]},
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
@@ -216,25 +237,26 @@ def generate_input2():
                 "input_data": TensorConfig(data_gen=partial(generate_input1)),
                 "index_data": TensorConfig(data_gen=partial(generate_input2)),
             },
-            outputs=["output_data"])
+            outputs=["output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1, 2]
+                "index_data": [2, 2],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [4, 4]
+                "index_data": [2, 2],
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
-                "index_data": [2, 2]
+                "input_data": [2, 32, 64, 64],
+                "index_data": [2, 2],
             }
 
         def clear_dynamic_shape():
@@ -251,43 +273,51 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.",
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertGatherNdTest_dim_4_3(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1():
             return np.random.random([2, 32, 64, 64]).astype(np.float32)
 
         def generate_input2():
             return np.ones([2, 2, 4]).astype(np.int32)
 
-        ops_config = [{
-            "op_type": "gather_nd",
-            "op_inputs": {
-                "X": ["input_data"],
-                "Index": ["index_data"]
-            },
-            "op_outputs": {
-                "Out": ["output_data"]
-            },
-            "op_attrs": {}
-        }]
+        ops_config = [
+            {
+                "op_type": "gather_nd",
+                "op_inputs": {"X": ["input_data"], "Index": ["index_data"]},
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
@@ -297,25 +327,26 @@ def generate_input2():
                 "input_data": TensorConfig(data_gen=partial(generate_input1)),
                 "index_data": TensorConfig(data_gen=partial(generate_input2)),
             },
-            outputs=["output_data"])
+            outputs=["output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 8, 8, 8],
-                "index_data": [1, 2, 2]
+                "index_data": [2, 2, 4],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 32, 64, 64],
-                "index_data": [4, 4, 4]
+                "index_data": [2, 2, 4],
             }
             self.dynamic_shape.opt_input_shape = {
-                "input_data": [2, 4, 64, 64],
-                "index_data": [2, 2, 2]
+                "input_data": [2, 32, 64, 64],
+                "index_data": [2, 2, 4],
             }
 
         def clear_dynamic_shape():
@@ -332,43 +363,51 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.",
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertGatherNdTest_dim_2_2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1():
             return np.random.random([2, 32]).astype(np.float32)
 
         def generate_input2():
             return np.array([[0, 3], [1, 9]]).astype(np.int32)
 
-        ops_config = [{
-            "op_type": "gather_nd",
-            "op_inputs": {
-                "X": ["input_data"],
-                "Index": ["index_data"]
-            },
-            "op_outputs": {
-                "Out": ["output_data"]
-            },
-            "op_attrs": {}
-        }]
+        ops_config = [
+            {
+                "op_type": "gather_nd",
+                "op_inputs": {"X": ["input_data"], "Index": ["index_data"]},
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
@@ -378,25 +417,26 @@ def generate_input2():
                 "input_data": TensorConfig(data_gen=partial(generate_input1)),
                 "index_data": TensorConfig(data_gen=partial(generate_input2)),
             },
-            outputs=["output_data"])
+            outputs=["output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 4],
-                "index_data": [1, 1]
+                "index_data": [2, 2],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [4, 64],
-                "index_data": [4, 2]
+                "index_data": [2, 2],
             }
             self.dynamic_shape.opt_input_shape = {
                 "input_data": [2, 8],
-                "index_data": [2, 2]
+                "index_data": [2, 2],
             }
 
         def clear_dynamic_shape():
@@ -413,44 +453,53 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def add_skip_trt_case(self):
+        def teller1(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller1,
+            SkipReasons.TRT_NOT_SUPPORT,
+            "Under Windows Ci, this case will sporadically fail.",
+        )
 
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
 class TrtConvertGatherNdTest_dim_3_3(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1():
             return np.random.random([16, 32, 256]).astype(np.float32)
 
         def generate_input2():
-            return np.array([[[2, 5], [3, 8]], [[0, 2], [0,
-                                                         3]]]).astype(np.int32)
-
-        ops_config = [{
-            "op_type": "gather_nd",
-            "op_inputs": {
-                "X": ["input_data"],
-                "Index": ["index_data"]
-            },
-            "op_outputs": {
-                "Out": ["output_data"]
-            },
-            "op_attrs": {}
-        }]
+            return np.array([[[2, 5], [3, 8]], [[0, 2], [0, 3]]]).astype(
+                np.int32
+            )
+
+        ops_config = [
+            {
+                "op_type": "gather_nd",
+                "op_inputs": {"X": ["input_data"], "Index": ["index_data"]},
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
@@ -460,25 +509,26 @@ def generate_input2():
                 "input_data": TensorConfig(data_gen=partial(generate_input1)),
                 "index_data": TensorConfig(data_gen=partial(generate_input2)),
             },
-            outputs=["output_data"])
+            outputs=["output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 4, 4],
-                "index_data": [1, 1, 1]
+                "index_data": [1, 1, 1],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data": [16, 64, 512],
-                "index_data": [4, 2, 4]
+                "index_data": [4, 2, 4],
             }
             self.dynamic_shape.opt_input_shape = {
                 "input_data": [2, 8, 64],
-                "index_data": [2, 2, 2]
+                "index_data": [2, 2, 2],
             }
 
         def clear_dynamic_shape():
@@ -495,14 +545,14 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (0, 4), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (0, 4), 1e-5
+        yield self.create_inference_config(), (0, 4), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
index 29f656130f793d..818f2a4c1b842e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_gelu.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertGeluTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(dims, attrs: List[Dict[str, Any]]):
             if dims == 1:
                 return np.ones([32]).astype(np.float32)
@@ -43,33 +41,32 @@ def generate_input1(dims, attrs: List[Dict[str, Any]]):
                 self.dims = dims
                 dics = [{"approximate": approximate}]
 
-                ops_config = [{
-                    "op_type": "gelu",
-                    "op_inputs": {
-                        "X": ["input_data"]
-                    },
-                    "op_outputs": {
-                        "Out": ["output_data"]
-                    },
-                    "op_attrs": dics[0]
-                }]
+                ops_config = [
+                    {
+                        "op_type": "gelu",
+                        "op_inputs": {"X": ["input_data"]},
+                        "op_outputs": {"Out": ["output_data"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
 
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "input_data":
-                        TensorConfig(
-                            data_gen=partial(generate_input1, dims, dics))
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input1, dims, dics)
+                        )
                     },
-                    outputs=["output_data"])
+                    outputs=["output_data"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -123,19 +120,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_grid_sampler.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_grid_sampler.py
new file mode 100644
index 00000000000000..36b0c1638cb77e
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_grid_sampler.py
@@ -0,0 +1,110 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertGridSampler(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1():
+            return np.random.random([1, 3, 32, 32]).astype(np.float32)
+
+        def generate_input2():
+            return np.random.random([1, 3, 3, 2]).astype(np.float32)
+
+        ops_config = [
+            {
+                "op_type": "grid_sampler",
+                "op_inputs": {
+                    "X": ["input_data"],
+                    "Grid": ["grid_data"],
+                },
+                "op_outputs": {"Output": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
+
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                    "grid_data": TensorConfig(
+                        data_gen=partial(generate_input2)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+        yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 3, 32, 32],
+                "grid_data": [1, 3, 3, 2],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 3, 64, 64],
+                "grid_data": [1, 3, 4, 4],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 3, 32, 32],
+                "grid_data": [1, 3, 3, 2],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 4), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 4), 1e-3
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
index 6115ae60eff328..3b3ac529389560 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_group_norm.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertGroupNormTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -36,7 +35,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(attrs: List[Dict[str, Any]], batch):
             if attrs[0]['data_layout'] == 'NCHW':
                 return np.random.random([batch, 32, 64, 64]).astype(np.float32)
@@ -53,47 +51,56 @@ def generate_bias():
             for group in [1, 4, 32, -1]:
                 for epsilon in [0.0001, 0.0007, -1, 1]:
                     for data_layout in ['NCHW']:
-                        dics = [{
-                            "epsilon": epsilon,
-                            "groups": group,
-                            "data_layout": data_layout
-                        }]
-                        ops_config = [{
-                            "op_type": "group_norm",
-                            "op_inputs": {
-                                "X": ["input_data"],
-                                "Scale": ["scale_weight"],
-                                "Bias": ["bias_weight"]
-                            },
-                            "op_outputs": {
-                                "Y": ["y_output"],
-                                "Mean": ["mean_output"],
-                                "Variance": ["variance_output"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        dics = [
+                            {
+                                "epsilon": epsilon,
+                                "groups": group,
+                                "data_layout": data_layout,
+                            }
+                        ]
+                        ops_config = [
+                            {
+                                "op_type": "group_norm",
+                                "op_inputs": {
+                                    "X": ["input_data"],
+                                    "Scale": ["scale_weight"],
+                                    "Bias": ["bias_weight"],
+                                },
+                                "op_outputs": {
+                                    "Y": ["y_output"],
+                                    "Mean": ["mean_output"],
+                                    "Variance": ["variance_output"],
+                                },
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={
-                                "scale_weight":
-                                TensorConfig(data_gen=partial(generate_scale)),
-                                "bias_weight":
-                                TensorConfig(data_gen=partial(generate_bias))
+                                "scale_weight": TensorConfig(
+                                    data_gen=partial(generate_scale)
+                                ),
+                                "bias_weight": TensorConfig(
+                                    data_gen=partial(generate_bias)
+                                ),
                             },
                             inputs={
-                                "input_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_input, dics, batch))
+                                "input_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, dics, batch
+                                    )
+                                )
                             },
-                            outputs=["y_output"])
+                            outputs=["y_output"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 16, 16, 16]}
             self.dynamic_shape.max_input_shape = {
@@ -117,19 +124,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
index 0980acccb88b5a..8ed6407476acbc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_sigmoid.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertHardSigmoidTest_dim_2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -37,33 +35,34 @@ def generate_input(shape):
                 for slope in [0.1, 0.5]:
                     for offset in [0.2, 0.7]:
                         dics = [{"slope": slope, "offset": offset}]
-                        ops_config = [{
-                            "op_type": "hard_sigmoid",
-                            "op_inputs": {
-                                "X": ["input_data"],
-                            },
-                            "op_outputs": {
-                                "Out": ["output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        ops_config = [
+                            {
+                                "op_type": "hard_sigmoid",
+                                "op_inputs": {
+                                    "X": ["input_data"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data":
-                                TensorConfig(
-                                    data_gen=partial(generate_input, shape))
+                                "input_data": TensorConfig(
+                                    data_gen=partial(generate_input, shape)
+                                )
                             },
-                            outputs=["output_data"])
+                            outputs=["output_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.input_dim == 2:
                 self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
@@ -98,14 +97,14 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
index 220611517e0634..106431e740fc6a 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_hard_swish.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertHardSwishTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -36,46 +35,46 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 3, 32, 32]).astype(np.float32)
 
         for threshold in [6.0, 7.0, 100.0, 0.0, -1.0]:
             for scale in [5.0, 7.0, -1.0, 0.0, 100.0]:
                 for offset in [3.0, 5.0, -1.0, 0.0, 100.0]:
-                    dics = [{
-                        "threshold": threshold,
-                        "scale": scale,
-                        "offset": offset
-                    }]
-
-                    ops_config = [{
-                        "op_type": "hard_swish",
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["hard_swish_output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    dics = [
+                        {
+                            "threshold": threshold,
+                            "scale": scale,
+                            "offset": offset,
+                        }
+                    ]
+
+                    ops_config = [
+                        {
+                            "op_type": "hard_swish",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": {"Out": ["hard_swish_output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics))
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input1, dics)
+                            )
                         },
-                        outputs=["hard_swish_output_data"])
+                        outputs=["hard_swish_output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 16, 16]}
             self.dynamic_shape.max_input_shape = {"input_data": [2, 3, 32, 32]}
@@ -97,19 +96,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
index 56767b3457791d..99c34a587b9186 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_instance_norm.py
@@ -20,6 +20,7 @@
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import os
 
 
 class TrtConvertInstanceNormTest(TrtLayerAutoScanTest):
@@ -113,7 +114,9 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if dynamic_shape or self.in_dim != 4:
+            if dynamic_shape:
+                return 1, 2
+            if self.in_dim != 4:
                 return 0, 3
             return 1, 2
 
@@ -139,7 +142,19 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, True), (1e-3, 1e-3)
 
+    def add_skip_trt_case(self):
+
+        def teller2(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_SUPPORT,
+            "The output has diff between gpu and trt in Windows.")
+
     def test(self):
+        self.add_skip_trt_case()
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_inverse.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_inverse.py
new file mode 100644
index 00000000000000..6ccb00d1a0f51d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_inverse.py
@@ -0,0 +1,99 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertInverse(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1():
+            return np.random.random([32, 32]).astype(np.float32)
+
+        ops_config = [
+            {
+                "op_type": "inverse",
+                "op_inputs": {
+                    "Input": ["input_data"],
+                },
+                "op_outputs": {"Output": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 1],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [64, 64],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [32, 32],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-3
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-3
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
index 7f33cfc64a8663..0ac59aa8c547db 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_leaky_relu.py
@@ -23,12 +23,10 @@
 
 
 class TrtConvertLeakyReluTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -37,32 +35,35 @@ def generate_input1(shape):
                 self.input_dim = len(shape)
                 for alpha in [0.02, 1.0, 100.0, -1.0, 0.0]:
                     dics = [{"alpha": alpha}]
-                    ops_config = [{
-                        "op_type": "leaky_relu",
-                        "op_inputs": {
-                            "X": ["input_data"],
-                        },
-                        "op_outputs": {
-                            "Out": ["y_data"],
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "leaky_relu",
+                            "op_inputs": {
+                                "X": ["input_data"],
+                            },
+                            "op_outputs": {
+                                "Out": ["y_data"],
+                            },
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, shape))
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input1, shape)
+                            )
                         },
-                        outputs=["y_data"])
+                        outputs=["y_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.input_dim == 2:
                 self.dynamic_shape.min_input_shape = {"input_data": [1, 8]}
@@ -101,25 +102,31 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False
+        ), (1e-3, 1e-3)
         self.trt_param.precision = paddle_infer.PrecisionType.Int8
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True
+        ), (1e-3, 1e-3)
         self.trt_param.precision = paddle_infer.PrecisionType.Int8
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
index 76fcffad4592c3..3ba8aad3cae021 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertMatmulTest_static(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -47,48 +45,55 @@ def generate_input(shape):
                         input1_shape = [batch, 32, 6]
                         input2_shape = [batch, 6, 11]
                     for alpha in [0.3, 1.0]:
-                        dics = [{
-                            "transpose_X": trans_x,
-                            "transpose_Y": trans_y,
-                            "alpha": alpha,
-                            "fused_reshape_X": [],
-                            "fused_reshape_Y": [],
-                            "fused_transpose_X": [],
-                            "fused_transpose_Y": [],
-                            "fused_reshape_Out": [],
-                            "fused_transpose_Out": []
-                        }]
-                        ops_config = [{
-                            "op_type": "matmul",
-                            "op_inputs": {
-                                "X": ["input1_data"],
-                                "Y": ["input2_data"]
-                            },
-                            "op_outputs": {
-                                "Out": ["output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                        dics = [
+                            {
+                                "transpose_X": trans_x,
+                                "transpose_Y": trans_y,
+                                "alpha": alpha,
+                                "fused_reshape_X": [],
+                                "fused_reshape_Y": [],
+                                "fused_transpose_X": [],
+                                "fused_transpose_Y": [],
+                                "fused_reshape_Out": [],
+                                "fused_transpose_Out": [],
+                            }
+                        ]
+                        ops_config = [
+                            {
+                                "op_type": "matmul",
+                                "op_inputs": {
+                                    "X": ["input1_data"],
+                                    "Y": ["input2_data"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={},
                             inputs={
-                                "input1_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_input, input1_shape)),
-                                "input2_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_input, input2_shape))
+                                "input1_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, input1_shape
+                                    )
+                                ),
+                                "input2_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, input2_shape
+                                    )
+                                ),
                             },
-                            outputs=["output_data"])
+                            outputs=["output_data"],
+                        )
 
                         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             pass
 
@@ -102,19 +107,17 @@ def clear_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
 
     def test(self):
         self.run_test()
 
 
 class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input(shape):
             return np.random.random(shape).astype(np.float32)
 
@@ -133,60 +136,63 @@ def generate_input(shape):
                 #     input1_shape = [batch, 32, 6]
                 #     input2_shape = [batch, 6, 11]
                 for alpha in [0.3, 1.0]:
-                    dics = [{
-                        "transpose_X": trans_x,
-                        "transpose_Y": trans_y,
-                        "alpha": alpha,
-                        "fused_reshape_X": [],
-                        "fused_reshape_Y": [],
-                        "fused_transpose_X": [],
-                        "fused_transpose_Y": [],
-                        "fused_reshape_Out": [],
-                        "fused_transpose_Out": []
-                    }]
-                    ops_config = [{
-                        "op_type": "matmul",
-                        "op_inputs": {
-                            "X": ["input1_data"],
-                            "Y": ["input2_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    dics = [
+                        {
+                            "transpose_X": trans_x,
+                            "transpose_Y": trans_y,
+                            "alpha": alpha,
+                            "fused_reshape_X": [],
+                            "fused_reshape_Y": [],
+                            "fused_transpose_X": [],
+                            "fused_transpose_Y": [],
+                            "fused_reshape_Out": [],
+                            "fused_transpose_Out": [],
+                        }
+                    ]
+                    ops_config = [
+                        {
+                            "op_type": "matmul",
+                            "op_inputs": {
+                                "X": ["input1_data"],
+                                "Y": ["input2_data"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input1_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, input1_shape)),
-                            "input2_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input, input2_shape))
+                            "input1_data": TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)
+                            ),
+                            "input2_data": TensorConfig(
+                                data_gen=partial(generate_input, input2_shape)
+                            ),
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input1_data": [1, 4, 4],
-                "input2_data": [1, 4, 4]
+                "input2_data": [1, 4, 4],
             }
             self.dynamic_shape.max_input_shape = {
                 "input1_data": [16, 4, 4],
-                "input2_data": [16, 4, 4]
+                "input2_data": [16, 4, 4],
             }
             self.dynamic_shape.opt_input_shape = {
                 "input1_data": [8, 4, 4],
-                "input2_data": [8, 4, 4]
+                "input2_data": [8, 4, 4],
             }
 
         attrs = [
@@ -198,7 +204,7 @@ def generate_dynamic_shape(attrs):
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 3), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), 1e-5
+        yield self.create_inference_config(), (1, 3), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
new file mode 100644
index 00000000000000..ec6c9e633071c6
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_matmul_v2.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+import os
+
+
+class TrtConvertMatmulTest_dynamic(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [10, 11, 12, 13, 14, 15]:
+            for trans_x in [False]:
+                for trans_y in [False]:
+                    input1_shape = [batch, 64, 350, 75]
+                    input2_shape = [75, 25]
+                    dics = [
+                        {
+                            "trans_x": trans_x,
+                            "trans_y": trans_y,
+                        }
+                    ]
+                    ops_config = [
+                        {
+                            "op_type": "matmul_v2",
+                            "op_inputs": {
+                                "X": ["input1_data"],
+                                "Y": ["input2_data"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input1_data": TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)
+                            ),
+                            "input2_data": TensorConfig(
+                                data_gen=partial(generate_input, input2_shape)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [10, 64, 350, 75],
+                "input2_data": [75, 25],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [100, 64, 350, 75],
+                "input2_data": [75, 25],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [15, 64, 350, 75],
+                "input2_data": [75, 25],
+            }
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # The output has little diff between gpu and trt in CI-Windows-Inference
+        tol_fp32 = 1e-5
+        tol_half = 1e-5
+        if os.name == 'nt':
+            tol_fp32 = 1e-3
+            tol_half = 1e-3
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), tol_fp32
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), tol_half
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+class TrtConvertMatmulTest_dynamic2(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        for batch in [10, 11, 12, 13, 14, 15]:
+            for trans_x in [False]:
+                for trans_y in [False]:
+                    input1_shape = [60, 40]
+                    input2_shape = [batch, 40, 90]
+                    dics = [
+                        {
+                            "trans_x": trans_x,
+                            "trans_y": trans_y,
+                        }
+                    ]
+                    ops_config = [
+                        {
+                            "op_type": "matmul_v2",
+                            "op_inputs": {
+                                "X": ["input1_data"],
+                                "Y": ["input2_data"],
+                            },
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
+                    ops = self.generate_op_config(ops_config)
+
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input1_data": TensorConfig(
+                                data_gen=partial(generate_input, input1_shape)
+                            ),
+                            "input2_data": TensorConfig(
+                                data_gen=partial(generate_input, input2_shape)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                    yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input1_data": [60, 40],
+                "input2_data": [10, 40, 90],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input1_data": [60, 40],
+                "input2_data": [20, 40, 90],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input1_data": [60, 40],
+                "input2_data": [15, 40, 90],
+            }
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+        # The output has little diff between gpu and trt in CI-Windows-Inference
+        tol_fp32 = 1e-5
+        tol_half = 1e-5
+        if os.name == 'nt':
+            tol_fp32 = 1e-3
+            tol_half = 1e-3
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), tol_fp32
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), tol_half
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+class TrtConvertMatmulTest_dynamic3(TrtLayerAutoScanTest):
+    def sample_program_configs(self):
+        def generate_input(shape):
+            return np.random.random(shape).astype(np.float32)
+
+        # case0: mat * vec
+        # case1: vec * mat
+        # case2: vec * vec
+        for case in [0, 1, 2]:
+            for batch in range(20, 23):
+                for trans_x in [False, True]:
+                    for trans_y in [False, True]:
+                        self.case = case
+                        input1_shape = []
+                        input2_shape = []
+                        if case == 0:
+                            input1_shape = [batch, 50]
+                            input2_shape = [50]
+                        elif case == 1:
+                            input1_shape = [50]
+                            input2_shape = [50, batch]
+                        elif case == 2:
+                            input1_shape = [50]
+                            input2_shape = [50]
+                        if case == 0 or case == 1:
+                            dics = [
+                                {
+                                    "trans_x": False,
+                                    "trans_y": False,
+                                }
+                            ]
+                        elif case == 2:
+                            dics = [
+                                {
+                                    "trans_x": trans_x,
+                                    "trans_y": trans_y,
+                                }
+                            ]
+                        ops_config = [
+                            {
+                                "op_type": "matmul_v2",
+                                "op_inputs": {
+                                    "X": ["input1_data"],
+                                    "Y": ["input2_data"],
+                                },
+                                "op_outputs": {"Out": ["output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
+                        ops = self.generate_op_config(ops_config)
+
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input1_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, input1_shape
+                                    )
+                                ),
+                                "input2_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input, input2_shape
+                                    )
+                                ),
+                            },
+                            outputs=["output_data"],
+                        )
+
+                        yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape():
+            if self.case == 0:
+                self.dynamic_shape.min_input_shape = {
+                    "input1_data": [20, 50],
+                    "input2_data": [50],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input1_data": [30, 50],
+                    "input2_data": [50],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input1_data": [25, 50],
+                    "input2_data": [50],
+                }
+            elif self.case == 1:
+                self.dynamic_shape.min_input_shape = {
+                    "input2_data": [50, 20],
+                    "input1_data": [50],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input2_data": [50, 30],
+                    "input1_data": [50],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input2_data": [50, 25],
+                    "input1_data": [50],
+                }
+            elif self.case == 2:
+                self.dynamic_shape.min_input_shape = {
+                    "input2_data": [30],
+                    "input1_data": [50],
+                }
+                self.dynamic_shape.max_input_shape = {
+                    "input2_data": [50],
+                    "input1_data": [50],
+                }
+                self.dynamic_shape.opt_input_shape = {
+                    "input2_data": [50],
+                    "input1_data": [50],
+                }
+
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 3), 1e-3
+
+    def add_skip_trt_case(self):
+        pass
+
+    def test(self):
+        self.add_skip_trt_case()
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms.py
new file mode 100644
index 00000000000000..f554955786ad77
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms.py
@@ -0,0 +1,202 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest, SkipReasons
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import Optional, List, Callable, Dict, Any, Set
+import unittest
+
+
+class TrtConvertMulticlassNMSTest(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def create_inference_config(self, use_trt=True) -> paddle_infer.Config:
+        if use_trt:
+            config = paddle_infer.Config()
+            config.disable_glog_info()
+            config.enable_use_gpu(100, 0)
+            config.set_optim_cache_dir(self.cache_dir)
+            config.switch_ir_debug()
+            config.enable_tensorrt_engine(
+                max_batch_size=self.trt_param.max_batch_size,
+                workspace_size=self.trt_param.workspace_size,
+                min_subgraph_size=self.trt_param.min_subgraph_size,
+                precision_mode=self.trt_param.precision,
+                use_static=self.trt_param.use_static,
+                use_calib_mode=self.trt_param.use_calib_mode)
+            if len(self.dynamic_shape.min_input_shape
+                   ) != 0 and self.dynamic_shape.min_input_shape.keys(
+                   ) == self.dynamic_shape.max_input_shape.keys(
+                   ) and self.dynamic_shape.min_input_shape.keys(
+                   ) == self.dynamic_shape.opt_input_shape.keys():
+                config.set_trt_dynamic_shape_info(
+                    self.dynamic_shape.min_input_shape,
+                    self.dynamic_shape.max_input_shape,
+                    self.dynamic_shape.opt_input_shape,
+                    self.dynamic_shape.disable_trt_plugin_fp16)
+            return config
+        else:
+            config = paddle_infer.Config()
+            config.switch_ir_debug(True)
+            config.set_optim_cache_dir(self.cache_dir)
+            config.disable_glog_info()
+            return config
+
+    def sample_program_configs(self):
+
+        def generate_boxes(batch, num_boxes):
+            return np.arange(batch * num_boxes * 4,
+                             dtype=np.float32).reshape([batch, num_boxes, 4])
+
+        def generate_scores(batch, num_boxes, num_classes):
+            return np.arange(batch * num_classes * num_boxes,
+                             dtype=np.float32).reshape(
+                                 [batch, num_classes, num_boxes])
+            # return np.random.rand(batch, num_classes, num_boxes).astype(np.float32)
+
+        for batch in [1, 2]:
+            self.batch = batch
+            for nms_eta in [0.8, 1.1]:
+                for num_boxes, num_classes in [[80, 100], [40, 200], [20, 400]]:
+                    self.num_boxes, self.num_classes = num_boxes, num_classes
+                    for score_threshold in [
+                            0.01,
+                    ]:
+                        ops_config = [{
+                            "op_type": "multiclass_nms",
+                            "op_inputs": {
+                                "BBoxes": ["input_bboxes"],
+                                "Scores": ["input_scores"],
+                            },
+                            "op_outputs": {
+                                "Out": ["nms_output_boxes"],
+                            },
+                            "op_attrs": {
+                                "background_label": -1,
+                                "score_threshold": score_threshold,
+                                "nms_top_k": num_boxes,
+                                "keep_top_k": num_boxes,
+                                "nms_threshold": 0.3,
+                                "normalized": False,
+                                "nms_eta": nms_eta
+                            }
+                        }]
+                        ops = self.generate_op_config(ops_config)
+                        program_config = ProgramConfig(
+                            ops=ops,
+                            weights={},
+                            inputs={
+                                "input_bboxes":
+                                TensorConfig(data_gen=partial(
+                                    generate_boxes, batch, num_boxes)),
+                                "input_scores":
+                                TensorConfig(
+                                    data_gen=partial(generate_scores, batch,
+                                                     num_boxes, num_classes))
+                            },
+                            outputs=["nms_output_boxes"])
+                        yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape(attrs):
+            # The last dim of input_bboxes should be static.
+            self.dynamic_shape.min_input_shape = {
+                "input_bboxes": [1, self.num_boxes, 4],
+                "input_scores": [1, self.num_classes, self.num_boxes],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_bboxes": [8, self.num_boxes, 4],
+                "input_scores": [8, self.num_classes, self.num_boxes],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_bboxes": [self.batch, self.num_boxes, 4],
+                "input_scores": [self.batch, self.num_classes, self.num_boxes],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        def generate_trt_nodes_num(attrs, dynamic_shape):
+            return 1, 2
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, False), 1e-2
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        # self.trt_param.precision = paddle_infer.PrecisionType.Half
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-2, 1e-2)
+
+    def assert_tensors_near(self, atol: float, rtol: float,
+                            tensor: Dict[str, np.array],
+                            baseline: Dict[str, np.array]):
+        # the order of tensorrt outputs are not consistent with paddle
+        for key, arr in tensor.items():
+            if key == "nms_output_boxes":
+                basline_arr = np.array(
+                    sorted(baseline[key].reshape((-1, 6)),
+                           key=lambda i: [i[0], i[1]]))
+                arr = np.array(
+                    sorted(arr.reshape((-1, 6)), key=lambda i: [i[0], i[1]]))
+            else:
+                basline_arr = np.array(baseline[key].reshape((-1, 1)))
+                arr = np.array(arr.reshape((-1, 1)))
+
+            self.assertTrue(
+                basline_arr.shape == arr.shape,
+                "The output shapes are not equal, the baseline shape is " +
+                str(basline_arr.shape) + ', but got ' + str(arr.shape))
+            diff = abs(basline_arr - arr)
+            np.testing.assert_allclose(
+                basline_arr,
+                arr,
+                rtol=rtol,
+                atol=atol,
+                err_msg='Output has diff, Maximum absolute error: {}'.format(
+                    np.amax(diff)))
+
+    def assert_op_size(self, trt_engine_num, paddle_op_num):
+        # tensorrt op num is not consistent with paddle
+        return True
+
+    def test(self):
+        self.trt_param.workspace_size = 1 << 25
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py
index 8394a3b7069231..c0b659b41befe2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multiclass_nms3.py
@@ -71,8 +71,10 @@ def generate_scores(batch, num_boxes, num_classes):
             # return np.random.rand(batch, num_classes, num_boxes).astype(np.float32)
 
         for batch in [1, 2]:
-            for num_boxes in [4, 12]:
-                for num_classes in [2, 6]:
+            self.batch = batch
+            for nms_eta in [0.8, 1.1]:
+                for num_boxes, num_classes in [[80, 100], [40, 200], [20, 400]]:
+                    self.num_boxes, self.num_classes = num_boxes, num_classes
                     for score_threshold in [
                             0.01,
                     ]:
@@ -94,7 +96,7 @@ def generate_scores(batch, num_boxes, num_classes):
                                 "keep_top_k": num_boxes,
                                 "nms_threshold": 0.3,
                                 "normalized": False,
-                                "nms_eta": 1.1
+                                "nms_eta": nms_eta
                             }
                         }]
                         ops = self.generate_op_config(ops_config)
@@ -114,12 +116,26 @@ def generate_scores(batch, num_boxes, num_classes):
                                 "nms_output_boxes", "nms_output_num",
                                 "nms_output_index"
                             ])
-
                         yield program_config
 
     def sample_predictor_configs(
             self, program_config) -> (paddle_infer.Config, List[int], float):
 
+        def generate_dynamic_shape(attrs):
+            # The last dim of input_bboxes should be static.
+            self.dynamic_shape.min_input_shape = {
+                "input_bboxes": [1, self.num_boxes, 4],
+                "input_scores": [1, self.num_classes, self.num_boxes],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_bboxes": [8, self.num_boxes, 4],
+                "input_scores": [8, self.num_classes, self.num_boxes],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_bboxes": [self.batch, self.num_boxes, 4],
+                "input_scores": [self.batch, self.num_classes, self.num_boxes],
+            }
+
         def clear_dynamic_shape():
             self.dynamic_shape.min_input_shape = {}
             self.dynamic_shape.max_input_shape = {}
@@ -141,6 +157,15 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         yield self.create_inference_config(), generate_trt_nodes_num(
             attrs, False), 1e-2
 
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), generate_trt_nodes_num(
+            attrs, True), 1e-5
+        # self.trt_param.precision = paddle_infer.PrecisionType.Half
+        # yield self.create_inference_config(), generate_trt_nodes_num(
+        #     attrs, True), (1e-2, 1e-2)
+
     def assert_tensors_near(self, atol: float, rtol: float,
                             tensor: Dict[str, np.array],
                             baseline: Dict[str, np.array]):
@@ -176,7 +201,7 @@ def assert_op_size(self, trt_engine_num, paddle_op_num):
         return True
 
     def test(self):
-        self.trt_param.workspace_size = 1 << 20
+        self.trt_param.workspace_size = 1 << 25
         self.run_test()
 
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
index d552692ae4ff93..9731b7667f793d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_multihead_matmul.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertMultiHeadMatmulTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(batch, dim1):
             return np.random.random((batch, dim1, 768)).astype(np.float32)
 
@@ -44,103 +42,86 @@ def generate_weight2():
             self.batch = batch
             for reshape_shape in [[0, 0, 12, 64]]:
                 for dim1 in [128]:
-                    input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
-                                     [batch, 1, 1, dim1]]
+                    input2_shapes = [
+                        [batch, reshape_shape[2], dim1, dim1],
+                        [batch, 1, 1, dim1],
+                    ]
                     for input2_shape in input2_shapes:
                         for axis in [0]:
-                            dics = [{
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1
-                            }, {
-                                "axis": 2
-                            }, {
-                                "shape": reshape_shape
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1
-                            }, {
-                                "axis": 2
-                            }, {
-                                "shape": reshape_shape
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1
-                            }, {
-                                "axis": 2
-                            }, {
-                                "shape": reshape_shape
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "scale": 0.125,
-                                "bias": 0.0,
-                                "bias_after_scale": True
-                            }, {
-                                "alpha": 1.0,
-                                "transpose_X": False,
-                                "transpose_Y": True,
-                                "fused_reshape_X": [],
-                                "fused_reshape_Y": [],
-                                "fused_transpose_X": [],
-                                "fused_transpose_Y": [],
-                                "fused_reshape_Out": [],
-                                "fused_transpose_Out": []
-                            }, {
-                                "axis": axis
-                            }, {
-                                "axis": -1,
-                                "is_test": True
-                            }, {
-                                "seed": 0,
-                                "dropout_prob": 0.10000000149011612,
-                                "dropout_implementation": "upscale_in_train",
-                                "fix_seed": False,
-                                "is_test": True
-                            }, {
-                                "alpha": 1.0,
-                                "transpose_X": False,
-                                "transpose_Y": False,
-                                "fused_reshape_X": [],
-                                "fused_reshape_Y": [],
-                                "fused_transpose_X": [],
-                                "fused_transpose_Y": [],
-                                "fused_reshape_Out": [],
-                                "fused_transpose_Out": []
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "shape": [0, 0, 768]
-                            }, {
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1
-                            }]
+                            dics = [
+                                {"x_num_col_dims": 2, "y_num_col_dims": 1},
+                                {"axis": 2},
+                                {"shape": reshape_shape},
+                                {"axis": [0, 2, 1, 3]},
+                                {"x_num_col_dims": 2, "y_num_col_dims": 1},
+                                {"axis": 2},
+                                {"shape": reshape_shape},
+                                {"axis": [0, 2, 1, 3]},
+                                {"x_num_col_dims": 2, "y_num_col_dims": 1},
+                                {"axis": 2},
+                                {"shape": reshape_shape},
+                                {"axis": [0, 2, 1, 3]},
+                                {
+                                    "scale": 0.125,
+                                    "bias": 0.0,
+                                    "bias_after_scale": True,
+                                },
+                                {
+                                    "alpha": 1.0,
+                                    "transpose_X": False,
+                                    "transpose_Y": True,
+                                    "fused_reshape_X": [],
+                                    "fused_reshape_Y": [],
+                                    "fused_transpose_X": [],
+                                    "fused_transpose_Y": [],
+                                    "fused_reshape_Out": [],
+                                    "fused_transpose_Out": [],
+                                },
+                                {"axis": axis},
+                                {"axis": -1, "is_test": True},
+                                {
+                                    "seed": 0,
+                                    "dropout_prob": 0.10000000149011612,
+                                    "dropout_implementation": "upscale_in_train",
+                                    "fix_seed": False,
+                                    "is_test": True,
+                                },
+                                {
+                                    "alpha": 1.0,
+                                    "transpose_X": False,
+                                    "transpose_Y": False,
+                                    "fused_reshape_X": [],
+                                    "fused_reshape_Y": [],
+                                    "fused_transpose_X": [],
+                                    "fused_transpose_Y": [],
+                                    "fused_reshape_Out": [],
+                                    "fused_transpose_Out": [],
+                                },
+                                {"axis": [0, 2, 1, 3]},
+                                {"shape": [0, 0, 768]},
+                                {"x_num_col_dims": 2, "y_num_col_dims": 1},
+                            ]
 
                             ops_config = [
                                 {
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["input_data1"],
-                                        "Y": ["mul1_weight"]
+                                        "Y": ["mul1_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul1_output"]
-                                    },
-                                    "op_attrs": dics[0]
+                                    "op_outputs": {"Out": ["mul1_output"]},
+                                    "op_attrs": dics[0],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["mul1_output"],
-                                        "Y": ["elementwise_add1_weight"]
+                                        "Y": ["elementwise_add1_weight"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add1_output"]
                                     },
-                                    "op_attrs": dics[1]
+                                    "op_attrs": dics[1],
                                 },
                                 {
                                     "op_type": "reshape2",
@@ -149,42 +130,38 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["reshape21_output"],
-                                        "XShape": ["reshape21_output_xshape"]
+                                        "XShape": ["reshape21_output_xshape"],
                                     },
-                                    "op_attrs": dics[2]
+                                    "op_attrs": dics[2],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["reshape21_output"]
-                                    },
+                                    "op_inputs": {"X": ["reshape21_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose21_output"],
-                                        "XShape": ["transpose21_output_xshape"]
+                                        "XShape": ["transpose21_output_xshape"],
                                     },
-                                    "op_attrs": dics[3]
+                                    "op_attrs": dics[3],
                                 },
                                 {
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["input_data1"],
-                                        "Y": ["mul2_weight"]
+                                        "Y": ["mul2_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul2_output"]
-                                    },
-                                    "op_attrs": dics[4]
+                                    "op_outputs": {"Out": ["mul2_output"]},
+                                    "op_attrs": dics[4],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["mul2_output"],
-                                        "Y": ["elementwise_add2_weight"]
+                                        "Y": ["elementwise_add2_weight"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add2_output"]
                                     },
-                                    "op_attrs": dics[5]
+                                    "op_attrs": dics[5],
                                 },
                                 {
                                     "op_type": "reshape2",
@@ -193,42 +170,38 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["reshape22_output"],
-                                        "XShape": ["reshape22_output_xshape"]
+                                        "XShape": ["reshape22_output_xshape"],
                                     },
-                                    "op_attrs": dics[6]
+                                    "op_attrs": dics[6],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["reshape22_output"]
-                                    },
+                                    "op_inputs": {"X": ["reshape22_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose22_output"],
-                                        "XShape": ["transpose22_output_xshape"]
+                                        "XShape": ["transpose22_output_xshape"],
                                     },
-                                    "op_attrs": dics[7]
+                                    "op_attrs": dics[7],
                                 },
                                 {
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["input_data1"],
-                                        "Y": ["mul3_weight"]
+                                        "Y": ["mul3_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul3_output"]
-                                    },
-                                    "op_attrs": dics[8]
+                                    "op_outputs": {"Out": ["mul3_output"]},
+                                    "op_attrs": dics[8],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["mul3_output"],
-                                        "Y": ["elementwise_add3_weight"]
+                                        "Y": ["elementwise_add3_weight"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add3_output"]
                                     },
-                                    "op_attrs": dics[9]
+                                    "op_attrs": dics[9],
                                 },
                                 {
                                     "op_type": "reshape2",
@@ -237,30 +210,26 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["reshape23_output"],
-                                        "XShape": ["reshape23_output_xshape"]
+                                        "XShape": ["reshape23_output_xshape"],
                                     },
-                                    "op_attrs": dics[10]
+                                    "op_attrs": dics[10],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["reshape23_output"]
-                                    },
+                                    "op_inputs": {"X": ["reshape23_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose23_output"],
-                                        "XShape": ["transpose23_output_xshape"]
+                                        "XShape": ["transpose23_output_xshape"],
                                     },
-                                    "op_attrs": dics[11]
+                                    "op_attrs": dics[11],
                                 },
                                 {
                                     "op_type": "scale",
                                     "op_inputs": {
                                         "X": ["transpose23_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["scale_output"]
-                                    },
-                                    "op_attrs": dics[12]
+                                    "op_outputs": {"Out": ["scale_output"]},
+                                    "op_attrs": dics[12],
                                 },
                                 {
                                     "op_type": "matmul",
@@ -268,41 +237,35 @@ def generate_weight2():
                                         "X": ["scale_output"],
                                         "Y": ["transpose22_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["matmul1_output"]
-                                    },
-                                    "op_attrs": dics[13]
+                                    "op_outputs": {"Out": ["matmul1_output"]},
+                                    "op_attrs": dics[13],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["matmul1_output"],
-                                        "Y": ["input_data2"]
+                                        "Y": ["input_data2"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add4_output"]
                                     },
-                                    "op_attrs": dics[14]
+                                    "op_attrs": dics[14],
                                 },
                                 {
                                     "op_type": "softmax",
                                     "op_inputs": {
                                         "X": ["elementwise_add4_output"]
                                     },
-                                    "op_outputs": {
-                                        "Out": ["softmax_output"]
-                                    },
-                                    "op_attrs": dics[15]
+                                    "op_outputs": {"Out": ["softmax_output"]},
+                                    "op_attrs": dics[15],
                                 },
                                 {
                                     "op_type": "dropout",
                                     "op_inputs": {
                                         "X": ["softmax_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["dropout3_output"]
-                                    },
-                                    "op_attrs": dics[16]
+                                    "op_outputs": {"Out": ["dropout3_output"]},
+                                    "op_attrs": dics[16],
                                 },
                                 {
                                     "op_type": "matmul",
@@ -310,32 +273,26 @@ def generate_weight2():
                                         "X": ["dropout3_output"],
                                         "Y": ["transpose21_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["matmul2_output"]
-                                    },
-                                    "op_attrs": dics[17]
+                                    "op_outputs": {"Out": ["matmul2_output"]},
+                                    "op_attrs": dics[17],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["matmul2_output"]
-                                    },
+                                    "op_inputs": {"X": ["matmul2_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose24_output"],
-                                        "XShape": ["transpose24_output_xshape"]
+                                        "XShape": ["transpose24_output_xshape"],
                                     },
-                                    "op_attrs": dics[18]
+                                    "op_attrs": dics[18],
                                 },
                                 {
                                     "op_type": "reshape2",
-                                    "op_inputs": {
-                                        "X": ["transpose24_output"]
-                                    },
+                                    "op_inputs": {"X": ["transpose24_output"]},
                                     "op_outputs": {
                                         "Out": ["reshape24_output"],
-                                        "XShape": ["reshape24_output_xshape"]
+                                        "XShape": ["reshape24_output_xshape"],
                                     },
-                                    "op_attrs": dics[19]
+                                    "op_attrs": dics[19],
                                 },
                                 # In order to fuse ops with
                                 # multihead_matmul_fuse_pass_v2, the last op
@@ -344,72 +301,75 @@ def generate_weight2():
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["reshape24_output"],
-                                        "Y": ["mul4_weight"]
+                                        "Y": ["mul4_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul4_output"]
-                                    },
-                                    "op_attrs": dics[20]
-                                }
+                                    "op_outputs": {"Out": ["mul4_output"]},
+                                    "op_attrs": dics[20],
+                                },
                             ]
                             ops = self.generate_op_config(ops_config)
 
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={
-                                    "mul1_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "mul2_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "mul3_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "mul4_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "elementwise_add1_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight2)),
-                                    "elementwise_add2_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight2)),
-                                    "elementwise_add3_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight2)),
+                                    "mul1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "mul2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "mul3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "mul4_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "elementwise_add1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)
+                                    ),
+                                    "elementwise_add2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)
+                                    ),
+                                    "elementwise_add3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)
+                                    ),
                                 },
                                 inputs={
-                                    "input_data1":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input1, batch, dim1)),
-                                    "input_data2":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input2, input2_shape)),
+                                    "input_data1": TensorConfig(
+                                        data_gen=partial(
+                                            generate_input1, batch, dim1
+                                        )
+                                    ),
+                                    "input_data2": TensorConfig(
+                                        data_gen=partial(
+                                            generate_input2, input2_shape
+                                        )
+                                    ),
                                 },
-                                outputs=["mul4_output"])
+                                outputs=["mul4_output"],
+                            )
 
                             yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The last dim of input1 and input2 should be static.
             self.dynamic_shape.min_input_shape = {
                 "input_data1": [1, 8, 768],
                 "input_data2": [1, 1, 1, 128],
-                "reshape24_output": [1, 128, 768]
+                "reshape24_output": [1, 128, 768],
             }
             self.dynamic_shape.max_input_shape = {
                 "input_data1": [16, 512, 768],
                 "input_data2": [16, 256, 512, 128],
-                "reshape24_output": [1, 128, 768]
+                "reshape24_output": [1, 128, 768],
             }
             self.dynamic_shape.opt_input_shape = {
                 "input_data1": [8, 128, 768],
                 "input_data2": [8, 32, 64, 128],
-                "reshape24_output": [1, 128, 768]
+                "reshape24_output": [1, 128, 768],
             }
 
         def clear_dynamic_shape():
@@ -427,7 +387,7 @@ def clear_dynamic_shape():
         self.trt_param.workspace_size = 2013265920
         yield self.create_inference_config(), (1, 4), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 4), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 4), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -435,28 +395,33 @@ def clear_dynamic_shape():
         self.trt_param.workspace_size = 2013265920
         yield self.create_inference_config(), (1, 3), (1e-5, 1e-4)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 3), (1e-5, 1e-5)
+        yield self.create_inference_config(), (1, 3), (1e-3, 1e-3)
 
     def add_skip_trt_case(self):
-
         def teller1(program_config, predictor_config):
             if self.trt_param.precision == paddle_infer.PrecisionType.Half:
                 return True
             return False
 
         self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in fp16 mode.")
+            teller1,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt in fp16 mode.",
+        )
 
         def teller2(program_config, predictor_config):
-            if self.trt_param.precision == paddle_infer.PrecisionType.Float32 and len(
-                    self.dynamic_shape.min_input_shape) != 0 and self.batch > 2:
+            if (
+                self.trt_param.precision == paddle_infer.PrecisionType.Float32
+                and len(self.dynamic_shape.min_input_shape) != 0
+                and self.batch > 2
+            ):
                 return True
             return False
 
         self.add_skip_case(
-            teller2, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2."
+            teller2,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt when dynamic fp32 mode and batch size > 2.",
         )
 
         def teller3(program_config, predictor_config):
@@ -465,8 +430,10 @@ def teller3(program_config, predictor_config):
             return False
 
         self.add_skip_case(
-            teller3, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The output has diff between gpu and trt in int8 mode.")
+            teller3,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The output has diff between gpu and trt in int8 mode.",
+        )
 
     def test(self):
         self.add_skip_trt_case()
@@ -474,9 +441,7 @@ def test(self):
 
 
 class TrtConvertMultiHeadMatmulTestInt8(TrtConvertMultiHeadMatmulTest):
-
     def sample_program_configs(self):
-
         def generate_input1(batch, dim1):
             return np.random.random((batch, dim1, 768)).astype(np.float32)
 
@@ -493,112 +458,110 @@ def generate_weight2():
             self.batch = batch
             for reshape_shape in [[0, 0, 12, 64]]:
                 for dim1 in [128]:
-                    input2_shapes = [[batch, reshape_shape[2], dim1, dim1],
-                                     [batch, 1, 1, dim1]]
+                    input2_shapes = [
+                        [batch, reshape_shape[2], dim1, dim1],
+                        [batch, 1, 1, dim1],
+                    ]
                     for input2_shape in input2_shapes:
                         for axis in [0]:
-                            dics = [{
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1,
-                                "enable_int8": True,
-                                "Input_scale": 1.0,
-                            }, {
-                                "axis": 2,
-                                "out_threshold": 1.0,
-                            }, {
-                                "shape": reshape_shape
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1,
-                                "enable_int8": True,
-                                "Input_scale": 1.0,
-                            }, {
-                                "axis": 2,
-                                "out_threshold": 1.0,
-                            }, {
-                                "shape": reshape_shape
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1,
-                                "enable_int8": True,
-                                "Input_scale": 1.0,
-                            }, {
-                                "axis": 2,
-                                "out_threshold": 1.0,
-                            }, {
-                                "shape": reshape_shape
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "scale": 0.125,
-                                "bias": 0.0,
-                                "bias_after_scale": True
-                            }, {
-                                "alpha": 1.0,
-                                "transpose_X": False,
-                                "transpose_Y": True,
-                                "fused_reshape_X": [],
-                                "fused_reshape_Y": [],
-                                "fused_transpose_X": [],
-                                "fused_transpose_Y": [],
-                                "fused_reshape_Out": [],
-                                "fused_transpose_Out": []
-                            }, {
-                                "axis": axis
-                            }, {
-                                "axis": -1,
-                                "is_test": True
-                            }, {
-                                "seed": 0,
-                                "dropout_prob": 0.10000000149011612,
-                                "dropout_implementation": "upscale_in_train",
-                                "fix_seed": False,
-                                "is_test": True
-                            }, {
-                                "alpha": 1.0,
-                                "transpose_X": False,
-                                "transpose_Y": False,
-                                "fused_reshape_X": [],
-                                "fused_reshape_Y": [],
-                                "fused_transpose_X": [],
-                                "fused_transpose_Y": [],
-                                "fused_reshape_Out": [],
-                                "fused_transpose_Out": []
-                            }, {
-                                "axis": [0, 2, 1, 3]
-                            }, {
-                                "shape": [0, 0, 768]
-                            }, {
-                                "x_num_col_dims": 2,
-                                "y_num_col_dims": 1
-                            }]
+                            dics = [
+                                {
+                                    "x_num_col_dims": 2,
+                                    "y_num_col_dims": 1,
+                                    "enable_int8": True,
+                                    "Input_scale": 1.0,
+                                },
+                                {
+                                    "axis": 2,
+                                    "out_threshold": 1.0,
+                                },
+                                {"shape": reshape_shape},
+                                {"axis": [0, 2, 1, 3]},
+                                {
+                                    "x_num_col_dims": 2,
+                                    "y_num_col_dims": 1,
+                                    "enable_int8": True,
+                                    "Input_scale": 1.0,
+                                },
+                                {
+                                    "axis": 2,
+                                    "out_threshold": 1.0,
+                                },
+                                {"shape": reshape_shape},
+                                {"axis": [0, 2, 1, 3]},
+                                {
+                                    "x_num_col_dims": 2,
+                                    "y_num_col_dims": 1,
+                                    "enable_int8": True,
+                                    "Input_scale": 1.0,
+                                },
+                                {
+                                    "axis": 2,
+                                    "out_threshold": 1.0,
+                                },
+                                {"shape": reshape_shape},
+                                {"axis": [0, 2, 1, 3]},
+                                {
+                                    "scale": 0.125,
+                                    "bias": 0.0,
+                                    "bias_after_scale": True,
+                                },
+                                {
+                                    "alpha": 1.0,
+                                    "transpose_X": False,
+                                    "transpose_Y": True,
+                                    "fused_reshape_X": [],
+                                    "fused_reshape_Y": [],
+                                    "fused_transpose_X": [],
+                                    "fused_transpose_Y": [],
+                                    "fused_reshape_Out": [],
+                                    "fused_transpose_Out": [],
+                                },
+                                {"axis": axis},
+                                {"axis": -1, "is_test": True},
+                                {
+                                    "seed": 0,
+                                    "dropout_prob": 0.10000000149011612,
+                                    "dropout_implementation": "upscale_in_train",
+                                    "fix_seed": False,
+                                    "is_test": True,
+                                },
+                                {
+                                    "alpha": 1.0,
+                                    "transpose_X": False,
+                                    "transpose_Y": False,
+                                    "fused_reshape_X": [],
+                                    "fused_reshape_Y": [],
+                                    "fused_transpose_X": [],
+                                    "fused_transpose_Y": [],
+                                    "fused_reshape_Out": [],
+                                    "fused_transpose_Out": [],
+                                },
+                                {"axis": [0, 2, 1, 3]},
+                                {"shape": [0, 0, 768]},
+                                {"x_num_col_dims": 2, "y_num_col_dims": 1},
+                            ]
 
                             ops_config = [
                                 {
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["input_data1"],
-                                        "Y": ["mul1_weight"]
-                                    },
-                                    "op_outputs": {
-                                        "Out": ["mul1_output"]
+                                        "Y": ["mul1_weight"],
                                     },
-                                    "op_attrs": dics[0]
+                                    "op_outputs": {"Out": ["mul1_output"]},
+                                    "op_attrs": dics[0],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["mul1_output"],
-                                        "Y": ["elementwise_add1_weight"]
+                                        "Y": ["elementwise_add1_weight"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add1_output"]
                                     },
-                                    "op_attrs": dics[1]
+                                    "op_attrs": dics[1],
                                 },
                                 {
                                     "op_type": "reshape2",
@@ -607,42 +570,38 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["reshape21_output"],
-                                        "XShape": ["reshape21_output_xshape"]
+                                        "XShape": ["reshape21_output_xshape"],
                                     },
-                                    "op_attrs": dics[2]
+                                    "op_attrs": dics[2],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["reshape21_output"]
-                                    },
+                                    "op_inputs": {"X": ["reshape21_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose21_output"],
-                                        "XShape": ["transpose21_output_xshape"]
+                                        "XShape": ["transpose21_output_xshape"],
                                     },
-                                    "op_attrs": dics[3]
+                                    "op_attrs": dics[3],
                                 },
                                 {
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["input_data1"],
-                                        "Y": ["mul2_weight"]
+                                        "Y": ["mul2_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul2_output"]
-                                    },
-                                    "op_attrs": dics[4]
+                                    "op_outputs": {"Out": ["mul2_output"]},
+                                    "op_attrs": dics[4],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["mul2_output"],
-                                        "Y": ["elementwise_add2_weight"]
+                                        "Y": ["elementwise_add2_weight"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add2_output"]
                                     },
-                                    "op_attrs": dics[5]
+                                    "op_attrs": dics[5],
                                 },
                                 {
                                     "op_type": "reshape2",
@@ -651,42 +610,38 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["reshape22_output"],
-                                        "XShape": ["reshape22_output_xshape"]
+                                        "XShape": ["reshape22_output_xshape"],
                                     },
-                                    "op_attrs": dics[6]
+                                    "op_attrs": dics[6],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["reshape22_output"]
-                                    },
+                                    "op_inputs": {"X": ["reshape22_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose22_output"],
-                                        "XShape": ["transpose22_output_xshape"]
+                                        "XShape": ["transpose22_output_xshape"],
                                     },
-                                    "op_attrs": dics[7]
+                                    "op_attrs": dics[7],
                                 },
                                 {
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["input_data1"],
-                                        "Y": ["mul3_weight"]
+                                        "Y": ["mul3_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul3_output"]
-                                    },
-                                    "op_attrs": dics[8]
+                                    "op_outputs": {"Out": ["mul3_output"]},
+                                    "op_attrs": dics[8],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["mul3_output"],
-                                        "Y": ["elementwise_add3_weight"]
+                                        "Y": ["elementwise_add3_weight"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add3_output"]
                                     },
-                                    "op_attrs": dics[9]
+                                    "op_attrs": dics[9],
                                 },
                                 {
                                     "op_type": "reshape2",
@@ -695,30 +650,26 @@ def generate_weight2():
                                     },
                                     "op_outputs": {
                                         "Out": ["reshape23_output"],
-                                        "XShape": ["reshape23_output_xshape"]
+                                        "XShape": ["reshape23_output_xshape"],
                                     },
-                                    "op_attrs": dics[10]
+                                    "op_attrs": dics[10],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["reshape23_output"]
-                                    },
+                                    "op_inputs": {"X": ["reshape23_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose23_output"],
-                                        "XShape": ["transpose23_output_xshape"]
+                                        "XShape": ["transpose23_output_xshape"],
                                     },
-                                    "op_attrs": dics[11]
+                                    "op_attrs": dics[11],
                                 },
                                 {
                                     "op_type": "scale",
                                     "op_inputs": {
                                         "X": ["transpose23_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["scale_output"]
-                                    },
-                                    "op_attrs": dics[12]
+                                    "op_outputs": {"Out": ["scale_output"]},
+                                    "op_attrs": dics[12],
                                 },
                                 {
                                     "op_type": "matmul",
@@ -726,41 +677,35 @@ def generate_weight2():
                                         "X": ["scale_output"],
                                         "Y": ["transpose22_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["matmul1_output"]
-                                    },
-                                    "op_attrs": dics[13]
+                                    "op_outputs": {"Out": ["matmul1_output"]},
+                                    "op_attrs": dics[13],
                                 },
                                 {
                                     "op_type": "elementwise_add",
                                     "op_inputs": {
                                         "X": ["matmul1_output"],
-                                        "Y": ["input_data2"]
+                                        "Y": ["input_data2"],
                                     },
                                     "op_outputs": {
                                         "Out": ["elementwise_add4_output"]
                                     },
-                                    "op_attrs": dics[14]
+                                    "op_attrs": dics[14],
                                 },
                                 {
                                     "op_type": "softmax",
                                     "op_inputs": {
                                         "X": ["elementwise_add4_output"]
                                     },
-                                    "op_outputs": {
-                                        "Out": ["softmax_output"]
-                                    },
-                                    "op_attrs": dics[15]
+                                    "op_outputs": {"Out": ["softmax_output"]},
+                                    "op_attrs": dics[15],
                                 },
                                 {
                                     "op_type": "dropout",
                                     "op_inputs": {
                                         "X": ["softmax_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["dropout3_output"]
-                                    },
-                                    "op_attrs": dics[16]
+                                    "op_outputs": {"Out": ["dropout3_output"]},
+                                    "op_attrs": dics[16],
                                 },
                                 {
                                     "op_type": "matmul",
@@ -768,32 +713,26 @@ def generate_weight2():
                                         "X": ["dropout3_output"],
                                         "Y": ["transpose21_output"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["matmul2_output"]
-                                    },
-                                    "op_attrs": dics[17]
+                                    "op_outputs": {"Out": ["matmul2_output"]},
+                                    "op_attrs": dics[17],
                                 },
                                 {
                                     "op_type": "transpose2",
-                                    "op_inputs": {
-                                        "X": ["matmul2_output"]
-                                    },
+                                    "op_inputs": {"X": ["matmul2_output"]},
                                     "op_outputs": {
                                         "Out": ["transpose24_output"],
-                                        "XShape": ["transpose24_output_xshape"]
+                                        "XShape": ["transpose24_output_xshape"],
                                     },
-                                    "op_attrs": dics[18]
+                                    "op_attrs": dics[18],
                                 },
                                 {
                                     "op_type": "reshape2",
-                                    "op_inputs": {
-                                        "X": ["transpose24_output"]
-                                    },
+                                    "op_inputs": {"X": ["transpose24_output"]},
                                     "op_outputs": {
                                         "Out": ["reshape24_output"],
-                                        "XShape": ["reshape24_output_xshape"]
+                                        "XShape": ["reshape24_output_xshape"],
                                     },
-                                    "op_attrs": dics[19]
+                                    "op_attrs": dics[19],
                                 },
                                 # In order to fuse ops with
                                 # multihead_matmul_fuse_pass_v2, the last op
@@ -802,61 +741,62 @@ def generate_weight2():
                                     "op_type": "mul",
                                     "op_inputs": {
                                         "X": ["reshape24_output"],
-                                        "Y": ["mul4_weight"]
+                                        "Y": ["mul4_weight"],
                                     },
-                                    "op_outputs": {
-                                        "Out": ["mul4_output"]
-                                    },
-                                    "op_attrs": dics[20]
-                                }
+                                    "op_outputs": {"Out": ["mul4_output"]},
+                                    "op_attrs": dics[20],
+                                },
                             ]
                             ops = self.generate_op_config(ops_config)
 
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={
-                                    "mul1_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "mul2_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "mul3_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "mul4_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight1)),
-                                    "elementwise_add1_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight2)),
-                                    "elementwise_add2_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight2)),
-                                    "elementwise_add3_weight":
-                                    TensorConfig(
-                                        data_gen=partial(generate_weight2)),
+                                    "mul1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "mul2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "mul3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "mul4_weight": TensorConfig(
+                                        data_gen=partial(generate_weight1)
+                                    ),
+                                    "elementwise_add1_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)
+                                    ),
+                                    "elementwise_add2_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)
+                                    ),
+                                    "elementwise_add3_weight": TensorConfig(
+                                        data_gen=partial(generate_weight2)
+                                    ),
                                 },
                                 inputs={
-                                    "input_data1":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input1, batch, dim1)),
-                                    "input_data2":
-                                    TensorConfig(data_gen=partial(
-                                        generate_input2, input2_shape)),
+                                    "input_data1": TensorConfig(
+                                        data_gen=partial(
+                                            generate_input1, batch, dim1
+                                        )
+                                    ),
+                                    "input_data2": TensorConfig(
+                                        data_gen=partial(
+                                            generate_input2, input2_shape
+                                        )
+                                    ),
                                 },
-                                outputs=["mul4_output"])
+                                outputs=["mul4_output"],
+                            )
 
                             yield program_config
 
 
 class TrtConvertVitToMultiHeadMatmulTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(batch, length):
             return np.zeros((batch, length, 768), dtype=np.float32)
 
@@ -868,218 +808,192 @@ def generate_weight2():
 
         for batch in [2, 4]:
             self.batch = batch
-            for length in [64, 384]:
+            for length in [197]:
                 self.length = length
-                ops_config = [{
-                    "op_type": "matmul_v2",
-                    "op_inputs": {
-                        "X": ["input_data1"],
-                        "Y": ["matmul1_weight"]
-                    },
-                    "op_outputs": {
-                        "Out": ["matmul1_output"]
-                    },
-                    "op_attrs": {
-                        "trans_x": False,
-                        "trans_y": False
-                    }
-                }, {
-                    "op_type": "elementwise_add",
-                    "op_inputs": {
-                        "X": ["matmul1_output"],
-                        "Y": ["elementwise_add1_weight"]
-                    },
-                    "op_outputs": {
-                        "Out": ["elementwise_add1_output"]
+                ops_config = [
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["input_data1"],
+                            "Y": ["matmul1_weight"],
+                        },
+                        "op_outputs": {"Out": ["matmul1_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
                     },
-                    "op_attrs": {
-                        "Scale_out": 1.0,
-                        "Scale_x": 1.0,
-                        "Scale_y": 1.0,
-                        "axis": 2
-                    }
-                }, {
-                    "op_type": "reshape2",
-                    "op_inputs": {
-                        "X": ["elementwise_add1_output"],
+                    {
+                        "op_type": "elementwise_add",
+                        "op_inputs": {
+                            "X": ["matmul1_output"],
+                            "Y": ["elementwise_add1_weight"],
+                        },
+                        "op_outputs": {"Out": ["elementwise_add1_output"]},
+                        "op_attrs": {
+                            "Scale_out": 1.0,
+                            "Scale_x": 1.0,
+                            "Scale_y": 1.0,
+                            "axis": 2,
+                        },
                     },
-                    "op_outputs": {
-                        "Out": ["reshape1_output"],
-                        "XShape": ["reshape1_output_xshape"]
+                    {
+                        "op_type": "reshape2",
+                        "op_inputs": {
+                            "X": ["elementwise_add1_output"],
+                        },
+                        "op_outputs": {
+                            "Out": ["reshape1_output"],
+                            "XShape": ["reshape1_output_xshape"],
+                        },
+                        "op_attrs": {"shape": [-1, self.length, 3, 12, 64]},
                     },
-                    "op_attrs": {
-                        "shape": [-1, self.length, 3, 12, 64]
-                    }
-                }, {
-                    "op_type": "transpose2",
-                    "op_inputs": {
-                        "X": ["reshape1_output"]
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["reshape1_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose1_output"],
+                            "XShape": ["transpose1_output_xshape"],
+                        },
+                        "op_attrs": {
+                            "axis": [2, 0, 3, 1, 4],
+                            "data_format": "AnyLayout",
+                        },
                     },
-                    "op_outputs": {
-                        "Out": ["transpose1_output"],
-                        "XShape": ["transpose1_output_xshape"]
+                    {
+                        "op_type": "slice",
+                        "op_inputs": {
+                            "Input": ["transpose1_output"],
+                        },
+                        "op_outputs": {"Out": ["slice1_output"]},
+                        "op_attrs": {
+                            "axes": [0],
+                            "starts": [0],
+                            "ends": [1],
+                            "decrease_axis": [0],
+                            "infer_flags": [1],
+                        },
                     },
-                    "op_attrs": {
-                        "axis": [2, 0, 3, 1, 4],
-                        "data_format": "AnyLayout"
-                    }
-                }, {
-                    "op_type": "slice",
-                    "op_inputs": {
-                        "Input": ["transpose1_output"],
+                    {
+                        "op_type": "slice",
+                        "op_inputs": {
+                            "Input": ["transpose1_output"],
+                        },
+                        "op_outputs": {"Out": ["slice2_output"]},
+                        "op_attrs": {
+                            "axes": [0],
+                            "starts": [1],
+                            "ends": [2],
+                            "decrease_axis": [0],
+                            "infer_flags": [1],
+                        },
                     },
-                    "op_outputs": {
-                        "Out": ["slice1_output"]
+                    {
+                        "op_type": "slice",
+                        "op_inputs": {
+                            "Input": ["transpose1_output"],
+                        },
+                        "op_outputs": {"Out": ["slice3_output"]},
+                        "op_attrs": {
+                            "axes": [0],
+                            "starts": [2],
+                            "ends": [3],
+                            "decrease_axis": [0],
+                            "infer_flags": [1],
+                        },
                     },
-                    "op_attrs": {
-                        "axes": [0],
-                        "starts": [0],
-                        "ends": [1],
-                        "decrease_axis": [0],
-                        "infer_flags": [1]
-                    }
-                }, {
-                    "op_type": "slice",
-                    "op_inputs": {
-                        "Input": ["transpose1_output"],
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["slice2_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose2_output"],
+                        },
+                        "op_attrs": {
+                            "axis": [0, 1, 3, 2],
+                            "data_format": "AnyLayout",
+                        },
                     },
-                    "op_outputs": {
-                        "Out": ["slice2_output"]
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["slice1_output"],
+                            "Y": ["transpose2_output"],
+                        },
+                        "op_outputs": {"Out": ["matmul2_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
                     },
-                    "op_attrs": {
-                        "axes": [0],
-                        "starts": [1],
-                        "ends": [2],
-                        "decrease_axis": [0],
-                        "infer_flags": [1]
-                    }
-                }, {
-                    "op_type": "slice",
-                    "op_inputs": {
-                        "Input": ["transpose1_output"],
+                    {
+                        "op_type": "scale",
+                        "op_inputs": {
+                            "X": ["matmul2_output"],
+                        },
+                        "op_outputs": {"Out": ["scale_output"]},
+                        "op_attrs": {
+                            "scale": 0.125,
+                            "bias": 0.0,
+                            "bias_after_scale": True,
+                        },
                     },
-                    "op_outputs": {
-                        "Out": ["slice3_output"]
+                    {
+                        "op_type": "softmax",
+                        "op_inputs": {"X": ["scale_output"]},
+                        "op_outputs": {"Out": ["softmax_output"]},
+                        "op_attrs": {"axis": -1, "data_format": "AnyLayout"},
                     },
-                    "op_attrs": {
-                        "axes": [0],
-                        "starts": [2],
-                        "ends": [3],
-                        "decrease_axis": [0],
-                        "infer_flags": [1]
-                    }
-                }, {
-                    "op_type": "transpose2",
-                    "op_inputs": {
-                        "X": ["slice2_output"]
+                    {
+                        "op_type": "matmul_v2",
+                        "op_inputs": {
+                            "X": ["softmax_output"],
+                            "Y": ["slice3_output"],
+                        },
+                        "op_outputs": {"Out": ["matmul3_output"]},
+                        "op_attrs": {"trans_x": False, "trans_y": False},
                     },
-                    "op_outputs": {
-                        "Out": ["transpose2_output"],
+                    {
+                        "op_type": "transpose2",
+                        "op_inputs": {"X": ["matmul3_output"]},
+                        "op_outputs": {
+                            "Out": ["transpose3_output"],
+                            "XShape": ["transpose3_output_xshape"],
+                        },
+                        "op_attrs": {
+                            "axis": [0, 2, 1, 3],
+                            "data_format": "AnyLayout",
+                        },
                     },
-                    "op_attrs": {
-                        "axis": [0, 1, 3, 2],
-                        "data_format": "AnyLayout"
-                    }
-                }, {
-                    "op_type": "matmul_v2",
-                    "op_inputs": {
-                        "X": ["slice1_output"],
-                        "Y": ["transpose2_output"]
+                    {
+                        "op_type": "reshape2",
+                        "op_inputs": {"X": ["transpose3_output"]},
+                        "op_outputs": {
+                            "Out": ["reshape2_output"],
+                            "XShape": ["reshape2_output_xshape"],
+                        },
+                        "op_attrs": {"shape": [-1, self.length, 768]},
                     },
-                    "op_outputs": {
-                        "Out": ["matmul2_output"]
-                    },
-                    "op_attrs": {
-                        "trans_x": False,
-                        "trans_y": False
-                    }
-                }, {
-                    "op_type": "scale",
-                    "op_inputs": {
-                        "X": ["matmul2_output"],
-                    },
-                    "op_outputs": {
-                        "Out": ["scale_output"]
-                    },
-                    "op_attrs": {
-                        "scale": 0.125,
-                        "bias": 0.0,
-                        "bias_after_scale": True
-                    }
-                }, {
-                    "op_type": "softmax",
-                    "op_inputs": {
-                        "X": ["scale_output"]
-                    },
-                    "op_outputs": {
-                        "Out": ["softmax_output"]
-                    },
-                    "op_attrs": {
-                        "axis": -1,
-                        "data_format": "AnyLayout"
-                    }
-                }, {
-                    "op_type": "matmul_v2",
-                    "op_inputs": {
-                        "X": ["softmax_output"],
-                        "Y": ["slice3_output"]
-                    },
-                    "op_outputs": {
-                        "Out": ["matmul3_output"]
-                    },
-                    "op_attrs": {
-                        "trans_x": False,
-                        "trans_y": False
-                    }
-                }, {
-                    "op_type": "transpose2",
-                    "op_inputs": {
-                        "X": ["matmul3_output"]
-                    },
-                    "op_outputs": {
-                        "Out": ["transpose3_output"],
-                        "XShape": ["transpose3_output_xshape"]
-                    },
-                    "op_attrs": {
-                        "axis": [0, 2, 1, 3],
-                        "data_format": "AnyLayout"
-                    }
-                }, {
-                    "op_type": "reshape2",
-                    "op_inputs": {
-                        "X": ["transpose3_output"]
-                    },
-                    "op_outputs": {
-                        "Out": ["reshape2_output"],
-                        "XShape": ["reshape2_output_xshape"]
-                    },
-                    "op_attrs": {
-                        "shape": [-1, self.length, 768]
-                    }
-                }]
+                ]
 
                 ops = self.generate_op_config(ops_config)
 
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={
-                        "matmul1_weight":
-                        TensorConfig(data_gen=partial(generate_weight1)),
-                        "elementwise_add1_weight":
-                        TensorConfig(data_gen=partial(generate_weight2))
+                        "matmul1_weight": TensorConfig(
+                            data_gen=partial(generate_weight1)
+                        ),
+                        "elementwise_add1_weight": TensorConfig(
+                            data_gen=partial(generate_weight2)
+                        ),
                     },
                     inputs={
-                        "input_data1":
-                        TensorConfig(
-                            data_gen=partial(generate_input1, batch, length))
+                        "input_data1": TensorConfig(
+                            data_gen=partial(generate_input1, batch, length)
+                        )
                     },
-                    outputs=["reshape2_output"])
+                    outputs=["reshape2_output"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             # The last dim of input1 and input2 should be static.
             self.dynamic_shape.min_input_shape = {
@@ -1111,11 +1025,15 @@ def generate_trt_nodes_num():
         generate_dynamic_shape(attrs)
         self.trt_param.workspace_size = 2013265920
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(), (1e-3,
-                                                                         1e-3)
+        yield self.create_inference_config(), generate_trt_nodes_num(), (
+            1e-3,
+            1e-3,
+        )
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
-        yield self.create_inference_config(), generate_trt_nodes_num(), (1e-5,
-                                                                         1e-5)
+        yield self.create_inference_config(), generate_trt_nodes_num(), (
+            1e-5,
+            1e-5,
+        )
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad3d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad3d.py
new file mode 100644
index 00000000000000..02429bed44c042
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pad3d.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertPad3d(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1():
+            return np.ones([1, 1, 3, 64, 64]).astype(np.float32)
+
+        for value in [True, False]:
+            for paddings in [
+                [0, 0, 0, 0, 1, 1],
+                [0, 0, 1, 2, 3, 4],
+                [1, 1, 1, 1, 1, 1],
+                [0, 0, -1, -1, 1, 1],
+            ]:
+                dics = [{"value": value, "paddings": paddings}, {}]
+
+                ops_config = [
+                    {
+                        "op_type": "pad3d",
+                        "op_inputs": {"X": ["input_data"]},
+                        "op_outputs": {"Out": ["output_data"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
+
+                ops = self.generate_op_config(ops_config)
+                for i in range(10):
+                    program_config = ProgramConfig(
+                        ops=ops,
+                        weights={},
+                        inputs={
+                            "input_data": TensorConfig(
+                                data_gen=partial(generate_input1)
+                            ),
+                        },
+                        outputs=["output_data"],
+                    )
+
+                yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 1, 3, 64, 64]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 1, 3, 64, 64]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 1, 3, 64, 64]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-3
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-3
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
index b543484d89251f..03392554a7ffa7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_pool2d.py
@@ -20,10 +20,10 @@
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
 import itertools
+import copy
 
 
 class TrtConvertPool2dTest(TrtLayerAutoScanTest):
-
     def is_paddings_valid(self, program_config: ProgramConfig) -> bool:
         exclusive = program_config.ops[0].attrs['exclusive']
         paddings = program_config.ops[0].attrs['paddings']
@@ -65,39 +65,54 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
         ceil_mode_options = [True, False]
 
         configurations = [
-            strides_options, paddings_options, pooling_type_options,
-            padding_algorithm_options, ksize_options, data_format_options,
-            global_pooling_options, exclusive_options, adaptive_option,
-            ceil_mode_options
+            strides_options,
+            paddings_options,
+            pooling_type_options,
+            padding_algorithm_options,
+            ksize_options,
+            data_format_options,
+            global_pooling_options,
+            exclusive_options,
+            adaptive_option,
+            ceil_mode_options,
         ]
 
-        for (strides, paddings, pooling_type, padding_algorithm, ksize,
-             data_format, global_pooling, exclusive, adaptive,
-             ceil_mode) in itertools.product(*configurations):
-
-            attrs = [{
-                "strides": strides,
-                "paddings": paddings,
-                "pooling_type": pooling_type,
-                "padding_algorithm": padding_algorithm,
-                "ksize": ksize,
-                "data_format": data_format,
-                "global_pooling": global_pooling,
-                "exclusive": exclusive,
-                "adaptive": adaptive,
-                "ceil_mode": ceil_mode,
-            }]
-
-            ops_config = [{
-                "op_type": "pool2d",
-                "op_inputs": {
-                    "X": ["input_data"]
-                },
-                "op_outputs": {
-                    "Out": ["output_data"]
-                },
-                "op_attrs": attrs[0]
-            }]
+        for (
+            strides,
+            paddings,
+            pooling_type,
+            padding_algorithm,
+            ksize,
+            data_format,
+            global_pooling,
+            exclusive,
+            adaptive,
+            ceil_mode,
+        ) in itertools.product(*configurations):
+
+            attrs = [
+                {
+                    "strides": strides,
+                    "paddings": paddings,
+                    "pooling_type": pooling_type,
+                    "padding_algorithm": padding_algorithm,
+                    "ksize": ksize,
+                    "data_format": data_format,
+                    "global_pooling": global_pooling,
+                    "exclusive": exclusive,
+                    "adaptive": adaptive,
+                    "ceil_mode": ceil_mode,
+                }
+            ]
+
+            ops_config = [
+                {
+                    "op_type": "pool2d",
+                    "op_inputs": {"X": ["input_data"]},
+                    "op_outputs": {"Out": ["output_data"]},
+                    "op_attrs": attrs[0],
+                }
+            ]
 
             ops = self.generate_op_config(ops_config)
 
@@ -105,16 +120,18 @@ def generate_weight1(attrs: List[Dict[str, Any]]):
                 ops=ops,
                 weights={},
                 inputs={
-                    "input_data":
-                    TensorConfig(data_gen=partial(generate_input1, attrs))
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1, attrs)
+                    )
                 },
-                outputs=["output_data"])
+                outputs=["output_data"],
+            )
 
             yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [1, 3, 64, 64]}
@@ -136,36 +153,75 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-3, 1e-3)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-3, 1e-3)
+            attrs, True
+        ), (1e-3, 1e-3)
 
     def add_skip_trt_case(self):
-
         def teller(program_config, predictor_config):
-            if program_config.ops[0].attrs['pooling_type'] == 'avg' and \
-               program_config.ops[0].attrs['global_pooling'] == False and \
-               program_config.ops[0].attrs['exclusive'] == True and \
-               program_config.ops[0].attrs['adaptive'] == False and \
-               program_config.ops[0].attrs['ceil_mode'] == True:
+            if (
+                program_config.ops[0].attrs['pooling_type'] == 'avg'
+                and program_config.ops[0].attrs['global_pooling'] == False
+                and program_config.ops[0].attrs['exclusive'] == True
+                and program_config.ops[0].attrs['adaptive'] == False
+                and program_config.ops[0].attrs['ceil_mode'] == True
+            ):
                 return True
             return False
 
         self.add_skip_case(
-            teller, SkipReasons.TRT_NOT_IMPLEMENTED,
-            "The results of some cases are Nan, but the results of TensorRT and GPU are the same."
+            teller,
+            SkipReasons.TRT_NOT_IMPLEMENTED,
+            "The results of some cases are Nan, but the results of TensorRT and GPU are the same.",
         )
 
+    def assert_tensors_near(
+        self,
+        atol: float,
+        rtol: float,
+        tensor: Dict[str, np.array],
+        baseline: Dict[str, np.array],
+    ):
+        for key, arr in tensor.items():
+            self.assertEqual(
+                baseline[key].shape,
+                arr.shape,
+                'The output shapes are not equal, the baseline shape is '
+                + str(baseline[key].shape)
+                + ', but got '
+                + str(arr.shape),
+            )
+
+            # The result of Pool2d may have some elements that is the least value (-65504 for FP16),
+            # but for FP32 and FP16 precision, their least value are different.
+            # We set a threshold that is the least value of FP16,
+            # and make the values less than the threshold to be the threshold.
+            def align_less_threshold(arr, threshold):
+                return np.clip(arr, threshold, None)
+
+            fp16_min = np.finfo(np.float16).min
+            baseline_threshold = align_less_threshold(
+                copy.deepcopy(baseline[key]), fp16_min
+            )
+            arr_threshold = align_less_threshold(copy.deepcopy(arr), fp16_min)
+            np.testing.assert_allclose(
+                baseline_threshold, arr_threshold, rtol=rtol, atol=atol
+            )
+
     def test(self):
         self.add_skip_trt_case()
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
index 42b234827b1e72..eb640ac54029a9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reduce_sum.py
@@ -23,7 +23,6 @@
 
 
 class TrtConvertReduceSumTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
@@ -41,7 +40,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(dtype, attrs: List[Dict[str, Any]]):
             if dtype == -1 or dtype == 5:
                 return np.random.random([1, 3, 32, 32]).astype(np.float32)
@@ -49,39 +47,52 @@ def generate_input1(dtype, attrs: List[Dict[str, Any]]):
                 return np.random.random([1, 3, 32, 32]).astype(np.int32)
 
         for keep_dim in [True, False]:
-            for dim in [[], [1], [0], [0, 1], [1, 2, 3], [-2, 0, 3], [-3],
-                        [-4, 1], [3, 4, 5]]:
+            for dim in [
+                [],
+                [1],
+                [0],
+                [0, 1],
+                [1, 2, 3],
+                [-2, 0, 3],
+                [-3],
+                [-4, 1],
+                [3, 4, 5],
+            ]:
                 for reduce_all in [True, False]:
                     for out_dtype in [-1, 2, 5]:
-                        dics = [{
-                            "keep_dim": keep_dim,
-                            "dim": dim,
-                            "reduce_all": reduce_all,
-                            "out_dtype": out_dtype,
-                            "in_dtype": out_dtype,
-                        }, {}]
-
-                        ops_config = [{
-                            "op_type": "reduce_sum",
-                            "op_inputs": {
-                                "X": ["input_data"]
+                        dics = [
+                            {
+                                "keep_dim": keep_dim,
+                                "dim": dim,
+                                "reduce_all": reduce_all,
+                                "out_dtype": out_dtype,
+                                "in_dtype": out_dtype,
                             },
-                            "op_outputs": {
-                                "Out": ["reduce_output_data"]
-                            },
-                            "op_attrs": dics[0]
-                        }]
+                            {},
+                        ]
+
+                        ops_config = [
+                            {
+                                "op_type": "reduce_sum",
+                                "op_inputs": {"X": ["input_data"]},
+                                "op_outputs": {"Out": ["reduce_output_data"]},
+                                "op_attrs": dics[0],
+                            }
+                        ]
                         ops = self.generate_op_config(ops_config)
 
                         program_config = ProgramConfig(
                             ops=ops,
                             weights={},
                             inputs={
-                                "input_data":
-                                TensorConfig(data_gen=partial(
-                                    generate_input1, out_dtype, dics))
+                                "input_data": TensorConfig(
+                                    data_gen=partial(
+                                        generate_input1, out_dtype, dics
+                                    )
+                                )
                             },
-                            outputs=["reduce_output_data"])
+                            outputs=["reduce_output_data"],
+                        )
 
                         if not self.is_program_valid(program_config):
                             continue
@@ -89,7 +100,6 @@ def generate_input1(dtype, attrs: List[Dict[str, Any]]):
                         yield program_config
 
     def sample_predictor_configs(self, program_config):
-
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -120,19 +130,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-5, 1e-5)
+            attrs, False
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), (1e-4, 1e-4)
+            attrs, False
+        ), (1e-3, 1e-3)
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-5, 1e-5)
+            attrs, True
+        ), (1e-5, 1e-5)
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), (1e-4, 1e-4)
+            attrs, True
+        ), (1e-3, 1e-3)
 
         pass
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
index 7902a35a9a6b47..3dfca41b3e3561 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_reshape.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertReshapeTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
@@ -31,7 +30,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             if len(attrs[0]['shape']) != 1:
                 return False
 
-        #To test if the shape contains 0
+        # To test if the shape contains 0
         if len(attrs[0]['shape']) == 3:
             if attrs[0]['shape'][1] == 0:
                 if self.dims != 3:
@@ -45,7 +44,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             if self.dims == 4:
                 self.input_shape = [1, 2, 4, 6]
@@ -70,9 +68,18 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
             return np.array([24]).astype(np.int32)
 
         for dims in [4, 3, 2, 1]:
-            for shape in [[1, 6, 8], [1, 2, 4, 6], [1, 1, 0, 12], [1, 0, 6],
-                          [1, -1, 12], [2, -1], [3, 16], [3, 4, 4], [48],
-                          [-1, 48]]:
+            for shape in [
+                [1, 6, 8],
+                [1, 2, 4, 6],
+                [1, 1, 0, 12],
+                [1, 0, 6],
+                [1, -1, 12],
+                [2, -1],
+                [3, 16],
+                [3, 4, 4],
+                [48],
+                [-1, 48],
+            ]:
                 dics = [
                     {
                         "shape": shape,
@@ -81,29 +88,31 @@ def generate_shapeT2_data(attrs: List[Dict[str, Any]]):
                 self.dims = dims
                 dics_intput = [{"X": ["reshape_input"]}]
 
-                ops_config = [{
-                    "op_type": "reshape",
-                    "op_inputs": dics_intput[0],
-                    "op_outputs": {
-                        "Out": ["reshape_out"]
-                    },
-                    "op_attrs": dics[0]
-                }]
+                ops_config = [
+                    {
+                        "op_type": "reshape",
+                        "op_inputs": dics_intput[0],
+                        "op_outputs": {"Out": ["reshape_out"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "reshape_input":
-                        TensorConfig(data_gen=partial(generate_input1, dics))
+                        "reshape_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
                     },
-                    outputs=["reshape_out"])
+                    outputs=["reshape_out"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -141,13 +150,14 @@ def clear_dynamic_shape():
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
             # in static shape mode, here is consistent with op_teller.cc
-            if (not dynamic_shape):
-                if (attrs[0]['shape'][0] == 0):
+            if not dynamic_shape:
+                if attrs[0]['shape'][0] == 0:
                     return 1, 2
-                elif (len(attrs[0]['shape']) == 1):
+                elif len(attrs[0]['shape']) == 1:
                     return 0, 3
-                elif (np.prod(attrs[0]['shape'][1:]) == np.prod(
-                        self.input_shape[1:])):
+                elif np.prod(attrs[0]['shape'][1:]) == np.prod(
+                    self.input_shape[1:]
+                ):
                     return 1, 2
                 else:
                     return 0, 3
@@ -161,19 +171,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
@@ -185,12 +199,10 @@ def test(self):
 
 # reshape having three inputs.
 class TrtConvertReshapeTest2(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             if self.dims == 4:
                 return np.random.random([1, 2, 4, 6]).astype(np.float32)
@@ -203,9 +215,12 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
         for dims in [4, 3, 2, 1]:
             for shape in [[-1, 48]]:
-                dics = [{
-                    "shape": shape,
-                }, {}]
+                dics = [
+                    {
+                        "shape": shape,
+                    },
+                    {},
+                ]
                 self.dims = dims
                 dics_intput = [
                     {
@@ -217,9 +232,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     {
                         "op_type": "fill_constant",
                         "op_inputs": {},
-                        "op_outputs": {
-                            "Out": ["shapeT1_data"]
-                        },
+                        "op_outputs": {"Out": ["shapeT1_data"]},
                         "op_attrs": {
                             "dtype": 2,
                             "str_value": "2",
@@ -229,9 +242,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     {
                         "op_type": "fill_constant",
                         "op_inputs": {},
-                        "op_outputs": {
-                            "Out": ["shapeT2_data"]
-                        },
+                        "op_outputs": {"Out": ["shapeT2_data"]},
                         "op_attrs": {
                             "dtype": 2,
                             "str_value": "24",
@@ -241,10 +252,8 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     {
                         "op_type": "reshape",
                         "op_inputs": dics_intput[0],
-                        "op_outputs": {
-                            "Out": ["reshape_out"]
-                        },
-                        "op_attrs": dics[0]
+                        "op_outputs": {"Out": ["reshape_out"]},
+                        "op_attrs": dics[0],
                     },
                 ]
                 ops = self.generate_op_config(ops_config)
@@ -252,16 +261,18 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     ops=ops,
                     weights={},
                     inputs={
-                        "reshape_input":
-                        TensorConfig(data_gen=partial(generate_input1, dics))
+                        "reshape_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
                     },
-                    outputs=["reshape_out"])
+                    outputs=["reshape_out"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -297,7 +308,7 @@ def generate_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-3
 
     def add_skip_trt_case(self):
         pass
@@ -309,12 +320,10 @@ def test(self):
 
 # reshape having 2 inputs.
 class TrtConvertReshapeTest3(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             if self.dims == 4:
                 return np.random.random([1, 2, 12, 6]).astype(np.float32)
@@ -327,9 +336,12 @@ def generate_input1(attrs: List[Dict[str, Any]]):
 
         for dims in [4, 3, 2, 1]:
             for shape in [[-1, 144]]:
-                dics = [{
-                    "shape": shape,
-                }, {}]
+                dics = [
+                    {
+                        "shape": shape,
+                    },
+                    {},
+                ]
                 self.dims = dims
                 dics_intput = [
                     {
@@ -341,9 +353,7 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     {
                         "op_type": "fill_constant",
                         "op_inputs": {},
-                        "op_outputs": {
-                            "Out": ["shape_data"]
-                        },
+                        "op_outputs": {"Out": ["shape_data"]},
                         "op_attrs": {
                             "dtype": 2,
                             "str_value": "12",
@@ -353,10 +363,8 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     {
                         "op_type": "reshape",
                         "op_inputs": dics_intput[0],
-                        "op_outputs": {
-                            "Out": ["reshape_out"]
-                        },
-                        "op_attrs": dics[0]
+                        "op_outputs": {"Out": ["reshape_out"]},
+                        "op_attrs": dics[0],
                     },
                 ]
                 ops = self.generate_op_config(ops_config)
@@ -364,16 +372,18 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                     ops=ops,
                     weights={},
                     inputs={
-                        "reshape_input":
-                        TensorConfig(data_gen=partial(generate_input1, dics))
+                        "reshape_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
                     },
-                    outputs=["reshape_out"])
+                    outputs=["reshape_out"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -409,7 +419,7 @@ def generate_dynamic_shape():
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), (1, 2), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), (1, 2), 1e-5
+        yield self.create_inference_config(), (1, 2), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
index ca12fe876ca394..f59ce47e97d395 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roi_align.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertRoiAlignTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             return np.ones([batch, 256, 32, 32]).astype(np.float32)
 
@@ -47,92 +45,111 @@ def generate_input3(attrs: List[Dict[str, Any]], batch):
                                     self.num_input = num_input
                                     if num_input == 1:
                                         batch = 1
-                                    dics = [{
-                                        "spatial_scale": spatial_scale,
-                                        "pooled_height": pooled_height,
-                                        "pooled_width": pooled_width,
-                                        "sampling_ratio": sampling_ratio,
-                                        "aligned": aligned
-                                    }, {}]
-                                    dics_input = [{
-                                        "X": ["roi_align_input"],
-                                        "ROIs": ["ROIs"],
-                                        "RoisNum": ["RoisNum"]
-                                    }, {
-                                        "X": ["roi_align_input"],
-                                        "ROIs": ["ROIs"]
-                                    }]
-                                    program_input = [{
-                                        "roi_align_input":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input1, dics, batch)),
-                                        "ROIs":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input2, dics, batch)),
-                                        "RoisNum":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input3, dics, batch))
-                                    }, {
-                                        "roi_align_input":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input1, dics, batch)),
-                                        "ROIs":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input2, dics, batch),
-                                                     lod=[[32, 3]])
-                                    }]
-                                    ops_config = [{
-                                        "op_type":
-                                        "roi_align",
-                                        "op_inputs":
-                                        dics_input[num_input],
-                                        "op_outputs": {
-                                            "Out": ["roi_align_out"]
+                                    dics = [
+                                        {
+                                            "spatial_scale": spatial_scale,
+                                            "pooled_height": pooled_height,
+                                            "pooled_width": pooled_width,
+                                            "sampling_ratio": sampling_ratio,
+                                            "aligned": aligned,
+                                        },
+                                        {},
+                                    ]
+                                    dics_input = [
+                                        {
+                                            "X": ["roi_align_input"],
+                                            "ROIs": ["ROIs"],
+                                            "RoisNum": ["RoisNum"],
+                                        },
+                                        {
+                                            "X": ["roi_align_input"],
+                                            "ROIs": ["ROIs"],
+                                        },
+                                    ]
+                                    program_input = [
+                                        {
+                                            "roi_align_input": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_input1, dics, batch
+                                                )
+                                            ),
+                                            "ROIs": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_input2, dics, batch
+                                                )
+                                            ),
+                                            "RoisNum": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_input3, dics, batch
+                                                )
+                                            ),
+                                        },
+                                        {
+                                            "roi_align_input": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_input1, dics, batch
+                                                )
+                                            ),
+                                            "ROIs": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_input2, dics, batch
+                                                ),
+                                                lod=[[32, 3]],
+                                            ),
                                         },
-                                        "op_attrs":
-                                        dics[0]
-                                    }]
+                                    ]
+                                    ops_config = [
+                                        {
+                                            "op_type": "roi_align",
+                                            "op_inputs": dics_input[num_input],
+                                            "op_outputs": {
+                                                "Out": ["roi_align_out"]
+                                            },
+                                            "op_attrs": dics[0],
+                                        }
+                                    ]
                                     ops = self.generate_op_config(ops_config)
                                     program_config = ProgramConfig(
                                         ops=ops,
                                         weights={},
                                         inputs=program_input[num_input],
-                                        outputs=["roi_align_out"])
+                                        outputs=["roi_align_out"],
+                                    )
 
                                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.num_input == 0:
                 self.dynamic_shape.min_input_shape = {
                     "roi_align_input": [1, 256, 32, 32],
                     "ROIs": [3, 4],
-                    "RoisNum": [1]
+                    "RoisNum": [1],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "roi_align_input": [1, 256, 64, 64],
                     "ROIs": [3, 4],
-                    "RoisNum": [1]
+                    "RoisNum": [1],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "roi_align_input": [1, 256, 64, 64],
                     "ROIs": [3, 4],
-                    "RoisNum": [1]
+                    "RoisNum": [1],
                 }
             elif self.num_input == 1:
                 self.dynamic_shape.min_input_shape = {
                     "roi_align_input": [1, 256, 32, 32],
-                    "ROIs": [3, 4]
+                    "ROIs": [3, 4],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "roi_align_input": [1, 256, 64, 64],
-                    "ROIs": [3, 4]
+                    "ROIs": [3, 4],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "roi_align_input": [1, 256, 64, 64],
-                    "ROIs": [3, 4]
+                    "ROIs": [3, 4],
                 }
 
         def clear_dynamic_shape():
@@ -159,29 +176,33 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
-
         def teller1(program_config, predictor_config):
             if len(program_config.inputs) == 3:
                 return True
             return False
 
-        self.add_skip_case(teller1, SkipReasons.TRT_NOT_SUPPORT,
-                           "INPUT RoisNum NOT SUPPORT")
+        self.add_skip_case(
+            teller1, SkipReasons.TRT_NOT_SUPPORT, "INPUT RoisNum NOT SUPPORT"
+        )
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py
index 675054317d9b17..8217b3e8d8506e 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_roll.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertRollTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -32,43 +31,44 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 56, 56, 192]).astype(np.float32)
 
         for axis in [[1, 2]]:
             for shifts in [[-1, -1], [-3, -3]]:
-                dics = [{
-                    "axis": axis,
-                    "shifts": shifts,
-                }]
-
-                ops_config = [{
-                    "op_type": "roll",
-                    "op_inputs": {
-                        "X": ["input_data"]
-                    },
-                    "op_outputs": {
-                        "Out": ["roll_output_data"]
-                    },
-                    "op_attrs": dics[0]
-                }]
+                dics = [
+                    {
+                        "axis": axis,
+                        "shifts": shifts,
+                    }
+                ]
+
+                ops_config = [
+                    {
+                        "op_type": "roll",
+                        "op_inputs": {"X": ["input_data"]},
+                        "op_outputs": {"Out": ["roll_output_data"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
 
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "input_data":
-                        TensorConfig(data_gen=partial(generate_input1, dics))
+                        "input_data": TensorConfig(
+                            data_gen=partial(generate_input1, dics)
+                        )
                     },
-                    outputs=["roll_output_data"])
+                    outputs=["roll_output_data"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "input_data": [1, 56, 56, 192]
@@ -103,19 +103,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-4
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-4
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scatter_nd_add.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scatter_nd_add.py
new file mode 100644
index 00000000000000..9376ade22a2be5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_scatter_nd_add.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertScatterNd(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1():
+            return np.random.random([6]).astype(np.float32)
+
+        def generate_input2():
+            return np.random.random([4, 1]).astype(np.int32)
+
+        def generate_input3():
+            return np.random.random([4]).astype(np.float32)
+
+        ops_config = [
+            {
+                "op_type": "scatter_nd_add",
+                "op_inputs": {
+                    "X": ["input_data"],
+                    "Index": ["index_data"],
+                    "Updates": ["update_data"],
+                },
+                "op_outputs": {"Out": ["output_data"]},
+                "op_attrs": {},
+            }
+        ]
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                    "index_data": TensorConfig(
+                        data_gen=partial(generate_input2)
+                    ),
+                    "update_data": TensorConfig(
+                        data_gen=partial(generate_input3)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1],
+                "index_data": [2, 1],
+                "update_data": [1],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [6],
+                "index_data": [4, 1],
+                "update_data": [4],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [6],
+                "index_data": [4, 1],
+                "update_data": [4],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 5), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 5), 1e-3
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 4), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 4), 1e-3
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shape.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shape.py
index 610f4a9425fbaf..363152a4d00ec2 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shape.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shape.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertSumTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -41,31 +39,31 @@ def generate_input1(batch):
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
                 self.dims = dims
-                ops_config = [{
-                    "op_type": "shape",
-                    "op_inputs": {
-                        "Input": ["input1"]
-                    },
-                    "op_outputs": {
-                        "Out": ["output"]
-                    },
-                    "op_attrs": {}
-                }]
+                ops_config = [
+                    {
+                        "op_type": "shape",
+                        "op_inputs": {"Input": ["input1"]},
+                        "op_outputs": {"Out": ["output"]},
+                        "op_attrs": {},
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "input1":
-                        TensorConfig(data_gen=partial(generate_input1, batch))
+                        "input1": TensorConfig(
+                            data_gen=partial(generate_input1, batch)
+                        )
                     },
-                    outputs=["output"])
+                    outputs=["output"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {"input1": [1, 3, 24, 24]}
@@ -87,7 +85,7 @@ def generate_dynamic_shape():
                 }
 
         def generate_trt_nodes_num(dynamic_shape):
-            if (not dynamic_shape):
+            if not dynamic_shape:
                 return 0, 3
             return 1, 2
 
@@ -100,17 +98,19 @@ def clear_dynamic_shape():
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
index a53b61a00727bc..f27bed02771029 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_shuffle_channel.py
@@ -22,44 +22,41 @@
 
 
 class TrtConvertShuffleChannelTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             return np.ones([batch, 6, 24, 24]).astype(np.float32)
 
         for batch in [1, 2, 4]:
             for group in [1, 2, 3]:
                 dics = [{"group": group}, {}]
-                ops_config = [{
-                    "op_type": "shuffle_channel",
-                    "op_inputs": {
-                        "X": ["shuffle_channel_input"]
-                    },
-                    "op_outputs": {
-                        "Out": ["shuffle_channel_out"]
-                    },
-                    "op_attrs": dics[0]
-                }]
+                ops_config = [
+                    {
+                        "op_type": "shuffle_channel",
+                        "op_inputs": {"X": ["shuffle_channel_input"]},
+                        "op_outputs": {"Out": ["shuffle_channel_out"]},
+                        "op_attrs": dics[0],
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "shuffle_channel_input":
-                        TensorConfig(
-                            data_gen=partial(generate_input1, dics, batch))
+                        "shuffle_channel_input": TensorConfig(
+                            data_gen=partial(generate_input1, dics, batch)
+                        )
                     },
-                    outputs=["shuffle_channel_out"])
+                    outputs=["shuffle_channel_out"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {
                 "shuffle_channel_input": [1, 6, 24, 24]
@@ -78,8 +75,10 @@ def clear_dynamic_shape():
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
             ver = paddle_infer.get_trt_compile_version()
-            if ver[0] * 1000 + ver[1] * 100 + ver[
-                    2] * 10 < 8000 and dynamic_shape == True:
+            if (
+                ver[0] * 1000 + ver[1] * 100 + ver[2] * 10 < 8000
+                and dynamic_shape == True
+            ):
                 return 0, 3
             else:
                 return 1, 2
@@ -92,19 +91,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
index deac7ef9d2a14c..a41af25b1a6b44 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_slice.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertSliceTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -34,13 +33,17 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             start = 0
             end = 0
             if attrs[0]["starts"][x] < 0:
-                start = attrs[0]["starts"][x] + inputs['input_data'].shape[
-                    attrs[0]["axes"][x]]
+                start = (
+                    attrs[0]["starts"][x]
+                    + inputs['input_data'].shape[attrs[0]["axes"][x]]
+                )
             else:
                 start = attrs[0]["starts"][x]
             if attrs[0]["ends"][x] < 0:
-                end = attrs[0]["ends"][x] + inputs['input_data'].shape[
-                    attrs[0]["axes"][x]]
+                end = (
+                    attrs[0]["ends"][x]
+                    + inputs['input_data'].shape[attrs[0]["axes"][x]]
+                )
             else:
                 end = attrs[0]["ends"][x]
             start = max(0, start)
@@ -51,12 +54,11 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         for x in attrs[0]["decrease_axis"]:
             if x < 0:
                 return False
-            if (out_shape[x] != 1):
+            if out_shape[x] != 1:
                 return False
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.random.random([6, 6, 64, 64]).astype(np.float32)
 
@@ -65,41 +67,44 @@ def generate_input1(attrs: List[Dict[str, Any]]):
                 for ends in [[2, 2], [5, 5], [1, -1]]:
                     for decrease_axis in [[], [1], [2], [-1], [-100]]:
                         for infer_flags in [[-1]]:
-                            dics = [{
-                                "axes": axes,
-                                "starts": starts,
-                                "ends": ends,
-                                "decrease_axis": decrease_axis,
-                                "infer_flags": infer_flags
-                            }]
-
-                            ops_config = [{
-                                "op_type": "slice",
-                                "op_inputs": {
-                                    "Input": ["input_data"]
-                                },
-                                "op_outputs": {
-                                    "Out": ["slice_output_data"]
-                                },
-                                "op_attrs": dics[0]
-                            }]
+                            dics = [
+                                {
+                                    "axes": axes,
+                                    "starts": starts,
+                                    "ends": ends,
+                                    "decrease_axis": decrease_axis,
+                                    "infer_flags": infer_flags,
+                                }
+                            ]
+
+                            ops_config = [
+                                {
+                                    "op_type": "slice",
+                                    "op_inputs": {"Input": ["input_data"]},
+                                    "op_outputs": {
+                                        "Out": ["slice_output_data"]
+                                    },
+                                    "op_attrs": dics[0],
+                                }
+                            ]
                             ops = self.generate_op_config(ops_config)
 
                             program_config = ProgramConfig(
                                 ops=ops,
                                 weights={},
                                 inputs={
-                                    "input_data":
-                                    TensorConfig(
-                                        data_gen=partial(generate_input1, dics))
+                                    "input_data": TensorConfig(
+                                        data_gen=partial(generate_input1, dics)
+                                    )
                                 },
-                                outputs=["slice_output_data"])
+                                outputs=["slice_output_data"],
+                            )
 
                             yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [8, 8, 64, 64]}
@@ -125,19 +130,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-4
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-4
+            attrs, True
+        ), 1e-3
 
     def test(self):
         # TODO(inference): fix.
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py
index b6cef5ca17bdca..6a65382baf4b5d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_softmax.py
@@ -126,7 +126,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
                 attrs, False), 1e-5
             self.trt_param.precision = paddle_infer.PrecisionType.Half
             yield self.create_inference_config(), generate_trt_nodes_num(
-                attrs, False), 1e-5
+                attrs, False), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
@@ -135,7 +135,7 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
index e8c283acc3b8fe..0d81712209bbb9 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_split.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertSplitTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -35,13 +34,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         if len(inputs['split_input'].shape) <= attrs[0]['axis']:
             return False
 
-        #Sections and num cannot both be equal to 0.
+        # Sections and num cannot both be equal to 0.
         if len(attrs[0]['sections']) == 0:
             if attrs[0]['num'] == 0:
                 return False
 
-        #When sections and num are not both equal to 0, sections has higher priority.
-        #The sum of sections should be equal to the input size.
+        # When sections and num are not both equal to 0, sections has higher priority.
+        # The sum of sections should be equal to the input size.
         if len(attrs[0]['sections']) != 0:
             if attrs[0]['num'] != 0:
                 return False
@@ -53,16 +52,18 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             if sum != inputs['split_input'].shape[attrs[0]['axis']]:
                 return False
 
-        #The size of num should be equal to the input dimension.
+        # The size of num should be equal to the input dimension.
         if attrs[0]['num'] != 0:
             if len(outputs) != attrs[0]['num']:
                 return False
 
-        #Test AxisTensor and SectionsTensorList
+        # Test AxisTensor and SectionsTensorList
         if self.num_input == 0:
-            if self.dims == 2 and attrs[0]['sections'] == [
-                    10, 14
-            ] and len(outputs) == 2:
+            if (
+                self.dims == 2
+                and attrs[0]['sections'] == [10, 14]
+                and len(outputs) == 2
+            ):
                 return True
             else:
                 return False
@@ -70,7 +71,6 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.random.random([batch, 3, 3, 24]).astype(np.float32)
@@ -93,72 +93,95 @@ def generate_SectionsTensorList2(attrs: List[Dict[str, Any]]):
         for num_input in [0, 1]:
             for dims in [1, 2, 3, 4]:
                 for batch in [3, 6, 9]:
-                    for Out in [["output_var0", "output_var1"],
-                                ["output_var0", "output_var1", "output_var2"]]:
-                        for sections in [[], [1, 2], [2, 1], [10, 14],
-                                         [1, 1, 1], [2, 2, 2], [3, 3, 3],
-                                         [3, 7, 14]]:
+                    for Out in [
+                        ["output_var0", "output_var1"],
+                        ["output_var0", "output_var1", "output_var2"],
+                    ]:
+                        for sections in [
+                            [],
+                            [1, 2],
+                            [2, 1],
+                            [10, 14],
+                            [1, 1, 1],
+                            [2, 2, 2],
+                            [3, 3, 3],
+                            [3, 7, 14],
+                        ]:
                             for num in [0, 3]:
                                 for axis in [0, 1, 2, 3]:
                                     self.batch = batch
                                     self.num_input = num_input
                                     self.dims = dims
-                                    dics = [{
-                                        "sections": sections,
-                                        "num": num,
-                                        "axis": axis
-                                    }, {}]
-
-                                    dics_intput = [{
-                                        "X": ["split_input"],
-                                        "AxisTensor": ["AxisTensor"],
-                                        "SectionsTensorList": [
-                                            "SectionsTensorList1",
-                                            "SectionsTensorList2"
-                                        ]
-                                    }, {
-                                        "X": ["split_input"]
-                                    }]
-                                    dics_intputs = [{
-                                        "AxisTensor":
-                                        TensorConfig(data_gen=partial(
-                                            generate_AxisTensor, dics)),
-                                        "SectionsTensorList1":
-                                        TensorConfig(data_gen=partial(
-                                            generate_SectionsTensorList1,
-                                            dics)),
-                                        "SectionsTensorList2":
-                                        TensorConfig(data_gen=partial(
-                                            generate_SectionsTensorList2, dics))
-                                    }, {}]
-
-                                    ops_config = [{
-                                        "op_type":
-                                        "split",
-                                        "op_inputs":
-                                        dics_intput[num_input],
-                                        "op_outputs": {
-                                            "Out": Out
+                                    dics = [
+                                        {
+                                            "sections": sections,
+                                            "num": num,
+                                            "axis": axis,
+                                        },
+                                        {},
+                                    ]
+
+                                    dics_intput = [
+                                        {
+                                            "X": ["split_input"],
+                                            "AxisTensor": ["AxisTensor"],
+                                            "SectionsTensorList": [
+                                                "SectionsTensorList1",
+                                                "SectionsTensorList2",
+                                            ],
                                         },
-                                        "op_attrs":
-                                        dics[0]
-                                    }]
+                                        {"X": ["split_input"]},
+                                    ]
+                                    dics_intputs = [
+                                        {
+                                            "AxisTensor": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_AxisTensor, dics
+                                                )
+                                            ),
+                                            "SectionsTensorList1": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_SectionsTensorList1,
+                                                    dics,
+                                                )
+                                            ),
+                                            "SectionsTensorList2": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_SectionsTensorList2,
+                                                    dics,
+                                                )
+                                            ),
+                                        },
+                                        {},
+                                    ]
+
+                                    ops_config = [
+                                        {
+                                            "op_type": "split",
+                                            "op_inputs": dics_intput[num_input],
+                                            "op_outputs": {"Out": Out},
+                                            "op_attrs": dics[0],
+                                        }
+                                    ]
                                     ops = self.generate_op_config(ops_config)
                                     program_config = ProgramConfig(
                                         ops=ops,
                                         weights=dics_intputs[num_input],
                                         inputs={
-                                            "split_input":
-                                            TensorConfig(data_gen=partial(
-                                                generate_input1, dics, batch))
+                                            "split_input": TensorConfig(
+                                                data_gen=partial(
+                                                    generate_input1, dics, batch
+                                                )
+                                            )
                                         },
-                                        outputs=Out)
+                                        outputs=Out,
+                                    )
 
                                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -216,30 +239,35 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
-
         def teller1(program_config, predictor_config):
             if len(program_config.weights) == 3:
                 return True
             return False
 
         self.add_skip_case(
-            teller1, SkipReasons.TRT_NOT_SUPPORT,
-            "INPUT AxisTensor AND SectionsTensorList NOT SUPPORT.")
+            teller1,
+            SkipReasons.TRT_NOT_SUPPORT,
+            "INPUT AxisTensor AND SectionsTensorList NOT SUPPORT.",
+        )
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_squeeze2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_squeeze2.py
index f82791a5912335..b8a905f0471187 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_squeeze2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_squeeze2.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertSplitTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
@@ -40,25 +39,25 @@ def sample_program_configs(self):
                     self.dims = dims
                     self.axes = axes
                     dics = [{"axes": axes}]
-                    ops_config = [{
-                        "op_type": "squeeze2",
-                        "op_inputs": {
-                            "X": ["in_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["out_data"],
-                            "XShape": ["XShape_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "squeeze2",
+                            "op_inputs": {"X": ["in_data"]},
+                            "op_outputs": {
+                                "Out": ["out_data"],
+                                "XShape": ["XShape_data"],
+                            },
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     # new_axes is the update of axes
                     new_axes = list(axes)
                     for i in range(len(new_axes)):
-                        if (new_axes[i] < 0):
+                        if new_axes[i] < 0:
                             new_axes[i] += dims
-                    if (max(new_axes) >= dims):
+                    if max(new_axes) >= dims:
                         continue
-                # generate input data
+                    # generate input data
                     self.input_shape = [1] * dims
                     for i in range(dims):
                         self.input_shape[i] = np.random.randint(1, 20)
@@ -68,24 +67,26 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
                         for i in new_axes:
                             self.input_shape[i] = 1
                         return np.random.random(self.input_shape).astype(
-                            np.float32)
+                            np.float32
+                        )
 
                     ops = self.generate_op_config(ops_config)
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "in_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch))
+                            "in_data": TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)
+                            )
                         },
-                        outputs=["out_data"])
+                        outputs=["out_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             max_shape = list(self.input_shape)
             min_shape = list(self.input_shape)
@@ -112,19 +113,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py
index cfae56fc2b630e..90047400a33724 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_stack.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertStackTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -31,14 +30,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
-        #The input dimension should be less than the set axis.
+        # The input dimension should be less than the set axis.
         if len(inputs['stack_input1'].shape) < attrs[0]['axis']:
             return False
 
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.random.random([batch, 3, 24, 24]).astype(np.float32)
@@ -74,103 +72,107 @@ def generate_input3(attrs: List[Dict[str, Any]], batch):
                 for axis in [-2, -1, 0, 1, 2, 3]:
                     self.dims = dims
                     dics = [{"axis": axis}, {}]
-                    ops_config = [{
-                        "op_type": "stack",
-                        "op_inputs": {
-                            "X":
-                            ["stack_input1", "stack_input2", "stack_input3"]
-                        },
-                        "op_outputs": {
-                            "Y": ["stack_output"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "stack",
+                            "op_inputs": {
+                                "X": [
+                                    "stack_input1",
+                                    "stack_input2",
+                                    "stack_input3",
+                                ]
+                            },
+                            "op_outputs": {"Y": ["stack_output"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "stack_input1":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch)),
-                            "stack_input2":
-                            TensorConfig(
-                                data_gen=partial(generate_input2, dics, batch)),
-                            "stack_input3":
-                            TensorConfig(
-                                data_gen=partial(generate_input3, dics, batch))
+                            "stack_input1": TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)
+                            ),
+                            "stack_input2": TensorConfig(
+                                data_gen=partial(generate_input2, dics, batch)
+                            ),
+                            "stack_input3": TensorConfig(
+                                data_gen=partial(generate_input3, dics, batch)
+                            ),
                         },
-                        outputs=["stack_output"])
+                        outputs=["stack_output"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
                     "stack_input1": [1, 3, 24, 24],
                     "stack_input2": [1, 3, 24, 24],
-                    "stack_input3": [1, 3, 24, 24]
+                    "stack_input3": [1, 3, 24, 24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "stack_input1": [4, 3, 48, 48],
                     "stack_input2": [4, 3, 48, 48],
-                    "stack_input3": [4, 3, 48, 48]
+                    "stack_input3": [4, 3, 48, 48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "stack_input1": [1, 3, 24, 24],
                     "stack_input2": [1, 3, 24, 24],
-                    "stack_input3": [1, 3, 24, 24]
+                    "stack_input3": [1, 3, 24, 24],
                 }
             elif self.dims == 3:
                 self.dynamic_shape.min_input_shape = {
                     "stack_input1": [1, 3, 24],
                     "stack_input2": [1, 3, 24],
-                    "stack_input3": [1, 3, 24]
+                    "stack_input3": [1, 3, 24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "stack_input1": [4, 3, 48],
                     "stack_input2": [4, 3, 48],
-                    "stack_input3": [4, 3, 48]
+                    "stack_input3": [4, 3, 48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "stack_input1": [1, 3, 24],
                     "stack_input2": [1, 3, 24],
-                    "stack_input3": [1, 3, 24]
+                    "stack_input3": [1, 3, 24],
                 }
             elif self.dims == 2:
                 self.dynamic_shape.min_input_shape = {
                     "stack_input1": [1, 24],
                     "stack_input2": [1, 24],
-                    "stack_input3": [1, 24]
+                    "stack_input3": [1, 24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "stack_input1": [4, 48],
                     "stack_input2": [4, 48],
-                    "stack_input3": [4, 48]
+                    "stack_input3": [4, 48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "stack_input1": [1, 24],
                     "stack_input2": [1, 24],
-                    "stack_input3": [1, 24]
+                    "stack_input3": [1, 24],
                 }
             elif self.dims == 1:
                 self.dynamic_shape.min_input_shape = {
                     "stack_input1": [24],
                     "stack_input2": [24],
-                    "stack_input3": [24]
+                    "stack_input3": [24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "stack_input1": [48],
                     "stack_input2": [48],
-                    "stack_input3": [48]
+                    "stack_input3": [48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "stack_input1": [24],
                     "stack_input2": [24],
-                    "stack_input3": [24]
+                    "stack_input3": [24],
                 }
 
         def clear_dynamic_shape():
@@ -191,19 +193,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
index beea119c79fc0b..319a4be5a745bc 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_strided_slice.py
@@ -34,7 +34,7 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
     def sample_program_configs(self):
 
         def generate_input1(attrs: List[Dict[str, Any]]):
-            return np.ones([1, 56, 56, 192]).astype(np.float32)
+            return np.random.random([1, 56, 56, 192]).astype(np.float32)
 
         for axes in [[1, 2]]:
             for starts in [[1, 1]]:
@@ -130,5 +130,88 @@ def test(self):
         self.run_test()
 
 
+class TrtConvertStridedSliceTest2(TrtLayerAutoScanTest):
+
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+
+        def generate_input1(attrs: List[Dict[str, Any]]):
+            return np.random.random([1, 56, 56, 192]).astype(np.float32)
+
+        for axes in [[1, 2], [2, 3], [1, 3]]:
+            for starts in [[-10, 1], [-10, 20], [-10, 15], [-10, 16], [-10,
+                                                                       20]]:
+                for ends in [[-9, 10000], [-9, -1], [-9, 40]]:
+                    for decrease_axis in [[]]:
+                        for infer_flags in [[1, 1]]:
+                            for strides in [[2, 2]]:
+                                dics = [{
+                                    "axes": axes,
+                                    "starts": starts,
+                                    "ends": ends,
+                                    "decrease_axis": [axes[0]],
+                                    "infer_flags": infer_flags,
+                                    "strides": strides
+                                }]
+
+                                ops_config = [{
+                                    "op_type": "strided_slice",
+                                    "op_inputs": {
+                                        "Input": ["input_data"]
+                                    },
+                                    "op_outputs": {
+                                        "Out": ["slice_output_data"]
+                                    },
+                                    "op_attrs": dics[0]
+                                }]
+                                ops = self.generate_op_config(ops_config)
+
+                                program_config = ProgramConfig(
+                                    ops=ops,
+                                    weights={},
+                                    inputs={
+                                        "input_data":
+                                        TensorConfig(data_gen=partial(
+                                            generate_input1, dics))
+                                    },
+                                    outputs=["slice_output_data"])
+
+                                yield program_config
+
+    def sample_predictor_configs(
+            self, program_config) -> (paddle_infer.Config, List[int], float):
+
+        def generate_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 56, 56, 192]
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [8, 100, 100, 200]
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [4, 56, 56, 192]
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+        # for dynamic_shape
+        generate_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+
+    def test(self):
+        self.run_test()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_sum.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_sum.py
index a982a26362f473..819115fb595026 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_sum.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_sum.py
@@ -22,12 +22,10 @@
 
 
 class TrtConvertSumTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -61,99 +59,101 @@ def generate_input3(batch):
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
                 self.dims = dims
-                ops_config = [{
-                    "op_type": "sum",
-                    "op_inputs": {
-                        "X": ["input1", "input2", "input3"]
-                    },
-                    "op_outputs": {
-                        "Out": ["output"]
-                    },
-                    "op_attrs": {}
-                }]
+                ops_config = [
+                    {
+                        "op_type": "sum",
+                        "op_inputs": {"X": ["input1", "input2", "input3"]},
+                        "op_outputs": {"Out": ["output"]},
+                        "op_attrs": {},
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "input1":
-                        TensorConfig(data_gen=partial(generate_input1, batch)),
-                        "input2":
-                        TensorConfig(data_gen=partial(generate_input2, batch)),
-                        "input3":
-                        TensorConfig(data_gen=partial(generate_input3, batch))
+                        "input1": TensorConfig(
+                            data_gen=partial(generate_input1, batch)
+                        ),
+                        "input2": TensorConfig(
+                            data_gen=partial(generate_input2, batch)
+                        ),
+                        "input3": TensorConfig(
+                            data_gen=partial(generate_input3, batch)
+                        ),
                     },
-                    outputs=["output"])
+                    outputs=["output"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
                     "input1": [1, 3, 24, 24],
                     "input2": [1, 3, 24, 24],
-                    "input3": [1, 3, 24, 24]
+                    "input3": [1, 3, 24, 24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input1": [4, 3, 48, 48],
                     "input2": [4, 3, 48, 48],
-                    "input3": [4, 3, 48, 48]
+                    "input3": [4, 3, 48, 48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input1": [1, 3, 24, 24],
                     "input2": [1, 3, 24, 24],
-                    "input3": [1, 3, 24, 24]
+                    "input3": [1, 3, 24, 24],
                 }
             elif self.dims == 3:
                 self.dynamic_shape.min_input_shape = {
                     "input1": [1, 3, 24],
                     "input2": [1, 3, 24],
-                    "input3": [1, 3, 24]
+                    "input3": [1, 3, 24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input1": [4, 3, 48],
                     "input2": [4, 3, 48],
-                    "input3": [4, 3, 48]
+                    "input3": [4, 3, 48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input1": [1, 3, 24],
                     "input2": [1, 3, 24],
-                    "input3": [1, 3, 24]
+                    "input3": [1, 3, 24],
                 }
             elif self.dims == 2:
                 self.dynamic_shape.min_input_shape = {
                     "input1": [1, 24],
                     "input2": [1, 24],
-                    "input3": [1, 24]
+                    "input3": [1, 24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input1": [4, 48],
                     "input2": [4, 48],
-                    "input3": [4, 48]
+                    "input3": [4, 48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input1": [1, 24],
                     "input2": [1, 24],
-                    "input3": [1, 24]
+                    "input3": [1, 24],
                 }
             elif self.dims == 1:
                 self.dynamic_shape.min_input_shape = {
                     "input1": [24],
                     "input2": [24],
-                    "input3": [24]
+                    "input3": [24],
                 }
                 self.dynamic_shape.max_input_shape = {
                     "input1": [48],
                     "input2": [48],
-                    "input3": [48]
+                    "input3": [48],
                 }
                 self.dynamic_shape.opt_input_shape = {
                     "input1": [24],
                     "input2": [24],
-                    "input3": [24]
+                    "input3": [24],
                 }
 
         def clear_dynamic_shape():
@@ -162,7 +162,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(dynamic_shape):
-            if (self.dims == 1 and not dynamic_shape):
+            if self.dims == 1 and not dynamic_shape:
                 return 0, 5
             return 1, 4
 
@@ -170,17 +170,19 @@ def generate_trt_nodes_num(dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-3
 
     def test(self):
         self.run_test()
@@ -188,12 +190,10 @@ def test(self):
 
 # special case when sum having olny one input
 class TrtConvertSumTest1(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -207,31 +207,31 @@ def generate_input1(batch):
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
                 self.dims = dims
-                ops_config = [{
-                    "op_type": "sum",
-                    "op_inputs": {
-                        "X": ["input1"]
-                    },
-                    "op_outputs": {
-                        "Out": ["output"]
-                    },
-                    "op_attrs": {}
-                }]
+                ops_config = [
+                    {
+                        "op_type": "sum",
+                        "op_inputs": {"X": ["input1"]},
+                        "op_outputs": {"Out": ["output"]},
+                        "op_attrs": {},
+                    }
+                ]
                 ops = self.generate_op_config(ops_config)
                 program_config = ProgramConfig(
                     ops=ops,
                     weights={},
                     inputs={
-                        "input1":
-                        TensorConfig(data_gen=partial(generate_input1, batch)),
+                        "input1": TensorConfig(
+                            data_gen=partial(generate_input1, batch)
+                        ),
                     },
-                    outputs=["output"])
+                    outputs=["output"],
+                )
 
                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape():
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {"input1": [1, 3, 24, 24]}
@@ -268,7 +268,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(dynamic_shape):
-            if (self.dims == 1 and not dynamic_shape):
+            if self.dims == 1 and not dynamic_shape:
                 return 0, 3
             return 1, 2
 
@@ -276,17 +276,19 @@ def generate_trt_nodes_num(dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            False), 1e-5
+            False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
-        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-5
+        yield self.create_inference_config(), generate_trt_nodes_num(True), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
index 82c707869f88c1..1176b3df3bac70 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_tile.py
@@ -26,7 +26,6 @@
 
 
 class TrtConvertTileTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
@@ -39,38 +38,37 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
     def sample_program_configs(self, *args, **kwargs):
-
         def generate_input1(attrs: List[Dict[str, Any]]):
             return np.ones([1, 2, 3, 4]).astype(np.float32)
 
         dics = [{"repeat_times": kwargs['repeat_times']}]
 
-        ops_config = [{
-            "op_type": "tile",
-            "op_inputs": {
-                "X": ["input_data"]
-            },
-            "op_outputs": {
-                "Out": ["tile_output_data"]
-            },
-            "op_attrs": dics[0]
-        }]
+        ops_config = [
+            {
+                "op_type": "tile",
+                "op_inputs": {"X": ["input_data"]},
+                "op_outputs": {"Out": ["tile_output_data"]},
+                "op_attrs": dics[0],
+            }
+        ]
         ops = self.generate_op_config(ops_config)
 
         program_config = ProgramConfig(
             ops=ops,
             weights={},
             inputs={
-                "input_data":
-                TensorConfig(data_gen=partial(generate_input1, dics))
+                "input_data": TensorConfig(
+                    data_gen=partial(generate_input1, dics)
+                )
             },
-            outputs=["tile_output_data"])
+            outputs=["tile_output_data"],
+        )
 
         yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             self.dynamic_shape.min_input_shape = {"input_data": [1, 3, 32, 32]}
             self.dynamic_shape.max_input_shape = {"input_data": [4, 3, 64, 64]}
@@ -99,19 +97,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-4
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-4
+            attrs, True
+        ), 1e-3
 
     @given(repeat_times=st.sampled_from([[100], [1, 2], [0, 3], [1, 2, 100]]))
     def test(self, *args, **kwargs):
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py
index 28509d42ee30b1..8f779a64bf488d 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertActivationTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -44,34 +43,37 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
                 for k in [1, 3]:
                     self.dims = dims
                     dics = [{"k": k}]
-                    ops_config = [{
-                        "op_type": "top_k",
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"],
-                            "Indices": ["indices_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "top_k",
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": {
+                                "Out": ["output_data"],
+                                "Indices": ["indices_data"],
+                            },
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(data_gen=partial(
-                                generate_input1, dims, batch, dics))
+                            "input_data": TensorConfig(
+                                data_gen=partial(
+                                    generate_input1, dims, batch, dics
+                                )
+                            )
                         },
-                        outputs=["output_data", "indices_data"])
+                        outputs=["output_data", "indices_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -114,19 +116,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         ## for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py
index 651cc00d2cd7a6..33d6ca0a74eb78 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_top_k_v2.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertActivationTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         attrs = [
@@ -53,40 +52,48 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
                             for sort in [True, False]:
                                 self.dims = dims
                                 self.sort = sort
-                                dics = [{
-                                    "k": k,
-                                    "axis": axis,
-                                    "largest": largest,
-                                    "sorted": sort
-                                }]
-                                ops_config = [{
-                                    "op_type": "top_k_v2",
-                                    "op_inputs": {
-                                        "X": ["input_data"]
-                                    },
-                                    "op_outputs": {
-                                        "Out": ["output_data"],
-                                        "Indices": ["indices_data"]
-                                    },
-                                    "op_attrs": dics[0]
-                                }]
+                                dics = [
+                                    {
+                                        "k": k,
+                                        "axis": axis,
+                                        "largest": largest,
+                                        "sorted": sort,
+                                    }
+                                ]
+                                ops_config = [
+                                    {
+                                        "op_type": "top_k_v2",
+                                        "op_inputs": {"X": ["input_data"]},
+                                        "op_outputs": {
+                                            "Out": ["output_data"],
+                                            "Indices": ["indices_data"],
+                                        },
+                                        "op_attrs": dics[0],
+                                    }
+                                ]
                                 ops = self.generate_op_config(ops_config)
 
                                 program_config = ProgramConfig(
                                     ops=ops,
                                     weights={},
                                     inputs={
-                                        "input_data":
-                                        TensorConfig(data_gen=partial(
-                                            generate_input1, dims, batch, dics))
+                                        "input_data": TensorConfig(
+                                            data_gen=partial(
+                                                generate_input1,
+                                                dims,
+                                                batch,
+                                                dics,
+                                            )
+                                        )
                                     },
-                                    outputs=["output_data", "indices_data"])
+                                    outputs=["output_data", "indices_data"],
+                                )
 
                                 yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -131,19 +138,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
index e9604925e4ac50..5766f939396d43 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_transpose.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertTransposeTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         inputs = program_config.inputs
         weights = program_config.weights
@@ -32,14 +31,13 @@ def is_program_valid(self, program_config: ProgramConfig) -> bool:
             program_config.ops[i].attrs for i in range(len(program_config.ops))
         ]
 
-        #The shape of input and axis should be equal.
+        # The shape of input and axis should be equal.
         if len(inputs['transpose_input'].shape) != len(attrs[0]['axis']):
             return False
 
         return True
 
     def sample_program_configs(self):
-
         def generate_input1(attrs: List[Dict[str, Any]], batch):
             if self.dims == 4:
                 return np.ones([batch, 3, 24, 24]).astype(np.float32)
@@ -50,37 +48,43 @@ def generate_input1(attrs: List[Dict[str, Any]], batch):
 
         for dims in [2, 3, 4]:
             for batch in [1, 2, 4]:
-                for axis in [[0, 1, 3, 2], [0, 3, 2, 1], [3, 2, 0, 1],
-                             [0, 1, 2, 3], [0, 1, 2], [2, 0, 1], [1, 0], [0,
-                                                                          1]]:
+                for axis in [
+                    [0, 1, 3, 2],
+                    [0, 3, 2, 1],
+                    [3, 2, 0, 1],
+                    [0, 1, 2, 3],
+                    [0, 1, 2],
+                    [2, 0, 1],
+                    [1, 0],
+                    [0, 1],
+                ]:
                     self.dims = dims
                     dics = [{"axis": axis}, {}]
-                    ops_config = [{
-                        "op_type": "transpose",
-                        "op_inputs": {
-                            "X": ["transpose_input"]
-                        },
-                        "op_outputs": {
-                            "Out": ["transpose_out"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "transpose",
+                            "op_inputs": {"X": ["transpose_input"]},
+                            "op_outputs": {"Out": ["transpose_out"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "transpose_input":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch))
+                            "transpose_input": TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)
+                            )
                         },
-                        outputs=["transpose_out"])
+                        outputs=["transpose_out"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 4:
                 self.dynamic_shape.min_input_shape = {
@@ -134,19 +138,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
index ca4231a3561bcf..40326fc8ca4bda 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unary.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertActivationTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -42,40 +41,54 @@ def generate_input1(dims, batch, attrs: List[Dict[str, Any]]):
         for dims in [1, 2, 3, 4]:
             for batch in [1, 4]:
                 for op_type in [
-                        "exp", "log", "sqrt", "abs", "sin", "cos", "tan",
-                        "sinh", "cosh", "asin", "acos", "atan", "asinh",
-                        "atanh", "ceil", "floor"
+                    "exp",
+                    "log",
+                    "sqrt",
+                    "abs",
+                    "sin",
+                    "cos",
+                    "tan",
+                    "sinh",
+                    "cosh",
+                    "asin",
+                    "acos",
+                    "atan",
+                    "asinh",
+                    "atanh",
+                    "ceil",
+                    "floor",
                 ]:
                     self.dims = dims
                     dics = [{}]
 
-                    ops_config = [{
-                        "op_type": op_type,
-                        "op_inputs": {
-                            "X": ["input_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["output_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": op_type,
+                            "op_inputs": {"X": ["input_data"]},
+                            "op_outputs": {"Out": ["output_data"]},
+                            "op_attrs": dics[0],
+                        }
+                    ]
                     ops = self.generate_op_config(ops_config)
 
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "input_data":
-                            TensorConfig(data_gen=partial(
-                                generate_input1, dims, batch, dics))
+                            "input_data": TensorConfig(
+                                data_gen=partial(
+                                    generate_input1, dims, batch, dics
+                                )
+                            )
                         },
-                        outputs=["output_data"])
+                        outputs=["output_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             if self.dims == 1:
                 self.dynamic_shape.min_input_shape = {"input_data": [1]}
@@ -118,19 +131,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def test(self):
         self.run_test()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py
new file mode 100644
index 00000000000000..5ec187daef4e91
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unfold.py
@@ -0,0 +1,104 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from trt_layer_auto_scan_test import TrtLayerAutoScanTest
+from program_config import TensorConfig, ProgramConfig
+import numpy as np
+import paddle.inference as paddle_infer
+from functools import partial
+from typing import List
+import unittest
+
+
+class TrtConvertUnfold(TrtLayerAutoScanTest):
+    def is_program_valid(self, program_config: ProgramConfig) -> bool:
+        return True
+
+    def sample_program_configs(self):
+        def generate_input1():
+            return np.random.random([1, 3, 24, 24]).astype(np.float32)
+
+        ops_config = [
+            {
+                "op_type": "unfold",
+                "op_inputs": {
+                    "X": ["input_data"],
+                },
+                "op_outputs": {"Y": ["output_data"]},
+                "op_attrs": {
+                    "dilations": [1, 1],
+                    "kernel_sizes": [4, 4],
+                    "paddings": [0, 0, 0, 0],
+                    "strides": [1, 1],
+                },
+            }
+        ]
+        ops = self.generate_op_config(ops_config)
+        for i in range(10):
+            program_config = ProgramConfig(
+                ops=ops,
+                weights={},
+                inputs={
+                    "input_data": TensorConfig(
+                        data_gen=partial(generate_input1)
+                    ),
+                },
+                outputs=["output_data"],
+            )
+
+            yield program_config
+
+    def sample_predictor_configs(
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
+        def generate_dynamic_shape(attrs):
+            self.dynamic_shape.min_input_shape = {
+                "input_data": [1, 3, 4, 4],
+            }
+            self.dynamic_shape.max_input_shape = {
+                "input_data": [1, 3, 24, 24],
+            }
+            self.dynamic_shape.opt_input_shape = {
+                "input_data": [1, 3, 24, 24],
+            }
+
+        def clear_dynamic_shape():
+            self.dynamic_shape.max_input_shape = {}
+            self.dynamic_shape.min_input_shape = {}
+            self.dynamic_shape.opt_input_shape = {}
+
+        attrs = [
+            program_config.ops[i].attrs for i in range(len(program_config.ops))
+        ]
+
+        # for static_shape
+        clear_dynamic_shape()
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (0, 3), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (0, 3), 1e-3
+
+        # for dynamic_shape
+        generate_dynamic_shape(attrs)
+        self.trt_param.precision = paddle_infer.PrecisionType.Float32
+        yield self.create_inference_config(), (1, 2), 1e-5
+        self.trt_param.precision = paddle_infer.PrecisionType.Half
+        yield self.create_inference_config(), (1, 2), 1e-3
+
+    def test(self):
+        self.run_test()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unsqueeze2.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unsqueeze2.py
index fc99da714f6846..9c92ea5493cf08 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unsqueeze2.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_unsqueeze2.py
@@ -22,7 +22,6 @@
 
 
 class TrtConvertSplitTest(TrtLayerAutoScanTest):
-
     def is_program_valid(self, program_config: ProgramConfig) -> bool:
         return True
 
@@ -34,17 +33,17 @@ def sample_program_configs(self):
                     self.dims = dims
                     self.axes = axes
                     dics = [{"axes": axes}]
-                    ops_config = [{
-                        "op_type": "unsqueeze2",
-                        "op_inputs": {
-                            "X": ["in_data"]
-                        },
-                        "op_outputs": {
-                            "Out": ["out_data"],
-                            "XShape": ["XShape_data"]
-                        },
-                        "op_attrs": dics[0]
-                    }]
+                    ops_config = [
+                        {
+                            "op_type": "unsqueeze2",
+                            "op_inputs": {"X": ["in_data"]},
+                            "op_outputs": {
+                                "Out": ["out_data"],
+                                "XShape": ["XShape_data"],
+                            },
+                            "op_attrs": dics[0],
+                        }
+                    ]
 
                     # generate input data
                     self.input_shape = [1] * dims
@@ -54,24 +53,26 @@ def sample_program_configs(self):
                     def generate_input1(attrs: List[Dict[str, Any]], batch):
                         self.input_shape[0] = batch
                         return np.random.random(self.input_shape).astype(
-                            np.float32)
+                            np.float32
+                        )
 
                     ops = self.generate_op_config(ops_config)
                     program_config = ProgramConfig(
                         ops=ops,
                         weights={},
                         inputs={
-                            "in_data":
-                            TensorConfig(
-                                data_gen=partial(generate_input1, dics, batch))
+                            "in_data": TensorConfig(
+                                data_gen=partial(generate_input1, dics, batch)
+                            )
                         },
-                        outputs=["out_data"])
+                        outputs=["out_data"],
+                    )
 
                     yield program_config
 
     def sample_predictor_configs(
-            self, program_config) -> (paddle_infer.Config, List[int], float):
-
+        self, program_config
+    ) -> (paddle_infer.Config, List[int], float):
         def generate_dynamic_shape(attrs):
             max_shape = list(self.input_shape)
             min_shape = list(self.input_shape)
@@ -98,19 +99,23 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
         clear_dynamic_shape()
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, False), 1e-5
+            attrs, False
+        ), 1e-3
 
         # for dynamic_shape
         generate_dynamic_shape(attrs)
         self.trt_param.precision = paddle_infer.PrecisionType.Float32
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-5
         self.trt_param.precision = paddle_infer.PrecisionType.Half
         yield self.create_inference_config(), generate_trt_nodes_num(
-            attrs, True), 1e-5
+            attrs, True
+        ), 1e-3
 
     def add_skip_trt_case(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
index cebede99e6f82d..ddab5b8c52218b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_convert_yolo_box.py
@@ -19,6 +19,7 @@
 from functools import partial
 from typing import Optional, List, Callable, Dict, Any, Set
 import unittest
+import os
 
 
 class TrtConvertYoloBoxTest(TrtLayerAutoScanTest):
@@ -107,24 +108,30 @@ def generate_dynamic_shape(attrs):
             if attrs[0]['iou_aware'] == True:
                 channel = 3 * (attrs[0]['class_num'] + 6)
                 self.dynamic_shape.min_input_shape = {
-                    "scale_input": [1, channel, 12, 12]
+                    "yolo_box_input": [1, channel, 12, 12],
+                    "imgsize": [1, 2]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "scale_input": [4, channel, 24, 24]
+                    "yolo_box_input": [4, channel, 24, 24],
+                    "imgsize": [4, 2]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "scale_input": [1, channel, 24, 24]
+                    "yolo_box_input": [1, channel, 24, 24],
+                    "imgsize": [1, 2]
                 }
             else:
                 channel = 3 * (attrs[0]['class_num'] + 5)
                 self.dynamic_shape.min_input_shape = {
-                    "scale_input": [1, channel, 12, 12]
+                    "yolo_box_input": [1, channel, 12, 12],
+                    "imgsize": [1, 2]
                 }
                 self.dynamic_shape.max_input_shape = {
-                    "scale_input": [4, channel, 24, 24]
+                    "yolo_box_input": [4, channel, 24, 24],
+                    "imgsize": [4, 2]
                 }
                 self.dynamic_shape.opt_input_shape = {
-                    "scale_input": [1, channel, 24, 24]
+                    "yolo_box_input": [1, channel, 24, 24],
+                    "imgsize": [1, 2]
                 }
 
         def clear_dynamic_shape():
@@ -133,10 +140,7 @@ def clear_dynamic_shape():
             self.dynamic_shape.opt_input_shape = {}
 
         def generate_trt_nodes_num(attrs, dynamic_shape):
-            if dynamic_shape == True:
-                return 0, 5
-            else:
-                return 1, 4
+            return 1, 4
 
         attrs = [
             program_config.ops[i].attrs for i in range(len(program_config.ops))
@@ -160,7 +164,15 @@ def generate_trt_nodes_num(attrs, dynamic_shape):
             attrs, True), 1e-3
 
     def add_skip_trt_case(self):
-        pass
+
+        def teller2(program_config, predictor_config):
+            if len(self.dynamic_shape.min_input_shape) != 0 and os.name == 'nt':
+                return True
+            return False
+
+        self.add_skip_case(
+            teller2, SkipReasons.TRT_NOT_SUPPORT,
+            "The output has diff between gpu and trt in Windows.")
 
     def test(self):
         self.add_skip_trt_case()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
deleted file mode 100644
index d2dca92345ad33..00000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_reshape2_matmul_fuse_pass.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume, reproduce_failure
-import hypothesis.strategies as st
-
-
-class TestReshape2MatmulFusePass(PassAutoScanTest):
-    """
-        x_var  
-          |          
-       reshape2 
-          \
-    reshape2_out_var    y_var
-             \           /
-                 matmul      bias_var
-                    \          /
-                   elementwise_add  
-    """
-
-    def sample_predictor_configs(self, program_config):
-        # TRT
-        config = self.create_trt_inference_config()
-        config.enable_tensorrt_engine(
-            max_batch_size=10,
-            workspace_size=102400,
-            min_subgraph_size=0,
-            precision_mode=paddle_infer.PrecisionType.Float32,
-            use_static=False,
-            use_calib_mode=False)
-        yield config, ['mul', 'elementwise_add'], (1e-4, 1e-1)
-
-    def add_ignore_pass_case(self):
-        # Here we put some skip rules to avoid known bugs
-        def teller1(program_config, predictor_config):
-            y_shape = list(program_config.weights["matmul_y"].shape)
-            bias_shape = program_config.weights["bias"].shape
-            axis = program_config.ops[2].attrs["axis"]
-            # bias should be [mul_y_shape[-1]]
-            if axis == 0 or bias_shape[0] != y_shape[1] or len(bias_shape) != 1:
-                return True
-            return False
-
-        self.add_ignore_check_case(
-            teller1,
-            IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The pass error on TRT while shape of bias is not [out_size].",
-        )
-
-    def sample_program_config(self, draw):
-        # 1. Generate shape and attr of reshape2
-        reshape = draw(
-            st.lists(st.integers(min_value=1, max_value=10),
-                     min_size=2,
-                     max_size=2))
-        x_shape = reshape + [1, 1]
-
-        # 2. Generate attr:transpose_X/transpose_Y/alpha of matmul
-        alpha = 1.0
-        transpose_X = False
-        transpose_Y = False
-
-        # 3. Generate legal shape of input:Y of matmul
-        y_shape = draw(
-            st.lists(st.integers(min_value=1, max_value=8),
-                     min_size=2,
-                     max_size=2))
-        y_shape[0] = x_shape[1]
-
-        # 4. Generate legal attr:axis of elementwise_add
-        axis = draw(st.integers(min_value=-1, max_value=1))
-        if axis == 0:
-            axis = -1
-        bias_shape = [
-            y_shape[1],
-        ]
-        # if axis == -1:
-        #     if draw(st.booleans()):
-        #         bias_shape = [y_shape[1], ]
-        #     else:
-        #         bias_shape = [x_shape[0], y_shape[1]]
-
-        reshape2_op = OpConfig(
-            "reshape2",
-            inputs={
-                "X": ["reshape2_x"],
-            },
-            shape=reshape,
-            outputs={
-                "Out": ["reshape2_out"],
-                "XShape": ["xshape"]
-            },
-        )
-        matmul_op = OpConfig(
-            "matmul",
-            inputs={
-                "X": ["reshape2_out"],
-                "Y": ["matmul_y"]
-            },
-            outputs={"Out": ["matmul_out"]},
-            alpha=alpha,
-            transpose_X=transpose_X,
-            transpose_Y=transpose_Y,
-            fused_reshape_X=[],
-            fused_reshape_Y=[],
-            fused_transpose_X=[],
-            fused_transpose_Y=[],
-            fused_reshape_Out=[],
-            fused_transpose_Out=[],
-        )
-
-        add_op = OpConfig(
-            "elementwise_add",
-            inputs={
-                "X": ["matmul_out"],
-                "Y": ["bias"]
-            },
-            outputs={"Out": ["add_out"]},
-            axis=axis,
-        )
-
-        ops = [reshape2_op, matmul_op, add_op]
-
-        program_config = ProgramConfig(
-            ops=ops,
-            weights={
-                "matmul_y": TensorConfig(shape=y_shape),
-                "bias": TensorConfig(shape=bias_shape),
-            },
-            inputs={
-                "reshape2_x": TensorConfig(shape=x_shape),
-            },
-            outputs=ops[-1].outputs["Out"],
-        )
-
-        return program_config
-
-    def test(self):
-        self.run_and_statis(quant=False,
-                            max_examples=50,
-                            passes=["trt_reshape2_matmul_fuse_pass"])
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
index a52dd0aed84659..0dfb24fde660ce 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_squeeze2_matmul_fuse_pass.py
@@ -47,7 +47,8 @@ def sample_predictor_configs(self, program_config):
             min_subgraph_size=0,
             precision_mode=paddle_infer.PrecisionType.Float32,
             use_static=False,
-            use_calib_mode=False)
+            use_calib_mode=False,
+        )
         yield config, ['mul', 'elementwise_add'], (1e-4, 1e-1)
 
     def add_ignore_pass_case(self):
@@ -70,9 +71,10 @@ def teller1(program_config, predictor_config):
     def sample_program_config(self, draw):
         # 1. Generate shape of input:X of squeeze2
         x_shape = draw(
-            st.lists(st.integers(min_value=1, max_value=8),
-                     min_size=2,
-                     max_size=2))
+            st.lists(
+                st.integers(min_value=1, max_value=8), min_size=2, max_size=2
+            )
+        )
         # axes of squeeze2 == [2, 3]
         x_shape += [1, 1]
         axes = [2, 3]
@@ -84,9 +86,10 @@ def sample_program_config(self, draw):
 
         # 3. Generate legal shape of input:Y of matmul
         y_shape = draw(
-            st.lists(st.integers(min_value=1, max_value=8),
-                     min_size=2,
-                     max_size=2))
+            st.lists(
+                st.integers(min_value=1, max_value=8), min_size=2, max_size=2
+            )
+        )
         y_shape[0] = x_shape[1]
 
         # 4. Generate legal attr:axis of elementwise_add
@@ -108,17 +111,11 @@ def sample_program_config(self, draw):
                 "X": ["squeeze2_x"],
             },
             axes=axes,
-            outputs={
-                "Out": ["squeeze2_out"],
-                "XShape": ["xshape"]
-            },
+            outputs={"Out": ["squeeze2_out"], "XShape": ["xshape"]},
         )
         matmul_op = OpConfig(
             "matmul",
-            inputs={
-                "X": ["squeeze2_out"],
-                "Y": ["matmul_y"]
-            },
+            inputs={"X": ["squeeze2_out"], "Y": ["matmul_y"]},
             outputs={"Out": ["matmul_out"]},
             alpha=alpha,
             transpose_X=transpose_X,
@@ -133,10 +130,7 @@ def sample_program_config(self, draw):
 
         add_op = OpConfig(
             "elementwise_add",
-            inputs={
-                "X": ["matmul_out"],
-                "Y": ["bias"]
-            },
+            inputs={"X": ["matmul_out"], "Y": ["bias"]},
             outputs={"Out": ["add_out"]},
             axis=axis,
         )
@@ -157,9 +151,11 @@ def sample_program_config(self, draw):
         return program_config
 
     def test(self):
-        self.run_and_statis(quant=False,
-                            max_examples=50,
-                            passes=["trt_squeeze2_matmul_fuse_pass"])
+        self.run_and_statis(
+            quant=False,
+            max_examples=25,
+            passes=["trt_squeeze2_matmul_fuse_pass"],
+        )
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/ir/test_convert_to_mixed_precision.py b/python/paddle/fluid/tests/unittests/ir/test_convert_to_mixed_precision.py
index deb4990cf5db36..6a4f77005b4b3f 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_convert_to_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_convert_to_mixed_precision.py
@@ -20,7 +20,7 @@
 from paddle.jit import to_static
 from paddle.static import InputSpec
 
-from paddle.inference import PrecisionType, BackendType
+from paddle.inference import PrecisionType, PlaceType
 from paddle.inference import convert_to_mixed_precision
 
 
@@ -38,7 +38,7 @@ def test_convert_to_fp16(self):
                                    'resnet50/inference.pdiparams',
                                    'mixed/inference.pdmodel',
                                    'mixed/inference.pdiparams',
-                                   PrecisionType.Half, BackendType.GPU, True)
+                                   PrecisionType.Half, PlaceType.GPU, True)
 
     def test_convert_to_fp16_with_fp16_input(self):
         model = resnet50(True)
@@ -49,7 +49,7 @@ def test_convert_to_fp16_with_fp16_input(self):
                                    'resnet50/inference.pdiparams',
                                    'mixed1/inference.pdmodel',
                                    'mixed1/inference.pdiparams',
-                                   PrecisionType.Half, BackendType.GPU, False)
+                                   PrecisionType.Half, PlaceType.GPU, False)
 
     def test_convert_to_fp16_with_blacklist(self):
         model = resnet50(True)
@@ -60,7 +60,7 @@ def test_convert_to_fp16_with_blacklist(self):
                                    'resnet50/inference.pdiparams',
                                    'mixed2/inference.pdmodel',
                                    'mixed2/inference.pdiparams',
-                                   PrecisionType.Half, BackendType.GPU, False,
+                                   PrecisionType.Half, PlaceType.GPU, False,
                                    set('conv2d'))
 
     def test_convert_to_bf16(self):
@@ -72,8 +72,7 @@ def test_convert_to_bf16(self):
                                    'resnet50/inference.pdiparams',
                                    'mixed3/inference.pdmodel',
                                    'mixed3/inference.pdiparams',
-                                   PrecisionType.Bfloat16, BackendType.GPU,
-                                   True)
+                                   PrecisionType.Bfloat16, PlaceType.GPU, True)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
index 40697f0a6e3b15..0411432aa369ef 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_fuse_resnet_unit.py
@@ -24,13 +24,15 @@
 np.random.seed(0)
 
 
-@unittest.skipIf(not paddle.is_compiled_with_cuda()
-                 or paddle.get_cudnn_version() < 8000
-                 or paddle.device.cuda.get_device_capability()[0] < 7,
-                 "only support with cuda and cudnn version is at least 8.0 "
-                 "and device's compute capability is at least 7.0")
+@unittest.skipIf(
+    not paddle.is_compiled_with_cuda()
+    or paddle.get_cudnn_version() < 8000
+    or paddle.device.cuda.get_device_capability()[0] < 7
+    or paddle.device.cuda.get_device_capability()[0] >= 9,
+    "only support with cuda and cudnn version is at least 8.0 "
+    "and device's compute capability is at least 7.0 and less than 9.0",
+)
 class TestFuseResNetUnit(unittest.TestCase):
-
     def test_fuse_resenet_unit(self):
         place = paddle.CUDAPlace(0)
         program = paddle.static.Program()
@@ -38,14 +40,12 @@ def test_fuse_resenet_unit(self):
         with paddle.static.amp.fp16_guard():
             with paddle.static.program_guard(program, startup_program):
                 x = paddle.static.data("x", [1, 64, 64, 8])
-                conv2d = paddle.nn.Conv2D(8,
-                                          32,
-                                          1,
-                                          bias_attr=False,
-                                          data_format='NHWC')
-                batch_norm = paddle.nn.BatchNorm(32,
-                                                 act='relu',
-                                                 data_layout='NHWC')
+                conv2d = paddle.nn.Conv2D(
+                    8, 32, 1, bias_attr=False, data_format='NHWC'
+                )
+                batch_norm = paddle.nn.BatchNorm(
+                    32, act='relu', data_layout='NHWC'
+                )
                 out = batch_norm(conv2d(x))
         graph = core.Graph(program.desc)
         core.get_pass("fuse_resnet_unit").apply(graph)
@@ -54,15 +54,15 @@ def test_fuse_resenet_unit(self):
         after_params = paddle.static.amp.cast_model_to_fp16(after_program)
         exe = paddle.static.Executor(place)
         exe.run(startup_program)
-        paddle.static.amp.cast_parameters_to_fp16(place,
-                                                  program,
-                                                  to_fp16_var_names=params)
         paddle.static.amp.cast_parameters_to_fp16(
-            place, after_program, to_fp16_var_names=after_params)
+            place, program, to_fp16_var_names=params
+        )
+        paddle.static.amp.cast_parameters_to_fp16(
+            place, after_program, to_fp16_var_names=after_params
+        )
         feed = {"x": np.random.randn(1, 64, 64, 8).astype("float16")}
         before_out = exe.run(program, feed=feed, fetch_list=[out.name])
         after_out = exe.run(after_program, feed=feed, fetch_list=[out.name])
-        np.testing.assert_allclose(before_out[0],
-                                   after_out[0],
-                                   rtol=1e-05,
-                                   atol=0.005)
+        np.testing.assert_allclose(
+            before_out[0], after_out[0], rtol=1e-05, atol=0.005
+        )
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index ac880b02c4557e..b2e58514dca78b 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -128,19 +128,11 @@ def init_dtype(self):
         self.dtype = np.float32
 
 
-@skip_check_grad_ci(reason="not implemented yet")
 class TestMKLDNNHardSwishDim2(TestHardSwish):
 
     def setUp(self):
         super(TestMKLDNNHardSwishDim2, self).setUp()
-
-        self.attrs["use_mkldnn"] = True
-
-    def init_dtype(self):
-        self.dtype = np.float32
-
-    def test_check_grad(self):
-        pass
+        self.attrs = {"use_mkldnn": True}
 
 
 class TestMKLDNNSigmoidDim2(TestSigmoid):
@@ -317,11 +309,14 @@ def init_dtype(self):
 
 
 def ref_hardswish(x, threshold=6.0, scale=6.0, offset=3.0):
+    x_dtype = x.dtype
+    if x_dtype == 'float16':
+        x_dtype = 'float16'
+        x = x.astype('float32')
     return (x * np.minimum(np.maximum(x + offset, 0.), threshold) /
-            scale).astype(x.dtype)
+            scale).astype(x_dtype)
 
 
-@skip_check_grad_ci(reason="not implemented yet")
 class TestMKLDNNHardSwishDim4(TestHardSwish):
 
     def setUp(self):
@@ -343,9 +338,6 @@ def setUp(self):
     def init_dtype(self):
         self.dtype = np.float32
 
-    def test_check_grad(self):
-        pass
-
 
 class TestMKLDNNMish(TestActivation):
 
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
index e70cc8e3779672..6029c777330308 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_elementwise_sub_mkldnn_op.py
@@ -20,6 +20,8 @@
 from paddle.fluid.framework import _current_expected_place
 import paddle.fluid.core as core
 
+import sys
+
 
 @OpTestTool.skip_if(not (isinstance(_current_expected_place(), core.CPUPlace)),
                     "GPU is not supported")
@@ -108,6 +110,23 @@ def init_axis(self):
         self.axis = 1
 
 
+class TestMKLDNNElementwiseSubOp40(TestMKLDNNElementwiseSubOp):
+
+    def init_input_output(self):
+        self.x = np.random.uniform(0.1, 2, [180, 1]).astype(self.dtype)
+        self.y = np.random.uniform(0.1, 1, [1, 256]).astype(self.dtype)
+        self.out = np.subtract(self.x, self.y)
+
+    def test_check_grad_normal(self):
+        self.check_grad(['X', 'Y'], 'Out')
+
+    def test_check_grad_ignore_x(self):
+        self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
+
+    def test_check_grad_ignore_y(self):
+        self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
+
+
 class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
 
     def init_input_output(self):
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
index e59b70ec60c7a0..5667253e5cb73a 100755
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_onnx_format_quantization_mobilenetv1.py
@@ -18,8 +18,6 @@
 import random
 import math
 import functools
-import contextlib
-import tempfile
 import numpy as np
 from PIL import Image, ImageEnhance
 import paddle
@@ -151,13 +149,11 @@ def setUp(self):
         self.infer_iterations = 50000 if os.environ.get(
             'DATASET') == 'full' else 2
 
-        self.root_path = tempfile.TemporaryDirectory()
-        self.int8_model = os.path.join(self.root_path.name,
-                                       "post_training_quantization")
-        print("self.int8_model: ", self.int8_model)
+        self.int8_model = "post_training_quantization"
 
     def tearDown(self):
-        self.root_path.cleanup()
+        cmd = 'rm -rf post_training_quantization'
+        os.system(cmd)
         pass
 
     def cache_unzipping(self, target_folder, zip_path):
@@ -268,16 +264,8 @@ def generate_quantized_model(self,
                                  is_use_cache_file=False,
                                  is_optimize_model=False,
                                  onnx_format=False):
-        try:
-            os.system("mkdir " + self.int8_model)
-        except Exception as e:
-            print("Failed to create {} due to {}".format(
-                self.int8_model, str(e)))
-            sys.exit(-1)
-
         place = fluid.CPUPlace()
         exe = fluid.Executor(place)
-        scope = fluid.global_scope()
         val_reader = val()
 
         ptq = PostTrainingQuantization(executor=exe,
@@ -292,24 +280,6 @@ def generate_quantized_model(self,
                                        is_use_cache_file=is_use_cache_file)
         ptq.quantize()
         ptq.save_quantized_model(self.int8_model)
-        if onnx_format:
-            try:
-                collect_dict = ptq._calibration_scales
-                save_quant_table_path = os.path.join(self.int8_model,
-                                                     'calibration_table.txt')
-                with open(save_quant_table_path, 'w') as txt_file:
-                    for tensor_name in collect_dict.keys():
-                        write_line = '{} {}'.format(
-                            tensor_name,
-                            collect_dict[tensor_name]['scale']) + '\n'
-                        txt_file.write(write_line)
-                print(
-                    "Quantization clip ranges of tensors is save in: {}".format(
-                        save_quant_table_path))
-            except:
-                print(
-                    "Unable to generate `calibration_table.txt`, please update PaddlePaddle >= 2.3.3"
-                )
 
     def run_test(self,
                  model,
@@ -329,12 +299,6 @@ def run_test(self,
 
         model_cache_folder = self.download_data(data_urls, data_md5s, model)
 
-        print("Start FP32 inference for {0} on {1} images ...".format(
-            model, infer_iterations * batch_size))
-        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
-            os.path.join(model_cache_folder, "model"), batch_size,
-            infer_iterations)
-
         print("Start INT8 post training quantization for {0} on {1} images ...".
               format(model, sample_iterations * batch_size))
         self.generate_quantized_model(os.path.join(model_cache_folder, "model"),
@@ -342,6 +306,12 @@ def run_test(self,
                                       is_full_quantize, is_use_cache_file,
                                       is_optimize_model, onnx_format)
 
+        print("Start FP32 inference for {0} on {1} images ...".format(
+            model, infer_iterations * batch_size))
+        (fp32_throughput, fp32_latency, fp32_acc1) = self.run_program(
+            os.path.join(model_cache_folder, "model"), batch_size,
+            infer_iterations)
+
         print("Start INT8 inference for {0} on {1} images ...".format(
             model, infer_iterations * batch_size))
         (int8_throughput, int8_latency,
@@ -365,10 +335,10 @@ def run_test(self,
         self.assertLess(delta_value, diff_threshold)
 
 
-class TestMKLDNNInt8ForMobilenetv1AvgONNXFormat(TestPostTrainingQuantization):
+class TestMKLDNNInt8ForResnet50AvgONNXFormat(TestPostTrainingQuantization):
 
-    def test_onnx_format_avg_mobilenetv1(self):
-        model = "MobileNet-V1"
+    def test_onnx_format_avg_resnet50(self):
+        model = "resnet50"
         algo = "avg"
         round_type = "round"
         data_urls = [
@@ -397,96 +367,5 @@ def test_onnx_format_avg_mobilenetv1(self):
                       onnx_format=True)
 
 
-class TestMKLDNNInt8ForMobilenetv1Avg(TestPostTrainingQuantization):
-
-    def test_avg_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "avg"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = [
-            "conv2d",
-            "depthwise_conv2d",
-            "mul",
-        ]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        diff_threshold = 0
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      onnx_format=False)
-
-
-class TestMKLDNNInt8ForMobilenetv1AbsMaxONNXFormat(TestPostTrainingQuantization
-                                                   ):
-
-    def test_onnx_format_abs_max_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "abs_max"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        # The accuracy diff of post-training quantization (abs_max) maybe bigger
-        diff_threshold = 0
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      onnx_format=True)
-
-
-class TestMKLDNNInt8ForMobilenetv1AbsMax(TestPostTrainingQuantization):
-
-    def test_abs_max_mobilenetv1(self):
-        model = "MobileNet-V1"
-        algo = "abs_max"
-        round_type = "round"
-        data_urls = [
-            'http://paddle-inference-dist.bj.bcebos.com/int8/mobilenetv1_int8_model.tar.gz'
-        ]
-        data_md5s = ['13892b0716d26443a8cdea15b3c6438b']
-        quantizable_op_type = ["conv2d", "depthwise_conv2d", "mul"]
-        is_full_quantize = False
-        is_use_cache_file = False
-        is_optimize_model = False
-        # The accuracy diff of post-training quantization (abs_max) maybe bigger
-        diff_threshold = 0
-        self.run_test(model,
-                      algo,
-                      round_type,
-                      data_urls,
-                      data_md5s,
-                      quantizable_op_type,
-                      is_full_quantize,
-                      is_use_cache_file,
-                      is_optimize_model,
-                      diff_threshold,
-                      onnx_format=False)
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
index 5c1b8b602f2699..e7c50ec2880dc9 100644
--- a/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/sync_batch_norm_op_mlu.py
@@ -34,7 +34,10 @@
 from multiprocessing import Process
 import paddle.fluid.layers as layers
 from functools import reduce
-from test_sync_batch_norm_base_mlu import TestSyncBatchNormRunnerBase, runtime_main
+from test_sync_batch_norm_base_mlu import (
+    TestSyncBatchNormRunnerBase,
+    runtime_main,
+)
 from op_test import OpTest, _set_use_system_allocator
 
 from test_sync_batch_norm_op import create_or_get_tensor
@@ -44,11 +47,11 @@
 
 
 class TestSyncBatchNormOpTraining(TestSyncBatchNormRunnerBase):
-
     def __init__(self):
         self.global_ring_id = 0
 
         self.dtype = np.float32
+        self.bn_dtype = np.float32
         self.N = 8
         self.C = 16
         self.H = 32
@@ -56,29 +59,36 @@ def __init__(self):
         self.dshape = [self.N, self.C, self.H, self.W]
         self.atol = 1e-3
 
-    def get_model(self,
-                  main,
-                  startup,
-                  place,
-                  layout,
-                  seed,
-                  sync_bn=False,
-                  only_forward=False):
+    def get_model(
+        self,
+        main,
+        startup,
+        place,
+        layout,
+        seed,
+        sync_bn=False,
+        only_forward=False,
+    ):
         """Build program."""
         use_cudnn = False
         with fluid.unique_name.guard():
             with fluid.program_guard(main, startup):
-                data = fluid.layers.data(name='input',
-                                         shape=self.dshape,
-                                         dtype=self.dtype,
-                                         append_batch_size=False)
+                data = fluid.layers.data(
+                    name='input',
+                    shape=self.dshape,
+                    dtype=self.dtype,
+                    append_batch_size=False,
+                )
                 conv = fluid.layers.conv2d(
                     input=data,
                     num_filters=32,
                     filter_size=1,
                     param_attr=fluid.ParamAttr(name='conv2d_weight'),
                     bias_attr=False,
-                    use_cudnn=use_cudnn)
+                    use_cudnn=use_cudnn,
+                )
+                if self.bn_dtype == np.float16:
+                    conv = fluid.layers.cast(conv, 'float16')
                 bn = fluid.layers.batch_norm(
                     conv,
                     param_attr=fluid.ParamAttr(name='bn_scale'),
@@ -86,9 +96,10 @@ def get_model(self,
                     moving_mean_name='bn_moving_mean',
                     moving_variance_name='bn_moving_variance',
                     data_layout=layout,
-                    is_test=only_forward)
-                # if self.dtype == np.float16:
-                #     bn = fluid.layers.cast(bn, 'float32')
+                    is_test=only_forward,
+                )
+                if self.bn_dtype == np.float16:
+                    bn = fluid.layers.cast(bn, 'float32')
                 sigmoid = fluid.layers.sigmoid(bn)
                 out = fluid.layers.reduce_sum(sigmoid)
                 # if not sync_bn:
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
index 1b3ce96111573d..5ee747907564cb 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_api_base_mlu.py
@@ -41,10 +41,10 @@ def DataTypeCast(date_type):
 
 
 class TestCollectiveAPIRunnerBase(object):
-
     def get_model(self, train_prog, startup_prog, rank, indata=None):
         raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )
 
     def run_trainer(self, args):
         train_prog = fluid.Program()
@@ -66,12 +66,12 @@ def run_trainer(self, args):
             fetch_list = []
             for elem in result:
                 fetch_list.append(elem.name)
-            out = exe.run(train_prog,
-                          feed={'tindata': indata},
-                          fetch_list=fetch_list)
+            out = exe.run(
+                train_prog, feed={'tindata': indata}, fetch_list=fetch_list
+            )
         else:
             out = self.get_model(train_prog, startup_prog, rank, indata)
-            #print(out, sys.stderr)
+            # print(out, sys.stderr)
         sys.stdout.buffer.write(pickle.dumps(out))
 
 
@@ -96,19 +96,20 @@ def runtime_main(test_class, col_type):
 
 
 class TestDistBase(unittest.TestCase):
-
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
         self._python_interp = sys.executable
 
     def _find_free_port(self):
-
         def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                 s.bind(('', 0))
                 return s.getsockname()[1]
 
@@ -121,13 +122,13 @@ def __free_port():
     def _run_cluster(self, model_file, envs):
         worker_endpoints = self._ps_endpoints.split(",")
         w0_ep, w1_ep = worker_endpoints
-        #print("w0_ep:",w0_ep," w1_ep:",w1_ep)
+        # print("w0_ep:",w0_ep," w1_ep:",w1_ep)
         env0 = {
             "FLAGS_selected_mlus": "0",
             "PADDLE_TRAINER_ID": "0",
             "PADDLE_TRAINERS_NUM": "2",
             "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
         }
 
         env1 = {
@@ -135,9 +136,9 @@ def _run_cluster(self, model_file, envs):
             "PADDLE_TRAINER_ID": "1",
             "PADDLE_TRAINERS_NUM": "2",
             "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
         }
-        #update environment
+        # update environment
         env0.update(envs)
         env1.update(envs)
         if os.getenv('WITH_COVERAGE', 'OFF') == 'ON':
@@ -148,16 +149,20 @@ def _run_cluster(self, model_file, envs):
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
         tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
         tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
-        #print(tr0_cmd)
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
-
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        # print(tr0_cmd)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -170,17 +175,23 @@ def _run_cluster(self, model_file, envs):
             sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
         with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
             sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         data_type,
-                         path_id="0",
-                         static_mode="1",
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self,
+        model_file,
+        col_type,
+        data_type,
+        path_id="0",
+        static_mode="1",
+        check_error_log=False,
+        need_envs={},
+    ):
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -194,7 +205,7 @@ def check_with_place(self,
             "PADDLE_WITH_GLOO": '0',
             "BACKEND": "cncl",
             "PATH_ID": path_id,
-            "DATA_TYPE": data_type
+            "DATA_TYPE": data_type,
         }
         required_envs.update(need_envs)
         if check_error_log:
@@ -202,7 +213,8 @@ def check_with_place(self,
             required_envs["GLOG_logtostderr"] = "1"
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
         np_data_type = DataTypeCast(data_type)
         np.random.seed(pid0)
         input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -210,21 +222,19 @@ def check_with_place(self,
         input2 = np.random.random((10, 1000)).astype(np_data_type)
         if col_type == "broadcast":
             need_result = input2
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         elif col_type == "allreduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
         elif col_type == "reduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
         elif col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
index 47fb3a1a2305cd..cb6444056b09e0 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_collective_base_mlu.py
@@ -53,10 +53,10 @@ def DataTypeCast(date_type):
 
 
 class TestCollectiveRunnerBase(object):
-
     def get_model(self, train_prog, startup_prog, col_type):
         raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )
 
     def wait_server_ready(self, endpoints):
         while True:
@@ -64,13 +64,15 @@ def wait_server_ready(self, endpoints):
             not_ready_endpoints = []
             for ep in endpoints:
                 ip_port = ep.split(":")
-                with closing(socket.socket(socket.AF_INET,
-                                           socket.SOCK_STREAM)) as sock:
+                with closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                ) as sock:
                     sock.settimeout(2)
                     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                     if hasattr(socket, 'SO_REUSEPORT'):
-                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
-                                        1)
+                        sock.setsockopt(
+                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1
+                        )
 
                     result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                     if result != 0:
@@ -78,44 +80,51 @@ def wait_server_ready(self, endpoints):
                         not_ready_endpoints.append(ep)
             if not all_ok:
                 sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" +
-                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.write(
+                    "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+                )
                 sys.stderr.flush()
                 time.sleep(3)
             else:
                 break
 
+    # endpoints should be ["ip1:port1","ip2:port2"]
 
-#endpoints should be ["ip1:port1","ip2:port2"]
-
-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
+    def initCommunicator(
+        self, program, rank, nranks, wait_port, current_endpoint, endpoints
+    ):
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
         if rank == 0 and wait_port:
             self.wait_server_ready(other_endpoints)
         block = program.global_block()
-        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
-                                       persistable=True,
-                                       type=core.VarDesc.VarType.RAW)
-
-        block.append_op(type='c_gen_cncl_id',
-                        inputs={},
-                        outputs={'Out': cncl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-
-        block.append_op(type='c_comm_init',
-                        inputs={'X': cncl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': self.global_ring_id
-                        })
+        cncl_id_var = block.create_var(
+            name=nameGen.generate('cncl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+        )
+
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+            },
+        )
 
     def run_trainer(self, args):
         train_prog = fluid.Program()
@@ -124,8 +133,9 @@ def run_trainer(self, args):
         rank = args["trainerid"]
         current_endpoint = args["currentendpoint"]
         nranks = 2
-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
+        self.initCommunicator(
+            startup_prog, rank, nranks, True, current_endpoint, endpoints
+        )
         self.rank = rank
         result = self.get_model(train_prog, startup_prog, args["col_type"])
         device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
@@ -135,9 +145,9 @@ def run_trainer(self, args):
         np.random.seed(os.getpid())
         np_data_type = DataTypeCast(args["data_type"])
         indata = np.random.random((10, 1000)).astype(np_data_type)
-        out = exe.run(train_prog,
-                      feed={'tindata': indata},
-                      fetch_list=[result.name])
+        out = exe.run(
+            train_prog, feed={'tindata': indata}, fetch_list=[result.name]
+        )
         sys.stdout.buffer.write(pickle.dumps(out))
 
 
@@ -160,19 +170,20 @@ def runtime_main(test_class):
 
 
 class TestDistBase(unittest.TestCase):
-
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
         self._python_interp = sys.executable
 
     def _find_free_port(self):
-
         def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                 s.bind(('', 0))
                 return s.getsockname()[1]
 
@@ -191,7 +202,7 @@ def _run_cluster(self, model_file, envs):
             "PADDLE_TRAINER_ID": "0",
             "PADDLE_TRAINERS_NUM": "2",
             "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w0_ep
+            "PADDLE_CURRENT_ENDPOINT": w0_ep,
         }
 
         env1 = {
@@ -199,9 +210,9 @@ def _run_cluster(self, model_file, envs):
             "PADDLE_TRAINER_ID": "1",
             "PADDLE_TRAINERS_NUM": "2",
             "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
-            "PADDLE_CURRENT_ENDPOINT": w1_ep
+            "PADDLE_CURRENT_ENDPOINT": w1_ep,
         }
-        #update environment
+        # update environment
         env0.update(envs)
         env1.update(envs)
         tr_cmd = "%s %s"
@@ -210,15 +221,19 @@ def _run_cluster(self, model_file, envs):
         tr0_pipe = open("/tmp/tr0_err.log", "wb")
         tr1_pipe = open("/tmp/tr1_err.log", "wb")
 
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
 
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -227,15 +242,21 @@ def _run_cluster(self, model_file, envs):
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         data_type,
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self,
+        model_file,
+        col_type,
+        data_type,
+        check_error_log=False,
+        need_envs={},
+    ):
         required_envs = {
             "FLAGS_eager_delete_tensor_gb": "0.0",
             "PATH": os.getenv("PATH"),
@@ -251,7 +272,8 @@ def check_with_place(self,
             required_envs["GLOG_v"] = "3"
             required_envs["GLOG_logtostderr"] = "1"
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
         np_data_type = DataTypeCast(data_type)
         np.random.seed(pid0)
         input1 = np.random.random((10, 1000)).astype(np_data_type)
@@ -259,63 +281,55 @@ def check_with_place(self,
         input2 = np.random.random((10, 1000)).astype(np_data_type)
         if col_type == "broadcast":
             need_result = input2
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         elif col_type == "allreduce_sum":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
         elif col_type == "allreduce_prod":
             need_result = input1 * input2
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
         elif col_type == "allreduce_max":
             need_result = np.maximum(input1, input2)
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
         elif col_type == "allreduce_min":
             need_result = np.minimum(input1, input2)
-            np.testing.assert_allclose(tr0_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
-            np.testing.assert_allclose(tr1_out,
-                                       need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                tr0_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
+            np.testing.assert_allclose(
+                tr1_out[0], need_result, rtol=1e-05, atol=1e-05
+            )
         elif col_type == "reduce_sum":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         elif col_type == "reduce_prod":
             need_result = input1 * input2
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         elif col_type == "reduce_max":
             need_result = np.maximum(input1, input2)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         elif col_type == "reduce_min":
             need_result = np.minimum(input1, input2)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         elif col_type == "allgather":
             need_result = np.vstack((input1, input2))
-            np.testing.assert_allclose(tr0_out, need_result)
-            np.testing.assert_allclose(tr1_out, need_result)
+            np.testing.assert_allclose(tr0_out[0], need_result)
+            np.testing.assert_allclose(tr1_out[0], need_result)
         else:
             pass
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
index 8497853561d878..bedcde9def6e3f 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_dropout_op_mlu.py
@@ -29,26 +29,44 @@
 
 
 class TestDropoutOp(OpTest):
-
     def setUp(self):
-        self.op_type = "dropout"
         self.set_mlu()
         self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
+        self.init_inputs_shape()
+        self.init_attrs()
+        self.op_type = 'dropout'
+        self.inputs = {'X': np.random.random(self.shape).astype(self.dtype)}
         self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
+            'dropout_prob': self.dropout_prob,
+            'fix_seed': self.fix_seed,
+            'is_test': self.is_test,
+            'dropout_implementation': self.dropout_implementation,
         }
 
+        out = self.inputs['X'] * (1.0 - self.dropout_prob)
+        if self.is_test == False:
+            mask = None
+            if self.dropout_prob == 0.0:
+                mask = np.ones(self.shape).astype('uint8')
+            elif self.dropout_prob == 1.0:
+                mask = np.zeros(self.shape).astype('uint8')
+            self.outputs = {'Out': out, 'Mask': mask}
+        else:
+            self.outputs = {'Out': out}
+
     def init_dtype(self):
         self.dtype = np.float32
 
+    def init_inputs_shape(self):
+        self.shape = [32, 64]
+
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = False
+        self.dropout_prob = 0.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "upscale_in_train"
+
     def set_mlu(self):
         self.__class__.use_mlu = True
         self.place = paddle.device.MLUPlace(0)
@@ -57,84 +75,107 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
+        if (
+            hasattr(self.__class__, "no_need_check_grad")
+            and self.__class__.no_need_check_grad == True
+        ):
+            return
+
         self.check_grad_with_place(self.place, ['X'], 'Out')
 
 
 class TestDropoutOpInput1d(TestDropoutOp):
-    # change input shape
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((3, 62)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((3, 62)).astype('uint8')
-        }
-
-
-class TestDropoutOpInput1d_1(TestDropoutOp):
-    # the input is 1-D
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((2000)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((2000)).astype('uint8')
-        }
+    def init_inputs_shape(self):
+        self.shape = [2000]
 
 
 class TestDropoutOp2(TestDropoutOp):
-    # the dropout_prob is 1.0
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64)).astype(self.dtype)}
-        self.attrs = {
-            'dropout_prob': 1.0,
-            'fix_seed': True,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': np.zeros((32, 64)).astype('float32'),
-            'Mask': np.zeros((32, 64)).astype('uint8')
-        }
+    def init_inputs_shape(self):
+        self.shape = [32, 64]
+
+    def init_attrs(self):
+        self.dropout_prob = 1.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "upscale_in_train"
 
 
 class TestDropoutOp3(TestDropoutOp):
-    # the input dim is 3
+    def init_inputs_shape(self):
+        self.shape = [32, 64, 2]
+
+
+class TestDropoutOp4(TestDropoutOp):
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.35
+        self.fix_seed = True
+        self.is_test = True
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOp5(TestDropoutOp):
+    def init_inputs_shape(self):
+        self.shape = [32, 64, 3]
+
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.75
+        self.fix_seed = True
+        self.is_test = True
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOp6(TestDropoutOp):
+    def init_attrs(self):
+        self.__class__.no_need_check_grad = True
+        self.dropout_prob = 0.0
+        self.fix_seed = True
+        self.is_test = False
+        self.dropout_implementation = "downgrade_in_infer"
+
+
+class TestDropoutOpWithSeed(TestDropoutOp):
+    # the seed is a Tensor
     def setUp(self):
         self.op_type = "dropout"
         self.set_mlu()
-        self.init_dtype()
-        self.inputs = {'X': np.random.random((32, 64, 2)).astype(self.dtype)}
+        self.dtype = np.float32
+        self.inputs = {
+            "X": np.random.random((32, 64)).astype(self.dtype),
+            "Seed": np.asarray([125], dtype="int32"),
+        }
         self.attrs = {
             'dropout_prob': 0.0,
-            'fix_seed': True,
             'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
         }
         self.outputs = {
             'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64, 2)).astype('uint8')
+            'Mask': np.ones((32, 64)).astype('uint8'),
         }
 
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X'], 'Out')
+
+
+class TestDropoutOpFp16(TestDropoutOp):
+    # float16
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.no_need_check_grad = True
+
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference(OpTest):
@@ -148,7 +189,7 @@ def setUp(self):
             'dropout_prob': 0.35,
             'fix_seed': True,
             'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
         }
         self.outputs = {'Out': self.inputs['X']}
 
@@ -165,7 +206,6 @@ def test_check_output(self):
 
 @skip_check_grad_ci(reason="For inference, check_grad is not required.")
 class TestDropoutOpInference2(TestDropoutOpInference):
-
     def setUp(self):
         self.op_type = "dropout"
         self.set_mlu()
@@ -174,45 +214,12 @@ def setUp(self):
         self.attrs = {
             'dropout_prob': 0.75,
             'is_test': True,
-            'dropout_implementation': 'upscale_in_train'
+            'dropout_implementation': 'upscale_in_train',
         }
         self.outputs = {'Out': self.inputs['X']}
 
 
-class TestDropoutOpWithSeed(TestDropoutOp):
-    # the seed is a Tensor
-    def setUp(self):
-        self.op_type = "dropout"
-        self.set_mlu()
-        self.init_dtype()
-        self.inputs = {
-            "X": np.random.random((32, 64)).astype(self.dtype),
-            "Seed": np.asarray([125], dtype="int32")
-        }
-        self.attrs = {
-            'dropout_prob': 0.0,
-            'is_test': False,
-            'dropout_implementation': 'upscale_in_train'
-        }
-        self.outputs = {
-            'Out': self.inputs['X'],
-            'Mask': np.ones((32, 64)).astype('uint8')
-        }
-
-
-class TestDropoutOpFp16(TestDropoutOp):
-    # float16
-    def init_dtype(self):
-        self.dtype = np.float16
-
-    def set_mlu(self):
-        self.__class__.use_mlu = True
-        self.place = paddle.device.MLUPlace(0)
-        self.__class__.no_need_check_grad = True
-
-
 class TestDropoutAPI(unittest.TestCase):
-
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace(), paddle.device.MLUPlace(0)]
@@ -220,43 +227,44 @@ def setUp(self):
     def check_static_result(self, place):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
             input = fluid.data(name="input", shape=[40, 40], dtype="float32")
-            res1 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                training=False,
-                                                mode='upscale_in_train')
-            res2 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=0,
-                                                training=True,
-                                                mode='upscale_in_train')
-            res3 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=0,
-                                                training=False,
-                                                mode='upscale_in_train')
-            res4 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=[0, 1],
-                                                training=True,
-                                                mode='upscale_in_train')
-            res5 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=[0, 1],
-                                                training=False,
-                                                mode='upscale_in_train')
-            res6 = paddle.nn.functional.dropout(x=input,
-                                                p=1.,
-                                                training=True,
-                                                mode='upscale_in_train')
+            res1 = paddle.nn.functional.dropout(
+                x=input, p=0.0, training=False, mode='upscale_in_train'
+            )
+            res2 = paddle.nn.functional.dropout(
+                x=input, p=0.0, axis=0, training=True, mode='upscale_in_train'
+            )
+            res3 = paddle.nn.functional.dropout(
+                x=input, p=0.0, axis=0, training=False, mode='upscale_in_train'
+            )
+            res4 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=[0, 1],
+                training=True,
+                mode='upscale_in_train',
+            )
+            res5 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=[0, 1],
+                training=False,
+                mode='upscale_in_train',
+            )
+            res6 = paddle.nn.functional.dropout(
+                x=input, p=1.0, training=True, mode='upscale_in_train'
+            )
             res7 = paddle.fluid.layers.dropout(
                 x=input,
-                dropout_prob=0.,
-                dropout_implementation='upscale_in_train')
-            res8 = paddle.nn.functional.dropout(x=input,
-                                                p=0.,
-                                                axis=(0, 1),
-                                                training=False,
-                                                mode='upscale_in_train')
+                dropout_prob=0.0,
+                dropout_implementation='upscale_in_train',
+            )
+            res8 = paddle.nn.functional.dropout(
+                x=input,
+                p=0.0,
+                axis=(0, 1),
+                training=False,
+                mode='upscale_in_train',
+            )
 
             in_np = np.random.random([40, 40]).astype("float32")
             res_np = in_np
@@ -265,13 +273,17 @@ def check_static_result(self, place):
             exe = fluid.Executor(place)
             res_list = [res1, res2, res3, res4, res5, res7, res8]
             for res in res_list:
-                fetches = exe.run(fluid.default_main_program(),
-                                  feed={"input": in_np},
-                                  fetch_list=[res])
+                fetches = exe.run(
+                    fluid.default_main_program(),
+                    feed={"input": in_np},
+                    fetch_list=[res],
+                )
                 np.testing.assert_allclose(fetches[0], res_np)
-            fetches2 = exe.run(fluid.default_main_program(),
-                               feed={"input": in_np},
-                               fetch_list=[res6])
+            fetches2 = exe.run(
+                fluid.default_main_program(),
+                feed={"input": in_np},
+                fetch_list=[res6],
+            )
             np.testing.assert_allclose(fetches2[0], res_np2)
 
     def test_static(self):
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
index 032c2e9a506f7c..df173ebf18c6e3 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_grid_sampler_op_mlu.py
@@ -28,12 +28,15 @@ def AffineGrid(theta, grid_shape):
     n = grid_shape[0]
     h = grid_shape[1]
     w = grid_shape[2]
-    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w,
-                      axis=0).T[:, :, np.newaxis]
-    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h,
-                      axis=0)[:, :, np.newaxis]
-    grid = np.concatenate([w_idx, h_idx, np.ones([h, w, 1])],
-                          axis=2)  # h * w * 3
+    h_idx = np.repeat(np.linspace(-1, 1, h)[np.newaxis, :], w, axis=0).T[
+        :, :, np.newaxis
+    ]
+    w_idx = np.repeat(np.linspace(-1, 1, w)[np.newaxis, :], h, axis=0)[
+        :, :, np.newaxis
+    ]
+    grid = np.concatenate(
+        [w_idx, h_idx, np.ones([h, w, 1])], axis=2
+    )  # h * w * 3
     grid = np.repeat(grid[np.newaxis, :], n, axis=0)  # n * h * w *3
 
     ret = np.zeros([n, h * w, 2])
@@ -53,13 +56,17 @@ def getGridPointValue(data, x, y):
     out_H = x.shape[1]
     out_W = x.shape[2]
 
-    #out = np.zeros(data_shape, dtype='float32')
+    # out = np.zeros(data_shape, dtype='float32')
     out = np.zeros([N, C, out_H, out_W], dtype='float32')
     for i in range(N):
         for j in range(out_H):
             for k in range(out_W):
-                if y[i, j, k] < 0 or y[i, j, k] > in_H - 1 or x[
-                        i, j, k] < 0 or x[i, j, k] > in_W - 1:
+                if (
+                    y[i, j, k] < 0
+                    or y[i, j, k] > in_H - 1
+                    or x[i, j, k] < 0
+                    or x[i, j, k] > in_W - 1
+                ):
                     out[i, :, j, k] = 0
                 else:
                     out[i, :, j, k] = data[i, :, y[i, j, k], x[i, j, k]]
@@ -75,27 +82,28 @@ def unnormalizeAndClip(grid_slice, max_val, align_corners, padding_mode):
     if align_corners:
         grid_slice = 0.5 * ((grid_slice.astype('float32') + 1.0) * max_val)
     else:
-        grid_slice = 0.5 * ((grid_slice.astype('float32') + 1.0) *
-                            (max_val + 1)) - 0.5
+        grid_slice = (
+            0.5 * ((grid_slice.astype('float32') + 1.0) * (max_val + 1)) - 0.5
+        )
 
     if padding_mode == "border":
         grid_slice = clip(grid_slice, 0, max_val)
     elif padding_mode == "reflection":
         double_range = 2 * max_val if align_corners else (max_val + 1) * 2
-        grid_abs = np.abs(grid_slice) if align_corners else np.abs(grid_slice +
-                                                                   0.5)
+        grid_abs = (
+            np.abs(grid_slice) if align_corners else np.abs(grid_slice + 0.5)
+        )
         extra = grid_abs - np.floor(grid_abs / double_range) * double_range
         grid_slice = np.minimum(extra, double_range - extra)
-        grid_slice = grid_slice if align_corners else clip(
-            grid_slice - 0.5, 0, max_val)
+        grid_slice = (
+            grid_slice if align_corners else clip(grid_slice - 0.5, 0, max_val)
+        )
     return grid_slice
 
 
-def GridSampler(data,
-                grid,
-                align_corners=True,
-                mode="bilinear",
-                padding_mode="zeros"):
+def GridSampler(
+    data, grid, align_corners=True, mode="bilinear", padding_mode="zeros"
+):
     dims = data.shape
     N = dims[0]
     in_C = dims[1]
@@ -119,14 +127,18 @@ def GridSampler(data,
         y0 = np.floor(y).astype('int32')
         y1 = y0 + 1
 
-        wa = np.tile(((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wb = np.tile(((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wc = np.tile(((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
-        wd = np.tile(((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)),
-                     (1, in_C, 1, 1))
+        wa = np.tile(
+            ((x1 - x) * (y1 - y)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wb = np.tile(
+            ((x1 - x) * (y - y0)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wc = np.tile(
+            ((x - x0) * (y1 - y)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
+        wd = np.tile(
+            ((x - x0) * (y - y0)).reshape((N, 1, out_H, out_W)), (1, in_C, 1, 1)
+        )
 
         va = getGridPointValue(data, x0, y0)
         vb = getGridPointValue(data, x0, y1)
@@ -142,7 +154,6 @@ def GridSampler(data,
 
 
 class TestGridSamplerOp(OpTest):
-
     def setUp(self):
         self.place = paddle.device.MLUPlace(0)
         self.__class__.use_mlu = True
@@ -166,12 +177,12 @@ def setUp(self):
             'use_cudnn': False,
             "align_corners": self.align_corners,
             "padding_mode": self.padding_mode,
-            "mode": self.mode
+            "mode": self.mode,
         }
         self.outputs = {
-            'Output':
-            GridSampler(x, grid, self.align_corners, self.mode,
-                        self.padding_mode)
+            'Output': GridSampler(
+                x, grid, self.align_corners, self.mode, self.padding_mode
+            )
         }
 
     def test_check_output(self):
@@ -186,20 +197,17 @@ def initTestCase(self):
         self.mode = "bilinear"
 
 
-# TODO(fwg): Test this case when cnnl support align_corners = True.
-# class Case1(TestGridSamplerOp):
-#
-#     def initTestCase(self):
-#         self.x_shape = (2, 3, 5, 6)
-#         self.grid_shape = (2, 8, 9, 2)
-#         self.theta_shape = (2, 2, 3)
-#         self.align_corners = True
-#         self.padding_mode = "zeros"
-#         self.mode = "bilinear"
+class Case1(TestGridSamplerOp):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 5, 6)
+        self.grid_shape = (2, 8, 9, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
 
 
 class LargeInputCase(TestGridSamplerOp):
-
     def initTestCase(self):
         self.x_shape = (2, 3, 128, 128)
         self.grid_shape = (2, 130, 130, 2)
@@ -209,16 +217,15 @@ def initTestCase(self):
         self.mode = "bilinear"
 
 
-# TODO(fwg): Test this case when cnnl support align_corners = True.
-# class Case2(LargeInputCase):
-#
-#     def initTestCase(self):
-#         self.x_shape = (2, 3, 128, 128)
-#         self.grid_shape = (2, 130, 130, 2)
-#         self.theta_shape = (2, 2, 3)
-#         self.align_corners = True
-#         self.padding_mode = "zeros"
-#         self.mode = "bilinear"
+class Case2(LargeInputCase):
+    def initTestCase(self):
+        self.x_shape = (2, 3, 128, 128)
+        self.grid_shape = (2, 130, 130, 2)
+        self.theta_shape = (2, 2, 3)
+        self.align_corners = True
+        self.padding_mode = "zeros"
+        self.mode = "bilinear"
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py
new file mode 100644
index 00000000000000..2f2c10be7b6d76
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_huber_loss_op_mlu.py
@@ -0,0 +1,132 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+
+sys.path.append("..")
+from op_test import OpTest
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import compiler, Program, program_guard
+
+paddle.enable_static()
+
+
+def huber_loss_forward(val, delta):
+    abs_val = abs(val)
+    if abs_val <= delta:
+        return 0.5 * val * val
+    else:
+        return delta * (abs_val - 0.5 * delta)
+
+
+class TestHuberLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'huber_loss'
+        self.set_mlu()
+        self.python_api = paddle.fluid.layers.huber_loss
+        self.python_out_sig = ["Out"]
+        self.delta = 1.0
+        self.init_input()
+        shape = self.set_shape()
+        residual = self.inputs['Y'] - self.inputs['X']
+        loss = np.vectorize(huber_loss_forward)(residual, self.delta).astype(
+            'float32'
+        )
+        self.attrs = {'delta': self.delta}
+        self.outputs = {'Residual': residual, 'Out': loss.reshape(shape)}
+
+    def init_input(self):
+        shape = self.set_shape()
+        self.inputs = {
+            'X': np.random.uniform(0, 1.0, shape).astype('float32'),
+            'Y': np.random.uniform(0, 1.0, shape).astype('float32'),
+        }
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def set_shape(self):
+        return (100, 1)
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, atol=1e-3)
+
+    def test_check_grad_normal(self):
+        self.check_grad_with_place(self.place, ['X', 'Y'], 'Out')
+
+    def test_check_grad_ingore_x(self):
+        self.check_grad_with_place(
+            self.place,
+            ['Y'],
+            'Out',
+            max_relative_error=0.008,
+            no_grad_set=set("residual"),
+        )
+
+    def test_check_grad_ingore_y(self):
+        self.check_grad_with_place(
+            self.place,
+            ['X'],
+            'Out',
+            max_relative_error=0.008,
+            no_grad_set=set('residual'),
+        )
+
+
+def TestHuberLossOp1(TestHuberLossOp):
+    def set_shape(self):
+        return 64
+
+
+def TestHuberLossOp2(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6)
+
+
+def TestHuberLossOp3(TestHuberLossOp):
+    def set_shape(self):
+        return (6, 6, 1)
+
+
+class TestHuberLossOpError(unittest.TestCase):
+    def test_errors(self):
+        with program_guard(Program(), Program()):
+            # the input and label must be Variable
+            xw = np.random.random((6, 6)).astype("float32")
+            xr = fluid.data(name='xr', shape=[None, 6], dtype="float32")
+            lw = np.random.random((6, 6)).astype("float32")
+            lr = fluid.data(name='lr', shape=[None, 6], dtype="float32")
+            delta = 1.0
+            self.assertRaises(TypeError, fluid.layers.huber_loss, xr, lw, delta)
+            self.assertRaises(TypeError, fluid.layers.huber_loss, xw, lr, delta)
+
+            # the dtype of input and label must be float32 or float64
+            xw2 = fluid.data(name='xw2', shape=[None, 6], dtype="int32")
+            lw2 = fluid.data(name='lw2', shape=[None, 6], dtype="int32")
+            self.assertRaises(
+                TypeError, fluid.layers.huber_loss, xw2, lr, delta
+            )
+            self.assertRaises(
+                TypeError, fluid.layers.huber_loss, xr, lw2, delta
+            )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py
new file mode 100644
index 00000000000000..242e1c8e663f49
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_merged_adam_op_mlu.py
@@ -0,0 +1,228 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+
+sys.path.append('..')
+import unittest
+import paddle
+import numpy as np
+from paddle import _C_ops, _legacy_C_ops
+from paddle.fluid.framework import in_dygraph_mode
+
+
+def run_adam_op(
+    params,
+    grads,
+    lrs,
+    moment1s,
+    moment2s,
+    beta1_pows,
+    beta2_pows,
+    master_params,
+    epsilon,
+    beta1,
+    beta2,
+    place,
+    multi_precision=False,
+    use_merged=False,
+):
+    assert len(params) == len(grads)
+    assert len(params) == len(lrs)
+    assert len(params) == len(moment1s)
+    assert len(params) == len(moment2s)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(beta1_pows)
+    assert len(params) == len(master_params)
+    paddle.disable_static()
+    # paddle.set_device(place)
+
+    param_vars = [paddle.fluid.dygraph.to_variable(p) for p in params]
+    grad_vars = [paddle.fluid.dygraph.to_variable(g) for g in grads]
+    lr_vars = [paddle.fluid.dygraph.to_variable(l) for l in lrs]
+    moment1_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment1s]
+    moment2_vars = [paddle.fluid.dygraph.to_variable(m) for m in moment2s]
+    beta1_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta1_pows]
+    beta2_pow_vars = [paddle.fluid.dygraph.to_variable(b) for b in beta2_pows]
+    master_param_vars = [
+        paddle.fluid.dygraph.to_variable(m_p) for m_p in master_params
+    ]
+
+    if not use_merged:
+        for i in range(len(param_vars)):
+            _, _, _, _, _, _ = _legacy_C_ops.adam(
+                param_vars[i],
+                grad_vars[i],
+                lr_vars[i],
+                moment1_vars[i],
+                moment2_vars[i],
+                beta1_pow_vars[i],
+                beta2_pow_vars[i],
+                master_param_vars[i],
+                param_vars[i],
+                moment1_vars[i],
+                moment2_vars[i],
+                beta1_pow_vars[i],
+                beta2_pow_vars[i],
+                master_param_vars[i],
+                'epsilon',
+                epsilon,
+                'beta1',
+                beta1,
+                'beta2',
+                beta2,
+                'multi_precision',
+                multi_precision,
+            )
+    else:
+        if in_dygraph_mode():
+            _, _, _, _, _, _ = _C_ops.merged_adam_(
+                param_vars,
+                grad_vars,
+                lr_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                beta1,
+                beta2,
+                epsilon,
+                multi_precision,
+                False,
+            )
+        else:
+            _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
+                param_vars,
+                grad_vars,
+                lr_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                param_vars,
+                moment1_vars,
+                moment2_vars,
+                beta1_pow_vars,
+                beta2_pow_vars,
+                master_param_vars,
+                'epsilon',
+                epsilon,
+                'beta1',
+                beta1,
+                'beta2',
+                beta2,
+                'multi_precision',
+                multi_precision,
+            )
+
+    outputs = {
+        'ParamOut': param_vars,
+        'Moment1Out': moment1_vars,
+        'Moment2Out': moment2_vars,
+        'Beta1PowOut': beta1_pow_vars,
+        'Beta2PowOut': beta2_pow_vars,
+        'MasterParamOut': master_param_vars,
+    }
+
+    return outputs
+
+
+class TestMergedAdam(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.shapes = [[3, 4], [2, 7], [5, 6], [7, 8]]
+        self.seed = 10
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+
+    def gen_rand_data(self, shapes, dtype):
+        return [np.random.random(s).astype(dtype) for s in shapes]
+
+    def prepare_data(self, shapes, multi_precision, seed, place):
+        np.random.seed(seed)
+        mp_dtype = np.float32
+        # dtype = np.float16 if multi_precision and place == 'mlu' else np.float32
+        dtype = np.float32
+        params = self.gen_rand_data(shapes, dtype)
+        grads = self.gen_rand_data(shapes, dtype)
+        lrs = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        moment1s = self.gen_rand_data(shapes, mp_dtype)
+        moment2s = self.gen_rand_data(shapes, mp_dtype)
+        beta1_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        beta2_pows = self.gen_rand_data([[1], [1], [1], [1]], mp_dtype)
+        master_params = [p.astype(mp_dtype) for p in params]
+        return (
+            params,
+            grads,
+            lrs,
+            moment1s,
+            moment2s,
+            beta1_pows,
+            beta2_pows,
+            master_params,
+        )
+
+    def check_with_place(self, place, multi_precision):
+        (
+            params,
+            grads,
+            lrs,
+            moment1s,
+            moment2s,
+            beta1_pows,
+            beta2_pows,
+            master_params,
+        ) = self.prepare_data(self.shapes, multi_precision, self.seed, place)
+
+        def run_op(use_merged):
+            return run_adam_op(
+                params=params,
+                grads=grads,
+                lrs=lrs,
+                moment1s=moment1s,
+                moment2s=moment2s,
+                beta1_pows=beta1_pows,
+                beta2_pows=beta2_pows,
+                master_params=master_params,
+                epsilon=0.9,
+                beta1=0.9,
+                beta2=0.99,
+                place=place,
+                multi_precision=multi_precision,
+                use_merged=use_merged,
+            )
+
+        outs1 = run_op(True)
+        outs2 = run_op(False)
+        self.assertEqual(len(outs1), len(outs2))
+
+        for key in outs1.keys():
+            value1 = outs1[key]
+            value2 = outs2[key]
+            for i in range(len(value1)):
+                if place == 'mlu':
+                    np.testing.assert_array_equal(value1[i], value2[i])
+                else:
+                    np.testing.assert_allclose(
+                        value1[i], value2[i], rtol=1e-05, atol=1e-07
+                    )
+
+    def test_main(self):
+        for multi_precision in [False, True]:
+            self.check_with_place(self.place, multi_precision)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py
new file mode 100644
index 00000000000000..68df3067f0cf25
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_prior_box_op_mlu.py
@@ -0,0 +1,214 @@
+#   Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import sys
+
+sys.path.append('..')
+import numpy as np
+from op_test import OpTest
+import paddle.fluid as fluid
+import paddle
+import math
+
+paddle.enable_static()
+
+
+class TestMLUPriorBox(OpTest):
+    def setUp(self):
+        self.op_type = "prior_box"
+        self.set_mlu()
+        self.init_dtype()
+        self.set_data()
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place)
+
+    def set_mlu(self):
+        self.__class__.use_mlu = True
+        self.place = paddle.MLUPlace(0)
+
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {'Input': self.input, 'Image': self.image}
+
+        self.attrs = {
+            'min_sizes': self.min_sizes,
+            'aspect_ratios': self.aspect_ratios,
+            'variances': self.variances,
+            'flip': self.flip,
+            'clip': self.clip,
+            'min_max_aspect_ratios_order': self.min_max_aspect_ratios_order,
+            'step_w': self.step_w,
+            'step_h': self.step_h,
+            'offset': self.offset,
+        }
+        if len(self.max_sizes) > 0:
+            self.attrs['max_sizes'] = self.max_sizes
+
+        self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var}
+
+    def set_max_sizes(self):
+        max_sizes = [5, 10]
+        self.max_sizes = np.array(max_sizes).astype('float32').tolist()
+
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = True
+
+    def init_test_params(self):
+        self.layer_w = 32
+        self.layer_h = 32
+
+        self.image_w = 40
+        self.image_h = 40
+
+        self.step_w = float(self.image_w) / float(self.layer_w)
+        self.step_h = float(self.image_h) / float(self.layer_h)
+
+        self.input_channels = 2
+        self.image_channels = 3
+        self.batch_size = 10
+
+        self.min_sizes = [2, 4]
+        self.min_sizes = np.array(self.min_sizes).astype('float32').tolist()
+        self.set_max_sizes()
+        self.aspect_ratios = [2.0, 3.0]
+        self.flip = True
+        self.set_min_max_aspect_ratios_order()
+        self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
+        self.aspect_ratios = np.array(
+            self.aspect_ratios, dtype=np.float64
+        ).flatten()
+        self.variances = [0.1, 0.1, 0.2, 0.2]
+        self.variances = np.array(self.variances, dtype=np.float64).flatten()
+
+        self.clip = True
+        self.num_priors = len(self.real_aspect_ratios) * len(self.min_sizes)
+        if len(self.max_sizes) > 0:
+            self.num_priors += len(self.max_sizes)
+        self.offset = 0.5
+
+    def init_test_input(self):
+        self.image = np.random.random(
+            (self.batch_size, self.image_channels, self.image_w, self.image_h)
+        ).astype('float32')
+
+        self.input = np.random.random(
+            (self.batch_size, self.input_channels, self.layer_w, self.layer_h)
+        ).astype('float32')
+
+    def init_test_output(self):
+        out_dim = (self.layer_h, self.layer_w, self.num_priors, 4)
+        out_boxes = np.zeros(out_dim).astype('float32')
+        out_var = np.zeros(out_dim).astype('float32')
+
+        idx = 0
+        for h in range(self.layer_h):
+            for w in range(self.layer_w):
+                c_x = (w + self.offset) * self.step_w
+                c_y = (h + self.offset) * self.step_h
+                idx = 0
+                for s in range(len(self.min_sizes)):
+                    min_size = self.min_sizes[s]
+                    if not self.min_max_aspect_ratios_order:
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+                    else:
+                        c_w = c_h = min_size / 2.0
+                        out_boxes[h, w, idx, :] = [
+                            (c_x - c_w) / self.image_w,
+                            (c_y - c_h) / self.image_h,
+                            (c_x + c_w) / self.image_w,
+                            (c_y + c_h) / self.image_h,
+                        ]
+                        idx += 1
+                        if len(self.max_sizes) > 0:
+                            max_size = self.max_sizes[s]
+                            # second prior: aspect_ratio = 1,
+                            c_w = c_h = math.sqrt(min_size * max_size) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+                        # rest of priors
+                        for r in range(len(self.real_aspect_ratios)):
+                            ar = self.real_aspect_ratios[r]
+                            if abs(ar - 1.0) < 1e-6:
+                                continue
+                            c_w = min_size * math.sqrt(ar) / 2
+                            c_h = (min_size / math.sqrt(ar)) / 2
+                            out_boxes[h, w, idx, :] = [
+                                (c_x - c_w) / self.image_w,
+                                (c_y - c_h) / self.image_h,
+                                (c_x + c_w) / self.image_w,
+                                (c_y + c_h) / self.image_h,
+                            ]
+                            idx += 1
+
+        # clip the prior's coordidate such that it is within[0, 1]
+        if self.clip:
+            out_boxes = np.clip(out_boxes, 0.0, 1.0)
+        # set the variance.
+        out_var = np.tile(
+            self.variances, (self.layer_h, self.layer_w, self.num_priors, 1)
+        )
+        self.out_boxes = out_boxes.astype('float32')
+        self.out_var = out_var.astype('float32')
+
+
+class TestMLUPriorBoxWithoutMaxSize(TestMLUPriorBox):
+    def set_max_sizes(self):
+        self.max_sizes = []
+
+
+class TestMLUPriorBoxWithoutSpecifiedOutOrder(TestMLUPriorBox):
+    def set_min_max_aspect_ratios_order(self):
+        self.min_max_aspect_ratios_order = False
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
index ab984187443373..af0882be46a995 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_reduce_sum_op_mlu.py
@@ -26,7 +26,6 @@
 
 
 class TestMLUReduceSumOp(OpTest):
-
     def setUp(self):
         self.init_op_type()
         self.initTestCase()
@@ -34,16 +33,16 @@ def setUp(self):
         self.attrs = {
             'dim': self.axis,
             'keep_dim': self.keep_dim,
-            'reduce_all': self.reduce_all
+            'reduce_all': self.reduce_all,
         }
         self.inputs = {'X': np.random.random(self.shape).astype("float32")}
         if self.attrs['reduce_all']:
             self.outputs = {'Out': self.inputs['X'].sum()}
         else:
             self.outputs = {
-                'Out':
-                self.inputs['X'].sum(axis=self.axis,
-                                     keepdims=self.attrs['keep_dim'])
+                'Out': self.inputs['X'].sum(
+                    axis=self.axis, keepdims=self.attrs['keep_dim']
+                )
             }
 
     def set_mlu(self):
@@ -64,100 +63,92 @@ def init_op_type(self):
 
     def initTestCase(self):
         self.shape = (5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)
 
 
 class TestSumOp5D(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (1, 2, 5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)
 
 
 class TestSumOp6D(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (1, 1, 2, 5, 6, 10)
-        self.axis = (0, )
+        self.axis = (0,)
 
 
 class TestSumOp8D(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (1, 3, 1, 2, 1, 4, 3, 10)
         self.axis = (0, 3)
 
 
 class Test1DReduce(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = 120
-        self.axis = (0, )
+        self.axis = (0,)
 
 
 class Test2DReduce0(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (20, 10)
-        self.axis = (0, )
+        self.axis = (0,)
 
 
 class Test2DReduce1(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (20, 10)
-        self.axis = (1, )
+        self.axis = (1,)
 
 
 class Test3DReduce0(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (5, 6, 7)
-        self.axis = (1, )
+        self.axis = (1,)
 
 
 class Test3DReduce1(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (5, 6, 7)
-        self.axis = (2, )
+        self.axis = (2,)
 
 
 class Test3DReduce2(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (5, 6, 7)
-        self.axis = (-2, )
+        self.axis = (-2,)
 
 
 class Test3DReduce3(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (5, 6, 7)
         self.axis = (1, 2)
 
 
 class TestKeepDimReduce(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (5, 6, 10)
-        self.axis = (1, )
+        self.axis = (1,)
         self.keep_dim = True
 
 
 class TestKeepDim8DReduce(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (2, 5, 3, 2, 2, 3, 4, 2)
         self.axis = (3, 4, 5)
         self.keep_dim = True
 
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            self.place, ['X'], 'Out', max_relative_error=0.03
+        )
+
 
 class TestReduceAll(TestMLUReduceSumOp):
-
     def initTestCase(self):
         self.shape = (5, 6, 2, 10)
-        self.axis = (0, )
+        self.axis = (0,)
         self.reduce_all = True
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
index 71116b4d3cebbd..3d61fd3fc1f742 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_slice_op_mlu.py
@@ -31,7 +31,6 @@
 # Situation 1: starts(list, no tensor), ends(list, no tensor)
 # 1.1 without attr(decrease)
 class TestSliceOp(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -42,7 +41,7 @@ def setUp(self):
             'axes': self.axes,
             'starts': self.starts,
             'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
         }
 
     def config(self):
@@ -57,9 +56,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -67,7 +66,6 @@ def set_mlu(self):
 
 
 class TestCase1(TestSliceOp):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
@@ -78,7 +76,6 @@ def config(self):
 
 
 class TestCase2(TestSliceOp):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-3, 0, 2]
@@ -90,7 +87,6 @@ def config(self):
 
 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -118,9 +114,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -128,7 +124,6 @@ def set_mlu(self):
 
 
 class TestSliceOp_decs_dim_2(TestSliceOp_decs_dim):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [1, 0, 2]
@@ -140,7 +135,6 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_3(TestSliceOp_decs_dim):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1, 0, 2]
@@ -152,7 +146,6 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_4(TestSliceOp_decs_dim):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 7]).astype("float32")
         self.starts = [0, 1, 2, 3]
@@ -164,7 +157,6 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_5(TestSliceOp_decs_dim):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1]
@@ -176,7 +168,6 @@ def config(self):
 
 
 class TestSliceOp_decs_dim_6(TestSliceOp_decs_dim):
-
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [0, 1, 2, 3]
@@ -190,7 +181,6 @@ def config(self):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 # without attr(decrease)
 class TestSliceOp_starts_ListTensor(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -198,8 +188,9 @@ def setUp(self):
 
         starts_tensor = []
         for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int64') * ele))
+            starts_tensor.append(
+                ("x" + str(index), np.ones((1)).astype('int64') * ele)
+            )
 
         self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
         self.outputs = {'Out': self.out}
@@ -207,7 +198,7 @@ def setUp(self):
             'axes': self.axes,
             'starts': self.starts_infer,
             'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
         }
 
     def config(self):
@@ -224,9 +215,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -236,7 +227,6 @@ def set_mlu(self):
 # Situation 2: starts(list, have tensor), ends(list, no tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_ListTensor(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.set_mlu()
@@ -244,8 +234,9 @@ def setUp(self):
 
         starts_tensor = []
         for index, ele in enumerate(self.starts):
-            starts_tensor.append(("x" + str(index), np.ones(
-                (1)).astype('int32') * ele))
+            starts_tensor.append(
+                ("x" + str(index), np.ones((1)).astype('int32') * ele)
+            )
 
         self.inputs = {'Input': self.input, 'StartsTensorList': starts_tensor}
 
@@ -273,9 +264,9 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
     def set_mlu(self):
         self.__class__.use_mlu = True
@@ -283,8 +274,8 @@ def set_mlu(self):
 
 
 class TestSliceOp_decs_dim_5_starts_ListTensor(
-        TestSliceOp_decs_dim_starts_ListTensor):
-
+    TestSliceOp_decs_dim_starts_ListTensor
+):
     def config(self):
         self.input = np.random.random([3, 4, 5, 6]).astype("float32")
         self.starts = [-1]
@@ -300,7 +291,6 @@ def config(self):
 # Situation 3: starts(tensor), ends(list, no tensor)
 # with attr(decrease)
 class TestSliceOp_decs_dim_starts_OneTensor(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -308,7 +298,7 @@ def setUp(self):
         self.config()
         self.inputs = {
             'Input': self.input,
-            "StartsTensor": np.array(self.starts, dtype="int32")
+            "StartsTensor": np.array(self.starts, dtype="int32"),
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -332,15 +322,14 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
 
 # Situation 4: starts(tensor), ends(tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_OneTensor(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -350,14 +339,14 @@ def setUp(self):
         self.inputs = {
             'Input': self.input,
             "StartsTensor": np.array(self.starts, dtype="int64"),
-            "EndsTensor": np.array(self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32"),
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
             'axes': self.axes,
             #'starts': self.starts,
             #'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
         }
 
     def config(self):
@@ -372,15 +361,14 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
 
 # Situation 5: starts(tensor), ends(tensor)
 #  with attr(decrease)
 class TestSliceOp_decs_dim_starts_and_ends_OneTensor(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -389,7 +377,7 @@ def setUp(self):
         self.inputs = {
             'Input': self.input,
             "StartsTensor": np.array(self.starts, dtype="int32"),
-            "EndsTensor": np.array(self.ends, dtype="int32")
+            "EndsTensor": np.array(self.ends, dtype="int32"),
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
@@ -413,15 +401,14 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
 
 # Situation 6: starts(tensor), ends(list, have tensor)
 # without attr(decrease)
 class TestSliceOp_starts_OneTensor_ends_ListTensor(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -430,20 +417,21 @@ def setUp(self):
 
         ends_tensor = []
         for index, ele in enumerate(self.ends):
-            ends_tensor.append(("y" + str(index), np.ones(
-                (1)).astype('int32') * ele))
+            ends_tensor.append(
+                ("y" + str(index), np.ones((1)).astype('int32') * ele)
+            )
 
         self.inputs = {
             'Input': self.input,
             "StartsTensor": np.array(self.starts, dtype="int32"),
-            'EndsTensorList': ends_tensor
+            'EndsTensorList': ends_tensor,
         }
         self.outputs = {'Out': self.out}
         self.attrs = {
             'axes': self.axes,
             #'starts': self.starts,
             'ends': self.ends_infer,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
         }
 
     def config(self):
@@ -460,14 +448,13 @@ def test_check_output(self):
         self.check_output_with_place(self.place)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
 
 # Test float16
 class TestFP16(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -479,7 +466,7 @@ def setUp(self):
             'axes': self.axes,
             'starts': self.starts,
             'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
         }
 
     def config(self):
@@ -495,13 +482,12 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006)
+        self.check_grad_with_place(
+            self.place, ['Input'], 'Out', max_relative_error=0.006
+        )
 
 
 class TestFP16_2(OpTest):
-
     def setUp(self):
         self.op_type = "slice"
         self.__class__.use_mlu = True
@@ -513,7 +499,7 @@ def setUp(self):
             'axes': self.axes,
             'starts': self.starts,
             'ends': self.ends,
-            'infer_flags': self.infer_flags
+            'infer_flags': self.infer_flags,
         }
 
     def config(self):
@@ -529,24 +515,28 @@ def test_check_output(self):
         self.check_output_with_place(self.place, atol=1e-5)
 
     def test_check_grad_normal(self):
-        self.check_grad_with_place(self.place, ['Input'],
-                                   'Out',
-                                   max_relative_error=0.006,
-                                   numeric_grad_delta=0.5)
+        self.check_grad_with_place(
+            self.place,
+            ['Input'],
+            'Out',
+            max_relative_error=0.006,
+            numeric_grad_delta=0.5,
+        )
 
 
 class TestSliceApiWithTensor(unittest.TestCase):
-
     def test_starts_ends_is_tensor(self):
         with paddle.fluid.dygraph.guard():
             a = paddle.rand(shape=[4, 5, 6], dtype='float32')
             axes = [0, 1, 2]
             starts = [-3, 0, 2]
             ends = [3, 2, 4]
-            a_1 = paddle.slice(a,
-                               axes=axes,
-                               starts=paddle.to_tensor(starts, dtype='int32'),
-                               ends=paddle.to_tensor(ends, dtype='int32'))
+            a_1 = paddle.slice(
+                a,
+                axes=axes,
+                starts=paddle.to_tensor(starts, dtype='int32'),
+                ends=paddle.to_tensor(ends, dtype='int32'),
+            )
             a_2 = paddle.slice(a, axes=axes, starts=starts, ends=ends)
 
             np.testing.assert_allclose(a_1.numpy(), a_2.numpy())
@@ -569,24 +559,22 @@ def test_bool_tensor(self):
 
 
 class TestImperativeVarBaseGetItem(unittest.TestCase):
-
     def test_getitem_with_long(self):
         with fluid.dygraph.guard():
             data = np.random.random((2, 80, 16128)).astype('float32')
             var = fluid.dygraph.to_variable(data)
-            sliced = var[:, 10:, :var.shape[1]]  # var.shape[1] is 80L here
+            sliced = var[:, 10:, : var.shape[1]]  # var.shape[1] is 80L here
             self.assertEqual(sliced.shape, [2, 70, 80])
 
-            sliced = var[:, var.shape[0]:, var.shape[0]:var.shape[1]]
+            sliced = var[:, var.shape[0] :, var.shape[0] : var.shape[1]]
             self.assertEqual(sliced.shape, [2, 78, 78])
 
     def test_getitem_with_float(self):
-
         def test_float_in_slice_item():
             with fluid.dygraph.guard():
                 data = np.random.random((2, 80, 16128)).astype('float32')
                 var = fluid.dygraph.to_variable(data)
-                sliced = var[:, 1.1:, :var.shape[1]]
+                sliced = var[:, 1.1:, : var.shape[1]]
 
         self.assertRaises(Exception, test_float_in_slice_item)
 
@@ -600,15 +588,6 @@ def test_float_in_index():
 
 
 class TestInferShape(unittest.TestCase):
-
-    def test(self):
-        x = paddle.ones(shape=[3, 4, 5])
-        x.desc.set_shape([3, -1, 5])
-        self.assertEqual(x.shape, (3, -1, 5))
-
-        out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
-        self.assertEqual(out0.shape, (3, 3, 5))
-
     def test_axis_less_than_zero(self):
 
         # Using paddle.disable_static will make other unittests fail.
@@ -616,13 +595,18 @@ def test_axis_less_than_zero(self):
             x_arr = np.arange(0, 24, dtype=np.float32).reshape([2, 3, 4])
             x = paddle.to_tensor(x_arr)
 
-            pp_slice = paddle.slice(x, [
-                100,
-            ], [0], [1])
+            pp_slice = paddle.slice(
+                x,
+                [
+                    100,
+                ],
+                [0],
+                [1],
+            )
             np_slice = x_arr[:, :, 0:1]
             np.testing.assert_allclose(pp_slice, np_slice)
 
-            pp_slice = paddle.slice(x, (-100, ), [0], [1])
+            pp_slice = paddle.slice(x, (-100,), [0], [1])
             np_slice = x_arr[0:1]
             np.testing.assert_allclose(pp_slice, np_slice)
 
@@ -630,9 +614,11 @@ def test_axis_less_than_zero(self):
             x = paddle.to_tensor(np.reshape(x_arr, (0, 0, 0)))
 
             starts = paddle.to_tensor(
-                np.reshape(np.array([], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0,))
+            )
             ends = paddle.to_tensor(
-                np.reshape(np.array([], dtype=np.int32), (0, )))
+                np.reshape(np.array([], dtype=np.int32), (0,))
+            )
 
             with self.assertRaises(ValueError):
                 paddle.slice(x, [-1000000], starts, ends)
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
index 25dbbbd028e6ed..8ef5c5dc5df910 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_softmax_with_cross_entropy_op_mlu.py
@@ -30,7 +30,6 @@
 
 
 class TestSoftmaxWithCrossEntropyOp(OpTest):
-
     def set_mlu(self):
         self.__class__.use_mlu = True
 
@@ -53,8 +52,10 @@ def setUp(self):
         self.initParams()
 
         logits = getattr(
-            self, "logits",
-            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype))
+            self,
+            "logits",
+            np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype),
+        )
         softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
 
         if self.soft_label:
@@ -65,8 +66,9 @@ def setUp(self):
             self.shape[self.axis] = 1
             labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
 
-        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
-                             self.ignore_index)
+        loss = cross_entropy(
+            softmax, labels, self.soft_label, self.axis, self.ignore_index
+        )
 
         one_hot_label = np.eye(axis_dim)[labels.reshape(-1)]
 
@@ -74,7 +76,7 @@ def setUp(self):
         self.outputs = {
             "Backprop": (softmax - one_hot_label).astype(self.dtype),
             "Softmax": softmax.astype(self.dtype),
-            "Loss": loss.astype(self.dtype)
+            "Loss": loss.astype(self.dtype),
         }
         self.attrs = {
             "numeric_stable_mode": self.numeric_stable_mode,
@@ -92,14 +94,16 @@ def test_check_grad(self):
         if self.dtype == np.float16:
             return
         # fp32 has low precision, cpu and mlu both need to relax the max_relative_error if using fp32
-        self.check_grad_with_place(self.place, ['Logits'],
-                                   'Loss',
-                                   numeric_grad_delta=0.001,
-                                   max_relative_error=0.5)
+        self.check_grad_with_place(
+            self.place,
+            ['Logits'],
+            'Loss',
+            numeric_grad_delta=0.001,
+            max_relative_error=0.5,
+        )
 
 
 class TestPowNet(unittest.TestCase):
-
     def _test(self, run_mlu=True):
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -114,9 +118,9 @@ def _test(self, run_mlu=True):
         with paddle.static.program_guard(main_prog, startup_prog):
             a = paddle.static.data(name="a", shape=[32, 32], dtype='float32')
             b = paddle.static.data(name="b", shape=[32, 32], dtype='float32')
-            label = paddle.static.data(name="label",
-                                       shape=[32, 1],
-                                       dtype='int64')
+            label = paddle.static.data(
+                name="label", shape=[32, 1], dtype='int64'
+            )
 
             sum = paddle.add(a, b)
             z = paddle.pow(sum, 2.0)
@@ -140,16 +144,17 @@ def _test(self, run_mlu=True):
         print("Start run on {}".format(place))
         for epoch in range(100):
 
-            pred_res, loss_res = exe.run(main_prog,
-                                         feed={
-                                             "a": a_np,
-                                             "b": b_np,
-                                             "label": label_np
-                                         },
-                                         fetch_list=[prediction, loss])
+            pred_res, loss_res = exe.run(
+                main_prog,
+                feed={"a": a_np, "b": b_np, "label": label_np},
+                fetch_list=[prediction, loss],
+            )
             if epoch % 10 == 0:
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
+                print(
+                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                        epoch, pred_res[0], loss_res
+                    )
+                )
 
         return pred_res, loss_res
 
@@ -157,7 +162,7 @@ def test_mlu(self):
         cpu_pred, cpu_loss = self._test(False)
         mlu_pred, mlu_loss = self._test(True)
 
-        np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=1e-5)
+        np.testing.assert_allclose(mlu_pred, cpu_pred, rtol=2e-5)
         np.testing.assert_allclose(mlu_loss, cpu_loss)
 
 
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
index 3b8dd2c1922faa..a2b59048462861 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_base_mlu.py
@@ -44,17 +44,19 @@
 
 
 class TestSyncBatchNormRunnerBase(object):
-
-    def get_model(self,
-                  main,
-                  startup,
-                  place,
-                  layout,
-                  seed,
-                  sync_bn=False,
-                  only_forward=False):
+    def get_model(
+        self,
+        main,
+        startup,
+        place,
+        layout,
+        seed,
+        sync_bn=False,
+        only_forward=False,
+    ):
         raise NotImplementedError(
-            "get model should be implemented by child class.")
+            "get model should be implemented by child class."
+        )
 
     def wait_server_ready(self, endpoints):
         assert not isinstance(endpoints, string_types)
@@ -63,13 +65,15 @@ def wait_server_ready(self, endpoints):
             not_ready_endpoints = []
             for ep in endpoints:
                 ip_port = ep.split(":")
-                with closing(socket.socket(socket.AF_INET,
-                                           socket.SOCK_STREAM)) as sock:
+                with closing(
+                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+                ) as sock:
                     sock.settimeout(2)
                     sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
                     if hasattr(socket, 'SO_REUSEPORT'):
-                        sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEPORT,
-                                        1)
+                        sock.setsockopt(
+                            socket.SOL_SOCKET, socket.SO_REUSEPORT, 1
+                        )
 
                     result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                     if result != 0:
@@ -77,39 +81,47 @@ def wait_server_ready(self, endpoints):
                         not_ready_endpoints.append(ep)
             if not all_ok:
                 sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" +
-                                 str(not_ready_endpoints) + "\n")
+                sys.stderr.write(
+                    "not ready endpoints:" + str(not_ready_endpoints) + "\n"
+                )
                 sys.stderr.flush()
                 time.sleep(3)
             else:
                 break
 
-    def initCommunicator(self, program, rank, nranks, wait_port,
-                         current_endpoint, endpoints):
+    def initCommunicator(
+        self, program, rank, nranks, wait_port, current_endpoint, endpoints
+    ):
         other_endpoints = endpoints[:]
         other_endpoints.remove(current_endpoint)
         if rank == 0 and wait_port:
             self.wait_server_ready(other_endpoints)
         block = program.global_block()
-        cncl_id_var = block.create_var(name=nameGen.generate('cncl_id'),
-                                       persistable=True,
-                                       type=core.VarDesc.VarType.RAW)
-        block.append_op(type='c_gen_cncl_id',
-                        inputs={},
-                        outputs={'Out': cncl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-        block.append_op(type='c_comm_init',
-                        inputs={'X': cncl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': self.global_ring_id
-                        })
+        cncl_id_var = block.create_var(
+            name=nameGen.generate('cncl_id'),
+            persistable=True,
+            type=core.VarDesc.VarType.RAW,
+        )
+        block.append_op(
+            type='c_gen_cncl_id',
+            inputs={},
+            outputs={'Out': cncl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': cncl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': self.global_ring_id,
+            },
+        )
 
     def run_trainer(self, args):
         device_id = int(os.getenv("FLAGS_selected_mlus", "0"))
@@ -127,8 +139,8 @@ def run_trainer(self, args):
                 self._compare(args, place, layout, True)
 
         # Test FP16 - @TODO
-        self.dtype = np.float16
-        self.atol = 1e-2
+        self.bn_dtype = np.float16
+        self.atol = 3e-3
 
         # Test training
         for place in places:
@@ -142,24 +154,30 @@ def run_trainer(self, args):
 
         sys.stdout.buffer.write(
             pickle.dumps(
-                'training, inference, fp32, fp16, NCHW, NHWC all passed'))
+                'training, inference, fp32, fp16, NCHW, NHWC all passed'
+            )
+        )
 
     def _compare(self, args, place, layout, only_forward):
         scope = core.Scope()
 
         np.random.seed(SEED)
-        data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2
+        data = np.random.random(size=self.dshape).astype(self.dtype) * 4.0 - 2
         sys.stderr.write("data: " + str(data) + "\n")
-        data = create_or_get_tensor(scope, "input",
-                                    OpTest.np_dtype_to_fluid_dtype(data), place)
+        data = create_or_get_tensor(
+            scope, "input", OpTest.np_dtype_to_fluid_dtype(data), place
+        )
 
-        bn_fetches = self._cal_single_card(args, data, place, layout,
-                                           only_forward)
+        bn_fetches = self._cal_single_card(
+            args, data, place, layout, only_forward
+        )
         fetch_names, sync_bn_fetches = self._cal_multiple_cards(
-            args, data, place, layout, only_forward)
+            args, data, place, layout, only_forward
+        )
 
-        sys.stderr.write("len(sync_bn_fetches): " + str(len(sync_bn_fetches)) +
-                         "\n")
+        sys.stderr.write(
+            "len(sync_bn_fetches): " + str(len(sync_bn_fetches)) + "\n"
+        )
         for i in six.moves.xrange(0, len(sync_bn_fetches)):
             sys.stderr.write("i: " + str(i) + "\n")
             sys.stderr.write("fetch_names[i]): " + fetch_names[i] + "\n")
@@ -167,13 +185,14 @@ def _compare(self, args, place, layout, only_forward):
             bn_val = bn_fetches[i]
             sync_bn_val = sync_bn_fetches[i]
             if sync_bn_val.shape != bn_val.shape:
-                sync_bn_val = sync_bn_val[:bn_val.shape[0]]
+                sync_bn_val = sync_bn_val[: bn_val.shape[0]]
 
             # i = 0
             if fetch_names[i] == 'reduce_sum_0.tmp_0':
                 # sys.stderr.write("skip reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n")
-                sys.stderr.write("reduce_sum_0.tmp_0 (Out of reduce_sum op)" +
-                                 "\n")
+                sys.stderr.write(
+                    "reduce_sum_0.tmp_0 (Out of reduce_sum op)" + "\n"
+                )
                 sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                 sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
 
@@ -201,7 +220,8 @@ def _compare(self, args, place, layout, only_forward):
             if fetch_names[i] == 'batch_norm_0.tmp_2':
                 # sys.stderr.write("skip batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
                 sys.stderr.write(
-                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n")
+                    "batch_norm_0.tmp_2 (ReserveSpace of batch_norm)" + "\n"
+                )
                 sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                 sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
 
@@ -234,8 +254,9 @@ def _compare(self, args, place, layout, only_forward):
 
             # i = 8
             if fetch_names[i] == 'batch_norm_0.tmp_1':
-                sys.stderr.write("skip batch_norm_0.tmp_1 (SavedVariance)" +
-                                 "\n")
+                sys.stderr.write(
+                    "skip batch_norm_0.tmp_1 (SavedVariance)" + "\n"
+                )
                 sys.stderr.write("bn_val: " + str(bn_val) + "\n")
                 sys.stderr.write("sync_bn_val: " + str(sync_bn_val) + "\n")
 
@@ -281,10 +302,16 @@ def _compare(self, args, place, layout, only_forward):
             if fetch_names[i] == 'conv2d_0.tmp_0@GRAD':
                 atol = 1e-2
 
-            assert np.allclose(
-                bn_val, sync_bn_val, atol=atol), "Output (" + fetch_names[
-                    i] + ") has diff. \n" + "\nBN     " + str(
-                        bn_val) + "\n" + "Sync BN " + str(sync_bn_val)
+            assert np.allclose(bn_val, sync_bn_val, atol=atol), (
+                "Output ("
+                + fetch_names[i]
+                + ") has diff. \n"
+                + "\nBN     "
+                + str(bn_val)
+                + "\n"
+                + "Sync BN "
+                + str(sync_bn_val)
+            )
 
     def _cal_single_card(self, args, data, place, layout, only_forward):
         # Single-MLU, N = 32 per MLU
@@ -294,23 +321,31 @@ def _cal_single_card(self, args, data, place, layout, only_forward):
         startup_prog.global_seed(SEED)
         paddle.seed(SEED)
 
-        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
-                              False, only_forward)
+        outs = self.get_model(
+            train_prog, startup_prog, place, layout, SEED, False, only_forward
+        )
 
         exe = fluid.Executor(place)
         exe.run(startup_prog)
         fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            'bn_moving_mean',
+            'bn_moving_variance',
+            'bn_scale',
+            'bn_bias',
         ]
         if not only_forward:
             others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'batch_norm_0.tmp_0',
+                'batch_norm_0.tmp_1',
+                'bn_scale@GRAD',
+                'bn_bias@GRAD',
+                'batch_norm_0.tmp_3@GRAD',
+                'conv2d_0.tmp_0@GRAD',
             ]
             fetch_names += others
-        bn_fetches = exe.run(program=train_prog,
-                             feed={'input': data},
-                             fetch_list=fetch_names)
+        bn_fetches = exe.run(
+            program=train_prog, feed={'input': data}, fetch_list=fetch_names
+        )
 
         return bn_fetches
 
@@ -331,8 +366,9 @@ def _cal_multiple_cards(self, args, data, place, layout, only_forward):
         current_endpoint = args["currentendpoint"]
         nranks = 2
 
-        self.initCommunicator(startup_prog, rank, nranks, True,
-                              current_endpoint, endpoints)
+        self.initCommunicator(
+            startup_prog, rank, nranks, True, current_endpoint, endpoints
+        )
         # sys.stderr.write("after init, startup_prog: " +
         #                  startup_prog.to_string(True) + "\n")
         train_prog.global_seed(SEED)
@@ -342,8 +378,9 @@ def _cal_multiple_cards(self, args, data, place, layout, only_forward):
         paddle.seed(SEED)
 
         self.rank = rank
-        outs = self.get_model(train_prog, startup_prog, place, layout, SEED,
-                              True, only_forward)
+        outs = self.get_model(
+            train_prog, startup_prog, place, layout, SEED, True, only_forward
+        )
         # sys.stderr.write("after get_model, train_prog: " +
         #                  train_prog.to_string(True) + "\n")
         # sys.stderr.write("after get_model, startup_prog: " +
@@ -366,17 +403,24 @@ def _cal_multiple_cards(self, args, data, place, layout, only_forward):
         exe = fluid.Executor(place)
         exe.run(startup_prog)
         fetch_names = [v.name for v in outs] + [
-            'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias'
+            'bn_moving_mean',
+            'bn_moving_variance',
+            'bn_scale',
+            'bn_bias',
         ]
         if not only_forward:
             others = [
-                'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD',
-                'bn_bias@GRAD', 'batch_norm_0.tmp_3@GRAD', 'conv2d_0.tmp_0@GRAD'
+                'batch_norm_0.tmp_0',
+                'batch_norm_0.tmp_1',
+                'bn_scale@GRAD',
+                'bn_bias@GRAD',
+                'batch_norm_0.tmp_3@GRAD',
+                'conv2d_0.tmp_0@GRAD',
             ]
             fetch_names += others
-        sync_bn_fetches = exe.run(program=train_prog,
-                                  feed={'input': data},
-                                  fetch_list=fetch_names)
+        sync_bn_fetches = exe.run(
+            program=train_prog, feed={'input': data}, fetch_list=fetch_names
+        )
 
         return fetch_names, sync_bn_fetches
 
@@ -399,19 +443,20 @@ def runtime_main(test_class, col_type, sub_type):
 
 
 class TestDistBase(unittest.TestCase):
-
     def setUp(self):
         self._port_set = set()
         self._trainers = 2
         self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % (
-            self._find_free_port(), self._find_free_port())
+            self._find_free_port(),
+            self._find_free_port(),
+        )
         self._python_interp = sys.executable
 
     def _find_free_port(self):
-
         def __free_port():
-            with closing(socket.socket(socket.AF_INET,
-                                       socket.SOCK_STREAM)) as s:
+            with closing(
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as s:
                 s.bind(('', 0))
                 return s.getsockname()[1]
 
@@ -440,7 +485,7 @@ def _run_cluster(self, model_file, envs):
             "PADDLE_TRAINER_ENDPOINTS": self._ps_endpoints,
             "PADDLE_CURRENT_ENDPOINT": w1_ep,
         }
-        #update environment
+        # update environment
         env0.update(envs)
         env1.update(envs)
 
@@ -451,15 +496,19 @@ def _run_cluster(self, model_file, envs):
         tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
         print("tr0_cmd: {}, env: {}\n".format(tr0_cmd, env0))
         print("tr1_cmd: {}, env: {}\n".format(tr1_cmd, env1))
-        tr0_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr0_pipe,
-                                    env=env0)
-
-        tr1_proc = subprocess.Popen(tr0_cmd.strip().split(),
-                                    stdout=subprocess.PIPE,
-                                    stderr=tr1_pipe,
-                                    env=env1)
+        tr0_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr0_pipe,
+            env=env0,
+        )
+
+        tr1_proc = subprocess.Popen(
+            tr0_cmd.strip().split(),
+            stdout=subprocess.PIPE,
+            stderr=tr1_pipe,
+            env=env1,
+        )
 
         tr0_out, tr0_err = tr0_proc.communicate()
         tr1_out, tr1_err = tr1_proc.communicate()
@@ -473,14 +522,16 @@ def _run_cluster(self, model_file, envs):
             sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
         with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
             sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
-        return pickle.loads(tr0_out), pickle.loads(
-            tr1_out), tr0_proc.pid, tr1_proc.pid
-
-    def check_with_place(self,
-                         model_file,
-                         col_type,
-                         check_error_log=False,
-                         need_envs={}):
+        return (
+            pickle.loads(tr0_out),
+            pickle.loads(tr1_out),
+            tr0_proc.pid,
+            tr1_proc.pid,
+        )
+
+    def check_with_place(
+        self, model_file, col_type, check_error_log=False, need_envs={}
+    ):
         required_envs = {
             "FLAGS_fraction_of_gpu_memory_to_use": "0.15",
             "FLAGS_eager_delete_tensor_gb": "0.0",
@@ -491,7 +542,7 @@ def check_with_place(self,
             "FLAGS_call_stack_level": "2",
             "GLOG_v": "3",
             "PADDLE_WITH_GLOO": '0',
-            "BACKEND": "cncl"
+            "BACKEND": "cncl",
         }
         required_envs.update(need_envs)
         if check_error_log:
@@ -499,8 +550,11 @@ def check_with_place(self,
             required_envs["GLOG_logtostderr"] = "1"
             required_envs["GLOO_LOG_LEVEL"] = "TRACE"
         tr0_out, tr1_out, pid0, pid1 = self._run_cluster(
-            model_file, required_envs)
+            model_file, required_envs
+        )
         self.assertEqual(
-            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+            tr0_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed'
+        )
         self.assertEqual(
-            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed')
+            tr1_out, 'training, inference, fp32, fp16, NCHW, NHWC all passed'
+        )
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
index f524e47b54a92a..925eec94daccf8 100644
--- a/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
+++ b/python/paddle/fluid/tests/unittests/mlu/test_sync_batch_norm_op_mlu_baseline.py
@@ -29,14 +29,17 @@
 
 
 class TestSyncBatchNormOp(TestDistBase):
-
     def _setup_config(self):
         pass
 
     def test_identity(self, col_type="identity"):
-        self.check_with_place("sync_batch_norm_op_mlu.py",
-                              col_type,
-                              check_error_log=True)
+        envs = {"CNCL_MEM_POOL_MULTI_CLIQUE_ENABLE": "1"}
+        self.check_with_place(
+            "sync_batch_norm_op_mlu.py",
+            col_type,
+            check_error_log=True,
+            need_envs=envs,
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py b/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py
new file mode 100644
index 00000000000000..d4bdf87607681f
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mlu/test_yolo_box_op_mlu.py
@@ -0,0 +1,299 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+import sys
+
+sys.path.append("..")
+import unittest
+import numpy as np
+from op_test import OpTest
+import paddle
+from paddle.fluid import core
+import paddle.fluid as fluid
+from paddle.fluid.op import Operator
+from paddle.fluid.executor import Executor
+from paddle.fluid.framework import _test_eager_guard
+
+paddle.enable_static()
+
+
+def sigmoid(x):
+    return 1.0 / (1.0 + np.exp(((-1.0) * x)))
+
+
+def YoloBox(x, img_size, attrs):
+    (n, c, h, w) = x.shape
+    anchors = attrs['anchors']
+    an_num = int((len(anchors) // 2))
+    class_num = attrs['class_num']
+    conf_thresh = attrs['conf_thresh']
+    downsample = attrs['downsample_ratio']
+    clip_bbox = attrs['clip_bbox']
+    scale_x_y = attrs['scale_x_y']
+    iou_aware = attrs['iou_aware']
+    iou_aware_factor = attrs['iou_aware_factor']
+    bias_x_y = (-0.5) * (scale_x_y - 1.0)
+    input_h = downsample * h
+    input_w = downsample * w
+    if iou_aware:
+        ioup = x[:, :an_num, :, :]
+        ioup = np.expand_dims(ioup, axis=(-1))
+        x = x[:, an_num:, :, :]
+    x = x.reshape((n, an_num, (5 + class_num), h, w)).transpose((0, 1, 3, 4, 2))
+    pred_box = x[:, :, :, :, :4].copy()
+    grid_x = np.tile(np.arange(w).reshape((1, w)), (h, 1))
+    grid_y = np.tile(np.arange(h).reshape((h, 1)), (1, w))
+    pred_box[:, :, :, :, 0] = (
+        (grid_x + (sigmoid(pred_box[:, :, :, :, 0]) * scale_x_y)) + bias_x_y
+    ) / w
+    pred_box[:, :, :, :, 1] = (
+        (grid_y + (sigmoid(pred_box[:, :, :, :, 1]) * scale_x_y)) + bias_x_y
+    ) / h
+    anchors = [
+        (anchors[i], anchors[(i + 1)]) for i in range(0, len(anchors), 2)
+    ]
+    anchors_s = np.array(
+        [((an_w / input_w), (an_h / input_h)) for (an_w, an_h) in anchors]
+    )
+    anchor_w = anchors_s[:, 0:1].reshape((1, an_num, 1, 1))
+    anchor_h = anchors_s[:, 1:2].reshape((1, an_num, 1, 1))
+    pred_box[:, :, :, :, 2] = np.exp(pred_box[:, :, :, :, 2]) * anchor_w
+    pred_box[:, :, :, :, 3] = np.exp(pred_box[:, :, :, :, 3]) * anchor_h
+    if iou_aware:
+        pred_conf = (sigmoid(x[:, :, :, :, 4:5]) ** (1 - iou_aware_factor)) * (
+            sigmoid(ioup) ** iou_aware_factor
+        )
+    else:
+        pred_conf = sigmoid(x[:, :, :, :, 4:5])
+    pred_conf[(pred_conf < conf_thresh)] = 0.0
+    pred_score = sigmoid(x[:, :, :, :, 5:]) * pred_conf
+    pred_box = pred_box * (pred_conf > 0.0).astype('float32')
+    pred_box = pred_box.reshape((n, (-1), 4))
+    (pred_box[:, :, :2], pred_box[:, :, 2:4]) = (
+        (pred_box[:, :, :2] - (pred_box[:, :, 2:4] / 2.0)),
+        (pred_box[:, :, :2] + (pred_box[:, :, 2:4] / 2.0)),
+    )
+    pred_box[:, :, 0] = pred_box[:, :, 0] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 1] = pred_box[:, :, 1] * img_size[:, 0][:, np.newaxis]
+    pred_box[:, :, 2] = pred_box[:, :, 2] * img_size[:, 1][:, np.newaxis]
+    pred_box[:, :, 3] = pred_box[:, :, 3] * img_size[:, 0][:, np.newaxis]
+    if clip_bbox:
+        for i in range(len(pred_box)):
+            pred_box[i, :, 0] = np.clip(pred_box[i, :, 0], 0, np.inf)
+            pred_box[i, :, 1] = np.clip(pred_box[i, :, 1], 0, np.inf)
+            pred_box[i, :, 2] = np.clip(
+                pred_box[i, :, 2], (-np.inf), (img_size[(i, 1)] - 1)
+            )
+            pred_box[i, :, 3] = np.clip(
+                pred_box[i, :, 3], (-np.inf), (img_size[(i, 0)] - 1)
+            )
+    return (pred_box, pred_score.reshape((n, (-1), class_num)))
+
+
+class TestYoloBoxOp(OpTest):
+    def setUp(self):
+        self.initTestCase()
+        self.op_type = 'yolo_box'
+        self.place = paddle.device.MLUPlace(0)
+        self.__class__.use_mlu = True
+        self.__class__.no_need_check_grad = True
+        self.python_api = paddle.vision.ops.yolo_box
+        x = np.random.random(self.x_shape).astype('float32')
+        img_size = np.random.randint(10, 20, self.imgsize_shape).astype('int32')
+        self.attrs = {
+            'anchors': self.anchors,
+            'class_num': self.class_num,
+            'conf_thresh': self.conf_thresh,
+            'downsample_ratio': self.downsample,
+            'clip_bbox': self.clip_bbox,
+            'scale_x_y': self.scale_x_y,
+            'iou_aware': self.iou_aware,
+            'iou_aware_factor': self.iou_aware_factor,
+        }
+        self.inputs = {'X': x, 'ImgSize': img_size}
+        (boxes, scores) = YoloBox(x, img_size, self.attrs)
+        self.outputs = {'Boxes': boxes, 'Scores': scores}
+
+    def test_check_output(self):
+        self.check_output_with_place(self.place, check_eager=False, atol=1e-5)
+
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (5 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpNoClipBbox(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = False
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (5 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpScaleXY(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (5 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.2
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxOpIoUAware(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = True
+        self.x_shape = (
+            self.batch_size,
+            (an_num * (6 + self.class_num)),
+            13,
+            13,
+        )
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = True
+        self.iou_aware_factor = 0.5
+
+
+class TestYoloBoxDygraph(unittest.TestCase):
+    def test_dygraph(self):
+        paddle.disable_static()
+        img_size = np.ones((2, 2)).astype('int32')
+        img_size = paddle.to_tensor(img_size)
+        x1 = np.random.random([2, 14, 8, 8]).astype('float32')
+        x1 = paddle.to_tensor(x1)
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x1,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+        )
+        assert (boxes is not None) and (scores is not None)
+        x2 = np.random.random([2, 16, 8, 8]).astype('float32')
+        x2 = paddle.to_tensor(x2)
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+            iou_aware=True,
+            iou_aware_factor=0.5,
+        )
+        paddle.enable_static()
+
+
+class TestYoloBoxStatic(unittest.TestCase):
+    def test_static(self):
+        x1 = paddle.static.data('x1', [2, 14, 8, 8], 'float32')
+        img_size = paddle.static.data('img_size', [2, 2], 'int32')
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x1,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+        )
+        assert (boxes is not None) and (scores is not None)
+        x2 = paddle.static.data('x2', [2, 16, 8, 8], 'float32')
+        (boxes, scores) = paddle.vision.ops.yolo_box(
+            x2,
+            img_size=img_size,
+            anchors=[10, 13, 16, 30],
+            class_num=2,
+            conf_thresh=0.01,
+            downsample_ratio=8,
+            clip_bbox=True,
+            scale_x_y=1.0,
+            iou_aware=True,
+            iou_aware_factor=0.5,
+        )
+        assert (boxes is not None) and (scores is not None)
+
+
+class TestYoloBoxOpHW(TestYoloBoxOp):
+    def initTestCase(self):
+        self.anchors = [10, 13, 16, 30, 33, 23]
+        an_num = int((len(self.anchors) // 2))
+        self.batch_size = 32
+        self.class_num = 2
+        self.conf_thresh = 0.5
+        self.downsample = 32
+        self.clip_bbox = False
+        self.x_shape = (self.batch_size, (an_num * (5 + self.class_num)), 13, 9)
+        self.imgsize_shape = (self.batch_size, 2)
+        self.scale_x_y = 1.0
+        self.iou_aware = False
+        self.iou_aware_factor = 0.5
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
index e34da4c45a7c36..0fdc7ac0218991 100644
--- a/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
+++ b/python/paddle/fluid/tests/unittests/static_model_parallel_fused_attention.py
@@ -36,7 +36,7 @@ def get_param_attr(weight, bias):
 DTYPE = "float32"
 MODEL_PARALLEL_SIZE = 2
 n_head = 2 * MODEL_PARALLEL_SIZE
-d_key = 4
+d_key = 2
 hidden = n_head * d_key
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py
index eb2ea8c56ac31b..239705b57ed1f5 100644
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
@@ -25,10 +25,8 @@
 
 
 class TestAdamOp1(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with supplied attributes
-        '''
+        '''Test Adam Op with supplied attributes'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -50,20 +48,19 @@ def setUp(self):
             'Moment2': moment2,
             'LearningRate': np.array([learning_rate]).astype("float32"),
             'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
         }
 
         self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, self.attrs)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, self.attrs)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -71,13 +68,11 @@ def test_check_output(self):
 
 
 class TestAdamOp2(OpTest):
-
     def set_shape(self):
         self.shape = (102, 105)
 
     def setUp(self):
-        '''Test Adam Op with supplied attributes
-        '''
+        '''Test Adam Op with supplied attributes'''
         self.op_type = "adam"
         self.set_shape()
         param = np.random.uniform(-1, 1, self.shape).astype("float32")
@@ -100,20 +95,19 @@ def setUp(self):
             'Moment2': moment2,
             'LearningRate': np.array([learning_rate]).astype("float32"),
             'Beta1Pow': np.array([beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([beta2_pow]).astype("float32")
+            'Beta2Pow': np.array([beta2_pow]).astype("float32"),
         }
 
         attributes = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -121,16 +115,13 @@ def test_check_output(self):
 
 
 class TestAdamOnlyTailOp(TestAdamOp2):
-
     def set_shape(self):
-        self.shape = (3)
+        self.shape = 3
 
 
 class TestAdamOpMultipleSteps(OpTest):
-
     def setUp(self):
-        '''Test Adam Operator with supplied attributes
-        '''
+        '''Test Adam Operator with supplied attributes'''
         self.op_type = "adam"
         self.num_steps = 10
 
@@ -154,19 +145,20 @@ def setUp(self):
             'Moment2': moment2,
             'LearningRate': np.array([learning_rate]).astype("float32"),
             'Beta1Pow': np.array([self.beta1_pow]).astype("float32"),
-            'Beta2Pow': np.array([self.beta2_pow]).astype("float32")
+            'Beta2Pow': np.array([self.beta2_pow]).astype("float32"),
         }
 
         self.attrs = {
             'epsilon': epsilon,
             'beta1': self.beta1,
-            'beta2': self.beta2
+            'beta2': self.beta2,
         }
 
     def test_check_output(self):
         for _ in range(self.num_steps):
-            param_out, moment1_out, \
-                moment2_out = adam_step(self.inputs, self.attrs)
+            param_out, moment1_out, moment2_out = adam_step(
+                self.inputs, self.attrs
+            )
 
             beta1_pow_out = self.inputs['Beta1Pow'] * self.beta1
             beta2_pow_out = self.inputs['Beta2Pow'] * self.beta2
@@ -175,7 +167,7 @@ def test_check_output(self):
                 'Moment2Out': moment2_out,
                 'ParamOut': param_out,
                 'Beta1PowOut': beta1_pow_out,
-                'Beta2PowOut': beta2_pow_out
+                'Beta2PowOut': beta2_pow_out,
             }
 
             # Verify output for this step
@@ -191,8 +183,9 @@ def test_check_output(self):
             self.inputs['Beta2Pow'] = beta2_pow_out
 
             # Randomize gradient for next step
-            self.inputs['Grad'] = np.random.uniform(
-                -1, 1, (102, 105)).astype("float32")
+            self.inputs['Grad'] = np.random.uniform(-1, 1, (102, 105)).astype(
+                "float32"
+            )
 
     def test_api_eager_dygraph(self):
         with _test_eager_guard():
@@ -272,8 +265,9 @@ def adamw_step(inputs, attributes):
     return param_out, moment1_out, moment2_out
 
 
-def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
-                     lazy_mode):
+def adam_step_sparse(
+    inputs, attributes, height, rows, row_numel, np_grad, lazy_mode
+):
     '''
     Simulate one step of the adam optimizer
     :param inputs: dict of inputs
@@ -298,13 +292,16 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
     param_out = np.zeros(shape=[height, row_numel])
 
     def update_row(row_id, update_value):
-        moment1_out[row_id] = beta1 * moment1[row_id] + (1 -
-                                                         beta1) * update_value
-        moment2_out[row_id] = beta2 * moment2[row_id] + (
-            1 - beta2) * np.square(update_value)
+        moment1_out[row_id] = (
+            beta1 * moment1[row_id] + (1 - beta1) * update_value
+        )
+        moment2_out[row_id] = beta2 * moment2[row_id] + (1 - beta2) * np.square(
+            update_value
+        )
         lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
         param_out[row_id] = param[row_id] - lr_t * (
-            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon))
+            moment1_out[row_id] / (np.sqrt(moment2_out[row_id]) + epsilon)
+        )
 
     if lazy_mode:
         for idx, row_id in enumerate(rows):
@@ -320,7 +317,6 @@ def update_row(row_id, update_value):
 
 
 class TestSparseAdamOp(unittest.TestCase):
-
     def setup(self, scope, place, lazy_mode):
         beta1 = 0.78
         beta2 = 0.836
@@ -339,14 +335,14 @@ def setup(self, scope, place, lazy_mode):
             "Moment2": np.full((height, row_numel), 5.0).astype("float32"),
             'Beta1Pow': beta1_pow,
             'Beta2Pow': beta2_pow,
-            "LearningRate": np.full((1), 2.0).astype("float32")
+            "LearningRate": np.full((1), 2.0).astype("float32"),
         }
         self.init_output = np.full((height, row_numel), 0.0).astype("float32")
         self.attrs = {
             'epsilon': epsilon,
             'beta1': beta1,
             'beta2': beta2,
-            'min_row_size_to_use_multithread': 2
+            'min_row_size_to_use_multithread': 2,
         }
 
         grad_selected_rows = scope.var('Grad').get_selected_rows()
@@ -361,15 +357,21 @@ def setup(self, scope, place, lazy_mode):
 
         self.sparse_inputs = ["Grad"]
 
-        param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
-                                                 height, rows, row_numel,
-                                                 np_array, lazy_mode)
+        param_out, mom1, mom2 = adam_step_sparse(
+            self.dense_inputs,
+            self.attrs,
+            height,
+            rows,
+            row_numel,
+            np_array,
+            lazy_mode,
+        )
         self.outputs = {
             "ParamOut": param_out,
             "Moment1Out": mom1,
             "Moment2Out": mom2,
             'Beta1PowOut': beta1_pow * beta1,
-            'Beta2PowOut': beta2_pow * beta2
+            'Beta2PowOut': beta2_pow * beta2,
         }
 
     def check_with_place(self, place, lazy_mode):
@@ -414,10 +416,8 @@ def test_sparse_adam(self):
 
 
 class TestAdamOpBetaVariable(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with beta as Variable
-        '''
+        '''Test Adam Op with beta as Variable'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -446,15 +446,14 @@ def setUp(self):
 
         attributes = {'epsilon': epsilon}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -462,10 +461,8 @@ def test_check_output(self):
 
 
 class TestAdamOpBetaEpsilonVariable(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with beta/epsilon as Variable
-        '''
+        '''Test Adam Op with beta/epsilon as Variable'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -495,15 +492,14 @@ def setUp(self):
 
         attributes = {'epsilon': epsilon}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.outputs = {
             'Moment1Out': moment1_out,
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([beta1_pow]).astype("float32") * beta1,
-            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2
+            'Beta2PowOut': np.array([beta2_pow]).astype("float32") * beta2,
         }
 
     def test_check_output(self):
@@ -511,10 +507,8 @@ def test_check_output(self):
 
 
 class TestAdamOpWithGlobalBetaPow(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with global_beta_pow
-        '''
+        '''Test Adam Op with global_beta_pow'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -544,8 +538,7 @@ def setUp(self):
 
         attributes = {'epsilon': epsilon}
 
-        param_out, moment1_out, \
-            moment2_out = adam_step(self.inputs, attributes)
+        param_out, moment1_out, moment2_out = adam_step(self.inputs, attributes)
 
         self.attrs = {'use_global_beta_pow': True}
 
@@ -555,7 +548,7 @@ def setUp(self):
             'Moment2Out': moment2_out,
             'ParamOut': param_out,
             'Beta1PowOut': np.array([]),
-            'Beta2PowOut': np.array([])
+            'Beta2PowOut': np.array([]),
         }
 
     def test_check_output(self):
@@ -563,10 +556,8 @@ def test_check_output(self):
 
 
 class TestAdamOpWithSkipUpdate(OpTest):
-
     def setUp(self):
-        '''Test Adam Op with global_beta_pow
-        '''
+        '''Test Adam Op with global_beta_pow'''
         self.op_type = "adam"
         param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
         grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
@@ -613,7 +604,6 @@ def test_check_output(self):
 
 
 class TestAdamOpV2(unittest.TestCase):
-
     def test_adam_op(self):
         place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
@@ -626,20 +616,20 @@ def test_adam_op(self):
                 conv = fluid.layers.conv2d(data, 8, 3)
                 loss = fluid.layers.reduce_mean(conv)
 
-                beta1 = fluid.layers.create_global_var(shape=[1],
-                                                       value=0.85,
-                                                       dtype='float32',
-                                                       persistable=True)
-                beta2 = fluid.layers.create_global_var(shape=[1],
-                                                       value=0.95,
-                                                       dtype='float32',
-                                                       persistable=True)
+                beta1 = fluid.layers.create_global_var(
+                    shape=[1], value=0.85, dtype='float32', persistable=True
+                )
+                beta2 = fluid.layers.create_global_var(
+                    shape=[1], value=0.95, dtype='float32', persistable=True
+                )
                 betas = [beta1, beta2]
-                opt = paddle.optimizer.Adam(learning_rate=1e-5,
-                                            beta1=beta1,
-                                            beta2=beta2,
-                                            weight_decay=0.01,
-                                            epsilon=1e-8)
+                opt = paddle.optimizer.Adam(
+                    learning_rate=1e-5,
+                    beta1=beta1,
+                    beta2=beta2,
+                    weight_decay=0.01,
+                    epsilon=1e-8,
+                )
                 opt.minimize(loss)
 
         exe.run(startup)
@@ -653,8 +643,9 @@ def test_adam_op_dygraph(self):
         a = fluid.dygraph.to_variable(value)
         linear = fluid.Linear(13, 5, dtype="float32")
 
-        adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                     parameters=linear.parameters())
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01, parameters=linear.parameters()
+        )
         out = linear(a)
         out.backward()
         adam.step()
@@ -670,26 +661,29 @@ def test_adam_op_with_state_dict(self):
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        #learning_rate is LRScheduler
+        # learning_rate is LRScheduler
         learning_rate = paddle.optimizer.lr.CosineAnnealingDecay(
-            learning_rate=0.1, T_max=10)
+            learning_rate=0.1, T_max=10
+        )
         adam = paddle.optimizer.Adam(
             learning_rate=learning_rate,
             weight_decay=fluid.regularizer.L2Decay(0.001),
-            parameters=emb.parameters())
+            parameters=emb.parameters(),
+        )
         lr = adam.get_lr()
         state_dict = adam.state_dict()
         adam.set_state_dict(state_dict)
 
-        #leanrning_rate is Tensor
+        # leanrning_rate is Tensor
         with self.assertRaises(TypeError):
             learning_rate = np.array([0.01]).astype("float32")
             learning_rate = paddle.to_tensor(learning_rate)
-            adam = paddle.optimizer.Adam(learning_rate=learning_rate,
-                                         parameters=emb.parameters())
+            adam = paddle.optimizer.Adam(
+                learning_rate=learning_rate, parameters=emb.parameters()
+            )
 
         params = adam.get_opti_var_name_list()
-        assert (params is not None)
+        assert params is not None
         paddle.enable_static()
 
     def test_adam_with_grad_clip(self):
@@ -698,9 +692,9 @@ def test_adam_with_grad_clip(self):
         a = fluid.dygraph.to_variable(value)
         linear = fluid.Linear(13, 5, dtype="float32")
         clip = fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
-        adam = paddle.optimizer.Adam(0.1,
-                                     parameters=linear.parameters(),
-                                     grad_clip=clip)
+        adam = paddle.optimizer.Adam(
+            0.1, parameters=linear.parameters(), grad_clip=clip
+        )
         out = linear(a)
         out.backward()
         adam.step()
@@ -715,11 +709,11 @@ def test_adam_op_with_set_lr(self):
         lr = 0.01
         adam.set_lr(lr)
         cur_lr = adam.get_lr()
-        assert (lr == cur_lr)
+        assert lr == cur_lr
         with self.assertRaises(TypeError):
-            lr_var = paddle.fluid.layers.create_global_var(shape=[1],
-                                                           value=lr,
-                                                           dtype='float32')
+            lr_var = paddle.fluid.layers.create_global_var(
+                shape=[1], value=lr, dtype='float32'
+            )
             adam.set_lr(lr_var)
         paddle.enable_static()
 
@@ -727,17 +721,17 @@ def test_adam_op_invalid_input(self):
         paddle.disable_static()
         linear = paddle.nn.Linear(10, 10)
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(0.1,
-                                         beta1=-1,
-                                         parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(
+                0.1, beta1=-1, parameters=linear.parameters()
+            )
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(0.1,
-                                         beta2=-1,
-                                         parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(
+                0.1, beta2=-1, parameters=linear.parameters()
+            )
         with self.assertRaises(ValueError):
-            adam = paddle.optimizer.Adam(0.1,
-                                         epsilon=-1,
-                                         parameters=linear.parameters())
+            adam = paddle.optimizer.Adam(
+                0.1, epsilon=-1, parameters=linear.parameters()
+            )
         paddle.enable_static()
 
     def test_adam_op_with_sparse_input_and_weight_decay(self):
@@ -746,9 +740,9 @@ def test_adam_op_with_sparse_input_and_weight_decay(self):
         x_data = np.arange(0, 10).reshape((10, 1)).astype(np.int64)
         x = paddle.to_tensor(x_data, stop_gradient=False)
         emb = paddle.nn.Embedding(10, 10, sparse=True)
-        adam = paddle.optimizer.Adam(0.001,
-                                     parameters=emb.parameters(),
-                                     weight_decay=0.01)
+        adam = paddle.optimizer.Adam(
+            0.001, parameters=emb.parameters(), weight_decay=0.01
+        )
 
         with self.assertRaises(RuntimeError):
             out = emb(x)
@@ -766,13 +760,14 @@ def test_api_eager_dygraph(self):
 
 
 class TestAdamOptimizer(unittest.TestCase):
-
-    def _test(self,
-              place,
-              use_tensor=True,
-              use_fluid_api=True,
-              use_global_beta_pow=False,
-              flatten_param_grads=False):
+    def _test(
+        self,
+        place,
+        use_tensor=True,
+        use_fluid_api=True,
+        use_global_beta_pow=False,
+        flatten_param_grads=False,
+    ):
         paddle.enable_static()
         main_prog = paddle.static.Program()
         startup_prog = paddle.static.Program()
@@ -786,29 +781,30 @@ def _test(self,
         weight_attr1 = paddle.ParamAttr(
             name="weight1",
             initializer=fluid.initializer.Constant(value=1.0),
-            trainable=True)
+            trainable=True,
+        )
         weight_attr2 = paddle.ParamAttr(
             name="weight2",
             initializer=fluid.initializer.Constant(value=2.0),
-            trainable=True)
+            trainable=True,
+        )
         clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
 
         with paddle.static.program_guard(main_prog, startup_prog):
             with paddle.utils.unique_name.guard():
                 a = paddle.static.data(name="a", shape=[2, 2], dtype='float32')
                 b = paddle.static.data(name="b", shape=[2, 2], dtype='float32')
-                label = paddle.static.data(name="label",
-                                           shape=[2, 1],
-                                           dtype='int64')
+                label = paddle.static.data(
+                    name="label", shape=[2, 1], dtype='int64'
+                )
 
                 sum = paddle.add(a, b)
                 z = paddle.pow(sum, 2.0)
 
                 fc_1 = fluid.layers.fc(input=z, size=2, param_attr=weight_attr1)
-                prediction = fluid.layers.fc(input=fc_1,
-                                             size=2,
-                                             param_attr=weight_attr2,
-                                             act='softmax')
+                prediction = fluid.layers.fc(
+                    input=fc_1, size=2, param_attr=weight_attr2, act='softmax'
+                )
 
                 cost = fluid.layers.cross_entropy(input=prediction, label=label)
                 loss = fluid.layers.reduce_mean(cost)
@@ -821,19 +817,22 @@ def _test(self,
                         value=float(beta1_init),
                         dtype='float32',
                         persistable=True,
-                        name="beta1")
+                        name="beta1",
+                    )
                     beta2 = fluid.layers.create_global_var(
                         shape=[1],
                         value=float(beta2_init),
                         dtype='float32',
                         persistable=True,
-                        name="beta2")
+                        name="beta2",
+                    )
                     epsilon = fluid.layers.create_global_var(
                         shape=[1],
                         value=float(epsilon_init),
                         dtype='float32',
                         persistable=True,
-                        name="epsilon")
+                        name="epsilon",
+                    )
                     if use_fluid_api:
                         adam = fluid.optimizer.Adam(
                             learning_rate=0.01,
@@ -843,13 +842,16 @@ def _test(self,
                             use_global_beta_pow=use_global_beta_pow,
                             flatten_param_grads=flatten_param_grads,
                             align_size=256,
-                            grad_clip=clip)
+                            grad_clip=clip,
+                        )
                     else:
-                        adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                                     beta1=beta1,
-                                                     beta2=beta2,
-                                                     epsilon=epsilon,
-                                                     grad_clip=clip)
+                        adam = paddle.optimizer.Adam(
+                            learning_rate=0.01,
+                            beta1=beta1,
+                            beta2=beta2,
+                            epsilon=epsilon,
+                            grad_clip=clip,
+                        )
                 else:
                     if use_fluid_api:
                         adam = fluid.optimizer.Adam(
@@ -860,13 +862,16 @@ def _test(self,
                             use_global_beta_pow=use_global_beta_pow,
                             flatten_param_grads=flatten_param_grads,
                             align_size=256,
-                            grad_clip=clip)
+                            grad_clip=clip,
+                        )
                     else:
-                        adam = fluid.optimizer.Adam(learning_rate=0.01,
-                                                    beta1=beta1_init,
-                                                    beta2=beta2_init,
-                                                    epsilon=epsilon_init,
-                                                    grad_clip=clip)
+                        adam = fluid.optimizer.Adam(
+                            learning_rate=0.01,
+                            beta1=beta1_init,
+                            beta2=beta2_init,
+                            epsilon=epsilon_init,
+                            grad_clip=clip,
+                        )
 
                 adam.minimize(loss)
 
@@ -877,15 +882,16 @@ def _test(self,
 
             print("Start run on {}".format(place))
             for epoch in range(10):
-                pred_res, loss_res = exe.run(main_prog,
-                                             feed={
-                                                 "a": a_np,
-                                                 "b": b_np,
-                                                 "label": label_np
-                                             },
-                                             fetch_list=[prediction, loss])
-                print("Epoch {} | Prediction[0]: {}, Loss: {}".format(
-                    epoch, pred_res[0], loss_res))
+                pred_res, loss_res = exe.run(
+                    main_prog,
+                    feed={"a": a_np, "b": b_np, "label": label_np},
+                    fetch_list=[prediction, loss],
+                )
+                print(
+                    "Epoch {} | Prediction[0]: {}, Loss: {}".format(
+                        epoch, pred_res[0], loss_res
+                    )
+                )
             paddle.disable_static()
             return pred_res, loss_res
 
@@ -897,10 +903,13 @@ def _test_with_place(self, place):
             for use_fluid_api in [True, False]:
                 for use_global_beta_pow in [True, False]:
                     for flatten_param_grads in [True, False]:
-                        pred, loss = self._test(place, use_tensor,
-                                                use_fluid_api,
-                                                use_global_beta_pow,
-                                                flatten_param_grads)
+                        pred, loss = self._test(
+                            place,
+                            use_tensor,
+                            use_fluid_api,
+                            use_global_beta_pow,
+                            flatten_param_grads,
+                        )
                         preds.append(pred)
                         losses.append(loss)
         for pred in preds:
@@ -922,21 +931,22 @@ def test_adam_flatten_param_grads_with_regularizer(self):
             name="weight1",
             initializer=fluid.initializer.Constant(value=1.0),
             regularizer=fluid.regularizer.L1DecayRegularizer(
-                regularization_coeff=0.1),
-            trainable=True)
+                regularization_coeff=0.1
+            ),
+            trainable=True,
+        )
         with fluid.program_guard(main):
             x = fluid.data(name='x', shape=[None, 13], dtype='float32')
             y = fluid.data(name='y', shape=[None, 1], dtype='float32')
-            y_predict = fluid.layers.fc(input=x,
-                                        size=1,
-                                        act=None,
-                                        param_attr=weight_attr)
+            y_predict = fluid.layers.fc(
+                input=x, size=1, act=None, param_attr=weight_attr
+            )
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = paddle.mean(cost)
 
-            adam = fluid.optimizer.AdamOptimizer(0.01,
-                                                 flatten_param_grads=True,
-                                                 align_size=256)
+            adam = fluid.optimizer.AdamOptimizer(
+                0.01, flatten_param_grads=True, align_size=256
+            )
             adam.minimize(avg_cost)
             paddle.disable_static()
 
@@ -959,13 +969,16 @@ def test_adam_exception(self):
         adam = fluid.optimizer.Adam(use_global_beta_pow=True)
         adam.minimize(loss)
         self.assertRaises(Exception, adam._get_global_accumulator, 'tmp')
-        adam._add_global_accumulator('tmp',
-                                     type=core.VarDesc.VarType.LOD_TENSOR)
+        adam._add_global_accumulator(
+            'tmp', type=core.VarDesc.VarType.LOD_TENSOR
+        )
         adam._get_global_accumulator('tmp')
-        self.assertRaises(Exception,
-                          adam._add_global_accumulator,
-                          adam._beta1_pow_acc_str,
-                          type=core.VarDesc.VarType.LOD_TENSOR)
+        self.assertRaises(
+            Exception,
+            adam._add_global_accumulator,
+            adam._beta1_pow_acc_str,
+            type=core.VarDesc.VarType.LOD_TENSOR,
+        )
         paddle.disable_static()
 
     def test_adam_save_load(self):
@@ -976,12 +989,14 @@ def test_adam_save_load(self):
         state_dict = linear.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
 
-        scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
-                                                  warmup_steps=100,
-                                                  verbose=True)
-        adam = paddle.fluid.optimizer.Adam(learning_rate=scheduler,
-                                           parameter_list=linear.parameters(),
-                                           use_global_beta_pow=True)
+        scheduler = paddle.optimizer.lr.NoamDecay(
+            d_model=0.01, warmup_steps=100, verbose=True
+        )
+        adam = paddle.fluid.optimizer.Adam(
+            learning_rate=scheduler,
+            parameter_list=linear.parameters(),
+            use_global_beta_pow=True,
+        )
         adam.minimize(b)
         state_dict = adam.state_dict()
         fluid.save_dygraph(state_dict, "paddle_dy")
@@ -1002,13 +1017,14 @@ def get_opt(dtype, shape):
                 state_dict = linear.state_dict()
                 fluid.save_dygraph(state_dict, "paddle_dy")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01,
-                                                          warmup_steps=100,
-                                                          verbose=True)
+                scheduler = paddle.optimizer.lr.NoamDecay(
+                    d_model=0.01, warmup_steps=100, verbose=True
+                )
                 adam = paddle.fluid.optimizer.Adam(
                     learning_rate=scheduler,
                     parameter_list=linear.parameters(),
-                    use_global_beta_pow=True)
+                    use_global_beta_pow=True,
+                )
                 adam.minimize(b)
                 return adam
 
@@ -1023,14 +1039,14 @@ def get_opt(dtype, shape):
         self.assertRaises(AssertionError, adam2.set_state_dict, opt_state_dict)
 
         adam3 = get_opt('float32', [10, 10])  # shape not match
-        opt_state_dict['beta1_pow_acc_0'] = np.array([0.9, 0.9],
-                                                     dtype='float32')
+        opt_state_dict['beta1_pow_acc_0'] = np.array(
+            [0.9, 0.9], dtype='float32'
+        )
         self.assertRaises(AssertionError, adam3.set_state_dict, opt_state_dict)
         paddle.enable_static()
 
 
 class TestAdamOpV2Group(TestAdamOpV2):
-
     def test_adam_op(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -1038,16 +1054,19 @@ def test_adam_op(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Adam(learning_rate=0.01,
-                                     parameters=[{
-                                         'params': linear_1.parameters()
-                                     }, {
-                                         'params': linear_2.parameters(),
-                                         'weight_decay': 0.001,
-                                         'beta1': 0.1,
-                                         'beta2': 0.99
-                                     }],
-                                     weight_decay=0.1)
+        adam = paddle.optimizer.Adam(
+            learning_rate=0.01,
+            parameters=[
+                {'params': linear_1.parameters()},
+                {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'beta1': 0.1,
+                    'beta2': 0.99,
+                },
+            ],
+            weight_decay=0.1,
+        )
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -1056,13 +1075,14 @@ def test_adam_op(self):
 
 
 class TestMultiTensorAdam(unittest.TestCase):
-
-    def _adam_optimize_dygraph(self,
-                               place,
-                               use_param_attr=False,
-                               use_param_group=False,
-                               use_amp=False,
-                               use_multi_tensor=False):
+    def _adam_optimize_dygraph(
+        self,
+        place,
+        use_param_attr=False,
+        use_param_group=False,
+        use_amp=False,
+        use_multi_tensor=False,
+    ):
         paddle.disable_static()
         paddle.seed(10)
         paddle.set_device(place)
@@ -1072,29 +1092,40 @@ def _adam_optimize_dygraph(self,
         weight_attr = paddle.ParamAttr(
             learning_rate=0.5,
             regularizer=paddle.regularizer.L2Decay(1.0),
-            trainable=True)
+            trainable=True,
+        )
         if use_param_attr:
             model = paddle.nn.Linear(5, 5, weight_attr)
         else:
             model = paddle.nn.Linear(5, 5)
 
         if not use_param_group:
-            optimizer = paddle.optimizer.Adam(parameters=model.parameters(),
-                                              use_multi_tensor=use_multi_tensor,
-                                              multi_precision=use_amp)
+            optimizer = paddle.optimizer.Adam(
+                parameters=model.parameters(),
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp,
+            )
         else:
-            optimizer = paddle.optimizer.Adam(parameters=[{
-                'params':
-                model.parameters(),
-                'weight_decay':
-                0.001,
-                'beta1':
-                0.1,
-                'beta2':
-                0.99
-            }],
-                                              use_multi_tensor=use_multi_tensor,
-                                              multi_precision=use_amp)
+            parameters = list(model.parameters())
+            param_num = len(parameters)
+            optimizer = paddle.optimizer.Adam(
+                parameters=[
+                    {
+                        'params': parameters[: int(param_num / 2)],
+                        'weight_decay': 0.001,
+                        'beta1': 0.1,
+                        'beta2': 0.99,
+                    },
+                    {
+                        'params': parameters[int(param_num / 2) :],
+                        'weight_decay': 0.001,
+                        'beta1': 0.1,
+                        'beta2': 0.99,
+                    },
+                ],
+                use_multi_tensor=use_multi_tensor,
+                multi_precision=use_amp,
+            )
 
         for idx in range(2):
             if place == 'gpu' and use_amp == True:
@@ -1118,10 +1149,9 @@ def _adam_optimize_dygraph(self,
 
         return output, model.parameters()
 
-    def _adam_optimize_static(self,
-                              place,
-                              use_amp=False,
-                              use_multi_tensor=False):
+    def _adam_optimize_static(
+        self, place, use_amp=False, use_multi_tensor=False
+    ):
         paddle.enable_static()
         paddle.seed(10)
         np.random.seed(10)
@@ -1130,24 +1160,26 @@ def _adam_optimize_static(self,
         exe = paddle.static.Executor(place=place)
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Adam(multi_precision=use_amp,
-                                          use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Adam(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
+        )
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
                 init_loss_scaling=128.0,
                 use_dynamic_loss_scaling=True,
                 use_pure_fp16=True,
-                use_fp16_guard=False)
+                use_fp16_guard=False,
+            )
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float16')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16'
+                )
             else:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float32')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32'
+                )
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.mean(hidden)
             optimizer.minimize(loss)
@@ -1159,9 +1191,9 @@ def _adam_optimize_static(self,
             x = np.random.random(size=(2, 2)).astype('float32')
         out = []
         for idx in range(5):
-            loss_data, = exe.run(train_program,
-                                 feed={"X": x},
-                                 fetch_list=[loss.name])
+            (loss_data,) = exe.run(
+                train_program, feed={"X": x}, fetch_list=[loss.name]
+            )
             out.append(loss_data)
         return out
 
@@ -1174,49 +1206,59 @@ def _get_places(self):
     def _check_with_place_amp(self, place, use_amp):
         # test dygraph mode
         output_dygraph1, params_dygraph1 = self._adam_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
         output_dygraph2, params_dygraph2 = self._adam_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
         np.testing.assert_allclose(output_dygraph1, output_dygraph2, rtol=1e-05)
         for idx in range(len(params_dygraph1)):
-            np.testing.assert_allclose(params_dygraph1[idx],
-                                       params_dygraph2[idx],
-                                       rtol=1e-05)
+            np.testing.assert_allclose(
+                params_dygraph1[idx], params_dygraph2[idx], rtol=1e-05
+            )
         # test static mode
-        output_static1 = self._adam_optimize_static(place=place,
-                                                    use_amp=use_amp,
-                                                    use_multi_tensor=True)
-        output_static2 = self._adam_optimize_static(place=place,
-                                                    use_amp=use_amp,
-                                                    use_multi_tensor=False)
+        output_static1 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
+        output_static2 = self._adam_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
         for idx in range(len(output_static1)):
-            np.testing.assert_allclose(output_static1[idx],
-                                       output_static2[idx],
-                                       rtol=1e-05)
+            np.testing.assert_allclose(
+                output_static1[idx], output_static2[idx], rtol=1e-05
+            )
 
     def _check_with_param_arrt(self, place, use_amp):
-        output1, params1 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_attr=True,
-                                                       use_multi_tensor=True)
-        output2, params2 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_attr=True,
-                                                       use_multi_tensor=False)
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=True,
+        )
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_attr=True,
+            use_multi_tensor=False,
+        )
 
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
 
     def _check_with_param_group(self, place, use_amp):
-        output1, params1 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_group=True,
-                                                       use_multi_tensor=True)
-        output2, params2 = self._adam_optimize_dygraph(place=place,
-                                                       use_amp=use_amp,
-                                                       use_param_group=True,
-                                                       use_multi_tensor=False)
+        output1, params1 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=True,
+        )
+        output2, params2 = self._adam_optimize_dygraph(
+            place=place,
+            use_amp=use_amp,
+            use_param_group=True,
+            use_multi_tensor=False,
+        )
 
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
diff --git a/python/paddle/fluid/tests/unittests/test_assign_op.py b/python/paddle/fluid/tests/unittests/test_assign_op.py
index fa902542f162a0..9721cca5bf9c68 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
@@ -24,6 +24,9 @@
 from paddle.fluid import compiler, Program, program_guard
 from paddle.fluid.backward import append_backward
 import paddle.fluid.framework as framework
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestAssignOp(op_test.OpTest):
@@ -258,5 +261,79 @@ def test_type_error(self):
         paddle.disable_static()
 
 
+class TestAssignDoubleGradCheck(unittest.TestCase):
+
+    def assign_wrapper(self, x):
+        return paddle.fluid.layers.assign(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [3, 4, 5], False, dtype)
+        data.persistable = True
+        out = paddle.fluid.layers.assign(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.assign_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestAssignTripleGradCheck(unittest.TestCase):
+
+    def assign_wrapper(self, x):
+        return paddle.fluid.layers.assign(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [3, 4, 5], False, dtype)
+        data.persistable = True
+        out = paddle.fluid.layers.assign(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.assign_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
deleted file mode 100644
index f4a02679b32206..00000000000000
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_api.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from __future__ import print_function
-
-import unittest
-import paddle
-import paddle.fluid as fluid
-import paddle.nn as nn
-import paddle.distributed as dist
-from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
-from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
-
-paddle.enable_static()
-
-process_mesh1 = [0, 1, 2, 3]
-process_mesh2 = [[0, 1, 2], [3, 4, 5]]
-
-
-class SimpleNet(nn.Layer):
-
-    def __init__(self, vocab_size=128, hidden_size=4):
-        super(SimpleNet, self).__init__()
-        self.word_embeddings = nn.Embedding(vocab_size, hidden_size)
-        self.dense1 = nn.Linear(hidden_size, hidden_size)
-        self.dense2 = nn.Linear(hidden_size, hidden_size // 2)
-
-    def forward(self, x, y):
-        # Test shard_tensor interface with dist_attr arg
-        x = dist.shard_tensor(x,
-                              dist_attr={
-                                  "process_mesh": process_mesh1,
-                                  "dims_mapping": [0, -1]
-                              })
-        emb_out = self.word_embeddings(x)
-        # Test shard_tensor interface with no dist_attr arg
-        y = dist.shard_tensor(y)
-        linear1 = self.dense1(y)
-        out = self.dense2(linear1)
-
-        return x, y
-
-
-class TestAutoParallelAPI(unittest.TestCase):
-
-    def test_api(self):
-        dist_context = get_default_distributed_context()
-
-        net = SimpleNet()
-        data1 = fluid.layers.fill_constant(shape=[2, 4], value=1, dtype="int64")
-        data2 = fluid.layers.fill_constant(shape=[2, 4],
-                                           value=2,
-                                           dtype="float32")
-        data3 = fluid.layers.fill_constant(shape=[2, 4],
-                                           value=4,
-                                           dtype="float32")
-
-        x, y = net.forward(data1, data2)
-
-        dist_x = dist_context.get_dist_tensor_for_program(x)
-        self.assertEqual(dist_x.dist_attr.process_mesh.processes, process_mesh1)
-        self.assertEqual(dist_x.dist_attr.dims_mapping, [0, -1])
-        self.assertEqual(dist_x.dist_attr.shard_sizes, None)
-        self.assertEqual(dist_x.dist_attr.device_placement, None)
-        self.assertTrue(dist_x.dist_attr.is_annotated("process_mesh"))
-        self.assertTrue(dist_x.dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(dist_x.dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(dist_x.dist_attr.is_annotated("device_placement"))
-
-        dist_y = dist_context.get_dist_tensor_for_program(y)
-        self.assertEqual(dist_y.dist_attr.process_mesh, None)
-        self.assertEqual(dist_y.dist_attr.dims_mapping, [-1, -1])
-        self.assertEqual(dist_y.dist_attr.shard_sizes, None)
-        self.assertEqual(dist_y.dist_attr.device_placement, None)
-        self.assertFalse(dist_y.dist_attr.is_annotated("process_mesh"))
-        self.assertFalse(dist_y.dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(dist_y.dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(dist_y.dist_attr.is_annotated("device_placement"))
-
-        # Test shard_op interface with dist_attr
-        dims_mapping1 = [0, 1]
-        dims_mapping2 = [-1, 0]
-        dist_add = dist.shard_op(paddle.add,
-                                 dist_attr={
-                                     data2: {
-                                         "process_mesh": process_mesh2,
-                                         "dims_mapping": dims_mapping1
-                                     },
-                                     data3: {
-                                         "dims_mapping": dims_mapping2
-                                     }
-                                 })
-        results = dist_add(data2, data3)
-        ops = paddle.static.default_main_program().block(0).ops
-        last_op = ops[-1]
-
-        dist_op = dist_context.get_dist_op_for_program(last_op)
-        self.assertEqual(dist_op.dist_attr.process_mesh,
-                         ProcessMesh(process_mesh2))
-        self.assertEqual(dist_op.dist_attr.impl_type, "default")
-        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
-        self.assertTrue(dist_op.dist_attr.is_annotated("process_mesh"))
-
-        data2_dist_attr = dist_op.dist_attr.get_input_dist_attr(data2.name)
-        self.assertEqual(data2_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data2_dist_attr.dims_mapping, dims_mapping1)
-        self.assertEqual(data2_dist_attr.shard_sizes, None)
-        self.assertEqual(data2_dist_attr.device_placement, None)
-        self.assertTrue(data2_dist_attr.is_annotated("process_mesh"))
-        self.assertTrue(data2_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data2_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data2_dist_attr.is_annotated("device_placement"))
-
-        data3_dist_attr = dist_op.dist_attr.get_input_dist_attr(data3.name)
-        self.assertEqual(data3_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data3_dist_attr.dims_mapping, dims_mapping2)
-        self.assertEqual(data3_dist_attr.shard_sizes, None)
-        self.assertEqual(data3_dist_attr.device_placement, None)
-        self.assertTrue(data3_dist_attr.is_annotated("process_mesh"))
-        self.assertTrue(data3_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data3_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data3_dist_attr.is_annotated("device_placement"))
-
-        # Test shard_op interface with dist_attr
-        dist_add = dist.shard_op(paddle.add)
-        results = dist_add(data2, data3)
-        ops = paddle.static.default_main_program().block(0).ops
-        last_op = ops[-1]
-        dist_op = dist_context.get_dist_op_for_program(last_op)
-        self.assertEqual(dist_op.dist_attr.process_mesh, None)
-        self.assertEqual(dist_op.dist_attr.impl_type, "default")
-        self.assertEqual(dist_op.dist_attr.impl_idx, 0)
-        self.assertFalse(dist_op.dist_attr.is_annotated("process_mesh"))
-
-        data2_dist_attr = dist_op.dist_attr.get_input_dist_attr(data2.name)
-        self.assertEqual(data2_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data2_dist_attr.dims_mapping, [-1, -1])
-        self.assertEqual(data2_dist_attr.shard_sizes, None)
-        self.assertEqual(data2_dist_attr.device_placement, None)
-        self.assertFalse(data2_dist_attr.is_annotated("process_mesh"))
-        self.assertFalse(data2_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data2_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data2_dist_attr.is_annotated("device_placement"))
-
-        data3_dist_attr = dist_op.dist_attr.get_input_dist_attr(data3.name)
-        self.assertEqual(data3_dist_attr.process_mesh,
-                         dist_op.dist_attr.process_mesh)
-        self.assertEqual(data3_dist_attr.dims_mapping, [-1, -1])
-        self.assertEqual(data3_dist_attr.shard_sizes, None)
-        self.assertEqual(data3_dist_attr.device_placement, None)
-        self.assertFalse(data3_dist_attr.is_annotated("process_mesh"))
-        self.assertFalse(data3_dist_attr.is_annotated("dims_mapping"))
-        self.assertFalse(data3_dist_attr.is_annotated("shard_sizes"))
-        self.assertFalse(data3_dist_attr.is_annotated("device_placement"))
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
index 393d79557a927a..91eccbf022b2d4 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion.py
@@ -26,7 +26,7 @@
 import paddle.tensor as tensor
 from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
@@ -66,39 +66,13 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
-        elif _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh2,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -119,18 +93,10 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, hidden_size],
                             dtype='float32')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -146,7 +112,8 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -161,7 +128,8 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -177,8 +145,9 @@ def test_mlp_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -286,18 +255,10 @@ def __init__(self,
                                   bias_attr=bias_attr)
 
     def forward(self, input):
-        if _global_parallel_strategy == "dp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -306,38 +267,16 @@ def forward(self, input):
         k = self.k_proj(input)
         v = self.v_proj(input)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -369,18 +308,10 @@ def forward(self, input):
 
         # project to output
         out = self.out_proj(out)
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         return out
 
@@ -411,7 +342,8 @@ def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -420,15 +352,14 @@ def test_attn_dp(self):
         completer = Completer(dist_context)
         complete_train_program = completer.complete_forward_annotation(
             train_program)
-        # print_program_with_dist_attr(complete_train_program,
-        #                                     dist_context)
         self.assertTrue(dist_context.validate_dist_attr_for_program())
 
     def test_attn_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -444,8 +375,9 @@ def test_attn_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -542,34 +474,18 @@ def __init__(self,
         self.dropout3 = nn.Dropout(self.dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input_ids, position_ids):
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -585,38 +501,16 @@ def forward(self, input_ids, position_ids):
         k = self.k_proj(target)
         v = self.v_proj(target)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -649,18 +543,10 @@ def forward(self, input_ids, position_ids):
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -673,28 +559,13 @@ def forward(self, input_ids, position_ids):
         out2 = F.gelu(out1, approximate=True)
         out3 = self.linear1(out2)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -732,7 +603,8 @@ def test_decoder_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
         train_program = static.Program()
         start_program = static.Program()
         dist_context = DistributedContext()
@@ -747,7 +619,8 @@ def test_decoder_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -763,8 +636,9 @@ def test_decoder_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
index ab110c929f5c54..a0a9ed54b17c0b 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_completion_gpt.py
@@ -30,7 +30,7 @@
 from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.distributed.fleet import fleet
 import paddle.static as static
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
@@ -116,18 +116,10 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -158,34 +150,15 @@ def compute_kv(self, key, value):
         to construct cache for inference.
         """
         k = self.k_proj(key)
-
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
         v = self.v_proj(value)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
+            auto.shard_tensor(self.k_proj.weight,
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -265,18 +238,10 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         outs = [out]
         if self.need_weights:
@@ -439,31 +404,13 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
-        if _global_parallel_strategy == "mp":
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -523,18 +470,10 @@ def forward(self, input_ids, position_ids=None):
 
         input_embedings = self.word_embeddings(input_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -757,18 +696,10 @@ def gpt_pretrain_forward(train_program, start_program):
                                 shape=[batch_size, sequence_len],
                                 dtype='float64')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         gpt = GPTModel(vocab_size=32768,
                        hidden_size=1024,
@@ -801,7 +732,8 @@ def test_gpt_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -817,7 +749,8 @@ def test_gpt_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         train_program = static.Program()
         start_program = static.Program()
@@ -833,8 +766,9 @@ def test_gpt_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         start_program = static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
index bb8642d569e424..e16179ebdabff2 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_cost_model.py
@@ -22,7 +22,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
@@ -35,8 +35,8 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
 NUM_RANKS = 8
 STAGE_0_CNT = 5
 STAGE_1_CNT = 10
@@ -73,16 +73,8 @@ def __init__(self,
 
     def forward(self, input):
         if self.is_distributed:
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -135,16 +127,8 @@ def mlp_forward(train_program, start_program, is_distributed=True):
                                 dtype='float32')
 
         if is_distributed:
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [0, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, ["x", None])
+            auto.shard_tensor(label, PP_MESH_1, ["x", None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
index ca69535049c3bc..d2cf3508dd3145 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_dist_tensor.py
@@ -17,7 +17,7 @@
 
 import paddle
 from paddle.fluid import core
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
@@ -71,7 +71,7 @@ class TestDistributedTensor(unittest.TestCase):
 
     def test_new_local_tensor(self):
         test_auto_parallel_reshard._global_process_mesh = auto.ProcessMesh(
-            mesh=[0, 1])
+            mesh=[0, 1], dim_names=["x"])
         test_auto_parallel_reshard._global_parallel_strategy = "dp"
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
index 36923212fdfa16..fb44b5caa96bee 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_mapper.py
@@ -36,7 +36,7 @@
 from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.distributed import fleet
 
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.parallelizer import AutoParallelizer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
@@ -414,37 +414,25 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[0],
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[0],
-                                  "dims_mapping": [1, -1]
-                              })
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[1],
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear3.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[1],
-                                  "dims_mapping": [1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh[0],
+                              [None, "y"])
+
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh[0],
+                              ["y", None])
+
+            auto.shard_tensor(self.linear2.weight, _global_process_mesh[1],
+                              [None, "y"])
+
+            auto.shard_tensor(self.linear3.weight, _global_process_mesh[1],
+                              ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
         out = F.gelu(out, approximate=True)
         out = self.linear1(out)
 
-        auto.shard_tensor(out,
-                          dist_attr={
-                              "process_mesh": _global_process_mesh[1],
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(out, _global_process_mesh[1], ["x", None])
+
         out = self.linear2(out)
         out = F.gelu(out, approximate=True)
         out = self.linear3(out)
@@ -464,11 +452,7 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "dp_mp_pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh[0],
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh[0], ["x", None])
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
                        initializer_range=0.02)
@@ -548,7 +532,10 @@ def test_mapper_dp_mp_pp(self):
         global _global_num_stages
         _global_num_stages = 2
         global _global_process_mesh
-        _global_process_mesh = [[[0, 1], [2, 3]], [[4, 5], [6, 7]]]
+        _global_process_mesh = [
+            auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"]),
+            auto.ProcessMesh([[4, 5], [6, 7]], dim_names=["x", "y"])
+        ]
         processes = [0, 1, 2, 3, 4, 5, 6, 7]
 
         dist_programs = {}
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
index 80135b62885311..389b4c92b6ce93 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner.py
@@ -27,7 +27,7 @@
 import paddle.tensor as tensor
 from paddle.fluid import layers
 from paddle.nn.layer.transformer import _convert_param_attr_to_list
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
@@ -276,39 +276,20 @@ def __init__(self,
         self.dropout = nn.Dropout(dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input):
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
         else:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, None])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -329,18 +310,10 @@ def mlp_pretrain_forward(train_program, start_program):
                             shape=[batch_size, sequence_len, hidden_size],
                             dtype='float32')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -356,7 +329,8 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
@@ -391,7 +365,8 @@ def test_mlp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
 
@@ -453,8 +428,9 @@ def test_mlp_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             mlp_pretrain_forward)
 
@@ -558,18 +534,10 @@ def __init__(self,
                                   bias_attr=bias_attr)
 
     def forward(self, input):
-        if _global_parallel_strategy == "dp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None, None])
 
         q = self.q_proj(input)
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
@@ -578,38 +546,16 @@ def forward(self, input):
         k = self.k_proj(input)
         v = self.v_proj(input)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -641,18 +587,11 @@ def forward(self, input):
 
         # project to output
         out = self.out_proj(out)
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         return out
 
@@ -683,7 +622,8 @@ def test_attn_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["dp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -717,7 +657,8 @@ def test_attn_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1, 2, 3],
+                                                dim_names=["mp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -783,8 +724,9 @@ def test_attn_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             attn_pretrain_forward)
@@ -930,34 +872,18 @@ def __init__(self,
         self.dropout3 = nn.Dropout(self.dropout_ratio, mode="upscale_in_train")
 
     def forward(self, input_ids, position_ids):
-        if _global_parallel_strategy == "dp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         input_embeddings = self.word_embeddings(input_ids)
         position_embeddings = self.position_embeddings(position_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout1(embeddings)
@@ -973,38 +899,16 @@ def forward(self, input_ids, position_ids):
         k = self.k_proj(target)
         v = self.v_proj(target)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -1037,24 +941,14 @@ def forward(self, input_ids, position_ids):
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
         else:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, None])
 
         # Add residual
         residual = embeddings + self.dropout2(out)
@@ -1067,28 +961,13 @@ def forward(self, input_ids, position_ids):
         out2 = F.gelu(out1, approximate=True)
         out3 = self.linear1(out2)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # Add residual
         final = residual + self.dropout3(out3)
@@ -1126,8 +1005,9 @@ def test_decoder_dp_mp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             decoder_pretrain_forward)
 
@@ -1208,8 +1088,9 @@ def test_decoder_noparallel(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "None"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["x", "y"])
         serial_main_prog, serial_startup_prog, dist_main_prog, dist_startup_prog, dist_context = get_programs(
             decoder_pretrain_forward)
 
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
index 00ba2151fcba51..b7e9d295c2abb0 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_partitioner_gpt.py
@@ -30,7 +30,7 @@
 from paddle.fluid.initializer import Normal, Constant, NumpyArrayInitializer
 from paddle.distributed import fleet
 import paddle.static as static
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.utils import check_distributed_attr_for_program
 from paddle.distributed.auto_parallel.utils import print_program_with_dist_attr
@@ -163,18 +163,10 @@ def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         """
         q = self.q_proj(query)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.q_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
         q = tensor.transpose(x=q, perm=[0, 2, 1, 3])
@@ -205,34 +197,15 @@ def compute_kv(self, key, value):
         to construct cache for inference.
         """
         k = self.k_proj(key)
-
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.k_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
         v = self.v_proj(value)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
+            auto.shard_tensor(self.k_proj.weight,
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.v_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
 
         k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
         k = tensor.transpose(x=k, perm=[0, 2, 1, 3])
@@ -312,18 +285,10 @@ def forward(self,
         # project to output
         out = self.out_proj(out)
 
-        if _global_parallel_strategy == "mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.out_proj.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         outs = [out]
         if self.need_weights:
@@ -486,31 +451,13 @@ def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None):
         if self.normalize_before:
             tgt = self.norm2(tgt)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 0]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, 1]
-                              })
-
-        if _global_parallel_strategy == "mp":
+                              process_mesh=_global_process_mesh,
+                              shard_spec=[None, "mp"])
             auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
-            auto.shard_tensor(self.linear2.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         # tgt = self.dropout2(
         #     self.linear2(F.gelu(
@@ -570,18 +517,10 @@ def forward(self, input_ids, position_ids=None):
 
         input_embedings = self.word_embeddings(input_ids)
 
-        if _global_parallel_strategy == "mp":
-            auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(self.word_embeddings.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [1, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["mp", None])
 
         position_embeddings = self.position_embeddings(position_ids)
         embeddings = input_embedings + position_embeddings
@@ -804,18 +743,10 @@ def gpt_pretrain_forward(train_program, startup_program):
                                 shape=[batch_size, sequence_len],
                                 dtype='float64')
 
-        if _global_parallel_strategy == "dp":
-            auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
-        elif _global_parallel_strategy == "dp_mp":
+        if _global_parallel_strategy in ["dp", "dp_mp"]:
             auto.shard_tensor(input_ids,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+                              process_mesh=_global_process_mesh,
+                              shard_spec=["dp", None])
 
         gpt = GPTModel(vocab_size=32768,
                        hidden_size=768,
@@ -863,8 +794,9 @@ def test_gpt_dp_mp(self):
         _global_parallel_strategy = "dp_mp"
         global _global_process_mesh
 
-        _global_process_mesh = auto.ProcessMesh(
-            mesh=[[0, 1, 2, 3], [4, 5, 6, 7]])
+        _global_process_mesh = auto.ProcessMesh(mesh=[[0, 1, 2, 3],
+                                                      [4, 5, 6, 7]],
+                                                dim_names=["dp", "mp"])
 
         train_program = static.Program()
         startup_program = static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
index 51926286acc151..ef08eda65338cd 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard.py
@@ -21,7 +21,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
@@ -63,27 +63,13 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         else:
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -107,28 +93,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         else:
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -296,11 +266,11 @@ def test_mlp_pp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["x"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -325,11 +295,11 @@ def test_mlp_pp_diff_process_mesh(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "pp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
         global PP_MESH_0
-        PP_MESH_0 = auto.ProcessMesh(mesh=[0])
+        PP_MESH_0 = auto.ProcessMesh(mesh=[0], dim_names=["x"])
         global PP_MESH_1
-        PP_MESH_1 = auto.ProcessMesh(mesh=[1])
+        PP_MESH_1 = auto.ProcessMesh(mesh=[1], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
@@ -352,7 +322,7 @@ def test_mlp_dp(self):
         global _global_parallel_strategy
         _global_parallel_strategy = "dp"
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
index 33396f283ec0ee..77bb415bcc7d77 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_dpmppp.py
@@ -21,7 +21,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
@@ -34,9 +34,10 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "dp_mp_pp"
-_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]])
-PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]])
-PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]])
+_global_process_mesh = auto.ProcessMesh([[[0, 1], [4, 5]], [[2, 3], [6, 7]]],
+                                        dim_names=["x", "y", "z"])
+PP_MESH_0 = auto.ProcessMesh([[0, 1], [4, 5]], dim_names=["x", "y"])
+PP_MESH_1 = auto.ProcessMesh([[2, 3], [6, 7]], dim_names=["x", "y"])
 
 
 class MLPLayer(nn.Layer):
@@ -63,16 +64,8 @@ def __init__(self,
         self.norm = nn.LayerNorm(d_model, epsilon=1e-5)
 
     def forward(self, input):
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1, 1]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [1, -1]
-                          })
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "y"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["y", None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -80,11 +73,7 @@ def forward(self, input):
         out = self.linear1(out)
         param = paddle.fluid.layers.create_parameter([1024, 4096],
                                                      paddle.float32)
-        auto.shard_tensor(param,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [-1, 1]
-                          })
+        auto.shard_tensor(param, PP_MESH_1, [None, "y"])
         out = paddle.fluid.layers.mul(out, param)
 
         return out
@@ -103,16 +92,8 @@ def mlp_forward(train_program, start_program):
                             shape=[batch_size, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(input, PP_MESH_0, ["x", None])
+        auto.shard_tensor(label, PP_MESH_1, ["x", None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
index d5de1c12873319..4ba3fe3dbd5b1d 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_mppp.py
@@ -21,7 +21,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.completion import Completer
 from paddle.distributed.auto_parallel.dist_context import DistributedContext
 from paddle.distributed import fleet
@@ -34,9 +34,9 @@
 
 paddle.enable_static()
 _global_parallel_strategy = "mp_pp"
-_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]])
-PP_MESH_0 = auto.ProcessMesh([0, 1])
-PP_MESH_1 = auto.ProcessMesh([2, 3])
+_global_process_mesh = auto.ProcessMesh([[0, 1], [2, 3]], dim_names=["x", "y"])
+PP_MESH_0 = auto.ProcessMesh([0, 1], dim_names=["x"])
+PP_MESH_1 = auto.ProcessMesh([2, 3], dim_names=["x"])
 
 
 class MLPLayer(nn.Layer):
@@ -73,35 +73,15 @@ def __init__(self,
                                  bias_attr=bias_attr)
 
     def forward(self, input):
-        auto.shard_tensor(self.word_embeddings.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(self.linear0.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1, 0]
-                          })
-        auto.shard_tensor(self.linear1.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
-        auto.shard_tensor(self.linear2.weight,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(self.word_embeddings.weight, PP_MESH_0, ["x", None])
+        auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, "x"])
+        auto.shard_tensor(self.linear1.weight, PP_MESH_1, ["x", None])
+        auto.shard_tensor(self.linear2.weight, PP_MESH_1, ["x", None])
         w_out = self.word_embeddings(input)
         out = self.linear0(w_out)
         param = paddle.fluid.layers.create_parameter([4096, 4096],
                                                      paddle.float32)
-        auto.shard_tensor(param,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [0, -1]
-                          })
+        auto.shard_tensor(param, PP_MESH_0, ["x", None])
         out = paddle.fluid.layers.mul(out, param)
         gelu_out = F.gelu(out, approximate=True)
         out = self.linear1(gelu_out)
@@ -122,16 +102,8 @@ def mlp_forward(train_program, start_program):
                             shape=[batch_size, 1],
                             dtype='float32')
 
-        auto.shard_tensor(input,
-                          dist_attr={
-                              "process_mesh": PP_MESH_0,
-                              "dims_mapping": [-1]
-                          })
-        auto.shard_tensor(label,
-                          dist_attr={
-                              "process_mesh": PP_MESH_1,
-                              "dims_mapping": [-1, -1]
-                          })
+        auto.shard_tensor(input, PP_MESH_0, [None])
+        auto.shard_tensor(label, PP_MESH_1, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -238,7 +210,6 @@ def test_mlp_mppp(self):
         resharder = Resharder(dist_main_prog, dist_startup_prog, rank_id,
                               dist_context, dist_params_grads)
         resharder.reshard()
-        print_program_with_dist_attr(dist_main_prog, dist_context)
         # check send and recv result
         self.assertTrue(check_send_recv_result(dist_main_prog, rank_id))
 
@@ -249,32 +220,15 @@ def test_mlp_mppp(self):
     def test_allgather(self):
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        process_mesh = auto.ProcessMesh(mesh=[0, 1])
+        process_mesh = auto.ProcessMesh(mesh=[0, 1], dim_names=["x"])
         with static.program_guard(train_program, startup_program):
             x = paddle.static.data(name="x", shape=[4, 4], dtype='float32')
-            x = auto.shard_tensor(x,
-                                  dist_attr={
-                                      "process_mesh": process_mesh,
-                                      "dims_mapping": [0, -1]
-                                  })
-
+            x = auto.shard_tensor(x, process_mesh, ["x", None])
             w = paddle.static.data(name="w", shape=[4, 4], dtype='float32')
-            w = auto.shard_tensor(w,
-                                  dist_attr={
-                                      "process_mesh": process_mesh,
-                                      "dims_mapping": [-1, -1]
-                                  })
-
-            y = paddle.distributed.shard_op(paddle.matmul,
-                                            dist_attr={
-                                                "process_mesh": process_mesh,
-                                                x: {
-                                                    "dims_mapping": [-1, -1]
-                                                },
-                                                w: {
-                                                    "dims_mapping": [-1, -1]
-                                                }
-                                            })(x, w)
+            w = auto.shard_tensor(w, process_mesh, [None, None])
+
+            y = paddle.distributed.shard_op(paddle.matmul, process_mesh,
+                                            [[None, None], [None, None]])(x, w)
 
         rank_id = 0
         dist_context = DistributedContext()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
index 64ff030f5b1e2a..75ec5ad6805b1f 100644
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_reshard_serial.py
@@ -25,7 +25,7 @@
 import paddle.static as static
 import paddle.nn.functional as F
 import paddle.utils as utils
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.dist_context import get_default_distributed_context
 from paddle.distributed import fleet
 from paddle.distributed.auto_parallel.partitioner import Partitioner
@@ -62,27 +62,13 @@ def __init__(self,
 
     def forward(self, input):
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, PP_MESH_0, [None, None])
+            auto.shard_tensor(self.linear1.weight, PP_MESH_1, [None, None])
         else:
-            auto.shard_tensor(self.linear0.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(self.linear1.weight,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(self.linear0.weight, _global_process_mesh,
+                              [None, None])
+            auto.shard_tensor(self.linear1.weight, _global_process_mesh,
+                              [None, None])
 
         out = self.norm(input)
         out = self.linear0(out)
@@ -106,28 +92,12 @@ def mlp_forward(train_program, start_program):
                             dtype='float32')
 
         if _global_parallel_strategy == "pp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_0,
-                                  "dims_mapping": [-1, -1]
-                              })
-            auto.shard_tensor(label,
-                              dist_attr={
-                                  "process_mesh": PP_MESH_1,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, PP_MESH_0, [None, None])
+            auto.shard_tensor(label, PP_MESH_1, [None, None])
         elif _global_parallel_strategy == "dp":
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [0, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, ["x", None])
         else:
-            auto.shard_tensor(input,
-                              dist_attr={
-                                  "process_mesh": _global_process_mesh,
-                                  "dims_mapping": [-1, -1]
-                              })
+            auto.shard_tensor(input, _global_process_mesh, [None, None])
 
         mlp = MLPLayer(hidden_size=hidden_size,
                        intermediate_size=4 * hidden_size,
@@ -196,7 +166,7 @@ def test_mlp_serial(self):
         global _global_parallel_strategy
         _global_parallel_strategy = None
         global _global_process_mesh
-        _global_process_mesh = auto.ProcessMesh(mesh=[0])
+        _global_process_mesh = auto.ProcessMesh(mesh=[0], dim_names=["x"])
 
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
index 5d6119d23f3210..3428576bbb6bad 100755
--- a/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
+++ b/python/paddle/fluid/tests/unittests/test_auto_parallel_searcher.py
@@ -25,7 +25,7 @@
 import paddle.nn.functional as F
 import paddle.utils as utils
 from paddle.distributed import fleet
-import paddle.distributed.auto_parallel as auto
+from paddle.distributed.fleet import auto
 from paddle.distributed.auto_parallel.cluster import Cluster
 from paddle.distributed.auto_parallel.utils import SerialProgramInfo
 from paddle.distributed.auto_parallel.planner import PlanSpace, PlanFilter
diff --git a/python/paddle/fluid/tests/unittests/test_box_coder_op.py b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
index 18959d80b0e9ac..824d8155b7f288 100644
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
@@ -12,12 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import unittest
 import numpy as np
-import sys
-import math
 from op_test import OpTest
 import paddle
 import paddle.fluid.core as core
@@ -286,5 +282,57 @@ def run(place):
             run(place)
 
 
+class TestBoxCoderAPI(unittest.TestCase):
+
+    def setUp(self):
+        np.random.seed(678)
+        self.prior_box_np = np.random.random((80, 4)).astype('float32')
+        self.prior_box_var_np = np.random.random((80, 4)).astype('float32')
+        self.target_box_np = np.random.random((20, 80, 4)).astype('float32')
+
+    def test_dygraph_with_static(self):
+        paddle.enable_static()
+        prior_box = paddle.static.data(name='prior_box',
+                                       shape=[80, 4],
+                                       dtype='float32')
+        prior_box_var = paddle.static.data(name='prior_box_var',
+                                           shape=[80, 4],
+                                           dtype='float32')
+        target_box = paddle.static.data(name='target_box',
+                                        shape=[20, 80, 4],
+                                        dtype='float32')
+
+        boxes = paddle.vision.ops.box_coder(prior_box=prior_box,
+                                            prior_box_var=prior_box_var,
+                                            target_box=target_box,
+                                            code_type="decode_center_size",
+                                            box_normalized=False)
+
+        exe = paddle.static.Executor()
+        boxes_np = exe.run(paddle.static.default_main_program(),
+                           feed={
+                               'prior_box': self.prior_box_np,
+                               'prior_box_var': self.prior_box_var_np,
+                               'target_box': self.target_box_np,
+                           },
+                           fetch_list=[boxes])
+
+        paddle.disable_static()
+        prior_box_dy = paddle.to_tensor(self.prior_box_np)
+        prior_box_var_dy = paddle.to_tensor(self.prior_box_var_np)
+        target_box_dy = paddle.to_tensor(self.target_box_np)
+
+        boxes_dy = paddle.vision.ops.box_coder(prior_box=prior_box_dy,
+                                               prior_box_var=prior_box_var_dy,
+                                               target_box=target_box_dy,
+                                               code_type="decode_center_size",
+                                               box_normalized=False)
+        boxes_dy_np = boxes_dy.numpy()
+
+        np.testing.assert_allclose(boxes_np[0], boxes_dy_np)
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cast_op.py b/python/paddle/fluid/tests/unittests/test_cast_op.py
index cd67440990cb85..a3a6805a868ee5 100644
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
@@ -23,6 +23,9 @@
 from paddle.fluid import compiler, Program, program_guard
 from op_test import OpTest, convert_uint16_to_float, convert_float_to_uint16
 from paddle.fluid.framework import _test_eager_guard
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestCastOpFp32ToFp64(OpTest):
@@ -137,6 +140,80 @@ def test_eager(self):
                 self.assertTrue(x.gradient().dtype == np.float16)
 
 
+class TestCastDoubleGradCheck(unittest.TestCase):
+
+    def cast_wrapper(self, x):
+        return paddle.cast(x[0], 'float64')
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.cast(data, 'float64')
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.cast_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestCastTripleGradCheck(unittest.TestCase):
+
+    def cast_wrapper(self, x):
+        return paddle.cast(x[0], 'float64')
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.cast(data, 'float64')
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.cast_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index 21c9b172e98225..7d1a237951110c 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -29,6 +29,7 @@
 import paddle.fluid as fluid
 import paddle.fluid.unique_name as nameGen
 from paddle.fluid import core
+from paddle_bfloat import bfloat16
 
 
 def create_bool_test_data(shape=None, seed=None):
@@ -82,6 +83,9 @@ def create_test_data(shape=None, dtype=None, seed=None):
     assert shape, "Shape should be specified"
     if dtype == "float32" or dtype == "float16" or dtype == "float64":
         return create_float_test_data(shape=shape, dtype=dtype, seed=seed)
+    elif dtype == "bfloat16":
+        # since numpy does not support bfloat16 yet, use `paddle_bfloat` to replace
+        return create_float_test_data(shape=shape, dtype=bfloat16, seed=seed)
     elif dtype == "bool":
         return create_bool_test_data(shape=shape, seed=seed)
     elif dtype == "int32" or dtype == "int64" or dtype == "int8" or dtype == "uint8":
@@ -174,6 +178,15 @@ def setUp(self):
 
         self.temp_dir = tempfile.TemporaryDirectory()
 
+        # NOTE: this is a hack to get int format nccl version, like 2134
+        # if current platform is not linux, version number will be 0
+        nccl_version_str = subprocess.check_output(
+            r"ldconfig -v | grep 'libnccl.so' | tail -n1 | sed -r 's/^.*\.so\.//'",
+            stderr=subprocess.DEVNULL,
+            shell=True).decode('utf-8')
+        self._nccl_version = int("".join(
+            nccl_version_str.split("."))) if nccl_version_str else 0
+
     def tearDown(self):
         self.temp_dir.cleanup()
 
@@ -312,6 +325,10 @@ def check_with_place(self,
             model_file, required_envs)
         input1 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid0)
         input2 = create_test_data(shape=(10, 1000), dtype=dtype, seed=pid1)
+        # cast bfloat16 to float32 for numeric comparison
+        if dtype == "bfloat16":
+            input1 = input1.astype("float32")
+            input2 = input2.astype("float32")
         if col_type == "allgather":
             need_result = np.vstack((input1, input2))
             tr_out0 = np.vstack((tr0_out[0], tr0_out[1]))
@@ -328,7 +345,13 @@ def check_with_place(self,
             np.testing.assert_allclose(tr1_out[0], need_result, rtol=1e-05)
         elif col_type == "reduce":
             need_result = input1 + input2
-            np.testing.assert_allclose(tr0_out[0], need_result, rtol=1e-05)
+            # bfloat16 precision loss comes from truncating the last 16 bits of float32,
+            # which sums (\sum_{i=-23}^{-8}2^{i}) to about 0.0078
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result, rtol=rtol)
         elif col_type == "scatter":
             need_result = input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
@@ -339,18 +362,28 @@ def check_with_place(self,
             need_result = input1 + input2
             need_result1 = need_result[0:need_result.shape[0] // 2]
             need_result2 = need_result[need_result.shape[0] // 2:]
-            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=1e-05)
-            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=1e-05)
+            if dtype == "bfloat16":
+                rtol = 8e-03
+            else:
+                rtol = 1e-05
+            np.testing.assert_allclose(tr0_out[0], need_result1, rtol=rtol)
+            np.testing.assert_allclose(tr1_out[0], need_result2, rtol=rtol)
         elif col_type == "allreduce":
             need_result = input1 + input2
+            if dtype == "bfloat16":
+                rtol = 8e-03
+                atol = 8e-03
+            else:
+                rtol = 1e-05
+                atol = 1e-05
             np.testing.assert_allclose(tr0_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
             np.testing.assert_allclose(tr1_out[0],
                                        need_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+                                       rtol=rtol,
+                                       atol=atol)
         elif col_type == "parallel_embedding":
             result_data = tr0_out[0]
             np.random.seed(2020)
diff --git a/python/paddle/fluid/tests/unittests/test_communicator_geo.py b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
index f7593f8bb31fe4..df7d04be3198d6 100644
--- a/python/paddle/fluid/tests/unittests/test_communicator_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_communicator_geo.py
@@ -28,7 +28,7 @@
 import paddle.distributed.fleet.base.role_maker as role_maker
 import paddle.distributed.fleet as fleet
 
-from paddle.distributed.utils import find_free_ports
+from paddle.distributed.utils.launch_utils import find_free_ports
 
 paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_concat_op.py b/python/paddle/fluid/tests/unittests/test_concat_op.py
index 0bf3d6230d84f3..10c7410708341b 100644
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
@@ -21,6 +21,9 @@
 from paddle.fluid import compiler, Program, program_guard, core
 from paddle.fluid.framework import _test_eager_guard
 import paddle
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestConcatOp(OpTest):
@@ -451,5 +454,83 @@ def _run_static_mode(self, use_fluid_api):
             res[0], np.concatenate([self.x] * self.iter_num, axis=self.axis))
 
 
+class TestConcatDoubleGradCheck(unittest.TestCase):
+
+    def concat_wrapper(self, x):
+        return paddle.concat(x)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data1 = layers.data('data1', [2, 3], False, dtype)
+        data1.persistable = True
+        data2 = layers.data('data2', [2, 3], False, dtype)
+        data2.persistable = True
+        out = paddle.concat([data1, data2])
+        data1_arr = np.random.uniform(-1, 1, data1.shape).astype(dtype)
+        data2_arr = np.random.uniform(-1, 1, data2.shape).astype(dtype)
+        gradient_checker.double_grad_check([data1, data2],
+                                           out,
+                                           x_init=[data1_arr, data2_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(
+            self.concat_wrapper, [data1, data2],
+            out,
+            x_init=[data1_arr, data2_arr],
+            place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestConcatTripleGradCheck(unittest.TestCase):
+
+    def concat_wrapper(self, x):
+        return paddle.concat(x, 1)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data1 = layers.data('data1', [2, 3, 4], False, dtype)
+        data1.persistable = True
+        data2 = layers.data('data2', [2, 3, 4], False, dtype)
+        data2.persistable = True
+        out = paddle.concat([data1, data2], 1)
+        data1_arr = np.random.uniform(-1, 1, data1.shape).astype(dtype)
+        data2_arr = np.random.uniform(-1, 1, data2.shape).astype(dtype)
+        gradient_checker.double_grad_check([data1, data2],
+                                           out,
+                                           x_init=[data1_arr, data2_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(
+            self.concat_wrapper, [data1, data2],
+            out,
+            x_init=[data1_arr, data2_arr],
+            place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cond.py b/python/paddle/fluid/tests/unittests/test_cond.py
index 86031db0f96d5d..c42de0a0b20eac 100644
--- a/python/paddle/fluid/tests/unittests/test_cond.py
+++ b/python/paddle/fluid/tests/unittests/test_cond.py
@@ -31,7 +31,6 @@
 
 
 class TestCondInputOutput(unittest.TestCase):
-
     def test_return_single_var(self):
         """
         pseudocode:
@@ -59,13 +58,16 @@ def false_func():
             out = layers.cond(pred, true_func, false_func)
             # out is one tensor
 
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
-        ret, = exe.run(main_program, fetch_list=[out.name])
-        np.testing.assert_allclose(np.asarray(ret),
-                                   np.full((3, 2), -1, np.int32),
-                                   rtol=1e-05)
+        (ret,) = exe.run(main_program, fetch_list=[out.name])
+        np.testing.assert_allclose(
+            np.asarray(ret), np.full((3, 2), -1, np.int32), rtol=1e-05
+        )
 
     def test_return_var_tuple(self):
         """
@@ -80,18 +82,14 @@ def test_return_var_tuple(self):
         paddle.enable_static()
 
         def true_func():
-            return layers.fill_constant(shape=[1, 2], dtype='int32',
-                                        value=1), layers.fill_constant(
-                                            shape=[2, 3],
-                                            dtype='bool',
-                                            value=True)
+            return layers.fill_constant(
+                shape=[1, 2], dtype='int32', value=1
+            ), layers.fill_constant(shape=[2, 3], dtype='bool', value=True)
 
         def false_func():
-            return layers.fill_constant(shape=[3, 4], dtype='float32',
-                                        value=3), layers.fill_constant(
-                                            shape=[4, 5],
-                                            dtype='int64',
-                                            value=2)
+            return layers.fill_constant(
+                shape=[3, 4], dtype='float32', value=3
+            ), layers.fill_constant(shape=[4, 5], dtype='int64', value=2)
 
         main_program = Program()
         startup_program = Program()
@@ -100,16 +98,19 @@ def false_func():
             out = layers.cond(pred, true_func, false_func)
             # out is a tuple containing 2 tensors
 
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
         ret = exe.run(main_program, fetch_list=out)
-        np.testing.assert_allclose(np.asarray(ret[0]),
-                                   np.full((1, 2), 1, np.int32),
-                                   rtol=1e-05)
-        np.testing.assert_allclose(np.asarray(ret[1]),
-                                   np.full((2, 3), True, bool),
-                                   rtol=1e-05)
+        np.testing.assert_allclose(
+            np.asarray(ret[0]), np.full((1, 2), 1, np.int32), rtol=1e-05
+        )
+        np.testing.assert_allclose(
+            np.asarray(ret[1]), np.full((2, 3), True, bool), rtol=1e-05
+        )
 
     def test_pass_and_modify_var(self):
         """
@@ -137,20 +138,28 @@ def false_func(a, i):
         with program_guard(main_program, startup_program):
             a = layers.fill_constant(shape=[3, 2, 1], dtype='int32', value=7)
             i = fluid.data(name="i", shape=[1], dtype='int32')
-            pred = ((i % 2) == 0)
-            a = layers.cond(pred, lambda: true_func(a, i),
-                            lambda: false_func(a, i))
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+            pred = (i % 2) == 0
+            a = layers.cond(
+                pred, lambda: true_func(a, i), lambda: false_func(a, i)
+            )
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
         for feed_i in range(5):
             expected_a = 7 * (feed_i + 1) if feed_i % 2 == 0 else 8 - feed_i
-            ret, = exe.run(main_program,
-                           feed={'i': np.full((1), feed_i, np.int32)},
-                           fetch_list=[a])
-            np.testing.assert_allclose(np.asarray(ret),
-                                       np.full((3, 2, 1), expected_a, np.int32),
-                                       rtol=1e-05)
+            (ret,) = exe.run(
+                main_program,
+                feed={'i': np.full((1), feed_i, np.int32)},
+                fetch_list=[a],
+            )
+            np.testing.assert_allclose(
+                np.asarray(ret),
+                np.full((3, 2, 1), expected_a, np.int32),
+                rtol=1e-05,
+            )
 
     def test_return_none(self):
         """
@@ -174,12 +183,15 @@ def false_func():
         startup_program = Program()
         with program_guard(main_program, startup_program):
             i = fluid.data(name="i", shape=[1], dtype='int32')
-            pred = ((i % 2) == 0)
+            pred = (i % 2) == 0
             out1 = layers.cond(pred, true_func, false_func)
             out2 = layers.cond(pred, None, false_func)
             out3 = layers.cond(pred, true_func, None)
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
         for feed_i in range(5):
             # Test that output is None is runnable
@@ -202,17 +214,15 @@ def func_return_one_tensor():
             return layers.fill_constant(shape=[2, 7], dtype='int32', value=3)
 
         def func_return_two_tensors():
-            return layers.fill_constant(shape=[3, 1], dtype='int32',
-                                        value=7), layers.fill_constant(
-                                            shape=[3, 1],
-                                            dtype='int32',
-                                            value=8)
+            return layers.fill_constant(
+                shape=[3, 1], dtype='int32', value=7
+            ), layers.fill_constant(shape=[3, 1], dtype='int32', value=8)
 
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program, startup_program):
             i = fluid.data(name="i", shape=[1], dtype='int32')
-            pred = ((i % 2) == 0)
+            pred = (i % 2) == 0
             with self.assertRaises(TypeError):
                 out = layers.cond(pred, i, func_return_one_tensor)
 
@@ -220,47 +230,57 @@ def func_return_two_tensors():
                 out = layers.cond(pred, func_return_one_tensor, np.asarray([3]))
 
             with self.assertRaises(Exception) as e:
-                out = layers.cond(pred, func_return_none,
-                                  func_return_one_tensor)
+                out = layers.cond(
+                    pred, func_return_none, func_return_one_tensor
+                )
             self.assertTrue(
-                "Incompatible return values of true_fn and false_fn in cond" in
-                str(e.exception))
+                "Incompatible return values of true_fn and false_fn in cond"
+                in str(e.exception)
+            )
 
             with self.assertRaises(Exception) as e:
-                out = layers.cond(pred, func_return_two_tensors,
-                                  func_return_none)
+                out = layers.cond(
+                    pred, func_return_two_tensors, func_return_none
+                )
             self.assertTrue(
-                "Incompatible return values of true_fn and false_fn in cond" in
-                str(e.exception))
+                "Incompatible return values of true_fn and false_fn in cond"
+                in str(e.exception)
+            )
 
             with self.assertRaises(Exception) as e:
-                out = layers.cond(pred, func_return_one_tensor,
-                                  func_return_two_tensors)
+                out = layers.cond(
+                    pred, func_return_one_tensor, func_return_two_tensors
+                )
             self.assertTrue(
                 "true fn returns 1 vars, but false fn returns 2 vars, which is not equals"
-                in str(e.exception))
+                in str(e.exception)
+            )
 
     def test_extremely_simple_net_with_op_in_condition(self):
         paddle.enable_static()
         main_program = fluid.Program()
         startup_program = fluid.Program()
         with fluid.program_guard(main_program, startup_program):
-            a = fluid.layers.fill_constant(shape=[1],
-                                           dtype='float32',
-                                           value=1.23)
+            a = fluid.layers.fill_constant(
+                shape=[1], dtype='float32', value=1.23
+            )
             a.stop_gradient = False
-            b = fluid.layers.fill_constant(shape=[1],
-                                           dtype='float32',
-                                           value=1.25)
+            b = fluid.layers.fill_constant(
+                shape=[1], dtype='float32', value=1.25
+            )
             b.stop_gradient = False
             out = layers.cond(a - b < -1.0, lambda: a, lambda: b)
         append_backward(out)
 
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
-        ret = exe.run(main_program,
-                      fetch_list=[out, b, a.grad_name, b.grad_name])
+        ret = exe.run(
+            main_program, fetch_list=[out, b, a.grad_name, b.grad_name]
+        )
         # Note: fill_constant has loss of precision, you have to assertEqual
         # with values doens't lose precision in float-point number.
         self.assertEqual(ret[0][0], ret[1][0])
@@ -269,7 +289,6 @@ def test_extremely_simple_net_with_op_in_condition(self):
 
 
 class TestCondNestedControlFlow(unittest.TestCase):
-
     def test_cond_inside_cond(self):
         """
         pseudocode:
@@ -277,7 +296,7 @@ def test_cond_inside_cond(self):
             a = 2 * i
             if i < 5:
                 if i >= 3:
-                    return a + a 
+                    return a + a
                 else:
                     return a - a
             else:
@@ -290,25 +309,37 @@ def test_cond_inside_cond(self):
         paddle.enable_static()
 
         def less_than_branch(i, a):
-            return layers.cond(i >= 3.0, lambda: layers.elementwise_add(a, a),
-                               lambda: layers.elementwise_sub(a, a))
+            return layers.cond(
+                i >= 3.0,
+                lambda: layers.elementwise_add(a, a),
+                lambda: layers.elementwise_sub(a, a),
+            )
 
         def greater_equal_branch(i, a):
-            return layers.cond(i < 8.0, lambda: layers.elementwise_mul(a, a),
-                               lambda: layers.elementwise_div(a, a))
+            return layers.cond(
+                i < 8.0,
+                lambda: layers.elementwise_mul(a, a),
+                lambda: layers.elementwise_div(a, a),
+            )
 
         main_program = Program()
         startup_program = Program()
         with program_guard(main_program, startup_program):
             i = fluid.data(name="i", shape=[1], dtype='float32')
             a = 2.0 * i
-            out = layers.cond(i < 5.0, lambda: less_than_branch(i, a),
-                              lambda: greater_equal_branch(i, a))
+            out = layers.cond(
+                i < 5.0,
+                lambda: less_than_branch(i, a),
+                lambda: greater_equal_branch(i, a),
+            )
             mean = paddle.mean(out)
             append_backward(mean)
 
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
         for feed_i in range(0, 10):
             expected_a = 2.0 * feed_i
@@ -318,9 +349,11 @@ def greater_equal_branch(i, a):
             else:
                 expected_ret = expected_a * expected_a if feed_i < 8 else 1.0
                 expected_a_grad = 2.0 * expected_a if feed_i < 8 else 0.0
-            ret = exe.run(main_program,
-                          feed={'i': np.full((1), feed_i, np.float32)},
-                          fetch_list=[out.name, a.grad_name])
+            ret = exe.run(
+                main_program,
+                feed={'i': np.full((1), feed_i, np.float32)},
+                fetch_list=[out.name, a.grad_name],
+            )
             self.assertEqual(ret[0][0], expected_ret)
             self.assertEqual(ret[1][0], expected_a_grad)
 
@@ -330,24 +363,34 @@ def test_cond_op_in_condition(self):
         startup_program = fluid.Program()
 
         with fluid.program_guard(main_program, startup_program):
-            a = fluid.layers.fill_constant(shape=[1],
-                                           dtype='float32',
-                                           value=1.23)
+            a = fluid.layers.fill_constant(
+                shape=[1], dtype='float32', value=1.23
+            )
             a.stop_gradient = False
-            b = fluid.layers.fill_constant(shape=[1],
-                                           dtype='float32',
-                                           value=1.24)
+            b = fluid.layers.fill_constant(
+                shape=[1], dtype='float32', value=1.24
+            )
             b.stop_gradient = False
             out = fluid.layers.cond(
-                a < b, lambda: fluid.layers.cond(
-                    a - b < -1.0, lambda: fluid.layers.elementwise_add(a, b),
-                    lambda: fluid.layers.elementwise_mul(a, b)), lambda:
-                fluid.layers.cond(a == b, lambda: fluid.layers.elementwise_sub(
-                    a, b), lambda: fluid.layers.elementwise_pow(a, b)))
+                a < b,
+                lambda: fluid.layers.cond(
+                    a - b < -1.0,
+                    lambda: fluid.layers.elementwise_add(a, b),
+                    lambda: fluid.layers.elementwise_mul(a, b),
+                ),
+                lambda: fluid.layers.cond(
+                    a == b,
+                    lambda: fluid.layers.elementwise_sub(a, b),
+                    lambda: fluid.layers.elementwise_pow(a, b),
+                ),
+            )
             append_backward(out)
 
-        place = fluid.CUDAPlace(
-            0) if core.is_compiled_with_cuda() else fluid.CPUPlace()
+        place = (
+            fluid.CUDAPlace(0)
+            if core.is_compiled_with_cuda()
+            else fluid.CPUPlace()
+        )
         exe = fluid.Executor(place)
         ret = exe.run(main_program, fetch_list=[out, a.grad_name, b.grad_name])
         # Note: fill_constant has loss of precision, so we assertAlmostEqual.
@@ -357,7 +400,6 @@ def test_cond_op_in_condition(self):
 
 
 class TestCondBackward(unittest.TestCase):
-
     def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
         """
         Helper function that compares calculated backward value is close to dy/dx
@@ -381,70 +423,76 @@ def backward_value_helper(self, cond_func, use_cuda, use_parallel_exe):
         num_devices = 1
         if use_parallel_exe:
             os.environ['CPU_NUM'] = str(2)
-            exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                         main_program=main_program,
-                                         loss_name=loss.name)
+            exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                main_program=main_program,
+                loss_name=loss.name,
+            )
             num_devices = exe.device_count
 
         delta = 0.005
         for feed_i in range(0, 10):
             feed_img = np.random.random(size=[1, 9]).astype(np.float32)
-            feed_label = np.random.randint(low=0,
-                                           high=10,
-                                           size=[1, 1],
-                                           dtype=np.int64)
+            feed_label = np.random.randint(
+                low=0, high=10, size=[1, 1], dtype=np.int64
+            )
             if use_parallel_exe:
                 img_grad, loss_value = exe.run(
                     feed={
                         'i': np.full((num_devices), feed_i, np.int32),
                         'image': np.repeat(feed_img, num_devices, axis=0),
-                        'label': np.repeat(feed_label, num_devices, axis=0)
+                        'label': np.repeat(feed_label, num_devices, axis=0),
                     },
-                    fetch_list=[img.grad_name, loss.name])
+                    fetch_list=[img.grad_name, loss.name],
+                )
             else:
                 img_grad, loss_value = exe.run(
                     main_program,
                     feed={
                         'i': np.full((1), feed_i, np.int32),
                         'image': feed_img,
-                        'label': feed_label
+                        'label': feed_label,
                     },
-                    fetch_list=[img.grad_name, loss.name])
+                    fetch_list=[img.grad_name, loss.name],
+                )
 
             numerical_grad = np.zeros(shape=[num_devices, 9], dtype=np.float32)
             feed_img_delta = np.copy(feed_img)
             for j in range(9):
                 feed_img_delta[0][j] = feed_img[0][j] + delta
                 if use_parallel_exe:
-                    loss_delta = exe.run(feed={
-                        'i':
-                        np.full((num_devices), feed_i, np.int32),
-                        'image':
-                        np.repeat(feed_img_delta, num_devices, axis=0),
-                        'label':
-                        np.repeat(feed_label, num_devices, axis=0)
-                    },
-                                         fetch_list=[loss.name])
-                    multi_device_grad = (loss_delta[0] -
-                                         loss_value[0]) / delta / num_devices
+                    loss_delta = exe.run(
+                        feed={
+                            'i': np.full((num_devices), feed_i, np.int32),
+                            'image': np.repeat(
+                                feed_img_delta, num_devices, axis=0
+                            ),
+                            'label': np.repeat(feed_label, num_devices, axis=0),
+                        },
+                        fetch_list=[loss.name],
+                    )
+                    multi_device_grad = (
+                        (loss_delta[0] - loss_value[0]) / delta / num_devices
+                    )
                     for d in range(num_devices):
                         numerical_grad[d][j] = multi_device_grad[d]
                 else:
-                    loss_delta = exe.run(main_program,
-                                         feed={
-                                             'i': np.full((1), feed_i,
-                                                          np.int32),
-                                             'image': feed_img_delta,
-                                             'label': feed_label
-                                         },
-                                         fetch_list=[loss.name])
-                    numerical_grad[0][j] = (loss_delta[0] -
-                                            loss_value[0]) / delta
+                    loss_delta = exe.run(
+                        main_program,
+                        feed={
+                            'i': np.full((1), feed_i, np.int32),
+                            'image': feed_img_delta,
+                            'label': feed_label,
+                        },
+                        fetch_list=[loss.name],
+                    )
+                    numerical_grad[0][j] = (
+                        loss_delta[0] - loss_value[0]
+                    ) / delta
                 feed_img_delta[0][j] = feed_img[0][j]
-            np.testing.assert_allclose(img_grad,
-                                       numerical_grad,
-                                       rtol=0.05,
-                                       atol=0.05)
+            np.testing.assert_allclose(
+                img_grad, numerical_grad, rtol=0.05, atol=0.05
+            )
 
     def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe):
         """
@@ -465,43 +513,49 @@ def add_optimizer_helper(self, cond_func, use_cuda, use_parallel_exe):
         exe.run(startup_program)
         if use_parallel_exe:
             os.environ['CPU_NUM'] = str(2)
-            exe = fluid.ParallelExecutor(use_cuda=use_cuda,
-                                         main_program=main_program,
-                                         loss_name=loss.name)
+            exe = fluid.ParallelExecutor(
+                use_cuda=use_cuda,
+                main_program=main_program,
+                loss_name=loss.name,
+            )
             num_devices = exe.device_count
 
         for feed_i in range(0, 10):
             feed_img = np.random.random(size=[16, 784]).astype(np.float32)
-            feed_label = np.random.randint(low=0,
-                                           high=10,
-                                           size=[16, 1],
-                                           dtype=np.int64)
+            feed_label = np.random.randint(
+                low=0, high=10, size=[16, 1], dtype=np.int64
+            )
             if use_parallel_exe:
-                exe.run(feed={
-                    'i': np.full((num_devices), feed_i, np.int32),
-                    'image': np.repeat(feed_img, num_devices, axis=0),
-                    'label': np.repeat(feed_label, num_devices, axis=0)
-                },
-                        fetch_list=[loss.name])
+                exe.run(
+                    feed={
+                        'i': np.full((num_devices), feed_i, np.int32),
+                        'image': np.repeat(feed_img, num_devices, axis=0),
+                        'label': np.repeat(feed_label, num_devices, axis=0),
+                    },
+                    fetch_list=[loss.name],
+                )
             else:
-                exe.run(main_program,
-                        feed={
-                            'i': np.full((1), feed_i, np.int32),
-                            'image': feed_img,
-                            'label': feed_label
-                        },
-                        fetch_list=[loss])
+                exe.run(
+                    main_program,
+                    feed={
+                        'i': np.full((1), feed_i, np.int32),
+                        'image': feed_img,
+                        'label': feed_label,
+                    },
+                    fetch_list=[loss],
+                )
 
     def test_cond_backward(self):
 
         paddle.enable_static()
 
         def cond_func(i, img, label):
-            predicate = ((i % 2) == 0)
+            predicate = (i % 2) == 0
             return layers.cond(
                 predicate,
                 lambda: simple_fc_net_with_inputs(img, label, class_num=10),
-                lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
+                lambda: batchnorm_fc_with_inputs(img, label, class_num=10),
+            )
 
         for use_parallel_exe in [False, True]:
             if use_parallel_exe and os.name == "nt":
@@ -510,10 +564,12 @@ def cond_func(i, img, label):
                 )
                 continue
 
-            self.backward_value_helper(cond_func, core.is_compiled_with_cuda(),
-                                       use_parallel_exe)
-            self.add_optimizer_helper(cond_func, core.is_compiled_with_cuda(),
-                                      use_parallel_exe)
+            self.backward_value_helper(
+                cond_func, core.is_compiled_with_cuda(), use_parallel_exe
+            )
+            self.add_optimizer_helper(
+                cond_func, core.is_compiled_with_cuda(), use_parallel_exe
+            )
 
     def test_half_nested_cond_backward(self):
         paddle.enable_static()
@@ -522,15 +578,18 @@ def branch(i, img, label):
             return layers.cond(
                 (i % 2) == 0,
                 lambda: simple_fc_net_with_inputs(img, label, class_num=10),
-                lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
+                lambda: batchnorm_fc_with_inputs(img, label, class_num=10),
+            )
 
         def cond_func_simple_net_at_true(i, img, label):
-            return layers.cond(i < 5, lambda: branch(i, img, label),
-                               lambda: paddle.mean(img))
+            return layers.cond(
+                i < 5, lambda: branch(i, img, label), lambda: paddle.mean(img)
+            )
 
         def cond_func_simple_net_at_false(i, img, label):
-            return layers.cond(i < 5, lambda: paddle.mean(img),
-                               lambda: branch(i, img, label))
+            return layers.cond(
+                i < 5, lambda: paddle.mean(img), lambda: branch(i, img, label)
+            )
 
         for use_parallel_exe in [False, True]:
             if use_parallel_exe and os.name == "nt":
@@ -539,35 +598,47 @@ def cond_func_simple_net_at_false(i, img, label):
                 )
                 continue
 
-            self.backward_value_helper(cond_func_simple_net_at_true,
-                                       core.is_compiled_with_cuda(),
-                                       use_parallel_exe)
-            self.add_optimizer_helper(cond_func_simple_net_at_true,
-                                      core.is_compiled_with_cuda(),
-                                      use_parallel_exe)
-            self.backward_value_helper(cond_func_simple_net_at_false,
-                                       core.is_compiled_with_cuda(),
-                                       use_parallel_exe)
-            self.add_optimizer_helper(cond_func_simple_net_at_false,
-                                      core.is_compiled_with_cuda(),
-                                      use_parallel_exe)
+            self.backward_value_helper(
+                cond_func_simple_net_at_true,
+                core.is_compiled_with_cuda(),
+                use_parallel_exe,
+            )
+            self.add_optimizer_helper(
+                cond_func_simple_net_at_true,
+                core.is_compiled_with_cuda(),
+                use_parallel_exe,
+            )
+            self.backward_value_helper(
+                cond_func_simple_net_at_false,
+                core.is_compiled_with_cuda(),
+                use_parallel_exe,
+            )
+            self.add_optimizer_helper(
+                cond_func_simple_net_at_false,
+                core.is_compiled_with_cuda(),
+                use_parallel_exe,
+            )
 
     def test_nested_cond_backward(self):
         paddle.enable_static()
 
         def branch(i, img, label, mod_two):
             if mod_two:
-                predicate = ((i % 2) == 0)
+                predicate = (i % 2) == 0
             else:
-                predicate = ((i % 2) != 0)
+                predicate = (i % 2) != 0
             return layers.cond(
                 predicate,
                 lambda: simple_fc_net_with_inputs(img, label, class_num=10),
-                lambda: batchnorm_fc_with_inputs(img, label, class_num=10))
+                lambda: batchnorm_fc_with_inputs(img, label, class_num=10),
+            )
 
         def cond_func(i, img, label):
-            return layers.cond(i < 5, lambda: branch(i, img, label, True),
-                               lambda: branch(i, img, label, False))
+            return layers.cond(
+                i < 5,
+                lambda: branch(i, img, label, True),
+                lambda: branch(i, img, label, False),
+            )
 
         for use_parallel_exe in [False, True]:
             if use_parallel_exe and os.name == "nt":
@@ -575,14 +646,15 @@ def cond_func(i, img, label):
                     "Skip use_parallel_exe=True in Windows because of flaky test when using PE under old Windows machine"
                 )
                 continue
-            self.backward_value_helper(cond_func, core.is_compiled_with_cuda(),
-                                       use_parallel_exe)
-            self.add_optimizer_helper(cond_func, core.is_compiled_with_cuda(),
-                                      use_parallel_exe)
+            self.backward_value_helper(
+                cond_func, core.is_compiled_with_cuda(), use_parallel_exe
+            )
+            self.add_optimizer_helper(
+                cond_func, core.is_compiled_with_cuda(), use_parallel_exe
+            )
 
 
 class TestCondWithError(unittest.TestCase):
-
     def test_input_type_error(self):
         paddle.enable_static()
         main_program = framework.Program()
@@ -606,5 +678,44 @@ def func():
                 layers.cond(pred, func, func, set())
 
 
+class TestCondWithDict(unittest.TestCase):
+    def test_input_with_dict(self):
+        paddle.enable_static()
+        main_program = framework.Program()
+        startup_program = framework.Program()
+        with framework.program_guard(main_program, startup_program):
+
+            def true_func():
+                return {
+                    '1': paddle.full(shape=[3, 2], dtype='int32', fill_value=1),
+                    '2': paddle.full(
+                        shape=[2, 3], dtype='bool', fill_value=True
+                    ),
+                }
+
+            def false_func():
+                return {
+                    '1': paddle.full(
+                        shape=[3, 4], dtype='float32', fill_value=3
+                    ),
+                    '2': paddle.full(shape=[4, 5], dtype='int64', fill_value=2),
+                }
+
+            x = paddle.full(shape=[1], dtype='float32', fill_value=0.1)
+            y = paddle.full(shape=[1], dtype='float32', fill_value=0.23)
+            pred = paddle.less_than(x=x, y=y, name=None)
+            ret = paddle.static.nn.cond(pred, true_func, false_func)
+            self.assertEqual(
+                ret['1'].shape,
+                (3, -1),
+                f"The shape is not correct, expects (3, -1) but gets {ret['1'].shape}.",
+            )
+            self.assertEqual(
+                ret['2'].shape,
+                (-1, -1),
+                f"The shape is not correct, expects (-1, -1) but gets {ret['2'].shape}.",
+            )
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cumsum_op.py b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
index 2f66cd80ddef6a..cfef7ddcf4e9c7 100644
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
@@ -24,6 +24,9 @@
 import paddle.fluid as fluid
 from paddle.fluid import compiler, Program, program_guard
 import paddle.inference as paddle_infer
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestCumsumOp(unittest.TestCase):
@@ -353,6 +356,9 @@ def test_static_and_infer(self):
             relu_out = paddle.nn.functional.relu(linear_out)
             axis = paddle.full([1], 2, dtype='int64')
             out = paddle.cumsum(relu_out, axis=axis)
+            loss = paddle.mean(out)
+            sgd = paddle.optimizer.SGD(learning_rate=0.)
+            sgd.minimize(paddle.mean(out))
 
             exe = paddle.static.Executor(self.place)
             exe.run(starup_prog)
@@ -380,5 +386,79 @@ def test_static_and_infer(self):
             np.testing.assert_allclose(static_out[0], infer_out)
 
 
+class TestCumsumDoubleGradCheck(unittest.TestCase):
+
+    def cumsum_wrapper(self, x):
+        return paddle.cumsum(x[0], 0)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float64
+
+        data = layers.data('data', [3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.cumsum(data, 0)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.cumsum_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestCumsumTripleGradCheck(unittest.TestCase):
+
+    def cumsum_wrapper(self, x):
+        return paddle.cumsum(x[0], 0)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.cumsum(data, 0)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.cumsum_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py
index cf3dcd00a5d418..bf9f73e80d16f7 100755
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
@@ -627,7 +627,8 @@ def run_trainer_with_spawn(self, args):
         np.random.seed(seed)
         random.seed(seed)
         # get trainer id
-        args.trainer_id = paddle.distributed.get_rank()
+        paddle.distributed.parallel._get_global_parallel_env()
+        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
 
         # 3. init parallel env
         if args.update_method in ["nccl2", "gloo"]:
@@ -666,7 +667,8 @@ def run_use_fleet_api_trainer(self, args):
         np.random.seed(seed)
         random.seed(seed)
         # get trainer id
-        args.trainer_id = paddle.distributed.get_rank()
+        paddle.distributed.parallel._get_global_parallel_env()
+        args.trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0"))
 
         # set strategy
         strategy = fleet.DistributedStrategy()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index eb696420fe03b3..e3ef0fd61c5ac5 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -1112,16 +1112,18 @@ def test_backward_upscale_train_2_eager(self):
 class TestDropOutWithProbTensor(unittest.TestCase):
 
     def setUp(self):
-        shapes = [[10, 10], [10, 10, 10], [10, 10, 10, 10]]
-        self.inputs = [
-            np.random.random(shape).astype("float32") for shape in shapes
-        ]
+        self.init_info()
+        self.input = np.random.random(self.shape).astype("float32")
         self.place = paddle.CUDAPlace(
             0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace()
 
+    def init_info(self):
+        self.shape = [10, 10]
+        self.api = paddle.nn.functional.dropout
+
     def api_case(self, x):
         p = paddle.assign([0.5])
-        out = paddle.nn.functional.dropout(x=x, p=p, training=True)
+        out = self.api(x=x, p=p, training=True)
         return out
 
     def run_static(self, x):
@@ -1131,6 +1133,8 @@ def run_static(self, x):
         with program_guard(main_program):
             input = paddle.static.data(shape=x.shape, name='x', dtype='float32')
             out = self.api_case(input)
+            sgd = paddle.optimizer.SGD(learning_rate=0.1)
+            sgd.minimize(paddle.mean(out))
 
             exe = paddle.static.Executor(self.place)
             res = exe.run(feed={'x': x}, fetch_list=[out])
@@ -1144,10 +1148,23 @@ def run_dygraph(self, x):
         return out
 
     def test_p_tensor(self):
-        for x in self.inputs:
-            static_res = self.run_static(x)
-            dygraph_res = self.run_dygraph(x)
-            np.testing.assert_array_equal(static_res, dygraph_res)
+        static_res = self.run_static(self.input)
+        dygraph_res = self.run_dygraph(self.input)
+        np.testing.assert_array_equal(static_res, dygraph_res)
+
+
+class TestDropOut2DWithProbTensor(TestDropOutWithProbTensor):
+
+    def init_info(self):
+        self.shape = [2, 3, 10, 10]
+        self.api = paddle.nn.functional.dropout2d
+
+
+class TestDropOut3DWithProbTensor(TestDropOutWithProbTensor):
+
+    def init_info(self):
+        self.shape = [2, 3, 8, 8, 8]
+        self.api = paddle.nn.functional.dropout3d
 
 
 class TestRandomValue(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
index a468de7a28260f..09f3ae5d169a9d 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_recurrent_op.py
@@ -693,5 +693,45 @@ def check_forward(self):
         np.testing.assert_allclose(pd_output, py_output, rtol=0.01)
 
 
+class RecurrentNet(paddle.nn.Layer):
+
+    def __init__(self):
+        super(RecurrentNet, self).__init__()
+        self.cell = paddle.nn.SimpleRNNCell(16, 32)
+        self.rnn = paddle.nn.RNN(self.cell)
+
+    def forward(self, inputs, prev_h):
+        outputs, final_states = self.rnn(inputs, prev_h)
+        return outputs, final_states
+
+
+class TestDy2StRecurrentOpBackward(unittest.TestCase):
+
+    def setUp(self):
+        paddle.disable_static()
+        paddle.seed(100)
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_recurrent_backward(self):
+        net = RecurrentNet()
+        inputs = paddle.rand((4, 23, 16))
+        inputs.stop_gradient = False
+        prev_h = paddle.randn((4, 32))
+        prev_h.stop_gradient = False
+
+        outputs, final_states = net(inputs, prev_h)
+        outputs.backward()
+        dy_grad = inputs.gradient()
+        inputs.clear_gradient()
+
+        net = paddle.jit.to_static(net)
+        outputs, final_states = net(inputs, prev_h)
+        outputs.backward()
+        st_grad = inputs.gradient()
+        np.testing.assert_allclose(dy_grad, st_grad)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
index f107fec1c4e4eb..f2791e55d5188f 100755
--- a/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_as_v2_op.py
@@ -83,6 +83,24 @@ def setUp(self):
         self.outputs = {'Out': output}
 
 
+class TestExpandAsOpRank5(TestExpandAsBasic):
+    no_need_check_grad = True
+
+    def setUp(self):
+        self.op_type = "expand_as_v2"
+        self.python_api = paddle.expand_as
+        x = np.random.rand(1, 1, 7, 16).astype("int64")
+        target_tensor = np.random.rand(4, 6, 7, 16).astype("float64")
+        self.inputs = {'X': x, "Y": target_tensor}
+        self.attrs = {'target_shape': target_tensor.shape}
+        bcast_dims = [4, 6, 1, 1]
+        output = np.tile(self.inputs['X'], bcast_dims)
+        self.outputs = {'Out': output}
+
+    def test_check_grad(self):
+        pass
+
+
 class TestExpandAsV2Error(unittest.TestCase):
 
     def test_errors(self):
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index 6fc6fc8f7eb6b4..82fb8284fe7859 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -18,9 +18,12 @@
 import numpy as np
 from op_test import OpTest
 import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid import compiler, Program, program_guard, core
 import paddle
 from paddle.fluid.framework import _test_eager_guard
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 # Situation 1: shape is a list(without tensor)
@@ -284,6 +287,80 @@ def test_expand_times_is_tensor(self):
                                           egr_expand_1.numpy())
 
 
+class TestExpandDoubleGradCheck(unittest.TestCase):
+
+    def expand_wrapper(self, x):
+        return paddle.expand(x[0], [2, 3])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.expand(data, [2, 3])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.expand_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestExpandTripleGradCheck(unittest.TestCase):
+
+    def expand_wrapper(self, x):
+        return paddle.expand(x[0], [2, 3])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.expand(data, [2, 3])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.expand_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
new file mode 100644
index 00000000000000..680ce306742422
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_cond_interceptor.py
@@ -0,0 +1,216 @@
+# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.fluid.core as core
+from paddle.distributed.fleet.fleet_executor_utils import TaskNode
+
+paddle.enable_static()
+
+
+def cond(i, ten, data):
+    return i < ten
+
+
+def body(i, ten, data):
+    i = i + 1
+    data = data + 1
+    return [i, ten, data]
+
+
+num_micro_batches = 4
+
+
+def batch_generator_creator():
+    def __reader__():
+        for i in range(num_micro_batches):
+            data = np.full(shape=[1, 1], fill_value=i, dtype=np.float32)
+            yield data
+
+    return __reader__
+
+
+class TestFleetExecutor(unittest.TestCase):
+    def test_cond_interceptor(self):
+        main_program = paddle.static.Program()
+        startup_program = paddle.static.Program()
+        with paddle.static.program_guard(main_program, startup_program):
+            i = paddle.full(
+                shape=[1], fill_value=0, dtype='int64'
+            )  # loop counter
+            ten = paddle.full(
+                shape=[1], fill_value=10, dtype='int64'
+            )  # loop length
+            data = paddle.static.data(name='x', shape=[1])
+
+            loader = paddle.fluid.io.DataLoader.from_generator(
+                feed_list=[data], capacity=num_micro_batches * 4, iterable=False
+            )
+            loader.set_batch_generator(
+                batch_generator_creator(), paddle.CUDAPlace(0)
+            )
+
+            paddle.static.nn.while_loop(cond, body, [i, ten, data])
+
+        program_a = paddle.static.Program()
+        program_b = paddle.static.Program()
+
+        for var_name in main_program.block(0).vars:
+            if var_name != "_generated_var_0":
+                var = main_program.block(0).var(var_name)
+                if (
+                    var_name == "create_py_reader_0"
+                    or var_name == "double_buffer_0"
+                ):
+                    program_a.block(0).create_var(
+                        name=var_name,
+                        persistable=var.persistable,
+                    )
+                else:
+                    program_a.block(0).create_var(
+                        name=var_name,
+                        shape=var.shape,
+                        dtype=var.dtype,
+                        stop_gradient=var.stop_gradient,
+                    )
+                    program_b.block(0).create_var(
+                        name=var_name,
+                        shape=var.shape,
+                        dtype=var.dtype,
+                        stop_gradient=var.stop_gradient,
+                    )
+
+        for op in main_program.block(0).ops:
+            if op.type != "while":
+                program_a.block(0).append_op(
+                    type=op.type,
+                    inputs=op.desc.inputs(),
+                    outputs=op.desc.outputs(),
+                    attrs=op.all_attrs(),
+                )
+
+        for var_name in main_program.block(1).vars:
+            var = main_program.block(1).var(var_name)
+            program_b.block(0).create_var(
+                name=var_name,
+                shape=var.shape,
+                dtype=var.dtype,
+                stop_gradient=var.stop_gradient,
+            )
+
+        for op in main_program.block(1).ops:
+            program_b.block(0).append_op(
+                type=op.type,
+                inputs=op.desc.inputs(),
+                outputs=op.desc.outputs(),
+                attrs=op.all_attrs(),
+            )
+
+        cond_var_name = "tmp_0"
+
+        task_a = TaskNode(
+            0,
+            num_micro_batches,
+            node_type="Start",
+            task_id=0,
+            program=program_a,
+            lazy_initialize=True,
+        )
+        task_b = TaskNode(
+            0,
+            num_micro_batches,
+            node_type="Cond",
+            task_id=1,
+            program=paddle.static.Program(),
+            cond_var_name=cond_var_name,
+            lazy_initialize=True,
+        )
+        task_c = TaskNode(
+            0,
+            num_micro_batches,
+            node_type="Compute",
+            task_id=2,
+            program=program_b,
+            lazy_initialize=True,
+        )
+        task_d = TaskNode(
+            0,
+            num_micro_batches,
+            node_type="Compute",
+            task_id=3,
+            program=paddle.static.Program(),
+            vars_list={'x': 'float32', 'tmp_1': 'int64'},
+            lazy_initialize=True,
+        )
+        task_e = TaskNode(
+            0,
+            num_micro_batches,
+            node_type="Compute",
+            task_id=4,
+            program=paddle.static.Program(),
+            lazy_initialize=True,
+        )
+
+        infinite_buff_size = -1
+        task_a.add_downstream_task(task_b.task_id(), 2)
+        task_b.add_upstream_task(task_a.task_id(), 2)
+        task_b.add_downstream_task(task_c.task_id(), infinite_buff_size)
+        task_c.add_upstream_task(task_b.task_id(), infinite_buff_size)
+        task_c.add_downstream_task(task_d.task_id(), 2)
+        task_d.add_upstream_task(task_c.task_id(), 2)
+        task_d.add_downstream_task(
+            task_b.task_id(), infinite_buff_size, core.DependType.LOOP
+        )
+        task_b.add_upstream_task(
+            task_d.task_id(), infinite_buff_size, core.DependType.LOOP
+        )
+        task_b.add_downstream_task(
+            task_e.task_id(), infinite_buff_size, core.DependType.STOP_LOOP
+        )
+        task_e.add_upstream_task(
+            task_b.task_id(), infinite_buff_size, core.DependType.STOP_LOOP
+        )
+
+        main_program._pipeline_opt = {
+            "fleet_opt": {
+                'tasks': [task_a, task_b, task_c, task_d, task_e],
+                'task_id_to_rank': {
+                    task_a.task_id(): 0,
+                    task_b.task_id(): 0,
+                    task_c.task_id(): 0,
+                    task_d.task_id(): 0,
+                    task_e.task_id(): 0,
+                },
+                'num_micro_batches': num_micro_batches,
+                'inference_generation': True,
+                'fetch_var': ['x'],
+            },
+        }
+
+        place = paddle.CUDAPlace(0)
+        exe = paddle.static.Executor(place)
+        loader.start()
+        res = exe.run(main_program)
+        ref_res = np.full([1], 10, dtype="float32")
+        for data in res:
+            np.testing.assert_allclose(data, ref_res, rtol=1e-05)
+            ref_res = ref_res + 1
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
index 07ecf85c3db2ec..1d6b426bde146a 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_task_node.py
@@ -21,26 +21,33 @@
 
 
 class TestFleetExecutorTaskNode(unittest.TestCase):
-
     def test_task_node(self):
         program = paddle.static.Program()
-        task_node_0 = core.TaskNode(program.desc, 0, 1, 1)
+        task_node_0 = core.TaskNode(program.desc, 0, 0, 1)
         task_node_1 = core.TaskNode(program.desc, 0, 1, 1)
-        task_node_2 = core.TaskNode(program.desc, 0, 1, 1)
+        task_node_2 = core.TaskNode(program.desc, 0, 2, 1)
         self.assertEqual(task_node_0.task_id(), 0)
         self.assertEqual(task_node_1.task_id(), 1)
         self.assertEqual(task_node_2.task_id(), 2)
         self.assertTrue(
-            task_node_0.add_downstream_task(task_node_1.task_id(), 1))
-        self.assertTrue(task_node_1.add_upstream_task(task_node_0.task_id(), 1))
+            task_node_0.add_downstream_task(
+                task_node_1.task_id(), 1, core.DependType.NORMAL
+            )
+        )
+        self.assertTrue(
+            task_node_1.add_upstream_task(
+                task_node_0.task_id(), 1, core.DependType.NORMAL
+            )
+        )
 
     def test_lazy_task_node(self):
         program = paddle.static.Program()
-        task = TaskNode(program=program,
-                        rank=0,
-                        max_run_times=1,
-                        max_slot_times=1,
-                        lazy_initialize=True)
+        task = TaskNode(
+            program=program,
+            rank=0,
+            max_run_times=1,
+            lazy_initialize=True,
+        )
         task_node = task.task_node()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
index f80f998c047dc3..398ba59539aa37 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_executor_with_task_nodes.py
@@ -22,17 +22,16 @@
 
 
 class TestFleetExecutor(unittest.TestCase):
-
     def run_fleet_executor(self, place, x_data, y_data):
         exe = paddle.static.Executor(place)
         empty_program = paddle.static.Program()
         with fluid.program_guard(empty_program, empty_program):
-            x = fluid.layers.data(name='x',
-                                  shape=x_data.shape,
-                                  dtype=x_data.dtype)
-            y = fluid.layers.data(name='y',
-                                  shape=y_data.shape,
-                                  dtype=y_data.dtype)
+            x = fluid.layers.data(
+                name='x', shape=x_data.shape, dtype=x_data.dtype
+            )
+            y = fluid.layers.data(
+                name='y', shape=y_data.shape, dtype=y_data.dtype
+            )
             z = x + y
             a = 2 * x + 3 * y
             loss = paddle.mean(a)
@@ -41,11 +40,13 @@ def run_fleet_executor(self, place, x_data, y_data):
             steps_per_pass = 10
             bd = [steps_per_pass * p for p in passes]
             lr = [base_lr * (0.1**i) for i in range(len(bd) + 1)]
-            lr_val = paddle.optimizer.lr.PiecewiseDecay(boundaries=bd,
-                                                        values=lr)
+            lr_val = paddle.optimizer.lr.PiecewiseDecay(
+                boundaries=bd, values=lr
+            )
             opt = paddle.optimizer.AdamW(
                 learning_rate=lr_val,
-                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0))
+                grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0),
+            )
             opt.minimize(loss)
         # TODO: section_program will be removed in the future
         task_node = TaskNode(
@@ -54,23 +55,20 @@ def run_fleet_executor(self, place, x_data, y_data):
             rank=0,
             node_type="Compute",
             max_run_times=1,
-            max_slot_times=1,
-            lazy_initialize=True)
+            lazy_initialize=True,
+        )
         empty_program._pipeline_opt = {
             "fleet_opt": {
                 'tasks': [task_node],
-                'task_id_to_rank': {
-                    task_node.task_id(): 0
-                }
+                'task_id_to_rank': {task_node.task_id(): 0},
             },
-            "section_program": empty_program
+            "section_program": empty_program,
         }
-        res = exe.run(empty_program,
-                      feed={
-                          'x': x_data,
-                          'y': y_data
-                      },
-                      fetch_list=[z.name, a.name])
+        res = exe.run(
+            empty_program,
+            feed={'x': x_data, 'y': y_data},
+            fetch_list=[z.name, a.name],
+        )
         return res
 
     def test_executor_on_single_device(self):
diff --git a/python/paddle/fluid/tests/unittests/test_flip.py b/python/paddle/fluid/tests/unittests/test_flip.py
index a933595be87839..774978f753935e 100644
--- a/python/paddle/fluid/tests/unittests/test_flip.py
+++ b/python/paddle/fluid/tests/unittests/test_flip.py
@@ -21,6 +21,9 @@
 import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
 from op_test import OpTest
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestFlipOp_API(unittest.TestCase):
@@ -137,6 +140,80 @@ def init_test_case(self):
         self.axis = [-1]
 
 
+class TestFlipDoubleGradCheck(unittest.TestCase):
+
+    def flip_wrapper(self, x):
+        return paddle.flip(x[0], [0, 1])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [3, 2, 2], False, dtype)
+        data.persistable = True
+        out = paddle.flip(data, [0, 1])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.flip_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestFlipTripleGradCheck(unittest.TestCase):
+
+    def flip_wrapper(self, x):
+        return paddle.flip(x[0], [0, 1])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [3, 2, 2], False, dtype)
+        data.persistable = True
+        out = paddle.flip(data, [0, 1])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.flip_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fold_op.py b/python/paddle/fluid/tests/unittests/test_fold_op.py
index 0dec09ddf625dc..960ef5f4f99844 100644
--- a/python/paddle/fluid/tests/unittests/test_fold_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fold_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+
 import math
 import numpy as np
 import unittest
@@ -45,34 +46,64 @@ def init_data(self):
     def calc_fold(self):
         output_shape = [0] * 4
         output_shape[0] = self.batch_size
-        output_shape[1] = int(self.input_channels /
-                              (self.kernel_sizes[0] * self.kernel_sizes[1]))
+        output_shape[1] = int(
+            self.input_channels / (self.kernel_sizes[0] * self.kernel_sizes[1])
+        )
         output_shape[2] = self.output_sizes[0]
         output_shape[3] = self.output_sizes[1]
         dkernel_h = self.dilations[0] * (self.kernel_sizes[0] - 1) + 1
         dkernel_w = self.dilations[1] * (self.kernel_sizes[1] - 1) + 1
-        col_height = int((self.output_sizes[0] + self.paddings[0] +
-                          self.paddings[2] - dkernel_h) / self.strides[0]) + 1
-        col_width = int((self.output_sizes[1] + self.paddings[1] +
-                         self.paddings[3] - dkernel_w) / self.strides[1]) + 1
+        col_height = (
+            int(
+                (
+                    self.output_sizes[0]
+                    + self.paddings[0]
+                    + self.paddings[2]
+                    - dkernel_h
+                )
+                / self.strides[0]
+            )
+            + 1
+        )
+        col_width = (
+            int(
+                (
+                    self.output_sizes[1]
+                    + self.paddings[1]
+                    + self.paddings[3]
+                    - dkernel_w
+                )
+                / self.strides[1]
+            )
+            + 1
+        )
         output = np.zeros(output_shape).astype(np.float64)
         ############ calculate output ##############
         for b in range(output_shape[0]):
             for c in range(self.input_channels):
                 w_offset = int(c % self.kernel_sizes[1])
                 h_offset = int(
-                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0])
+                    (c / self.kernel_sizes[1]) % self.kernel_sizes[0]
+                )
                 c_out = int(c / self.kernel_sizes[0] / self.kernel_sizes[1])
                 for h in range(col_height):
-                    h_out = int(h * self.strides[0] - self.paddings[0] +
-                                h_offset * self.dilations[0])
+                    h_out = int(
+                        h * self.strides[0]
+                        - self.paddings[0]
+                        + h_offset * self.dilations[0]
+                    )
                     for w in range(col_width):
-                        w_out = int(w * self.strides[1] - self.paddings[1] +
-                                    w_offset * self.dilations[1])
+                        w_out = int(
+                            w * self.strides[1]
+                            - self.paddings[1]
+                            + w_offset * self.dilations[1]
+                        )
                         if (h_out >= 0 and h_out < self.output_sizes[0]) and (
-                                w_out >= 0 and w_out < self.output_sizes[1]):
-                            output[b, c_out, h_out,
-                                   w_out] += self.x[b, c, w + col_width * h]
+                            w_out >= 0 and w_out < self.output_sizes[1]
+                        ):
+                            output[b, c_out, h_out, w_out] += self.x[
+                                b, c, w + col_width * h
+                            ]
 
         self.outputs = output
 
@@ -85,7 +116,7 @@ def set_data(self):
             'paddings': self.paddings,
             'dilations': self.dilations,
             'strides': self.strides,
-            'output_sizes': self.output_sizes
+            'output_sizes': self.output_sizes,
         }
         self.outputs = {'Y': self.outputs}
 
@@ -101,9 +132,23 @@ def test_check_grad(self):
         self.check_grad(['X'], 'Y', check_eager=True)
 
 
+class TestFoldshape(TestFoldOp):
+    def init_data(self):
+        self.batch_size = 8
+        self.input_channels = 3 * 3 * 3
+        self.length = 6
+        self.kernel_sizes = [3, 3]
+        self.strides = [1, 1]
+        self.paddings = [0, 0, 0, 0]
+        self.dilations = [1, 1]
+        self.output_sizes = [4, 5]
+        input_shape = [self.batch_size, self.input_channels, self.length]
+        self.x = np.random.rand(*input_shape).astype(np.float64)
+
+
 class TestFoldAPI(TestFoldOp):
 
-    #This is for test on paddle.nn.Fold
+    # This is for test on paddle.nn.Fold
 
     def setUp(self):
         self.op_type = 'fold'
@@ -120,19 +165,19 @@ def test_api(self):
                 m = paddle.nn.Fold(**self.attrs)
                 m.eval()
                 result = m(input)
-                np.testing.assert_allclose(result.numpy(),
-                                           self.outputs['Y'],
-                                           rtol=1e-05)
+                np.testing.assert_allclose(
+                    result.numpy(), self.outputs['Y'], rtol=1e-05
+                )
 
     def test_info(self):
         str(paddle.nn.Fold(**self.attrs))
 
 
 class TestFoldOpError(unittest.TestCase):
-
     def test_errors(self):
         from paddle.nn.functional import fold
         from paddle.fluid.framework import Program, program_guard
+
         with program_guard(Program(), Program()):
 
             def test_input_shape():
@@ -148,59 +193,67 @@ def test_kernel_shape():
             def test_padding_shape():
                 # padding_size must be 2 or 4
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
-                           output_sizes=[2, 3],
-                           kernel_sizes=[2, 2],
-                           paddings=[2, 2, 3])
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    paddings=[2, 2, 3],
+                )
 
             def test_dilations_shape():
                 # dialtions_size must be 2
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
-                           output_sizes=[2, 3],
-                           kernel_sizes=[2, 2],
-                           dilations=[2, 2, 3])
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    dilations=[2, 2, 3],
+                )
 
             def test_strides_shape():
                 # strids_size must be 2
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
-                           output_sizes=[2, 3],
-                           kernel_sizes=[2, 2],
-                           strides=[2, 2, 3])
+                out = fold(
+                    x,
+                    output_sizes=[2, 3],
+                    kernel_sizes=[2, 2],
+                    strides=[2, 2, 3],
+                )
 
             def test_output_size():
                 # im_h * im_w must be L
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
-                           output_sizes=[6, 6],
-                           kernel_sizes=[2, 2],
-                           strides=[1, 1])
+                out = fold(
+                    x, output_sizes=[6, 6], kernel_sizes=[2, 2], strides=[1, 1]
+                )
 
             def test_output_size_2():
                 # out_size must GT 1
                 x = paddle.randn(shape=[2, 6, 6], dtype="float32")
-                out = fold(x,
-                           output_sizes=[0.1, 0.2],
-                           kernel_sizes=[2, 2],
-                           strides=[1, 1])
+                out = fold(
+                    x,
+                    output_sizes=[0.1, 0.2],
+                    kernel_sizes=[2, 2],
+                    strides=[1, 1],
+                )
 
             def test_block_h_w():
                 # test_block_h_w GT 0
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(x,
-                           output_sizes=[1, 1],
-                           kernel_sizes=[2, 2],
-                           strides=1)
+                out = fold(
+                    x, output_sizes=[1, 1], kernel_sizes=[2, 2], strides=1
+                )
 
             def test_GT_0():
                 x = paddle.randn(shape=[2, 1, 1], dtype="float32")
-                out = fold(x,
-                           output_sizes=[0, 0],
-                           kernel_sizes=[0, 0],
-                           dilations=0,
-                           paddings=[0, 0],
-                           strides=0)
+                out = fold(
+                    x,
+                    output_sizes=[0, 0],
+                    kernel_sizes=[0, 0],
+                    dilations=0,
+                    paddings=[0, 0],
+                    strides=0,
+                )
 
             self.assertRaises(AssertionError, test_input_shape)
             self.assertRaises(AssertionError, test_kernel_shape)
diff --git a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
index f911d614ee49bf..18c28144d6e491 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_gate_attention_op.py
@@ -30,10 +30,10 @@
 from paddle.fluid import core
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "Paddle is not compiled with CUDA")
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "Paddle is not compiled with CUDA"
+)
 class TestFusedGateAttentionOp(OpTest):
-
     def setUp(self):
         self.__class__.op_type = "fused_gate_attention"
         # use autograd to check grad in this unittest.
@@ -57,7 +57,6 @@ def config(self):
         self.bias_attr = True
 
     def generate_input_data(self):
-
         def _random(shape):
             if self.dtype == "bfloat16":
                 data = np.random.random(shape).astype("float32")
@@ -67,7 +66,8 @@ def _random(shape):
 
         np.random.seed(123)
         self.query = _random(
-            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim)
+        )
         self.q_weight = _random((self.q_dim, self.num_heads, self.head_dim))
         self.k_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
         self.v_weight = _random((self.kv_dim, self.num_heads, self.head_dim))
@@ -80,15 +80,18 @@ def _random(shape):
             self.qkv_weight = np.stack([q_weight_t, k_weight_t, v_weight_t])
         else:
             self.key = _random(
-                (self.batch_size, self.msa_len, self.m_size, self.kv_dim))
+                (self.batch_size, self.msa_len, self.m_size, self.kv_dim)
+            )
             self.qkv_weight = None
 
         self.attn_mask = _random(
-            (self.batch_size, self.msa_len, 1, 1, self.m_size))
+            (self.batch_size, self.msa_len, 1, 1, self.m_size)
+        )
 
         if self.bias_attr:
             self.nonbatched_bias = _random(
-                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size))
+                (self.batch_size, 1, self.num_heads, self.res_len, self.m_size)
+            )
 
         if self.has_gating:
             self.gating_w = _random((self.q_dim, self.num_heads, self.head_dim))
@@ -98,12 +101,17 @@ def _random(shape):
         self.output_b = _random((self.out_dim))
 
         self.dout = _random(
-            (self.batch_size, self.msa_len, self.res_len, self.q_dim))
+            (self.batch_size, self.msa_len, self.res_len, self.q_dim)
+        )
 
     def collect_outputs(self, query, key, softmax_out, fmha_out, gate_out, out):
         outputs = [
-            softmax_out, fmha_out, gate_out if self.has_gating else None, out,
-            query.grad, None if self.merge_qkv else key.grad
+            softmax_out,
+            fmha_out,
+            gate_out if self.has_gating else None,
+            out,
+            query.grad,
+            None if self.merge_qkv else key.grad,
         ]
         return outputs
 
@@ -111,14 +119,17 @@ def get_reference_out(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
 
         query = paddle.to_tensor(self.query, stop_gradient=False)
-        key = query if self.merge_qkv else paddle.to_tensor(self.key,
-                                                            stop_gradient=False)
+        key = (
+            query
+            if self.merge_qkv
+            else paddle.to_tensor(self.key, stop_gradient=False)
+        )
         q_weight = paddle.to_tensor(self.q_weight, stop_gradient=False)
         k_weight = paddle.to_tensor(self.k_weight, stop_gradient=False)
         v_weight = paddle.to_tensor(self.v_weight, stop_gradient=False)
         src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
 
-        c = self.head_dim**(-0.5)
+        c = self.head_dim ** (-0.5)
         # [batch_size, msa_len, res_len, q_dim], [q_dim, num_heads, head_dim]
         #   -> [batch_size, msa_len, res_len, num_heads, head_dim]
         q = paddle.einsum('nbqa,ahc->nbqhc', query, q_weight) * c
@@ -136,8 +147,9 @@ def get_reference_out(self):
         #   -> [batch_size, msa_len, num_heads, res_len, m_size]
         logits = logits + src_mask
         if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
-                                               stop_gradient=False)
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False
+            )
             # [batch_size, msa_len, num_heads, res_len, m_size], [batch_size, 1, num_heads, res_len, m_size]
             #   -> [batch_size, msa_len, num_heads, res_len, m_size]
             logits = logits + nonbatched_bias
@@ -159,14 +171,22 @@ def get_reference_out(self):
             # gate_values = paddle.einsum('nbqc,chv->nbqhv', query,
             #                             gating_w) + gating_b
             gating_w_2d = paddle.reshape(
-                gating_w, shape=[self.q_dim, self.num_heads * self.head_dim])
+                gating_w, shape=[self.q_dim, self.num_heads * self.head_dim]
+            )
             gate_values_4d = paddle.matmul(query, gating_w_2d)
-            gate_values = paddle.reshape(
-                gate_values_4d,
-                shape=[
-                    self.batch_size, self.msa_len, self.res_len, self.num_heads,
-                    self.head_dim
-                ]) + gating_b
+            gate_values = (
+                paddle.reshape(
+                    gate_values_4d,
+                    shape=[
+                        self.batch_size,
+                        self.msa_len,
+                        self.res_len,
+                        self.num_heads,
+                        self.head_dim,
+                    ],
+                )
+                + gating_b
+            )
             gate_values = nn.functional.sigmoid(gate_values)
             gate_out = fmha_out * gate_values
         else:
@@ -183,20 +203,32 @@ def get_reference_out(self):
             gate_out,
             shape=[
                 self.batch_size * self.msa_len * self.res_len,
-                self.num_heads * self.head_dim
-            ])
+                self.num_heads * self.head_dim,
+            ],
+        )
         output_w_2d = paddle.reshape(
-            output_w, shape=[self.num_heads * self.head_dim, self.out_dim])
+            output_w, shape=[self.num_heads * self.head_dim, self.out_dim]
+        )
         out_2d = paddle.matmul(gate_out_2d, output_w_2d)
-        out = paddle.reshape(
-            out_2d,
-            shape=[self.batch_size, self.msa_len, self.res_len, self.out_dim
-                   ]) + output_b
-
-        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
-                                 retain_graph=True)
-        return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
-                                    out)
+        out = (
+            paddle.reshape(
+                out_2d,
+                shape=[
+                    self.batch_size,
+                    self.msa_len,
+                    self.res_len,
+                    self.out_dim,
+                ],
+            )
+            + output_b
+        )
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+        return self.collect_outputs(
+            query, key, softmax_out, fmha_out, gate_out, out
+        )
 
     def get_fused_gate_attention_out(self):
         paddle.disable_static(place=paddle.CUDAPlace(0))
@@ -218,8 +250,9 @@ def get_fused_gate_attention_out(self):
         src_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
 
         if self.bias_attr:
-            nonbatched_bias = paddle.to_tensor(self.nonbatched_bias,
-                                               stop_gradient=False)
+            nonbatched_bias = paddle.to_tensor(
+                self.nonbatched_bias, stop_gradient=False
+            )
         else:
             nonbatched_bias = None
         if self.has_gating:
@@ -232,18 +265,42 @@ def get_fused_gate_attention_out(self):
         output_w = paddle.to_tensor(self.output_w, stop_gradient=False)
         output_b = paddle.to_tensor(self.output_b, stop_gradient=False)
 
-        _, _, _, _, softmax_out, fmha_out, gate_out, out = _legacy_C_ops.fused_gate_attention(
-            query, key, q_weight, k_weight, v_weight, qkv_weight,
-            nonbatched_bias, src_mask, gating_w, gating_b, output_w, output_b,
-            'has_gating', self.has_gating, 'merge_qkv', self.merge_qkv)
-
-        paddle.autograd.backward([out], [paddle.to_tensor(self.dout)],
-                                 retain_graph=True)
-        return self.collect_outputs(query, key, softmax_out, fmha_out, gate_out,
-                                    out)
+        (
+            _,
+            _,
+            _,
+            _,
+            softmax_out,
+            fmha_out,
+            gate_out,
+            out,
+        ) = _legacy_C_ops.fused_gate_attention(
+            query,
+            key,
+            q_weight,
+            k_weight,
+            v_weight,
+            qkv_weight,
+            nonbatched_bias,
+            src_mask,
+            gating_w,
+            gating_b,
+            output_w,
+            output_b,
+            'has_gating',
+            self.has_gating,
+            'merge_qkv',
+            self.merge_qkv,
+        )
+
+        paddle.autograd.backward(
+            [out], [paddle.to_tensor(self.dout)], retain_graph=True
+        )
+        return self.collect_outputs(
+            query, key, softmax_out, fmha_out, gate_out, out
+        )
 
     def check(self, ref, out, atol, rtol, check_equal, name):
-
         def _convert(value):
             if self.dtype == "bfloat16":
                 return convert_uint16_to_float(value)
@@ -252,19 +309,25 @@ def _convert(value):
         if check_equal:
             self.assertTrue(
                 np.equal(_convert(ref), _convert(out)).all(),
-                "Checking < {} > failed!".format(name))
+                "Checking < {} > failed!".format(name),
+            )
         else:
             np.testing.assert_allclose(
                 _convert(ref),
                 _convert(out),
                 atol=atol,
                 rtol=rtol,
-                err_msg="Checking < {} > failed!".format(name))
+                err_msg="Checking < {} > failed!".format(name),
+            )
 
     def check_output_and_grad(self, atol, rtol):
         output_names = [
-            "softmax_out", "fmha_out", "gate_out", "out", "query_grad",
-            "key_grad"
+            "softmax_out",
+            "fmha_out",
+            "gate_out",
+            "out",
+            "query_grad",
+            "key_grad",
         ]
         outputs_ref = self.get_reference_out()
         outputs_fused = self.get_fused_gate_attention_out()
@@ -280,22 +343,26 @@ def check_output_and_grad(self, atol, rtol):
                 # that in fused ops, check_equal is set to False and we use allclose
                 # to check the correctness.
                 check_equal = False
-                self.check(ref_res.numpy(), fused_res.numpy(), atol, rtol,
-                           check_equal, output_names[i])
+                self.check(
+                    ref_res.numpy(),
+                    fused_res.numpy(),
+                    atol,
+                    rtol,
+                    check_equal,
+                    output_names[i],
+                )
 
     def test_output_and_grad(self):
         self.check_output_and_grad(atol=1e-5, rtol=1e-6)
 
 
 class TestMergeQKVLargeBatchSizeCase(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.batch_size = 2
 
 
 class TestSeparatedQKVCase(TestFusedGateAttentionOp):
-
     def config(self):
         self.dtype = "float32"
         self.has_gating = False
@@ -312,7 +379,6 @@ def config(self):
 
 
 class TestMergeQKVNoBiasGatingCase(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.has_gating = False
@@ -320,7 +386,6 @@ def config(self):
 
 
 class TestMergeQKVFp16Case(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.dtype = "float16"
@@ -332,18 +397,18 @@ def test_output_and_grad(self):
 
 
 class TestMergeQKVLargeBatchSizeFp16Case(TestMergeQKVFp16Case):
-
     def config(self):
         super().config()
         self.batch_size = 2
 
 
 @unittest.skipIf(
-    not core.is_compiled_with_cuda() or get_cuda_version() < 11000,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    not core.is_compiled_with_cuda()
+    or get_cuda_version() < 11000
+    or paddle.device.cuda.get_device_capability()[0] < 8,
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestMergeQKVBF16Case(TestFusedGateAttentionOp):
-
     def config(self):
         super().config()
         self.dtype = "bfloat16"
@@ -353,7 +418,6 @@ def test_output_and_grad(self):
 
 
 class TestMergeQKVLargeBatchSizeBF16Case(TestMergeQKVBF16Case):
-
     def config(self):
         super().config()
         self.batch_size = 2
diff --git a/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
new file mode 100644
index 00000000000000..00f25b4570c0bd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fused_multi_transformer_int8_op.py
@@ -0,0 +1,792 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+import paddle
+import paddle.nn as nn
+import paddle.fluid.core as core
+import paddle.nn.functional as F
+import paddle.incubate.nn.functional as incubate_f
+from paddle.nn.layer.norm import LayerNorm
+from paddle.nn.layer.common import Linear, Dropout
+from paddle.nn.layer.transformer import _convert_attention_mask
+from paddle import tensor
+from paddle.fluid import layers
+import unittest
+from op_test import OpTest
+from paddle.fluid.framework import default_main_program
+from paddle.fluid.dygraph.layers import Layer
+from paddle.fluid.layer_helper import LayerHelper
+from paddle.nn.initializer import Constant
+from paddle.fluid.data_feeder import check_variable_and_dtype, check_dtype
+from paddle.fluid.framework import _non_static_mode, default_main_program
+from paddle import _legacy_C_ops
+
+default_main_program().random_seed = 42
+np.random.seed(0)
+
+
+def fused_multi_transformer_int8(
+    x,
+    ln_scales,
+    ln_biases,
+    qkv_weights,
+    qkv_biases,
+    linear_weights,
+    linear_biases,
+    ffn_ln_scales,
+    ffn_ln_biases,
+    ffn1_weights,
+    ffn1_biases,
+    ffn2_weights,
+    ffn2_biases,
+    pre_layer_norm=True,
+    epsilon=1e-05,
+    cache_kvs=None,
+    time_step=None,
+    attn_mask=None,
+    dropout_rate=0.0,
+    activation="gelu",
+    training=False,
+    mode='upscale_in_train',
+    trans_qkvw=True,
+    ring_id=-1,
+    name=None,
+    qkv_out_scales=None,
+    out_linear_out_scales=None,
+    ffn1_out_scales=None,
+    ffn2_out_scales=None,
+    num_head=0,
+    dim_head=0,
+    dim_ffn=0,
+    qkv_in_scale=[],
+    out_linear_in_scale=[],
+    ffn1_in_scale=[],
+    ffn2_in_scale=[],
+):
+    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+
+    cache_kv_out, final_out = _legacy_C_ops.fused_multi_transformer_int8(
+        x, ln_scales, ln_biases, qkv_weights, qkv_biases, cache_kvs, time_step,
+        attn_mask, linear_weights, linear_biases, ffn_ln_scales, ffn_ln_biases,
+        ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases, qkv_out_scales,
+        out_linear_out_scales, ffn1_out_scales, ffn2_out_scales, cache_kvs,
+        'num_head', num_head, 'dim_head', dim_head, 'dim_ffn', dim_ffn,
+        'qkv_in_scale', qkv_in_scale, 'out_linear_in_scale',
+        out_linear_in_scale, 'ffn1_in_scale', ffn1_in_scale, 'ffn2_in_scale',
+        ffn2_in_scale, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
+        'dropout_rate', dropout_rate, 'is_test', not training,
+        'dropout_implementation', mode, 'act_method', activation, 'trans_qkvw',
+        trans_qkvw, 'ring_id', ring_id)
+    if cache_kvs is not None:
+        return final_out, cache_kv_out
+    return final_out
+
+
+class TestFusedMultiTransformerInt8Op(unittest.TestCase):
+
+    def setUp(self):
+        self.config()
+        self.generate_input_data()
+
+        self.rtol = 1e-5
+        # FIXME(wangxi): Because there is a problem with the test precision
+        #  on A100, atol is temporarily set to 1e-2, and it will be
+        #  changed back after the precision problem is solved.
+        self.atol = 1e-2
+        # make sure local development precision
+        if "V100" in paddle.device.cuda.get_device_name():
+            self.atol = 1e-4
+        if self.x_type is np.float16:
+            self.atol = 1e-1
+
+        paddle.set_default_dtype(self.x_type)
+        self.__class__.op_type = "fused_multi_transformer_int8"
+        # use autograd to check grad in this unittest.
+        self.__class__.no_need_check_grad = True
+
+        paddle.set_default_dtype(np.float32)
+        self.norm = LayerNorm(self.embed_dim,
+                              weight_attr=False,
+                              bias_attr=False)
+        self.ffn_norm = LayerNorm(self.embed_dim,
+                                  weight_attr=False,
+                                  bias_attr=False)
+
+        paddle.set_default_dtype(self.x_type)
+        self.dropout = Dropout(self.dropout_prob, mode="upscale_in_train")
+        self.activation = getattr(F, self.act_method)
+
+    def config(self):
+        # for debug
+        self.debug = False
+
+        self.x_type = np.float32
+        self.attn_mask_type = np.float64
+        #self.attn_mask_type = np.bool
+        self.pre_layer_norm = True
+        self.has_attn_mask = True
+
+        # has_cache_kv, gen_cache_kv, stage
+        # False,        False,        not generation
+        # True,         True,         generation context stage
+        # True,         False,        generation decoder stage
+        self.has_cache_kv = False
+        self.gen_cache_kv = False
+
+        self.training = False
+
+        self.layers = 3
+        self.batch_size = 1
+        self.query_length = 1
+        self.cache_length = 1
+        self.head_dim = 64
+        self.num_heads = 16
+        self.embed_dim = self.head_dim * self.num_heads
+
+        self.dropout_prob = 0.0
+        self.attn_dropout_prob = 0.0
+        self.act_method = 'gelu'
+        self.weight_attr = None
+        self.bias_attr = None
+        self.kdim, self.vdim = self.embed_dim, self.embed_dim
+        self.key_length, self.value_length = self.query_length, self.query_length
+
+    def generate_input_data(self):
+        self.query = np.random.rand(self.batch_size, self.query_length,
+                                    self.embed_dim).astype(self.x_type)
+        q_weight = np.random.randint(-64, 64, [self.embed_dim, self.embed_dim],
+                                     np.int32).astype('float64')
+        k_weight = np.random.randint(-64, 64, [self.kdim, self.embed_dim],
+                                     np.int32).astype('float64')
+        v_weight = np.random.randint(-64, 64, [self.vdim, self.embed_dim],
+                                     np.int32).astype('float64')
+
+        self.q_weight_tensor = paddle.to_tensor(q_weight)
+        self.k_weight_tensor = paddle.to_tensor(k_weight)
+        self.v_weight_tensor = paddle.to_tensor(v_weight)
+
+        out_weight = np.random.randint(-64, 64,
+                                       [self.embed_dim, self.embed_dim],
+                                       np.int32).astype('float64')
+        ffn1_weight = np.random.randint(-64, 64,
+                                        [self.embed_dim, 4 * self.embed_dim],
+                                        np.int32).astype('float64')
+        ffn2_weight = np.random.randint(-64, 64,
+                                        [4 * self.embed_dim, self.embed_dim],
+                                        np.int32).astype('float64')
+
+        self.out_weight_tensor = paddle.to_tensor(out_weight)
+        self.ffn1_weight_tensor = paddle.to_tensor(ffn1_weight)
+        self.ffn2_weight_tensor = paddle.to_tensor(ffn2_weight)
+
+        q_proj_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+        k_proj_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+        v_proj_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+
+        self.q_proj_bias_tensor = paddle.to_tensor(q_proj_bias)
+        self.k_proj_bias_tensor = paddle.to_tensor(k_proj_bias)
+        self.v_proj_bias_tensor = paddle.to_tensor(v_proj_bias)
+
+        out_linear_proj_bias = np.random.rand(self.embed_dim).astype(
+            self.x_type)
+        ffn1_proj_bias = np.random.rand(4 * self.embed_dim).astype(self.x_type)
+        ffn2_proj_bias = np.random.rand(self.embed_dim).astype(self.x_type)
+
+        self.out_linear_proj_bias_tensor = paddle.to_tensor(
+            out_linear_proj_bias)
+        self.ffn1_proj_bias_tensor = paddle.to_tensor(ffn1_proj_bias)
+        self.ffn2_proj_bias_tensor = paddle.to_tensor(ffn2_proj_bias)
+
+        out_seq_len = self.key_length
+
+        self.qkv_in_scales = []
+        self.qkv_out_scales = []
+        self.out_linear_in_scales = []
+        self.out_linear_out_scales = []
+        self.ffn1_in_scales = []
+        self.ffn1_out_scales = []
+        self.ffn2_in_scales = []
+        self.ffn2_out_scales = []
+
+        if self.has_cache_kv:
+            self.cache_kv = np.random.rand(2, self.batch_size, self.num_heads,
+                                           self.cache_length,
+                                           self.head_dim).astype(self.x_type)
+
+            if self.gen_cache_kv:
+                self.cache_kv[:] = 0
+            else:
+                out_seq_len += self.cache_length
+        else:
+            self.cache_kv = None
+
+        if self.has_attn_mask:
+            # [B, n_head, seq_len, out_seq_len]
+            self.attn_mask = np.ones(
+                (self.batch_size, 1, self.query_length, out_seq_len),
+                dtype=self.attn_mask_type)
+            if self.attn_mask_type == np.int64:
+                self.attn_mask = np.tril(self.attn_mask)
+            elif self.attn_mask_type == np.float64:
+                if self.has_cache_kv and not self.gen_cache_kv:
+                    # NOTE: decoder stage, -1(out_seq_len) should no mask
+                    self.attn_mask[:, :, :, -2] = 0.0
+                    self.attn_mask = (self.attn_mask - 1.0) * 1e4
+                else:
+                    self.attn_mask = (np.tril(self.attn_mask) - 1.0) * 1e4
+            elif self.attn_mask_type == np.bool_:
+                if self.has_cache_kv and not self.gen_cache_kv:
+                    self.attn_mask[:, :, :, -2] = 0
+                else:
+                    self.attn_mask = np.tril(self.attn_mask)
+            else:
+                raise ValueError(
+                    "'attn_mask_type' should be 'int64' or 'float64'.")
+        else:
+            self.attn_mask = None
+
+    def fake_quant(self, input, scale):
+        quant_value = 127.0 * (1.0 / scale) * paddle.cast(input, 'float32')
+        quant_value = paddle.round(quant_value)
+
+        # No need to clip here because scale is the max value
+
+        return paddle.cast(quant_value, 'float64')
+
+    def GetBaselineOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+        tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
+
+        cache_kvs = []
+        cache_kv = None
+        if self.has_cache_kv:
+            cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False)
+
+        if self.has_attn_mask:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
+        else:
+            attn_mask = None
+        for i in range(self.layers):
+            residual = tensor_query
+            ln1_out = tensor_query
+            if self.pre_layer_norm:
+                ln1_out = self.norm(tensor_query)
+            max_v = paddle.max(paddle.abs(paddle.cast(ln1_out, 'float32')))[0]
+            # self.qkv_in_scales.append(127.0 / max_v)
+            self.qkv_in_scales.append(max_v)
+            self.qkv_out_scales.append(127.0 * 127.0)
+            # print('qkv_in_scales ', i, self.qkv_in_scales[i])
+            # print('qkv_out_scales ', i, self.qkv_out_scales[i])
+
+            # quant ln1_out
+            ln1_out = self.fake_quant(ln1_out, self.qkv_in_scales[i])
+
+            q = paddle.nn.functional.linear(ln1_out, self.q_weight_tensor)
+            # de quant
+            q = paddle.cast(
+                paddle.cast(q, 'float32') * self.qkv_in_scales[i] /
+                self.qkv_out_scales[i], self.x_type)
+
+            q = q + self.q_proj_bias_tensor
+            q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
+            q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3])
+
+            k = paddle.nn.functional.linear(ln1_out, self.k_weight_tensor)
+            k = paddle.cast(
+                paddle.cast(k, 'float32') * self.qkv_in_scales[i] /
+                self.qkv_out_scales[i], self.x_type)
+            k = k + self.k_proj_bias_tensor
+            v = paddle.nn.functional.linear(ln1_out, self.v_weight_tensor)
+            v = paddle.cast(
+                paddle.cast(v, 'float32') * self.qkv_in_scales[i] /
+                self.qkv_out_scales[i], self.x_type)
+            v = v + self.v_proj_bias_tensor
+
+            k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
+            k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3])
+            v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
+            v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])
+
+            if self.has_cache_kv:
+                # [1, B, n_head, cache_seq_len, head_dim]
+                cache_k, cache_v = paddle.split(cache_kv, 2)
+                cache_k = paddle.squeeze(cache_k, axis=0)
+                cache_v = paddle.squeeze(cache_v, axis=0)
+                # [B, n_head, cache_seq_len + seq_len, head_dim]
+                # out_seq_len = cache_seq_len + seq_len
+                if self.debug:
+                    print('q out is')
+                    print(q_out[0, 0, :, :])
+                    print('cache k out seq=128')
+                    print(k_out[0, 0, :, :])
+                if self.gen_cache_kv:
+                    cache_kvs.append((k_out, v_out))
+                else:
+                    k_out = paddle.concat([cache_k, k_out], axis=-2)
+                    v_out = paddle.concat([cache_v, v_out], axis=-2)
+
+            # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim]
+            # --> [B, n_head, seq_len, out_seq_len]
+            qk_out = layers.matmul(x=q_out,
+                                   y=k_out,
+                                   transpose_y=True,
+                                   alpha=self.head_dim**-0.5)
+
+            if self.debug:
+                print('qk out is')
+                print(qk_out[0][0][0])
+
+            if attn_mask is not None:
+                attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
+                attn_mask_out = qk_out + attn_mask
+                if self.debug:
+                    print('attn mask out is')
+                    print(attn_mask_out[0][0][0])
+                softmax_out = F.softmax(attn_mask_out)
+            else:
+                softmax_out = F.softmax(qk_out)
+
+            if self.debug:
+                print('softmax out is')
+                print(softmax_out[0][0][0])
+            if self.dropout_prob:
+                dropout_out = F.dropout(softmax_out,
+                                        self.dropout_prob,
+                                        training=self.training,
+                                        mode="upscale_in_train")
+                # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim]
+                # --> [B, n_head, seq_len, head_dim]
+                qktv_out = tensor.matmul(dropout_out, v_out)
+            else:
+                qktv_out = tensor.matmul(softmax_out, v_out)
+
+            fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3])
+            if self.debug:
+                print('fmha out is')
+                print(fmha_out[0][0][0])
+            out_linear_in = tensor.reshape(
+                x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]])
+
+            max_v = paddle.max(paddle.abs(paddle.cast(out_linear_in,
+                                                      'float32')))[0]
+            # self.out_linear_in_scales.append(127.0 / max_v)
+
+            self.out_linear_in_scales.append(max_v)
+            self.out_linear_out_scales.append((127.0 * 127.0))
+            out_linear_in = self.fake_quant(out_linear_in,
+                                            self.out_linear_in_scales[i])
+
+            out = paddle.nn.functional.linear(out_linear_in,
+                                              self.out_weight_tensor)
+
+            out = paddle.cast(
+                paddle.cast(out, 'float32') * self.out_linear_in_scales[i] /
+                self.out_linear_out_scales[i], self.x_type)
+
+            out = out + self.out_linear_proj_bias_tensor
+
+            residual_out = residual + self.dropout(out)
+            if not self.pre_layer_norm:
+                attn_out = self.norm(residual_out)
+            else:
+                attn_out = residual_out
+
+            ffn_ln_out = attn_out
+            if self.pre_layer_norm:
+                ffn_ln_out = self.ffn_norm(attn_out)
+
+            max_v = paddle.max(paddle.abs(paddle.cast(ffn_ln_out,
+                                                      'float32')))[0]
+            self.ffn1_in_scales.append(max_v)
+            self.ffn1_out_scales.append((127.0 * 127.0))
+            ffn_ln_out = self.fake_quant(ffn_ln_out, self.ffn1_in_scales[i])
+
+            ffn1_out = paddle.nn.functional.linear(ffn_ln_out,
+                                                   self.ffn1_weight_tensor)
+
+            ffn1_out = paddle.cast(
+                paddle.cast(ffn1_out, 'float32') * self.ffn1_in_scales[i] /
+                self.ffn1_out_scales[i], self.x_type)
+
+            ffn1_out = ffn1_out + self.ffn1_proj_bias_tensor
+            ffn1_out = self.dropout(self.activation(ffn1_out))
+
+            max_v = paddle.max(paddle.abs(paddle.cast(ffn1_out, 'float32')))[0]
+            # self.ffn2_in_scales.append(127.0 / max_v)
+            self.ffn2_in_scales.append(max_v)
+            self.ffn2_out_scales.append((127.0 * 127.0))
+            # print('ffn2_in_scales ', i, self.ffn2_in_scales[i])
+            ffn1_out = self.fake_quant(ffn1_out, self.ffn2_in_scales[i])
+
+            ffn2_out = paddle.nn.functional.linear(ffn1_out,
+                                                   self.ffn2_weight_tensor)
+
+            ffn2_out = paddle.cast(
+                paddle.cast(ffn2_out, 'float32') * self.ffn2_in_scales[i] /
+                self.ffn2_out_scales[i], self.x_type)
+            ffn2_out = ffn2_out + self.ffn2_proj_bias_tensor
+
+            residual_out = attn_out + self.dropout(ffn2_out)
+            # print("residual ", attn_out)
+            # print("residual_out ", residual_out)
+            final_out = residual_out
+            if not self.pre_layer_norm:
+                final_out = self.ffn_norm(residual_out)
+
+            tensor_query = final_out
+
+        if self.has_cache_kv and self.gen_cache_kv:
+            return final_out, cache_kvs
+        return final_out
+
+    def GetFusedMultiTransformerOut(self):
+        paddle.disable_static(place=paddle.CUDAPlace(0))
+
+        ln_scale = paddle.ones([self.embed_dim], 'float32')
+        ln_bias = paddle.zeros([self.embed_dim], 'float32')
+        ffn_ln_scale = ln_scale
+        ffn_ln_bias = ln_bias
+
+        q_proj_weight = self.q_weight_tensor.numpy().transpose((1, 0))
+        k_proj_weight = self.k_weight_tensor.numpy().transpose((1, 0))
+        v_proj_weight = self.v_weight_tensor.numpy().transpose((1, 0))
+        qkv_weight = np.concatenate(
+            (q_proj_weight, k_proj_weight, v_proj_weight))
+        qkv_weight = qkv_weight.reshape(
+            (3, self.num_heads, self.head_dim, self.embed_dim))
+
+        qkv_weight_tensor = paddle.to_tensor(qkv_weight)
+        qkv_weight_tensor = paddle.cast(qkv_weight_tensor, 'int8')
+
+        out_weight_tensor = paddle.cast(
+            paddle.to_tensor(self.out_weight_tensor.numpy().transpose((1, 0))),
+            'int8')
+        ffn1_weight_tensor = paddle.cast(
+            paddle.to_tensor(self.ffn1_weight_tensor.numpy().transpose((1, 0))),
+            'int8')
+        ffn2_weight_tensor = paddle.cast(
+            paddle.to_tensor(self.ffn2_weight_tensor.numpy().transpose((1, 0))),
+            'int8')
+
+        qkv_bias = np.concatenate(
+            (self.q_proj_bias_tensor.numpy(), self.k_proj_bias_tensor.numpy(),
+             self.v_proj_bias_tensor.numpy()))
+        qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim))
+        qkv_bias_tensor = paddle.to_tensor(qkv_bias)
+
+        x = paddle.to_tensor(self.query, stop_gradient=True)
+        cache_kvs, cache_kv = None, None
+        time_step = None
+        if self.has_cache_kv:
+            cache_kvs = []
+
+            max_seq_length = (self.cache_length + 128) // 128 * 128
+            cache_kv = np.zeros([
+                2, self.batch_size, self.num_heads, max_seq_length,
+                self.head_dim
+            ],
+                                dtype=self.x_type)
+
+            elems = 4
+            if self.x_type is np.float16:
+                elems = 8
+
+            assert self.head_dim % elems == 0
+            v_elems = self.head_dim // elems
+
+            # [B, num_head, 128, head_dim]
+            # cache_k_tmp = self.cache_kv[0, :]
+            # [B, num_head, 128, head_dim / 4, 4]
+            cache_k_tmp = self.cache_kv[0].reshape([
+                self.batch_size, self.num_heads, self.cache_length, v_elems,
+                elems
+            ])
+            # [B, num_head, head_dim / 4, 128, 4]
+            cache_k_tmp = cache_k_tmp.transpose([0, 1, 3, 2, 4])
+
+            cache_kv[0, :].reshape([
+                self.batch_size, self.num_heads, v_elems, max_seq_length, elems
+            ])[:, :, :, :self.cache_length, :] = cache_k_tmp
+
+            cache_kv[1, :, :, :self.cache_length, :] = self.cache_kv[1]
+            if self.gen_cache_kv:
+                assert self.query_length == self.cache_length
+                cache_kv[:] = 0
+            else:
+                time_step = paddle.to_tensor([self.cache_length],
+                                             dtype='int32',
+                                             place=paddle.CPUPlace())
+        if self.has_attn_mask:
+            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=True)
+        else:
+            attn_mask = None
+        epsilon = 1e-05
+        ln2_epsilon = 1e-05
+
+        if attn_mask is not None and self.attn_mask_type != np.bool_:
+            attn_mask = _convert_attention_mask(attn_mask, x.dtype)
+
+        qkv_weights, qkv_biases = [], []
+        out_weights, out_biases = [], []
+        ln_scales, ln_biases = [], []
+        ffn1_weights, ffn1_biases = [], []
+        ffn2_weights, ffn2_biases = [], []
+        ffn_ln_scales, ffn_ln_biases = [], []
+        qkv_in_scale = []
+        out_linear_in_scale = []
+        ffn1_in_scale = []
+        ffn2_in_scale = []
+
+        qkv_out_scales_tensor = paddle.ones([self.layers, 3 * self.embed_dim],
+                                            'float32')
+        out_linear_out_scales_tensor = paddle.ones(
+            [self.layers, self.embed_dim], 'float32')
+        ffn1_out_scales_tensor = paddle.ones([self.layers, 4 * self.embed_dim],
+                                             'float32')
+        ffn2_out_scales_tensor = paddle.ones([self.layers, self.embed_dim],
+                                             'float32')
+
+        for i in range(self.layers):
+            qkv_weights.append(qkv_weight_tensor)
+            qkv_biases.append(qkv_bias_tensor)
+            out_weights.append(out_weight_tensor)
+            out_biases.append(self.out_linear_proj_bias_tensor)
+            ln_scales.append(ln_scale)
+            ln_biases.append(ln_bias)
+            ffn1_weights.append(ffn1_weight_tensor)
+            ffn1_biases.append(self.ffn1_proj_bias_tensor)
+            ffn2_weights.append(ffn2_weight_tensor)
+            ffn2_biases.append(self.ffn2_proj_bias_tensor)
+            ffn_ln_scales.append(ffn_ln_scale)
+            ffn_ln_biases.append(ffn_ln_bias)
+            qkv_in_scale.append(self.qkv_in_scales[i])
+            out_linear_in_scale.append(self.out_linear_in_scales[i])
+            ffn1_in_scale.append(self.ffn1_in_scales[i])
+            ffn2_in_scale.append(self.ffn2_in_scales[i])
+
+            qkv_out_scales_tensor[i, :] *= self.qkv_out_scales[i]
+            out_linear_out_scales_tensor[i, :] *= self.out_linear_out_scales[i]
+            ffn1_out_scales_tensor[i, :] *= self.ffn1_out_scales[i]
+            ffn2_out_scales_tensor[i, :] *= self.ffn2_out_scales[i]
+
+            if self.has_cache_kv:
+                cache_kvs.append(paddle.to_tensor(cache_kv, stop_gradient=True))
+
+        final_out = fused_multi_transformer_int8(
+            x,
+            ln_scales,
+            ln_biases,
+            qkv_weights,
+            qkv_biases,
+            out_weights,
+            out_biases,
+            ffn_ln_scales,
+            ffn_ln_biases,
+            ffn1_weights,
+            ffn1_biases,
+            ffn2_weights,
+            ffn2_biases,
+            pre_layer_norm=self.pre_layer_norm,
+            epsilon=epsilon,
+            cache_kvs=cache_kvs,
+            time_step=time_step,
+            attn_mask=attn_mask,
+            dropout_rate=self.dropout_prob,
+            training=self.training,
+            mode='upscale_in_train',
+            trans_qkvw=True,
+            ring_id=-1,
+            name=None,
+            qkv_out_scales=qkv_out_scales_tensor,
+            out_linear_out_scales=out_linear_out_scales_tensor,
+            ffn1_out_scales=ffn1_out_scales_tensor,
+            ffn2_out_scales=ffn2_out_scales_tensor,
+            num_head=self.num_heads,
+            dim_head=self.head_dim,
+            dim_ffn=4 * self.embed_dim,
+            qkv_in_scale=qkv_in_scale,
+            out_linear_in_scale=out_linear_in_scale,
+            ffn1_in_scale=ffn1_in_scale,
+            ffn2_in_scale=ffn2_in_scale)
+
+        if self.has_cache_kv:
+            return final_out[0], final_out[1]
+
+        return final_out
+
+    def test_fused_multi_transformer_op(self):
+        final_out_ref = self.GetBaselineOut()
+        final_out = self.GetFusedMultiTransformerOut()
+        if self.has_cache_kv:
+            final_out, cache_kv_out = final_out
+            s = cache_kv_out[0].shape
+            bsz = s[1]
+            num_head = s[2]
+            max_seq_len = s[3]
+            head_dim = s[4]
+            elems = 8 if self.x_type is np.float16 else 4
+            v_elems = head_dim // elems
+
+            if self.debug:
+                print("cache_k out timestep=128")
+                print(cache_kv_out[0].reshape(
+                    [2, bsz, num_head, v_elems, max_seq_len,
+                     elems])[0, 0, 0, :, self.cache_length, :])
+
+                print("cache_v out timestep=128")
+                print(cache_kv_out[0][1, 0, 0, self.cache_length, :])
+
+            if self.gen_cache_kv:
+                final_out_ref, cache_kvs = final_out_ref
+                for i in range(self.layers):
+                    cache_k_ref = cache_kvs[i][0]
+                    cache_v_ref = cache_kvs[i][1]
+
+                    cache_k = cache_kv_out[i][0, :]
+                    cache_k = cache_k.reshape(
+                        [bsz, num_head, v_elems, max_seq_len, elems])
+                    cache_k = cache_k[:, :, :, :self.cache_length, :]
+                    cache_k = cache_k.transpose([0, 1, 3, 2, 4])
+                    cache_k = cache_k.reshape(
+                        [bsz, num_head, self.cache_length, head_dim])
+
+                    cache_v = cache_kv_out[i][1, :, :, :self.cache_length, :]
+
+                    np.testing.assert_allclose(cache_k_ref,
+                                               cache_k,
+                                               rtol=self.rtol,
+                                               atol=self.atol)
+                    np.testing.assert_allclose(cache_v_ref,
+                                               cache_v,
+                                               rtol=self.rtol,
+                                               atol=self.atol)
+                    if i == 0:
+                        break
+
+        np.testing.assert_allclose(final_out_ref,
+                                   final_out,
+                                   rtol=self.rtol,
+                                   atol=self.atol)
+
+
+class TestFusedMultiTransformerInt8OpFp16(TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.x_type = np.float16
+        self.layers = 3  # odd layers
+
+
+class TestFusedMultiTransformerInt8OpCacheKV(TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        super().generate_input_data()
+        self.has_cache_kv = True
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+        self.layers = 3  # odd layers
+
+
+class TestFusedMultiTransformerInt8OpCacheKVFp16(TestFusedMultiTransformerInt8Op
+                                                 ):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+        self.x_type = np.float16
+
+
+class TestFusedMultiTransformerInt8OpGenCacheKV(TestFusedMultiTransformerInt8Op
+                                                ):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.gen_cache_kv = True
+
+
+class TestFusedMultiTransformerInt8OpGenCacheKVFp16(
+        TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.gen_cache_kv = True
+        self.x_type = np.float16
+        self.layers = 3  # odd layers
+
+
+class TestFusedMultiTransformerInt8OpPostLayerNormFp16(
+        TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.x_type = np.float16
+        self.layers = 3  # odd layers
+        self.pre_layer_norm = False
+
+
+class TestFusedMultiTransformerInt8OpCacheKVPostLayerNorm(
+        TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+        self.layers = 3  # odd layers
+        self.pre_layer_norm = False
+
+
+class TestFusedMultiTransformerInt8OpCacheKVPostLayerNormFp16(
+        TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.query_length = 1
+        self.key_length, self.value_length = 1, 1
+        self.x_type = np.float16
+        self.pre_layer_norm = False
+
+
+class TestFusedMultiTransformerInt8OpGenCacheKVPostLayerNorm(
+        TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.gen_cache_kv = True
+        self.pre_layer_norm = False
+
+
+class TestFusedMultiTransformerInt8OpGenCacheKVPostLayerNormFp16(
+        TestFusedMultiTransformerInt8Op):
+
+    def config(self):
+        super().config()
+        self.has_cache_kv = True
+        self.gen_cache_kv = True
+        self.x_type = np.float16
+        self.layers = 3  # odd layers
+        self.pre_layer_norm = False
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
index f3a5acc048404d..242ed7e4a745d5 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_tree_op.py
@@ -125,6 +125,25 @@ def test_type_parents():
                 fluid.layers.gather_tree(ids, bad_parents)
 
             self.assertRaises(TypeError, test_type_parents)
+
+            def test_ids_ndim():
+                bad_ids = fluid.layers.data(name='bad_test_ids',
+                                            shape=[5, 2],
+                                            dtype='int64',
+                                            append_batch_size=False)
+                paddle.nn.functional.gather_tree(bad_ids, parents)
+
+            self.assertRaises(ValueError, test_ids_ndim)
+
+            def test_parents_ndim():
+                bad_parents = fluid.layers.data(name='bad_test_parents',
+                                                shape=[5, 2],
+                                                dtype='int64',
+                                                append_batch_size=False)
+                paddle.nn.functional.gather_tree(ids, bad_parents)
+
+            self.assertRaises(ValueError, test_parents_ndim)
+
         paddle.disable_static()
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_iinfo_and_finfo.py b/python/paddle/fluid/tests/unittests/test_iinfo_and_finfo.py
new file mode 100644
index 00000000000000..9debbccdb3d7ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_iinfo_and_finfo.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+import numpy as np
+
+
+class TestIInfoAndFInfoAPI(unittest.TestCase):
+
+    def test_invalid_input(self):
+        for dtype in [
+                paddle.float16, paddle.float32, paddle.float64, paddle.bfloat16,
+                paddle.complex64, paddle.complex128, paddle.bool
+        ]:
+            with self.assertRaises(ValueError):
+                _ = paddle.iinfo(dtype)
+
+    def test_iinfo(self):
+        for paddle_dtype, np_dtype in [(paddle.int64, np.int64),
+                                       (paddle.int32, np.int32),
+                                       (paddle.int16, np.int16),
+                                       (paddle.int8, np.int8),
+                                       (paddle.uint8, np.uint8)]:
+            xinfo = paddle.iinfo(paddle_dtype)
+            xninfo = np.iinfo(np_dtype)
+            self.assertEqual(xinfo.bits, xninfo.bits)
+            self.assertEqual(xinfo.max, xninfo.max)
+            self.assertEqual(xinfo.min, xninfo.min)
+            self.assertEqual(xinfo.dtype, xninfo.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_initializer.py b/python/paddle/fluid/tests/unittests/test_initializer.py
index 5ab219df5ec8b8..01b684b09abe78 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
@@ -245,7 +245,7 @@ def test_normal_initializer(self, dtype="float32"):
                                    name="param",
                                    initializer=initializer.NormalInitializer(
                                        2.3, 1.9, 123))
-        num_ops = 2 if (dtype == "float16" or dtype == "uint16") else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
@@ -390,7 +390,6 @@ def test_xavier_initializer_fp16(self):
         """Test the Xavier initializer with float16
         """
         block = self.test_xavier_initializer_supplied_arguments("float16")
-        self.assertTrue(check_cast_op(block.ops[1]))
 
     def test_xavier_initializer_bf16(self):
         """Test the Xavier initializer with bfloat16
@@ -400,7 +399,6 @@ def test_xavier_initializer_bf16(self):
         self.assertEqual(len(block_uniform.ops), 1)
         block_gaussian = self.test_xavier_initializer_supplied_arguments(
             "uint16", False)
-        self.assertTrue(check_cast_op(block_gaussian.ops[1]))
 
 
 class TestMSRAInitializer(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_initializer_nn.py b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
index 0f4a2e7a67c6bb..9e02eb3e7701b0 100644
--- a/python/paddle/fluid/tests/unittests/test_initializer_nn.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer_nn.py
@@ -398,7 +398,7 @@ def test_normal_initializer(self, dtype="float32"):
                                    lod_level=0,
                                    name="param",
                                    initializer=initializer.Normal(2.3, 1.9))
-        num_ops = 2 if dtype in ["float16", "uint16"] else 1
+        num_ops = 1
         self.assertEqual(len(block.ops), num_ops)
         init_op = block.ops[0]
         self.assertEqual(init_op.type, 'gaussian_random')
diff --git a/python/paddle/fluid/tests/unittests/test_jit_layer.py b/python/paddle/fluid/tests/unittests/test_jit_layer.py
index bc5658127b2da3..5a03e0ac3b80ee 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_layer.py
@@ -18,7 +18,7 @@
 import tempfile
 import numpy as np
 from paddle.static import InputSpec
-from paddle.fluid.framework import _enable_legacy_dygraph
+from paddle.fluid.framework import _dygraph_place_guard
 from paddle.jit.layer import Layer
 from paddle.fluid.dygraph.dygraph_to_static.program_translator import ProgramTranslator
 
@@ -51,9 +51,14 @@ def infer(self, input):
 
 class TestMultiLoad(unittest.TestCase):
 
-    def test_multi_load(self):
+    def setUp(self):
         self.temp_dir = tempfile.TemporaryDirectory()
 
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_multi_load(self):
+
         x = paddle.full([2, 4], 2)
         model = Net()
         program_translator = ProgramTranslator()
@@ -74,8 +79,41 @@ def test_multi_load(self):
         np.testing.assert_allclose(forward_out1, forward_out2[0], rtol=1e-05)
         np.testing.assert_allclose(infer_out1, infer_out2[0], rtol=1e-05)
 
+
+class SaveLinear(paddle.nn.Layer):
+
+    def __init__(self):
+        super().__init__()
+        self.linear = paddle.nn.Linear(80, 80)
+
+    @paddle.jit.to_static(
+        input_spec=[InputSpec(shape=[None, 80], dtype='float32')])
+    def forward(self, x):
+        out = self.linear(x)
+        return out
+
+
+class TestMKLOutput(unittest.TestCase):
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
         self.temp_dir.cleanup()
 
+    def test_mkl_output(self):
+        with _dygraph_place_guard(place=paddle.CPUPlace()):
+            net = SaveLinear()
+            model_path = os.path.join(self.temp_dir.name, 'save_linear')
+            paddle.jit.save(net, model_path, combine_params=True)
+
+            layer = Layer()
+            layer.load(model_path, paddle.CPUPlace())
+            x = paddle.ones([498, 80])
+            out = layer.forward(x)
+            out = paddle.unsqueeze(out[0], 0)
+            np.testing.assert_equal(out.shape, [1, 498, 80])
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 507083755c061a..da89cbf33c3ff1 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -1740,6 +1740,40 @@ def test_jit_save_incompatible_input_sepc(self):
             shutil.rmtree(save_dir)
 
 
+class NotJitForward(paddle.nn.Layer):
+
+    def __init__(self):
+        super(NotJitForward, self).__init__()
+
+    def forward(self, x, y):
+        return x + y
+
+
+class TestNotJitForward(unittest.TestCase):
+
+    def setUp(self):
+        self.temp_dir = tempfile.TemporaryDirectory()
+
+    def tearDown(self):
+        self.temp_dir.cleanup()
+
+    def test_jit_not_save_forward(self):
+        layer = NotJitForward()
+
+        save_dir = os.path.join(self.temp_dir.name, "jit_not_save_forward")
+        path = save_dir + "/model"
+
+        paddle.jit.save(layer=layer, path=path, skip_forward=True)
+
+        self.assertTrue(not os.path.exists(path + ".pdmodel"))
+        self.assertTrue(not os.path.exists(path + ".pdparam"))
+
+        with self.assertRaises(ValueError):
+            paddle.jit.load(path=path)
+
+        shutil.rmtree(save_dir)
+
+
 if __name__ == '__main__':
     with fluid.framework._test_eager_guard():
         unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_launch_coverage.py b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
index e4c35a63471853..125b56ec3e6f04 100644
--- a/python/paddle/fluid/tests/unittests/test_launch_coverage.py
+++ b/python/paddle/fluid/tests/unittests/test_launch_coverage.py
@@ -23,7 +23,7 @@
 import paddle.fluid as fluid
 
 from argparse import ArgumentParser, REMAINDER
-from paddle.distributed.utils import _print_arguments, get_gpus, get_cluster_from_args
+from paddle.distributed.utils.launch_utils import _print_arguments, get_gpus, get_cluster_from_args
 from paddle.distributed.fleet.launch_utils import find_free_ports
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_layout_autotune.py b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
index b502f405bd77a9..497bcd89e176a0 100644
--- a/python/paddle/fluid/tests/unittests/test_layout_autotune.py
+++ b/python/paddle/fluid/tests/unittests/test_layout_autotune.py
@@ -46,6 +46,13 @@ def forward(self, image):
 
 class LayoutAutoTune(unittest.TestCase):
 
+    def test_config(self):
+        paddle.fluid.core.enable_layout_autotune()
+        if self.use_autoune():
+            self.assertEqual(paddle.fluid.core.use_layout_autotune(), True)
+            paddle.fluid.core.disable_layout_autotune()
+        self.assertEqual(paddle.fluid.core.use_layout_autotune(), False)
+
     def setUp(self):
         self.use_autoune()
 
@@ -86,18 +93,9 @@ def train(self, data_format):
         return conv_out, predict
 
     def test_enable_autotune(self):
-        if self.use_autoune():
-            conv_out, predict = self.train(data_format="NCHW")
-            if paddle.fluid.core.use_layout_autotune():
-                self.assertEqual(conv_out.shape, [1, 14, 14, 8])
-                self.assertEqual(predict.shape, [1, 2])
-            else:
-                self.assertEqual(conv_out.shape, [1, 8, 14, 14])
-                self.assertEqual(predict.shape, [1, 2])
-        else:
-            conv_out, predict = self.train(data_format="NCHW")
-            self.assertEqual(conv_out.shape, [1, 8, 14, 14])
-            self.assertEqual(predict.shape, [1, 2])
+        conv_out, predict = self.train(data_format="NCHW")
+        self.assertEqual(conv_out.shape, [1, 8, 14, 14])
+        self.assertEqual(predict.shape, [1, 2])
 
     def test_transpose_op_transposer(self):
         conv = paddle.nn.Conv2D(3, 8, (3, 3))
@@ -117,12 +115,8 @@ def test_transpose_op_transposer(self):
         scaled.backward()
         scaler.minimize(optimizer, scaled)
 
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [1, 12, 8, 14])
-        else:
-            self.assertEqual(conv_out.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [1, 12, 8, 14])
+        self.assertEqual(conv_out.shape, [1, 8, 14, 12])
+        self.assertEqual(out.shape, [1, 12, 8, 14])
 
     def test_flatten_op_transposer(self):
         conv = paddle.nn.Conv2D(3, 8, (3, 3))
@@ -136,12 +130,8 @@ def test_flatten_op_transposer(self):
             # because it flatten the C and H dimensions.
             out = flatten(conv_out)
 
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [1, 112, 12])
-        else:
-            self.assertEqual(conv_out.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [1, 112, 12])
+        self.assertEqual(conv_out.shape, [1, 8, 14, 12])
+        self.assertEqual(out.shape, [1, 112, 12])
 
     def test_argmax_op_transposer_keep_dims(self):
         conv = paddle.nn.Conv2D(3, 8, (3, 3))
@@ -150,41 +140,8 @@ def test_argmax_op_transposer_keep_dims(self):
             conv_out = conv(data)
             # conv_out.shape = [1, 14, 12, 8] with NHWC
             out = paddle.argmax(conv_out, axis=1, keepdim=True)
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [1, 14, 12, 1])
-        else:
-            self.assertEqual(conv_out.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [1, 1, 14, 12])
-
-    def test_argmax_op_transposer_ff(self):
-        conv = paddle.nn.Conv2D(3, 8, (3, 3))
-        data = paddle.rand([1, 3, 16, 14])
-        with paddle.amp.auto_cast(level="O2"):
-            conv_out = conv(data)
-            # conv_out.shape = [1, 14, 12, 8] with NHWC
-            out = paddle.argmax(conv_out)
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [1])
-        else:
-            self.assertEqual(conv_out.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [1])
-
-    def test_argmax_op_transposer_t(self):
-        conv = paddle.nn.Conv2D(3, 8, (3, 3))
-        data = paddle.rand([1, 3, 16, 14])
-        with paddle.amp.auto_cast(level="O2"):
-            conv_out = conv(data)
-            # conv_out.shape = [1, 14, 12, 8] with NHWC
-            out = paddle.argmax(conv_out)
-
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [1])
-        else:
-            self.assertEqual(conv_out.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [1])
+        self.assertEqual(conv_out.shape, [1, 8, 14, 12])
+        self.assertEqual(out.shape, [1, 1, 14, 12])
 
     def test_concat_op_transposer(self):
         in1 = paddle.rand([1, 8, 14, 12])
@@ -195,12 +152,8 @@ def test_concat_op_transposer(self):
             # conv_out.shape = [1, 14, 12, 8] with NHWC
             out = paddle.concat(x=[conv_out, in1], axis=0)
 
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [2, 8, 14, 12])
-        else:
-            self.assertEqual(conv_out.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [2, 8, 14, 12])
+        self.assertEqual(conv_out.shape, [1, 8, 14, 12])
+        self.assertEqual(out.shape, [2, 8, 14, 12])
 
     def test_concat_op_no_transposer(self):
         conv = paddle.nn.Conv2D(3, 8, (3, 3))
@@ -212,12 +165,8 @@ def test_concat_op_no_transposer(self):
             # conv_out.shape = [1, 14, 12, 8] with NHWC
             out = paddle.concat(x=[conv_out1, conv_out2], axis=0)
 
-        if paddle.fluid.core.use_layout_autotune():
-            self.assertEqual(conv_out1.shape, [1, 14, 12, 8])
-            self.assertEqual(out.shape, [2, 14, 12, 8])
-        else:
-            self.assertEqual(conv_out1.shape, [1, 8, 14, 12])
-            self.assertEqual(out.shape, [2, 8, 14, 12])
+        self.assertEqual(conv_out1.shape, [1, 8, 14, 12])
+        self.assertEqual(out.shape, [2, 8, 14, 12])
 
 
 class TestAutoTuneAPI(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_mean_op.py b/python/paddle/fluid/tests/unittests/test_mean_op.py
index a4ed6dc3b575cf..206e984fefb686 100644
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
@@ -23,6 +23,9 @@
 from paddle.fluid import Program, program_guard
 from paddle.fluid.framework import _test_eager_guard
 from test_sum_op import TestReduceOPTensorAxisBase
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 np.random.seed(10)
 
@@ -433,6 +436,80 @@ def init_data(self):
         ]
 
 
+class TestMeanDoubleGradCheck(unittest.TestCase):
+
+    def mean_wrapper(self, x):
+        return paddle.mean(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [3, 4, 5], False, dtype)
+        data.persistable = True
+        out = paddle.mean(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.mean_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestMeanTripleGradCheck(unittest.TestCase):
+
+    def mean_wrapper(self, x):
+        return paddle.mean(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [3, 4, 5], False, dtype)
+        data.persistable = True
+        out = paddle.mean(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.mean_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py
index 7d037798588530..614d53de5ed040 100644
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
@@ -25,14 +25,16 @@
 from paddle.fluid.framework import _test_eager_guard
 
 
-def calculate_momentum_by_numpy(param,
-                                grad,
-                                mu,
-                                velocity,
-                                use_nesterov,
-                                learning_rate,
-                                regularization_method=None,
-                                regularization_coeff=1.0):
+def calculate_momentum_by_numpy(
+    param,
+    grad,
+    mu,
+    velocity,
+    use_nesterov,
+    learning_rate,
+    regularization_method=None,
+    regularization_coeff=1.0,
+):
     if regularization_method == "l2_decay":
         grad = grad + regularization_coeff * param
 
@@ -44,8 +46,9 @@ def calculate_momentum_by_numpy(param,
     else:
         velocity_out = mu * velocity + grad
         if use_nesterov:
-            param_out = param - grad * learning_rate - \
-                        velocity_out * mu * learning_rate
+            param_out = (
+                param - grad * learning_rate - velocity_out * mu * learning_rate
+            )
         else:
             param_out = param - learning_rate * velocity_out
 
@@ -53,7 +56,6 @@ def calculate_momentum_by_numpy(param,
 
 
 class TestMomentumOp1(OpTest):
-
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -70,7 +72,7 @@ def setUp(self):
             'Param': param,
             'Grad': grad,
             'Velocity': velocity,
-            'LearningRate': learning_rate
+            'LearningRate': learning_rate,
         }
 
         self.attrs = {'mu': mu}
@@ -81,7 +83,8 @@ def setUp(self):
             mu=mu,
             velocity=velocity,
             use_nesterov=use_nesterov,
-            learning_rate=learning_rate)
+            learning_rate=learning_rate,
+        )
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
@@ -93,7 +96,6 @@ def test_check_output(self):
 
 
 class TestMomentumOpFp16(TestMomentumOp1):
-
     def init_dtype(self):
         self.dtype = np.float16
 
@@ -102,8 +104,7 @@ def test_check_output(self):
 
 
 class TestMomentumOp2(OpTest):
-    '''Test Momentum with default values for attributes
-    '''
+    '''Test Momentum with default values for attributes'''
 
     def setUp(self):
         self.op_type = "momentum"
@@ -119,7 +120,7 @@ def setUp(self):
             'Param': param,
             'Grad': grad,
             'Velocity': velocity,
-            'LearningRate': learning_rate
+            'LearningRate': learning_rate,
         }
 
         self.attrs = {'mu': mu, 'use_nesterov': use_nesterov}
@@ -130,7 +131,8 @@ def setUp(self):
             mu=mu,
             velocity=velocity,
             use_nesterov=use_nesterov,
-            learning_rate=learning_rate)
+            learning_rate=learning_rate,
+        )
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
@@ -138,10 +140,10 @@ def test_check_output(self):
         self.check_output()
 
 
-@unittest.skipIf(not core.is_compiled_with_cuda(),
-                 "core is not compiled with CUDA")
+@unittest.skipIf(
+    not core.is_compiled_with_cuda(), "core is not compiled with CUDA"
+)
 class TestLarsMomentumOpWithMP(OpTest):
-
     def setUp(self):
         self.config()
         self.op_type = "lars_momentum"
@@ -168,11 +170,16 @@ def setUp(self):
             fp32_grad = grad.astype("float32")
             pnorm = np.sqrt(np.square(master_param).sum())
             gnorm = np.sqrt(np.square(fp32_grad).sum())
-            local_lr = learning_rate * lars_coeff * pnorm / (
-                gnorm + lars_weight_decay * pnorm)
+            local_lr = (
+                learning_rate
+                * lars_coeff
+                * pnorm
+                / (gnorm + lars_weight_decay * pnorm)
+            )
             fp32_grad = fp32_grad * rescale_grad
             velocity_out = mu * velocity + local_lr * (
-                fp32_grad + lars_weight_decay * master_param)
+                fp32_grad + lars_weight_decay * master_param
+            )
             p_new = master_param - velocity_out
             param_out = p_new.astype("float16")
             master_param_out = p_new
@@ -185,7 +192,8 @@ def setUp(self):
             param_outs.append(("SubParam_out_" + str(i), param_out))
             master_params.append(("SubMasterParam_" + str(i), master_param))
             master_param_outs.append(
-                ("SubMasterParamOut_" + str(i), master_param_out))
+                ("SubMasterParamOut_" + str(i), master_param_out)
+            )
 
         self.inputs = {
             'Param': params,
@@ -200,13 +208,13 @@ def setUp(self):
             'lars_coeff': lars_coeff,
             'lars_weight_decay': [lars_weight_decay],
             'multi_precision': True,
-            'rescale_grad': rescale_grad
+            'rescale_grad': rescale_grad,
         }
 
         self.outputs = {
             'ParamOut': param_outs,
             'VelocityOut': velocity_outs,
-            'MasterParamOut': master_param_outs
+            'MasterParamOut': master_param_outs,
         }
 
     def test_check_output(self):
@@ -221,7 +229,6 @@ def config(self):
 
 
 class TestLarsMomentumOp(OpTest):
-
     def setUp(self):
         self.config()
         self.op_type = "lars_momentum"
@@ -242,10 +249,15 @@ def setUp(self):
             learning_rate = np.array([0.001]).astype("float32")
             pnorm = np.sqrt(np.square(param).sum())
             gnorm = np.sqrt(np.square(grad).sum())
-            local_lr = learning_rate * lars_coeff * pnorm / (
-                gnorm + lars_weight_decay * param)
+            local_lr = (
+                learning_rate
+                * lars_coeff
+                * pnorm
+                / (gnorm + lars_weight_decay * param)
+            )
             velocity_out = mu * velocity + local_lr * (
-                grad + lars_weight_decay * param)
+                grad + lars_weight_decay * param
+            )
             param_out = param - velocity_out
 
             params.append(("SubParam_" + str(i), param))
@@ -259,13 +271,13 @@ def setUp(self):
             'Param': params,
             'Grad': grads,
             'Velocity': velocitys,
-            'LearningRate': learning_rates
+            'LearningRate': learning_rates,
         }
 
         self.attrs = {
             'mu': mu,
             'lars_coeff': lars_coeff,
-            'lars_weight_decay': [lars_weight_decay]
+            'lars_weight_decay': [lars_weight_decay],
         }
         self.outputs = {'ParamOut': param_outs, 'VelocityOut': velocity_outs}
 
@@ -278,7 +290,6 @@ def config(self):
 
 
 class TestSparseMomentumOp(unittest.TestCase):
-
     def setUp(self):
         self.use_nesterov = False
         self.regularization_method = ""
@@ -317,8 +328,9 @@ def check_with_place(self, place):
         velocity_np_array = np.ones((height, row_numel)).astype("float32")
         velocity.set(velocity_np_array, place)
         velocity_out = scope.var('VelocityOut').get_tensor()
-        velocity_out_np_array = np.full((height, row_numel),
-                                        0.0).astype("float32")
+        velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
+            "float32"
+        )
         velocity_out.set(velocity_out_np_array, place)
 
         # create and initialize LearningRate Variable
@@ -327,17 +339,19 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run operator
-        op = Operator("momentum",
-                      Param='Param',
-                      Grad='Grad',
-                      Velocity='Velocity',
-                      ParamOut='ParamOut',
-                      VelocityOut='VelocityOut',
-                      LearningRate='LearningRate',
-                      mu=mu,
-                      use_nesterov=use_nesterov,
-                      regularization_method=regularization_method,
-                      regularization_coeff=regularization_coeff)
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov,
+            regularization_method=regularization_method,
+            regularization_coeff=regularization_coeff,
+        )
         op.run(scope, place)
 
         # get and compare result
@@ -360,7 +374,8 @@ def check_with_place(self, place):
             use_nesterov=use_nesterov,
             learning_rate=lr_array,
             regularization_method=regularization_method,
-            regularization_coeff=regularization_coeff)
+            regularization_coeff=regularization_coeff,
+        )
 
         self.assertTrue((_velocity_out == velocity_out_np_array).all())
         self.assertTrue((_param_out == param_out_np_array).all())
@@ -377,13 +392,11 @@ def test_sparse_momentum(self):
 
 
 class TestSparseMomentumOp2(TestSparseMomentumOp):
-
     def init_kernel(self):
         self.use_nesterov = True
 
 
 class TestSparseMomentumOpWithMultiPrecision(unittest.TestCase):
-
     def setUp(self):
         self.init_args()
         self.regularization_method = ""
@@ -427,8 +440,9 @@ def check_with_place(self, place):
         velocity_np_array = np.ones((height, row_numel)).astype("float32")
         velocity.set(velocity_np_array, place)
         velocity_out = scope.var('VelocityOut').get_tensor()
-        velocity_out_np_array = np.full((height, row_numel),
-                                        0.0).astype("float32")
+        velocity_out_np_array = np.full((height, row_numel), 0.0).astype(
+            "float32"
+        )
         velocity_out.set(velocity_out_np_array, place)
 
         # create and initialize LearningRate Variable
@@ -437,21 +451,23 @@ def check_with_place(self, place):
         lr.set(lr_array, place)
 
         # create and run operator
-        op = Operator("momentum",
-                      Param='Param',
-                      Grad='Grad',
-                      Velocity='Velocity',
-                      MasterParam='MasterParam',
-                      ParamOut='ParamOut',
-                      VelocityOut='VelocityOut',
-                      MasterParamOut='MasterParamOut',
-                      LearningRate='LearningRate',
-                      mu=mu,
-                      use_nesterov=use_nesterov,
-                      regularization_method=regularization_method,
-                      regularization_coeff=regularization_coeff,
-                      multi_precision=True,
-                      rescale_grad=1.0)
+        op = Operator(
+            "momentum",
+            Param='Param',
+            Grad='Grad',
+            Velocity='Velocity',
+            MasterParam='MasterParam',
+            ParamOut='ParamOut',
+            VelocityOut='VelocityOut',
+            MasterParamOut='MasterParamOut',
+            LearningRate='LearningRate',
+            mu=mu,
+            use_nesterov=use_nesterov,
+            regularization_method=regularization_method,
+            regularization_coeff=regularization_coeff,
+            multi_precision=True,
+            rescale_grad=1.0,
+        )
         op.run(scope, place)
 
         # get and compare result
@@ -472,7 +488,8 @@ def check_with_place(self, place):
             use_nesterov=use_nesterov,
             learning_rate=lr_array,
             regularization_method=regularization_method,
-            regularization_coeff=regularization_coeff)
+            regularization_coeff=regularization_coeff,
+        )
 
         self.assertTrue((_velocity_out == velocity_out_np_array).all())
         self.assertTrue((_param_out == param_out_np_array).all())
@@ -486,23 +503,22 @@ def test_sparse_momentum(self):
 
 
 class TestSparseMomentumOpWithMultiPrecision2(
-        TestSparseMomentumOpWithMultiPrecision):
-
+    TestSparseMomentumOpWithMultiPrecision
+):
     def init_args(self):
         self.use_nesterov = True
 
 
 class TestMomentumV2(unittest.TestCase):
-
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
         a = paddle.to_tensor(value)
         linear = paddle.nn.Linear(13, 5)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(learning_rate=0.01,
-                                         momentum=0.9,
-                                         parameters=linear.parameters())
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01, momentum=0.9, parameters=linear.parameters()
+        )
         out = linear(a)
         out.backward()
         adam.step()
@@ -519,13 +535,15 @@ def test_momentum(self):
             cost = fluid.layers.square_error_cost(input=y_predict, label=y)
             avg_cost = paddle.mean(cost)
 
-            rms_optimizer = paddle.optimizer.Momentum(learning_rate=0.1,
-                                                      momentum=0.9)
+            rms_optimizer = paddle.optimizer.Momentum(
+                learning_rate=0.1, momentum=0.9
+            )
             rms_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
-                                        batch_size=1)
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1
+            )
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -533,9 +551,9 @@ def test_momentum(self):
                 exe.run(main, feed=feeder.feed(data), fetch_list=fetch_list)
 
     def test_raise_error(self):
-        self.assertRaises(ValueError,
-                          paddle.optimizer.Momentum,
-                          learning_rate=None)
+        self.assertRaises(
+            ValueError, paddle.optimizer.Momentum, learning_rate=None
+        )
         self.assertRaises(ValueError, paddle.optimizer.Momentum, momentum=None)
 
     def test_api_eager_dygraph(self):
@@ -545,7 +563,6 @@ def test_api_eager_dygraph(self):
 
 
 class TestMomentumOpWithDecay(OpTest):
-
     def setUp(self):
         self.op_type = "momentum"
         self.dtype = np.float32
@@ -567,14 +584,14 @@ def setUp(self):
             'Param': param,
             'Grad': grad,
             'Velocity': velocity,
-            'LearningRate': learning_rate
+            'LearningRate': learning_rate,
         }
 
         self.attrs = {
             'mu': mu,
             'use_nesterov': use_nesterov,
             'regularization_method': regularization_method,
-            'regularization_coeff': regularization_coeff
+            'regularization_coeff': regularization_coeff,
         }
 
         grad = grad + regularization_coeff * param
@@ -585,7 +602,8 @@ def setUp(self):
             mu=mu,
             velocity=velocity,
             use_nesterov=use_nesterov,
-            learning_rate=learning_rate)
+            learning_rate=learning_rate,
+        )
 
         self.outputs = {'ParamOut': param_out, 'VelocityOut': velocity_out}
 
@@ -598,7 +616,6 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecayFP16(TestMomentumOpWithDecay):
-
     def init_config(self):
         self.dtype = np.float16
 
@@ -608,13 +625,11 @@ def test_check_output(self):
 
 
 class TestMomentumOpWithDecay2(TestMomentumOpWithDecay):
-
     def init_config(self):
         self.use_nesterov = False
 
 
 class TestSparseMomentumOpWithDecay(TestSparseMomentumOp):
-
     def setUp(self):
         self.use_nesterov = False
         self.regularization_method = 'l2_decay'
@@ -622,13 +637,11 @@ def setUp(self):
 
 
 class TestSparseMomentumOpWithDecay2(TestSparseMomentumOpWithDecay):
-
     def init_kernel(self):
         self.use_nesterov = True
 
 
 class TestMomentumOpWithDecayAPI(unittest.TestCase):
-
     def _test_momentum_dygraph_common(self, regularization):
         paddle.disable_static()
         inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
@@ -641,13 +654,16 @@ def _test_momentum_dygraph_common(self, regularization):
             learning_rate=0.01,
             momentum=0.9,
             parameter_list=linear.parameters(),
-            regularization=regularization)
+            regularization=regularization,
+        )
         momentum.minimize(loss)
 
     def test_momentum_dygraph_1(self):
         self._test_momentum_dygraph_common(
             regularization=paddle.fluid.regularizer.L2Decay(
-                regularization_coeff=0.1))
+                regularization_coeff=0.1
+            )
+        )
 
     def test_momentum_static(self):
         paddle.enable_static()
@@ -661,12 +677,14 @@ def test_momentum_static(self):
             avg_cost = paddle.mean(cost)
 
             momentum_optimizer = paddle.fluid.contrib.optimizer.Momentum(
-                learning_rate=0.1, momentum=0.9)
+                learning_rate=0.1, momentum=0.9
+            )
             momentum_optimizer.minimize(avg_cost)
 
             fetch_list = [avg_cost]
-            train_reader = paddle.batch(paddle.dataset.uci_housing.train(),
-                                        batch_size=1)
+            train_reader = paddle.batch(
+                paddle.dataset.uci_housing.train(), batch_size=1
+            )
             feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
             exe = fluid.Executor(place)
             exe.run(fluid.default_startup_program())
@@ -675,23 +693,23 @@ def test_momentum_static(self):
 
 
 class TestFusedMomentumWithDecayAPI(unittest.TestCase):
-
     def get_program(self, weight_attr, bias_attr=False):
         main_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        with paddle.static.program_guard(main_program=main_program,
-                                         startup_program=startup_program):
+        with paddle.static.program_guard(
+            main_program=main_program, startup_program=startup_program
+        ):
             x = paddle.static.data(name='x', shape=[10, 10])
-            linear = paddle.nn.Linear(10,
-                                      10,
-                                      weight_attr=weight_attr,
-                                      bias_attr=bias_attr)
+            linear = paddle.nn.Linear(
+                10, 10, weight_attr=weight_attr, bias_attr=bias_attr
+            )
             out = linear(x)
             loss = paddle.mean(out)
             optimizer = paddle.optimizer.Momentum(
                 learning_rate=0.01,
                 momentum=0.9,
-                weight_decay=paddle.regularizer.L2Decay(0.5))
+                weight_decay=paddle.regularizer.L2Decay(0.5),
+            )
             optimizer.minimize(loss)
         return main_program
 
@@ -700,7 +718,8 @@ def test_param_has_l2decay(self):
         weight_attr = paddle.ParamAttr(
             name="weight",
             initializer=paddle.nn.initializer.Constant(value=0.5),
-            regularizer=paddle.regularizer.L2Decay(0.1))
+            regularizer=paddle.regularizer.L2Decay(0.1),
+        )
         program = self.get_program(weight_attr, bias_attr=False)
         ops = program.global_block().ops
 
@@ -715,11 +734,13 @@ def test_param_has_l1decay(self):
         weight_attr = paddle.ParamAttr(
             name="weight",
             initializer=paddle.nn.initializer.Constant(value=0.5),
-            regularizer=paddle.regularizer.L1Decay(0.1))
+            regularizer=paddle.regularizer.L1Decay(0.1),
+        )
         bias_attr = paddle.ParamAttr(
             name="bias",
-            initializer=paddle.nn.initializer.Constant(value=0.),
-            regularizer=None)
+            initializer=paddle.nn.initializer.Constant(value=0.0),
+            regularizer=None,
+        )
         program = self.get_program(weight_attr, bias_attr)
         ops = program.global_block().ops
 
@@ -734,8 +755,9 @@ def test_param_has_l1decay(self):
             self.assertEqual(ops[-1].attr('regularization_coeff'), 0)
         if 'bias' in ops[-2].input('Param'):
             self.assertEqual(ops[-2].attr('regularization_method'), 'l2_decay')
-            self.assertEqual(ops[-2].attr('regularization_coeff'),
-                             np.float32(0.5))
+            self.assertEqual(
+                ops[-2].attr('regularization_coeff'), np.float32(0.5)
+            )
 
     def test_param_has_no_regularizer(self):
         paddle.enable_static()
@@ -749,11 +771,11 @@ def test_param_has_no_regularizer(self):
 
 
 class TestMomentumOpVsMomentumOpWithDecayAPI(unittest.TestCase):
-
     def __update_params(self, momentum, linear):
         for i in range(10):
-            inp = paddle.full(shape=[2, 2], fill_value=i,
-                              dtype='float32').astype("float32")
+            inp = paddle.full(
+                shape=[2, 2], fill_value=i, dtype='float32'
+            ).astype("float32")
             inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
@@ -768,32 +790,39 @@ def __test_vs(self, place=fluid.CPUPlace()):
             2,
             2,
             weight_attr=paddle.nn.initializer.Constant(value=2.0),
-            bias_attr=paddle.nn.initializer.Constant(value=2.0))
+            bias_attr=paddle.nn.initializer.Constant(value=2.0),
+        )
         momentum_old = paddle.fluid.optimizer.Momentum(
             learning_rate=0.01,
             momentum=0.9,
             parameter_list=linear_old.parameters(),
             regularization=paddle.fluid.regularizer.L2Decay(
-                regularization_coeff=0.1))
+                regularization_coeff=0.1
+            ),
+        )
         self.__update_params(momentum=momentum_old, linear=linear_old)
 
         linear_new = paddle.nn.Linear(
             2,
             2,
             weight_attr=paddle.nn.initializer.Constant(value=2.0),
-            bias_attr=paddle.nn.initializer.Constant(value=2.0))
+            bias_attr=paddle.nn.initializer.Constant(value=2.0),
+        )
         momentum_new = paddle.fluid.contrib.optimizer.Momentum(
             learning_rate=0.01,
             momentum=0.9,
             parameter_list=linear_new.parameters(),
             regularization=paddle.fluid.regularizer.L2Decay(
-                regularization_coeff=0.1))
+                regularization_coeff=0.1
+            ),
+        )
         self.__update_params(momentum=momentum_new, linear=linear_new)
 
         self.assertEqual(
             (linear_old.weight.numpy() == linear_new.weight.numpy()).all(),
             True,
-            'the param weight updated by two Momentum optimizers should equal')
+            'the param weight updated by two Momentum optimizers should equal',
+        )
 
     def test_vs(self, place=fluid.CPUPlace()):
         places = [fluid.CPUPlace()]
@@ -805,7 +834,6 @@ def test_vs(self, place=fluid.CPUPlace()):
 
 
 class TestMomentumV2Group(TestMomentumV2):
-
     def test_momentum_dygraph(self):
         paddle.disable_static()
         value = np.arange(26).reshape(2, 13).astype("float32")
@@ -813,22 +841,20 @@ def test_momentum_dygraph(self):
         linear_1 = paddle.nn.Linear(13, 5)
         linear_2 = paddle.nn.Linear(5, 3)
         # This can be any optimizer supported by dygraph.
-        adam = paddle.optimizer.Momentum(learning_rate=0.01,
-                                         parameters=[{
-                                             'params':
-                                             linear_1.parameters()
-                                         }, {
-                                             'params':
-                                             linear_2.parameters(),
-                                             'weight_decay':
-                                             0.001,
-                                             'learning_rate':
-                                             0.1,
-                                             'momentum':
-                                             0.99
-                                         }],
-                                         weight_decay=0.1,
-                                         momentum=0.9)
+        adam = paddle.optimizer.Momentum(
+            learning_rate=0.01,
+            parameters=[
+                {'params': linear_1.parameters()},
+                {
+                    'params': linear_2.parameters(),
+                    'weight_decay': 0.001,
+                    'learning_rate': 0.1,
+                    'momentum': 0.99,
+                },
+            ],
+            weight_decay=0.1,
+            momentum=0.9,
+        )
         out = linear_1(a)
         out = linear_2(out)
         out.backward()
@@ -837,13 +863,14 @@ def test_momentum_dygraph(self):
 
 
 class TestMultiTensorMomentumDygraph(unittest.TestCase):
-
-    def _momentum_optimize_dygraph(self,
-                                   place,
-                                   use_param_attr=False,
-                                   use_param_group=False,
-                                   use_amp=False,
-                                   use_multi_tensor=False):
+    def _momentum_optimize_dygraph(
+        self,
+        place,
+        use_param_attr=False,
+        use_param_group=False,
+        use_amp=False,
+        use_multi_tensor=False,
+    ):
         paddle.disable_static()
         paddle.seed(10)
         paddle.set_device(place)
@@ -851,7 +878,8 @@ def _momentum_optimize_dygraph(self,
         weight_attr = paddle.ParamAttr(
             learning_rate=0.5,
             regularizer=paddle.regularizer.L2Decay(1.0),
-            trainable=True)
+            trainable=True,
+        )
         if use_param_attr:
             model = paddle.nn.Linear(5, 5, weight_attr)
         else:
@@ -860,17 +888,29 @@ def _momentum_optimize_dygraph(self,
             optimizer = paddle.optimizer.Momentum(
                 parameters=model.parameters(),
                 use_multi_tensor=use_multi_tensor,
-                multi_precision=use_amp)
+                multi_precision=use_amp,
+            )
         else:
+            parameters = list(model.parameters())
+            n = len(parameters)
             optimizer = paddle.optimizer.Momentum(
-                parameters=[{
-                    'params': model.parameters(),
-                    'weight_decay': 0.001,
-                    'learning_rate': 0.1,
-                    'momentum': 0.99
-                }],
+                parameters=[
+                    {
+                        'params': parameters[: int(n / 2)],
+                        'weight_decay': 0.001,
+                        'learning_rate': 0.1,
+                        'momentum': 0.99,
+                    },
+                    {
+                        'params': parameters[int(n / 2) :],
+                        'weight_decay': 0.001,
+                        'learning_rate': 0.1,
+                        'momentum': 0.99,
+                    },
+                ],
                 use_multi_tensor=use_multi_tensor,
-                multi_precision=use_amp)
+                multi_precision=use_amp,
+            )
         for idx in range(5):
             if place == 'gpu' and use_amp == True:
                 model = paddle.amp.decorate(models=model, level='O2')
@@ -900,9 +940,11 @@ def _get_places(self):
 
     def _check_with_place_amp(self, place, use_amp):
         output1, params1 = self._momentum_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=True)
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
         output2, params2 = self._momentum_optimize_dygraph(
-            place=place, use_amp=use_amp, use_multi_tensor=False)
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
 
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
@@ -913,12 +955,14 @@ def _check_with_param_arrt(self, place, use_amp):
             place=place,
             use_amp=use_amp,
             use_param_attr=True,
-            use_multi_tensor=True)
+            use_multi_tensor=True,
+        )
         output2, params2 = self._momentum_optimize_dygraph(
             place=place,
             use_amp=use_amp,
             use_param_attr=True,
-            use_multi_tensor=False)
+            use_multi_tensor=False,
+        )
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
@@ -928,12 +972,14 @@ def _check_with_param_group(self, place, use_amp):
             place=place,
             use_amp=use_amp,
             use_param_group=True,
-            use_multi_tensor=True)
+            use_multi_tensor=True,
+        )
         output2, params2 = self._momentum_optimize_dygraph(
             place=place,
             use_amp=use_amp,
             use_param_group=True,
-            use_multi_tensor=False)
+            use_multi_tensor=False,
+        )
         np.testing.assert_allclose(output1, output2, rtol=1e-05)
         for idx in range(len(params1)):
             np.testing.assert_allclose(params1[idx], params2[idx], rtol=1e-05)
@@ -952,11 +998,9 @@ def test_api_eager_dygraph(self):
 
 
 class TestMultiTensorMomentumStatic(unittest.TestCase):
-
-    def _momentum_optimize_static(self,
-                                  place,
-                                  use_amp=False,
-                                  use_multi_tensor=False):
+    def _momentum_optimize_static(
+        self, place, use_amp=False, use_multi_tensor=False
+    ):
         paddle.enable_static()
         paddle.seed(10)
         np.random.seed(10)
@@ -965,24 +1009,26 @@ def _momentum_optimize_static(self,
         exe = paddle.static.Executor(place=place)
         train_program = paddle.static.Program()
         startup_program = paddle.static.Program()
-        optimizer = paddle.optimizer.Momentum(multi_precision=use_amp,
-                                              use_multi_tensor=use_multi_tensor)
+        optimizer = paddle.optimizer.Momentum(
+            multi_precision=use_amp, use_multi_tensor=use_multi_tensor
+        )
         if use_amp:
             optimizer = paddle.static.amp.decorate(
                 optimizer,
                 init_loss_scaling=128.0,
                 use_dynamic_loss_scaling=True,
                 use_pure_fp16=True,
-                use_fp16_guard=False)
+                use_fp16_guard=False,
+            )
         with paddle.static.program_guard(train_program, startup_program):
             if use_amp:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float16')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float16'
+                )
             else:
-                data = paddle.static.data(shape=[2, 2],
-                                          name='X',
-                                          dtype='float32')
+                data = paddle.static.data(
+                    shape=[2, 2], name='X', dtype='float32'
+                )
             hidden = paddle.static.nn.fc(x=data, size=10)
             loss = paddle.mean(hidden)
             optimizer.minimize(loss)
@@ -994,9 +1040,9 @@ def _momentum_optimize_static(self,
             x = numpy.random.random(size=(2, 2)).astype('float32')
         out = []
         for idx in range(5):
-            loss_data, = exe.run(train_program,
-                                 feed={"X": x},
-                                 fetch_list=[loss.name])
+            (loss_data,) = exe.run(
+                train_program, feed={"X": x}, fetch_list=[loss.name]
+            )
             out.append(loss_data)
         return out
 
@@ -1007,12 +1053,12 @@ def _get_places(self):
         return places
 
     def _check_with_place_amp(self, place, use_amp):
-        output1 = self._momentum_optimize_static(place=place,
-                                                 use_amp=use_amp,
-                                                 use_multi_tensor=True)
-        output2 = self._momentum_optimize_static(place=place,
-                                                 use_amp=use_amp,
-                                                 use_multi_tensor=False)
+        output1 = self._momentum_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=True
+        )
+        output2 = self._momentum_optimize_static(
+            place=place, use_amp=use_amp, use_multi_tensor=False
+        )
         for idx in range(len(output1)):
             np.testing.assert_allclose(output1[idx], output2[idx], rtol=1e-05)
 
diff --git a/python/paddle/fluid/tests/unittests/test_nms_op.py b/python/paddle/fluid/tests/unittests/test_nms_op.py
index a81a46e1140e8d..cbd24d4ddf22e9 100755
--- a/python/paddle/fluid/tests/unittests/test_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nms_op.py
@@ -65,7 +65,7 @@ def nms(boxes, nms_threshold):
             else:
                 continue
 
-    return selected_indices
+    return selected_indices[:cnt]
 
 
 class TestNMSOp(OpTest):
diff --git a/python/paddle/fluid/tests/unittests/test_operator_desc.py b/python/paddle/fluid/tests/unittests/test_operator_desc.py
index 3c0871cfc82655..7654ae214b09b7 100644
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
@@ -77,9 +77,7 @@ def test_op_desc_creation(self):
             set(mul_op.attr_names),
             set([
                 "x_num_col_dims", "y_num_col_dims", "op_role", "op_role_var",
-                "use_mkldnn", "scale_x", "scale_y", "scale_out",
-                "force_fp32_output", "op_namescope", "op_callstack",
-                "op_device", "with_quant_attr"
+                "op_namescope", "op_callstack", "op_device", "with_quant_attr"
             ]))
         self.assertEqual(mul_op.has_attr("x_num_col_dims"), True)
         self.assertEqual(mul_op.attr_type("x_num_col_dims"), core.AttrType.INT)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
index 1e8aae7226a7e8..29b0b16de38c0b 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
@@ -22,7 +22,7 @@
 import os
 import subprocess
 
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 from paddle.fluid.framework import _test_eager_guard
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py
index 725d5249f594bf..4713f6619b93d7 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel_cpuonly.py
@@ -22,7 +22,7 @@
 import os
 import subprocess
 
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 
 
 def get_cluster_from_args(selected_gpus):
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
index 7b24360531228f..6b67f301878a1f 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
@@ -20,19 +20,22 @@
 
 
 class TestResnetGPU(TestResnetBase):
-
     def test_seresnext_with_learning_rate_decay(self):
         # NOTE(zcd): This test is compare the result of use parallel_executor
         # and executor, and the result of drop_out op and batch_norm op in
         # this two executor have diff, so the two ops should be removed
         # from the model.
-        check_func = partial(self.check_network_convergence,
-                             optimizer=seresnext_net.optimizer,
-                             use_parallel_executor=False)
-        self._compare_result_with_origin_model(check_func,
-                                               use_device=DeviceType.CUDA,
-                                               delta2=1e-5,
-                                               compare_separately=False)
+        check_func = partial(
+            self.check_network_convergence,
+            optimizer=seresnext_net.optimizer,
+            use_parallel_executor=False,
+        )
+        self._compare_result_with_origin_model(
+            check_func,
+            use_device=DeviceType.CUDA,
+            delta2=1e-3,
+            compare_separately=False,
+        )
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
index 8773e8d47ed3c0..10243a0faa9445 100644
--- a/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
+++ b/python/paddle/fluid/tests/unittests/test_pipeline_parallel.py
@@ -26,5 +26,11 @@ def test_pipeline_parallel(self):
         self.run_mnist_2gpu('hybrid_parallel_pp_alexnet.py')
 
 
+class TestModelParallelWithRecompute(TestMultipleGpus):
+
+    def test_model_parallel_with_recompute(self):
+        self.run_mnist_2gpu("dygraph_recompute_hybrid.py")
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
index 0b57e8d00f761e..0f04f6c5f511e6 100644
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
@@ -12,11 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
 import unittest
 import numpy as np
-import sys
 import math
 from op_test import OpTest
 import paddle
@@ -110,8 +107,6 @@ def init_test_params(self):
         self.flip = True
         self.set_min_max_aspect_ratios_order()
         self.real_aspect_ratios = [1, 2.0, 1.0 / 2.0, 3.0, 1.0 / 3.0]
-        self.aspect_ratios = np.array(self.aspect_ratios,
-                                      dtype=np.float64).flatten()
         self.variances = [0.1, 0.1, 0.2, 0.2]
         self.variances = np.array(self.variances, dtype=np.float64).flatten()
 
@@ -220,6 +215,54 @@ def set_min_max_aspect_ratios_order(self):
         self.min_max_aspect_ratios_order = True
 
 
+class TestPriorBoxAPI(unittest.TestCase):
+
+    def setUp(self):
+        np.random.seed(678)
+        self.input_np = np.random.rand(2, 10, 32, 32).astype('float32')
+        self.image_np = np.random.rand(2, 10, 40, 40).astype('float32')
+        self.min_sizes = [2.0, 4.0]
+
+    def test_dygraph_with_static(self):
+        paddle.enable_static()
+        input = paddle.static.data(name='input',
+                                   shape=[2, 10, 32, 32],
+                                   dtype='float32')
+        image = paddle.static.data(name='image',
+                                   shape=[2, 10, 40, 40],
+                                   dtype='float32')
+
+        box, var = paddle.vision.ops.prior_box(input=input,
+                                               image=image,
+                                               min_sizes=self.min_sizes,
+                                               clip=True,
+                                               flip=True)
+
+        exe = paddle.static.Executor()
+        box_np, var_np = exe.run(paddle.static.default_main_program(),
+                                 feed={
+                                     'input': self.input_np,
+                                     'image': self.image_np,
+                                 },
+                                 fetch_list=[box, var])
+
+        paddle.disable_static()
+        inputs_dy = paddle.to_tensor(self.input_np)
+        image_dy = paddle.to_tensor(self.image_np)
+
+        box_dy, var_dy = paddle.vision.ops.prior_box(input=inputs_dy,
+                                                     image=image_dy,
+                                                     min_sizes=self.min_sizes,
+                                                     clip=True,
+                                                     flip=True)
+        box_dy_np = box_dy.numpy()
+        var_dy_np = var_dy.numpy()
+
+        np.testing.assert_allclose(box_np, box_dy_np)
+        np.testing.assert_allclose(var_np, var_dy_np)
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
index e52b6462bfb54b..830ade004d3a6c 100644
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
@@ -95,12 +95,15 @@ def simple_fc_net(in_size,
         py_reader = fluid.layers.create_py_reader_by_data(
             capacity=queue_capacity,
             use_double_buffer=use_double_buffer,
-            feed_list=[in_data, label])
+            feed_list=[in_data, label],
+            name=unique_name.generate('py_reader_name'))
     else:
-        py_reader = fluid.layers.py_reader(capacity=queue_capacity,
-                                           shapes=[in_data.shape, label.shape],
-                                           dtypes=['float32', 'int64'],
-                                           use_double_buffer=use_double_buffer)
+        py_reader = fluid.layers.py_reader(
+            capacity=queue_capacity,
+            shapes=[in_data.shape, label.shape],
+            dtypes=['float32', 'int64'],
+            name=unique_name.generate('py_reader_name'),
+            use_double_buffer=use_double_buffer)
 
     in_data, label = fluid.layers.read_file(py_reader)
 
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
index 7f09d9b70631dc..9896a7bccf6865 100644
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -21,6 +21,9 @@
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 from paddle.fluid.framework import program_guard, Program
 from test_attribute_var import UnittestBase
@@ -267,6 +270,80 @@ def call_func(self, x):
         return out
 
 
+class TestReverseDoubleGradCheck(unittest.TestCase):
+
+    def reverse_wrapper(self, x):
+        return fluid.layers.reverse(x[0], [0, 1])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float64
+
+        data = layers.data('data', [3, 4], False, dtype)
+        data.persistable = True
+        out = fluid.layers.reverse(data, [0, 1])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.reverse_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestReverseTripleGradCheck(unittest.TestCase):
+
+    def reverse_wrapper(self, x):
+        return fluid.layers.reverse(x[0], [0, 1])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = fluid.layers.reverse(data, [0, 1])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.reverse_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index 0beee7b0a1ccbd..65876c7ceaef78 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -22,6 +22,9 @@
 import paddle.fluid.core as core
 from paddle.fluid.op import Operator
 from paddle.static import Program, program_guard
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestScaleOp(OpTest):
@@ -244,5 +247,79 @@ def _executed_api(self, x, scale=1.0, bias=0.0):
         return x.scale_(scale, bias)
 
 
+class TestScaleDoubleGradCheck(unittest.TestCase):
+
+    def scale_wrapper(self, x):
+        return paddle.scale(x[0], scale=2.0)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.scale(data, 2.0)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.scale_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestScaleTripleGradCheck(unittest.TestCase):
+
+    def scale_wrapper(self, x):
+        return paddle.scale(x[0], scale=2.0)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.scale(data, 2.0)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.scale_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_set_value_op.py b/python/paddle/fluid/tests/unittests/test_set_value_op.py
index 59ccff3973ff0b..fad47fc158cab2 100644
--- a/python/paddle/fluid/tests/unittests/test_set_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_set_value_op.py
@@ -576,6 +576,28 @@ def set_dtype(self):
 create_test_value_int64(TestSetValueItemSlice4)
 
 
+def create_test_value_fp16(parent):
+
+    class TestValueInt(parent):
+
+        def set_value(self):
+            self.value = 3.7
+
+        def set_dtype(self):
+            self.dtype = "float16"
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Valuefp16")
+    TestValueInt.__name__ = cls_name
+    globals()[cls_name] = TestValueInt
+
+
+create_test_value_fp16(TestSetValueItemInt)
+create_test_value_fp16(TestSetValueItemSlice)
+create_test_value_fp16(TestSetValueItemSlice2)
+create_test_value_fp16(TestSetValueItemSlice3)
+create_test_value_fp16(TestSetValueItemSlice4)
+
+
 def create_test_value_fp32(parent):
 
     class TestValueInt(parent):
@@ -1015,7 +1037,6 @@ def test_error(self):
         paddle.enable_static()
         with paddle.static.program_guard(self.program):
             self._value_type_error()
-            self._dtype_error()
             self._step_error()
             self._bool_list_error()
             self._bool_tensor_error()
diff --git a/python/paddle/fluid/tests/unittests/test_sign_op.py b/python/paddle/fluid/tests/unittests/test_sign_op.py
index 444675a4bb5c22..3eda8b286c8402 100644
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
@@ -19,7 +19,11 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
+import paddle.fluid.core as core
 from paddle.fluid import Program, program_guard
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestSignOp(OpTest):
@@ -91,6 +95,80 @@ def test_static(self):
             paddle.sign(input4)
 
 
+class TestSignDoubleGradCheck(unittest.TestCase):
+
+    def sign_wrapper(self, x):
+        return paddle.sign(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [1, 4], False, dtype)
+        data.persistable = True
+        out = paddle.sign(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.sign_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestSignTripleGradCheck(unittest.TestCase):
+
+    def sign_wrapper(self, x):
+        return paddle.sign(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [1, 4], False, dtype)
+        data.persistable = True
+        out = paddle.sign(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.sign_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_slice_op.py b/python/paddle/fluid/tests/unittests/test_slice_op.py
index d660518f04e4c8..bc10e4f4fdeb4f 100644
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
@@ -22,6 +22,9 @@
 import paddle.fluid.layers as layers
 import paddle
 from paddle.fluid.framework import _test_eager_guard, _enable_legacy_dygraph
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 paddle.enable_static()
 
@@ -79,6 +82,33 @@ def config(self):
         self.out = self.input[-3:3, 0:100, :, 2:-1]
 
 
+class TestSliceZerosShapeTensor(OpTest):
+
+    def setUp(self):
+        self.op_type = "slice"
+        self.config()
+        self.inputs = {'Input': self.input}
+        self.outputs = {'Out': self.out}
+        self.attrs = {
+            'axes': self.axes,
+            'starts': self.starts,
+            'ends': self.ends,
+            'infer_flags': self.infer_flags,
+            'use_mkldnn': True
+        }
+
+    def config(self):
+        self.input = np.random.random([0, 0, 0]).astype("float32")
+        self.starts = [1]
+        self.ends = [2]
+        self.axes = [0]
+        self.infer_flags = []
+        self.out = self.input[1:2]
+
+    def test_check_output(self):
+        self.check_output_with_place(paddle.CPUPlace())
+
+
 # 1.2 with attr(decrease)
 class TestSliceOp_decs_dim(OpTest):
 
@@ -784,7 +814,7 @@ def test(self):
         self.assertEqual(x.shape, (3, -1, 5))
 
         out0 = paddle.slice(x, axes=[1], starts=[0], ends=[3])
-        self.assertEqual(out0.shape, (3, 3, 5))
+        self.assertEqual(out0.shape, (3, -1, 5))
 
     def test_axis_less_than_zero(self):
         # Using paddle.disable_static will make other unittests fail.
@@ -840,6 +870,92 @@ def test_input_cuda_pinned_var(self):
             self.assertEqual(sliced.shape, [2, 70, 80])
 
 
+class TestSliceDoubleGradCheck(unittest.TestCase):
+
+    def slice_wrapper(self, x):
+        return paddle.slice(x[0],
+                            axes=[0, 1, 2],
+                            starts=[-3, 0, 2],
+                            ends=[3, 2, 4])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [4, 5, 6], False, dtype)
+        data.persistable = True
+        out = paddle.slice(data,
+                           axes=[0, 1, 2],
+                           starts=[-3, 0, 2],
+                           ends=[3, 2, 4])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.slice_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestSliceTripleGradCheck(unittest.TestCase):
+
+    def slice_wrapper(self, x):
+        return paddle.slice(x[0],
+                            axes=[0, 1, 2],
+                            starts=[-3, 0, 2],
+                            ends=[3, 2, 4])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [4, 5, 6], False, dtype)
+        data.persistable = True
+        out = paddle.slice(data,
+                           axes=[0, 1, 2],
+                           starts=[-3, 0, 2],
+                           ends=[3, 2, 4])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.slice_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_addmm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_addmm_op.py
index ffd81640d70f41..738177f6ccf54e 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_addmm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_addmm_op.py
@@ -64,7 +64,7 @@ def check_result(self, input_shape, x_shape, y_shape, format):
         sp_x.stop_gradient = False
         sp_y = origin_y.detach()
         sp_y.stop_gradient = False
-        sp_out = paddle.incubate.sparse.addmm(sp_input, sp_x, sp_y, 3.0, 2.0)
+        sp_out = paddle.sparse.addmm(sp_input, sp_x, sp_y, 3.0, 2.0)
 
         np.testing.assert_allclose(sp_out.numpy(),
                                    dense_out.numpy(),
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
index 4337461d48d42a..7f27300e53e87a 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_attention_op.py
@@ -93,14 +93,9 @@ def get_csr_value(mat, layout, nnz):
     return value
 
 
-def ref_sparse_attention(q,
-                         k,
-                         v,
-                         offset,
-                         columns,
-                         kp_mask=None,
-                         attn_mask=None,
-                         bsz=None):
+def ref_sparse_attention(
+    q, k, v, offset, columns, kp_mask=None, attn_mask=None, bsz=None
+):
     row, col, nnz = q.shape[0], q.shape[1], columns.shape[0]
     mat = np.zeros((row, row))
     for cur_row in range(row):
@@ -111,7 +106,7 @@ def ref_sparse_attention(q,
             mat[cur_row][cur_col] = 1
     a = np.dot(q, k.T) * mat
     a_value = get_csr_value(a, mat, nnz)
-    scaling = float(col)**-0.5
+    scaling = float(col) ** -0.5
     a = scaling * a
     for i in range(row):
         for j in range(row):
@@ -127,13 +122,9 @@ def ref_sparse_attention(q,
     return result, a_value, b_value
 
 
-def ref_batch_sparse_attention(q,
-                               k,
-                               v,
-                               offset,
-                               columns,
-                               kp_mask=None,
-                               attn_mask=None):
+def ref_batch_sparse_attention(
+    q, k, v, offset, columns, kp_mask=None, attn_mask=None
+):
     batch_size, num_heads, row, col = q.shape
     nnz = columns.shape[2]
     result = np.zeros((batch_size, num_heads, row, col))
@@ -141,11 +132,16 @@ def ref_batch_sparse_attention(q,
     result_softmax = np.zeros((batch_size, num_heads, nnz))
     for i in range(batch_size):
         for j in range(num_heads):
-            cur_q, cur_k, cur_v, = q[i][j], k[i][j], v[i][j]
+            cur_q, cur_k, cur_v, = (
+                q[i][j],
+                k[i][j],
+                v[i][j],
+            )
             cur_offset, cur_columns = offset[i][j], columns[i][j]
             if kp_mask is None and attn_mask is None:
                 cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
-                    cur_q, cur_k, cur_v, cur_offset, cur_columns)
+                    cur_q, cur_k, cur_v, cur_offset, cur_columns
+                )
             else:
                 cur_result, cur_sdd, cur_softmax = ref_sparse_attention(
                     cur_q,
@@ -155,7 +151,8 @@ def ref_batch_sparse_attention(q,
                     cur_columns,
                     kp_mask=kp_mask,
                     attn_mask=attn_mask,
-                    bsz=i)
+                    bsz=i,
+                )
             result[i][j] = cur_result
             result_sdd[i][j], result_softmax[i][j] = cur_sdd, cur_softmax
     return result, result_sdd, result_softmax
@@ -193,10 +190,9 @@ def init_csr_format(batch_size, num_heads, rows, blocksize):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestSparseAttentionOp(OpTest):
-
     def config(self):
         self.shape = (1, 1, 16, 16)
         self.blocksize = 4
@@ -212,8 +208,9 @@ def setUp(self):
         self.k = np.random.random(self.shape).astype(self.dtype)
         self.v = np.random.random(self.shape).astype(self.dtype)
         # init CSR tensor
-        offset, columns = init_csr_format(self.shape[0], self.shape[1],
-                                          self.shape[2], self.blocksize)
+        offset, columns = init_csr_format(
+            self.shape[0], self.shape[1], self.shape[2], self.blocksize
+        )
         self.offset = offset.astype('int32')
         self.columns = columns.astype('int32')
         # init mask tensor
@@ -234,10 +231,12 @@ def setUp(self):
                 self.offset,
                 self.columns,
                 kp_mask=self.key_padding_mask,
-                attn_mask=self.attn_mask)
+                attn_mask=self.attn_mask,
+            )
         else:
             result, result_sdd, result_softmax = ref_batch_sparse_attention(
-                self.q, self.k, self.v, self.offset, self.columns)
+                self.q, self.k, self.v, self.offset, self.columns
+            )
 
         if self.use_mask == True:
             self.inputs = {
@@ -260,7 +259,7 @@ def setUp(self):
         self.outputs = {
             'Out': result.astype(self.dtype),
             'SparseDotSdd': result_sdd.astype(self.dtype),
-            'Softmax': result_softmax.astype(self.dtype)
+            'Softmax': result_softmax.astype(self.dtype),
         }
 
     def test_check_output(self):
@@ -273,7 +272,6 @@ def test_check_grad(self):
 
 
 class TestSparseAttentionOpFp32Test(TestSparseAttentionOp):
-
     def config(self):
         self.shape = (1, 1, 8, 16)
         self.blocksize = 2
@@ -282,7 +280,6 @@ def config(self):
 
 
 class TestSparseAttentionOpShapeTest(TestSparseAttentionOp):
-
     def config(self):
         self.shape = (2, 2, 32, 8)
         self.blocksize = 8
@@ -292,10 +289,9 @@ def config(self):
 
 @unittest.skipIf(
     not core.is_compiled_with_cuda() or get_cuda_version() < 11030,
-    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3"
+    "core is not compiled with CUDA and cuda version need larger than or equal to 11.3",
 )
 class TestSparseAttentionAPI(unittest.TestCase):
-
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (1, 1, 8, 4)
@@ -310,54 +306,62 @@ def test_static_graph(self):
             K = paddle.static.data(name="K", shape=self.shape, dtype=self.dtype)
             V = paddle.static.data(name="V", shape=self.shape, dtype=self.dtype)
 
-            batch_size, num_heads, rows = self.shape[0], self.shape[
-                1], self.shape[2]
+            batch_size, num_heads, rows = (
+                self.shape[0],
+                self.shape[1],
+                self.shape[2],
+            )
             block_num = rows / self.blocksize
             block_last = rows % self.blocksize
-            sparse_nnz_num = block_num * self.blocksize * self.blocksize + block_last * block_last
+            sparse_nnz_num = (
+                block_num * self.blocksize * self.blocksize
+                + block_last * block_last
+            )
             offset_shape = (batch_size, num_heads, rows + 1)
             columns_shape = (batch_size, num_heads, int(sparse_nnz_num))
 
-            offset = paddle.static.data(name="Offset",
-                                        shape=offset_shape,
-                                        dtype="int32")
-            columns = paddle.static.data(name="Columns",
-                                         shape=columns_shape,
-                                         dtype="int32")
+            offset = paddle.static.data(
+                name="Offset", shape=offset_shape, dtype="int32"
+            )
+            columns = paddle.static.data(
+                name="Columns", shape=columns_shape, dtype="int32"
+            )
             key_padding_mask_shape = (self.shape[0], self.shape[2])
             attn_mask_shape = (self.shape[2], self.shape[2])
             if self.use_mask == True:
                 key_padding_mask = paddle.static.data(
                     name="KeyPaddingMask",
                     shape=key_padding_mask_shape,
-                    dtype=self.dtype)
-                attn_mask = paddle.static.data(name="AttnMask",
-                                               shape=attn_mask_shape,
-                                               dtype=self.dtype)
-                Out = F.sparse_attention(Q,
-                                         K,
-                                         V,
-                                         offset,
-                                         columns,
-                                         key_padding_mask=key_padding_mask,
-                                         attn_mask=attn_mask)
+                    dtype=self.dtype,
+                )
+                attn_mask = paddle.static.data(
+                    name="AttnMask", shape=attn_mask_shape, dtype=self.dtype
+                )
+                Out = F.sparse_attention(
+                    Q,
+                    K,
+                    V,
+                    offset,
+                    columns,
+                    key_padding_mask=key_padding_mask,
+                    attn_mask=attn_mask,
+                )
             else:
                 Out = F.sparse_attention(Q, K, V, offset, columns)
 
             Q_np = np.random.random(self.shape).astype(self.dtype)
             K_np = np.random.random(self.shape).astype(self.dtype)
             V_np = np.random.random(self.shape).astype(self.dtype)
-            offset_np, columns_np = init_csr_format(self.shape[0],
-                                                    self.shape[1],
-                                                    self.shape[2],
-                                                    self.blocksize)
+            offset_np, columns_np = init_csr_format(
+                self.shape[0], self.shape[1], self.shape[2], self.blocksize
+            )
             offset_np = offset_np.astype('int32')
             columns_np = columns_np.astype('int32')
 
             # init mask tensor
-            key_padding_mask_np = np.random.randint(0,
-                                                    2,
-                                                    size=key_padding_mask_shape)
+            key_padding_mask_np = np.random.randint(
+                0, 2, size=key_padding_mask_shape
+            )
             attn_mask_np = np.random.randint(0, 2, size=attn_mask_shape)
             key_padding_mask_np = init_mask(key_padding_mask_np)
             attn_mask_np = init_mask(attn_mask_np)
@@ -366,16 +370,18 @@ def test_static_graph(self):
 
             exe = fluid.Executor(self.place)
             if self.use_mask == True:
-                fetches_result = exe.run(feed={
-                    "Q": Q_np,
-                    "K": K_np,
-                    "V": V_np,
-                    "Offset": offset_np,
-                    "Columns": columns_np,
-                    'KeyPaddingMask': key_padding_mask_np,
-                    'AttnMask': attn_mask_np
-                },
-                                         fetch_list=[Out])
+                fetches_result = exe.run(
+                    feed={
+                        "Q": Q_np,
+                        "K": K_np,
+                        "V": V_np,
+                        "Offset": offset_np,
+                        "Columns": columns_np,
+                        'KeyPaddingMask': key_padding_mask_np,
+                        'AttnMask': attn_mask_np,
+                    },
+                    fetch_list=[Out],
+                )
                 expected_result, __, __ = ref_batch_sparse_attention(
                     Q_np,
                     K_np,
@@ -383,28 +389,32 @@ def test_static_graph(self):
                     offset_np,
                     columns_np,
                     kp_mask=key_padding_mask_np,
-                    attn_mask=attn_mask_np)
+                    attn_mask=attn_mask_np,
+                )
             else:
-                fetches_result = exe.run(feed={
-                    "Q": Q_np,
-                    "K": K_np,
-                    "V": V_np,
-                    "Offset": offset_np,
-                    "Columns": columns_np
-                },
-                                         fetch_list=[Out])
+                fetches_result = exe.run(
+                    feed={
+                        "Q": Q_np,
+                        "K": K_np,
+                        "V": V_np,
+                        "Offset": offset_np,
+                        "Columns": columns_np,
+                    },
+                    fetch_list=[Out],
+                )
                 expected_result, __, __ = ref_batch_sparse_attention(
-                    Q_np, K_np, V_np, offset_np, columns_np)
+                    Q_np, K_np, V_np, offset_np, columns_np
+                )
 
-            np.testing.assert_allclose(fetches_result,
-                                       expected_result,
-                                       rtol=1e-05,
-                                       atol=1e-05)
+            np.testing.assert_allclose(
+                fetches_result[0], expected_result, rtol=1e-05, atol=1e-05
+            )
 
     def test_dygraph(self):
         paddle.disable_static()
-        offset, columns = init_csr_format(self.shape[0], self.shape[1],
-                                          self.shape[2], self.blocksize)
+        offset, columns = init_csr_format(
+            self.shape[0], self.shape[1], self.shape[2], self.blocksize
+        )
         offset = offset.astype('int32')
         columns = columns.astype('int32')
         query = np.random.random(self.shape).astype(self.dtype)
@@ -429,13 +439,15 @@ def test_dygraph(self):
         paddle_attn_mask = paddle.to_tensor(attn_mask, place=self.place)
 
         if self.use_mask == True:
-            paddle_result = F.sparse_attention(paddle_query,
-                                               paddle_key,
-                                               paddle_value,
-                                               paddle_offset,
-                                               paddle_colunmns,
-                                               key_padding_mask=paddle_kp_mask,
-                                               attn_mask=paddle_attn_mask)
+            paddle_result = F.sparse_attention(
+                paddle_query,
+                paddle_key,
+                paddle_value,
+                paddle_offset,
+                paddle_colunmns,
+                key_padding_mask=paddle_kp_mask,
+                attn_mask=paddle_attn_mask,
+            )
 
             numpy_result, __, __ = ref_batch_sparse_attention(
                 query,
@@ -444,25 +456,29 @@ def test_dygraph(self):
                 offset,
                 columns,
                 kp_mask=key_padding_mask,
-                attn_mask=attn_mask)
+                attn_mask=attn_mask,
+            )
             numpy_result = numpy_result.astype(self.dtype)
         else:
-            paddle_result = F.sparse_attention(paddle_query, paddle_key,
-                                               paddle_value, paddle_offset,
-                                               paddle_colunmns)
+            paddle_result = F.sparse_attention(
+                paddle_query,
+                paddle_key,
+                paddle_value,
+                paddle_offset,
+                paddle_colunmns,
+            )
 
             numpy_result, __, __ = ref_batch_sparse_attention(
-                query, key, value, offset, columns)
+                query, key, value, offset, columns
+            )
             numpy_result = numpy_result.astype(self.dtype)
 
-        np.testing.assert_allclose(paddle_result.numpy(),
-                                   numpy_result,
-                                   rtol=1e-05,
-                                   atol=1e-05)
+        np.testing.assert_allclose(
+            paddle_result.numpy(), numpy_result, rtol=1e-05, atol=1e-05
+        )
 
 
 class TestSparseAttentionAPITestFloat(TestSparseAttentionAPI):
-
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (2, 2, 8, 4)
@@ -472,7 +488,6 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape1(TestSparseAttentionAPI):
-
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (2, 2, 64, 32)
@@ -482,7 +497,6 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape2(TestSparseAttentionAPI):
-
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (2, 1, 64, 32)
@@ -492,7 +506,6 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape3(TestSparseAttentionAPI):
-
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (4, 4, 128, 32)
@@ -502,7 +515,6 @@ def setUp(self):
 
 
 class TestSparseAttentionAPITestShape4(TestSparseAttentionAPI):
-
     def setUp(self):
         self.place = paddle.CUDAPlace(0)
         self.shape = (3, 3, 35, 15)
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
index 4477998875246c..20db591db0bdf9 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_conv_op.py
@@ -19,6 +19,7 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid import core
 from paddle.fluid.framework import _test_eager_guard
+import paddle.sparse as sparse
 
 
 class TestSparseConv(unittest.TestCase):
@@ -43,17 +44,17 @@ def test_conv3d(self):
             correct_out_values = [[5], [11]]
             sparse_input = core.eager.sparse_coo_tensor(indices, values,
                                                         dense_shape, False)
-            out = paddle.incubate.sparse.nn.functional.conv3d(
-                sparse_input,
-                dense_kernel,
-                bias=paddle.to_tensor(bias, dtype='float32'),
-                stride=strides,
-                padding=paddings,
-                dilation=dilations,
-                groups=1,
-                data_format="NDHWC")
+            out = paddle.sparse.nn.functional.conv3d(sparse_input,
+                                                     dense_kernel,
+                                                     bias=paddle.to_tensor(
+                                                         bias, dtype='float32'),
+                                                     stride=strides,
+                                                     padding=paddings,
+                                                     dilation=dilations,
+                                                     groups=1,
+                                                     data_format="NDHWC")
             out.backward(out)
-            out = paddle.incubate.sparse.coalesce(out)
+            out = paddle.sparse.coalesce(out)
             assert np.array_equal(correct_out_values, out.values().numpy())
 
     def test_subm_conv3d(self):
@@ -63,11 +64,14 @@ def test_subm_conv3d(self):
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                indices, values, dense_shape, stop_gradient=True)
+            sparse_x = paddle.sparse.sparse_coo_tensor(indices,
+                                                       values,
+                                                       dense_shape,
+                                                       stop_gradient=True)
             weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-            y = paddle.incubate.sparse.nn.functional.subm_conv3d(
-                sparse_x, weight, key='subm_conv')
+            y = paddle.sparse.nn.functional.subm_conv3d(sparse_x,
+                                                        weight,
+                                                        key='subm_conv')
             assert np.array_equal(sparse_x.indices().numpy(),
                                   y.indices().numpy())
 
@@ -81,17 +85,20 @@ def test_Conv3D(self):
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
             correct_out_values = [[4], [10]]
-            sparse_input = paddle.incubate.sparse.sparse_coo_tensor(
+            sparse_input = paddle.sparse.sparse_coo_tensor(
                 indices, values, dense_shape, False)
 
-            sparse_conv3d = paddle.incubate.sparse.nn.Conv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC')
+            sparse_conv3d = paddle.sparse.nn.Conv3D(1,
+                                                    1, (1, 3, 3),
+                                                    data_format='NDHWC')
             sparse_out = sparse_conv3d(sparse_input)
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
-                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')
+                conv3d = paddle.sparse.nn.SubmConv3D(1,
+                                                     1, (1, 3, 3),
+                                                     data_format='NCDHW',
+                                                     key='subm_conv')
 
     def test_SubmConv3D(self):
         with _test_eager_guard():
@@ -101,11 +108,13 @@ def test_SubmConv3D(self):
             values = paddle.to_tensor(values, dtype='float32')
             dense_shape = [1, 1, 3, 4, 1]
             correct_out_values = [[4], [10]]
-            sparse_input = paddle.incubate.sparse.sparse_coo_tensor(
+            sparse_input = paddle.sparse.sparse_coo_tensor(
                 indices, values, dense_shape, False)
 
-            subm_conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                1, 1, (1, 3, 3), data_format='NDHWC', key='subm_conv')
+            subm_conv3d = paddle.sparse.nn.SubmConv3D(1,
+                                                      1, (1, 3, 3),
+                                                      data_format='NDHWC',
+                                                      key='subm_conv')
             # test extra_repr
             print(subm_conv3d.extra_repr())
 
@@ -116,8 +125,10 @@ def test_SubmConv3D(self):
             #test errors
             with self.assertRaises(ValueError):
                 #Currently, only support data_format='NDHWC'
-                conv3d = paddle.incubate.sparse.nn.SubmConv3D(
-                    1, 1, (1, 3, 3), data_format='NCDHW', key='subm_conv')
+                conv3d = paddle.sparse.nn.SubmConv3D(1,
+                                                     1, (1, 3, 3),
+                                                     data_format='NCDHW',
+                                                     key='subm_conv')
 
     def test_Conv3D_bias(self):
         with _test_eager_guard():
@@ -127,10 +138,7 @@ def test_Conv3D_bias(self):
             sp_x = x.to_sparse_coo(4)
             conv3d = paddle.nn.Conv3D(3, 2, 3, data_format='NDHWC')
 
-            sp_conv3d = paddle.incubate.sparse.nn.Conv3D(3,
-                                                         2,
-                                                         3,
-                                                         data_format='NDHWC')
+            sp_conv3d = paddle.sparse.nn.Conv3D(3, 2, 3, data_format='NDHWC')
             sp_conv3d.weight.set_value(
                 paddle.to_tensor(conv3d.weight.numpy().transpose(2, 3, 4, 1,
                                                                  0)))
@@ -159,3 +167,66 @@ def test_Conv3D_bias(self):
                                sp_conv3d.bias.grad.numpy(),
                                atol=1e-5,
                                rtol=1e-5)
+
+
+class TestStatic(unittest.TestCase):
+
+    def test(self):
+        paddle.enable_static()
+        indices = paddle.static.data(name='indices',
+                                     shape=[4, 4],
+                                     dtype='int32')
+        values = paddle.static.data(name='values',
+                                    shape=[4, 1],
+                                    dtype='float32')
+        dense_shape = [1, 1, 3, 4, 1]
+        sp_x = sparse.sparse_coo_tensor(indices, values, dense_shape)
+
+        weight_shape = [1, 3, 3, 1, 1]
+        weight = paddle.static.data(name='weight',
+                                    shape=weight_shape,
+                                    dtype='float32')
+        bias_shape = [1]
+        bias = paddle.static.data(name='bias',
+                                  shape=bias_shape,
+                                  dtype='float32')
+        out = sparse.nn.functional.conv3d(sp_x,
+                                          weight,
+                                          bias,
+                                          stride=1,
+                                          padding=0,
+                                          dilation=1,
+                                          groups=1,
+                                          data_format="NDHWC")
+        sp_out = sparse.nn.functional.relu(out)
+        out_indices = sp_out.indices()
+        out_values = sp_out.values()
+        out = sp_out.to_dense()
+
+        exe = paddle.static.Executor()
+
+        indices_data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+        values_data = [[1.0], [2.0], [3.0], [4.0]]
+        weight_data = np.array([[[[[1], [1], [1]], [[1], [1], [1]],
+                                  [[1], [1], [1]]]]]).astype('float32')
+        weight_data = weight_data.reshape(weight_shape)
+        bias_data = np.array([1]).astype('float32')
+
+        fetch = exe.run(feed={
+            'indices': indices_data,
+            'values': values_data,
+            'weight': weight_data,
+            'bias': bias_data
+        },
+                        fetch_list=[out, out_indices, out_values],
+                        return_numpy=True)
+        correct_out = np.array([[[[[5.0], [11.0]]]]]).astype('float64')
+        correct_out_values = [[5.0], [11.0]]
+        assert np.array_equal(correct_out, fetch[0])
+        assert np.array_equal(correct_out_values, fetch[2])
+        assert out_indices.dtype == paddle.int32
+        paddle.disable_static()
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
index 149c4cfb22b993..279a688f6aeaf2 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_elementwise_op.py
@@ -18,20 +18,20 @@
 
 import numpy as np
 import paddle
-from paddle.fluid.framework import _test_eager_guard
+import paddle.sparse as sparse
 
 op_list = [__add__, __sub__, __mul__, __truediv__]
 
 
 def get_actual_res(x, y, op):
     if op == __add__:
-        res = paddle.incubate.sparse.add(x, y)
+        res = paddle.sparse.add(x, y)
     elif op == __sub__:
-        res = paddle.incubate.sparse.subtract(x, y)
+        res = paddle.sparse.subtract(x, y)
     elif op == __mul__:
-        res = paddle.incubate.sparse.multiply(x, y)
+        res = paddle.sparse.multiply(x, y)
     elif op == __truediv__:
-        res = paddle.incubate.sparse.divide(x, y)
+        res = paddle.sparse.divide(x, y)
     else:
         raise ValueError("unsupported op")
     return res
@@ -134,6 +134,61 @@ def test_support_dtypes_coo(self):
             for op in op_list:
                 self.func_test_coo(op)
 
+    def test_add_same_indices(self):
+        indices_data = [[0, 1], [0, 3]]
+        values1_data = [[1.0], [2.0]]
+        values2_data = [[1.0], [2.0]]
+        shape = [2, 4, 2]
+
+        sp_a = sparse.sparse_coo_tensor(indices_data,
+                                        values1_data,
+                                        shape,
+                                        stop_gradient=False)
+        sp_b = sparse.sparse_coo_tensor(indices_data,
+                                        values2_data,
+                                        shape,
+                                        stop_gradient=False)
+
+        values1 = paddle.to_tensor(values1_data, stop_gradient=False)
+        values2 = paddle.to_tensor(values2_data, stop_gradient=False)
+
+        #c.values() = a.values() + b.values()
+        sp_c = sparse.add(sp_a, sp_b)
+        sp_c.backward()
+        ref_c = values1 + values2
+        ref_c.backward()
+        np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
+        np.testing.assert_allclose(sp_a.grad.values().numpy(),
+                                   values1.grad.numpy())
+        np.testing.assert_allclose(sp_b.grad.values().numpy(),
+                                   values2.grad.numpy())
+
+    def test_add_bias(self):
+        indices_data = [[0, 1], [0, 3]]
+        values_data = [[1.0, 1.0], [2.0, 2.0]]
+        shape = [2, 4, 2]
+
+        sp_a = sparse.sparse_coo_tensor(indices_data,
+                                        values_data,
+                                        shape,
+                                        stop_gradient=False)
+
+        bias_values = [1.0, 2.0]
+
+        values1 = paddle.to_tensor(values_data, stop_gradient=False)
+        values2 = paddle.to_tensor(bias_values, stop_gradient=False)
+        values3 = paddle.to_tensor(bias_values, stop_gradient=False)
+
+        #c.values() = a.values() + b
+        sp_c = sparse.add(sp_a, values2)
+        sp_c.backward()
+        ref_c = values1 + values3
+        ref_c.backward()
+        np.testing.assert_allclose(sp_c.values().numpy(), ref_c.numpy())
+        np.testing.assert_allclose(sp_a.grad.values().numpy(),
+                                   values1.grad.numpy())
+        np.testing.assert_allclose(values2.grad.numpy(), values3.grad.numpy())
+
 
 if __name__ == "__main__":
     paddle.device.set_device('cpu')
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
index 996ba3a01148e5..50e9218a27d355 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_fused_attention_op.py
@@ -92,7 +92,7 @@ def test_dygraph(self):
                 output = paddle.matmul(softmax, value)
                 output.backward()
 
-                output_sp = paddle.incubate.sparse.nn.functional.attention(
+                output_sp = paddle.sparse.nn.functional.attention(
                     query_sp, key_sp, value_sp, sp_mask, kp_mask, attn_mask)
                 output_sp.backward()
             else:
@@ -103,7 +103,7 @@ def test_dygraph(self):
                 output = paddle.matmul(softmax, value)
                 output.backward()
 
-                output_sp = paddle.incubate.sparse.nn.functional.attention(
+                output_sp = paddle.sparse.nn.functional.attention(
                     query_sp, key_sp, value_sp, sp_mask)
                 output_sp.backward()
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_is_same_shape.py b/python/paddle/fluid/tests/unittests/test_sparse_is_same_shape.py
new file mode 100644
index 00000000000000..1b41fddb6c5c6b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_is_same_shape.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+
+import paddle
+from paddle.sparse.binary import is_same_shape
+
+
+class TestSparseIsSameShapeAPI(unittest.TestCase):
+    """
+    test paddle.sparse.is_same_shape
+    """
+
+    def setUp(self):
+        self.shapes = [[2, 5, 8], [3, 4]]
+        self.tensors = [
+            paddle.rand(self.shapes[0]),
+            paddle.rand(self.shapes[0]),
+            paddle.rand(self.shapes[1])
+        ]
+        self.sparse_dim = 2
+
+    def test_dense_dense(self):
+        self.assertTrue(is_same_shape(self.tensors[0], self.tensors[1]))
+        self.assertFalse(is_same_shape(self.tensors[0], self.tensors[2]))
+        self.assertFalse(is_same_shape(self.tensors[1], self.tensors[2]))
+
+    def test_dense_csr(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0], self.tensors[1].to_sparse_csr()))
+        self.assertFalse(
+            is_same_shape(self.tensors[0], self.tensors[2].to_sparse_csr()))
+        self.assertFalse(
+            is_same_shape(self.tensors[1], self.tensors[2].to_sparse_csr()))
+
+    def test_dense_coo(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0],
+                          self.tensors[1].to_sparse_coo(self.sparse_dim)))
+        self.assertFalse(
+            is_same_shape(self.tensors[0],
+                          self.tensors[2].to_sparse_coo(self.sparse_dim)))
+        self.assertFalse(
+            is_same_shape(self.tensors[1],
+                          self.tensors[2].to_sparse_coo(self.sparse_dim)))
+
+    def test_csr_dense(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0].to_sparse_csr(), self.tensors[1]))
+        self.assertFalse(
+            is_same_shape(self.tensors[0].to_sparse_csr(), self.tensors[2]))
+        self.assertFalse(
+            is_same_shape(self.tensors[1].to_sparse_csr(), self.tensors[2]))
+
+    def test_csr_csr(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0].to_sparse_csr(),
+                          self.tensors[1].to_sparse_csr()))
+        self.assertFalse(
+            is_same_shape(self.tensors[0].to_sparse_csr(),
+                          self.tensors[2].to_sparse_csr()))
+        self.assertFalse(
+            is_same_shape(self.tensors[1].to_sparse_csr(),
+                          self.tensors[2].to_sparse_csr()))
+
+    def test_csr_coo(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0].to_sparse_csr(),
+                          self.tensors[1].to_sparse_coo(self.sparse_dim)))
+        self.assertFalse(
+            is_same_shape(self.tensors[0].to_sparse_csr(),
+                          self.tensors[2].to_sparse_coo(self.sparse_dim)))
+        self.assertFalse(
+            is_same_shape(self.tensors[1].to_sparse_csr(),
+                          self.tensors[2].to_sparse_coo(self.sparse_dim)))
+
+    def test_coo_dense(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0].to_sparse_coo(self.sparse_dim),
+                          self.tensors[1]))
+        self.assertFalse(
+            is_same_shape(self.tensors[0].to_sparse_coo(self.sparse_dim),
+                          self.tensors[2]))
+        self.assertFalse(
+            is_same_shape(self.tensors[1].to_sparse_coo(self.sparse_dim),
+                          self.tensors[2]))
+
+    def test_coo_csr(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0].to_sparse_coo(self.sparse_dim),
+                          self.tensors[1].to_sparse_csr()))
+        self.assertFalse(
+            is_same_shape(self.tensors[0].to_sparse_coo(self.sparse_dim),
+                          self.tensors[2].to_sparse_csr()))
+        self.assertFalse(
+            is_same_shape(self.tensors[1].to_sparse_coo(self.sparse_dim),
+                          self.tensors[2].to_sparse_csr()))
+
+    def test_coo_coo(self):
+        self.assertTrue(
+            is_same_shape(self.tensors[0].to_sparse_coo(self.sparse_dim),
+                          self.tensors[1].to_sparse_coo(self.sparse_dim)))
+        self.assertFalse(
+            is_same_shape(self.tensors[0].to_sparse_coo(self.sparse_dim),
+                          self.tensors[2].to_sparse_coo(self.sparse_dim)))
+        self.assertFalse(
+            is_same_shape(self.tensors[1].to_sparse_coo(self.sparse_dim),
+                          self.tensors[2].to_sparse_coo(self.sparse_dim)))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
index 47f334b1a3f341..374ef5b03ab649 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_matmul_op.py
@@ -58,7 +58,7 @@ def check_result(self, x_shape, y_shape, format):
         sp_x.stop_gradient = False
         sp_y = origin_y.detach()
         sp_y.stop_gradient = False
-        sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y)
+        sp_out = paddle.sparse.matmul(sp_x, sp_y)
 
         np.testing.assert_allclose(sp_out.numpy(),
                                    dense_out.numpy(),
@@ -107,7 +107,7 @@ def test_masked_matmul_2d(self):
         x = paddle.to_tensor(np_x, stop_gradient=False)
         y = paddle.to_tensor(np_y, stop_gradient=False)
         mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr()
-        out = paddle.incubate.sparse.masked_matmul(x, y, mask)
+        out = paddle.sparse.masked_matmul(x, y, mask)
 
         np.testing.assert_allclose(np_out.indptr,
                                    out.crows().numpy(),
@@ -145,7 +145,7 @@ def test_masked_matmul_3d(self):
         sp_x.stop_gradient = False
         sp_y = origin_y.detach()
         sp_y.stop_gradient = False
-        sp_out = paddle.incubate.sparse.matmul(sp_x, sp_y)
+        sp_out = paddle.sparse.matmul(sp_x, sp_y)
         sp_out.backward()
 
         np.testing.assert_allclose(sp_out.numpy(),
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_model.py b/python/paddle/fluid/tests/unittests/test_sparse_model.py
index c070614fc708bf..2f7c94614c4d28 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_model.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_model.py
@@ -15,8 +15,7 @@
 import unittest
 import numpy as np
 import paddle
-from paddle.incubate import sparse
-from paddle.incubate.sparse import nn
+from paddle.sparse import nn
 from paddle.fluid.framework import _test_eager_guard
 
 
@@ -26,10 +25,10 @@ def sparse(self, sp_x):
         indentity = sp_x
         out = nn.functional.relu(sp_x)
         values = out.values() + indentity.values()
-        out = sparse.sparse_coo_tensor(out.indices(),
-                                       values,
-                                       shape=out.shape,
-                                       stop_gradient=out.stop_gradient)
+        out = paddle.sparse.sparse_coo_tensor(out.indices(),
+                                              values,
+                                              shape=out.shape,
+                                              stop_gradient=out.stop_gradient)
         return out
 
     def dense(self, x):
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_mv_op.py b/python/paddle/fluid/tests/unittests/test_sparse_mv_op.py
index 1631a2a7a59fe2..9922dd8aaf6a34 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_mv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_mv_op.py
@@ -61,7 +61,7 @@ def test_mv(self):
             sp_x.stop_gradient = False
             sp_vec = origin_vec.detach()
             sp_vec.stop_gradient = False
-            sp_out = paddle.incubate.sparse.mv(sp_x, sp_vec)
+            sp_out = paddle.sparse.mv(sp_x, sp_vec)
             sp_out.backward()
 
             np.testing.assert_allclose(sp_out.numpy(),
@@ -99,7 +99,7 @@ def test_mv(self):
             sp_x.stop_gradient = False
             sp_vec = origin_vec.detach()
             sp_vec.stop_gradient = False
-            sp_out = paddle.incubate.sparse.mv(sp_x, sp_vec)
+            sp_out = paddle.sparse.mv(sp_x, sp_vec)
             sp_out.backward()
 
             np.testing.assert_allclose(sp_out.numpy(),
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
index d04cb020cffaed..90660e35fe3fca 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_norm_op.py
@@ -16,19 +16,19 @@
 import unittest
 import numpy as np
 import paddle
-from paddle.incubate.sparse import nn
+from paddle.sparse import nn
+import paddle.sparse as sparse
 import paddle.fluid as fluid
 import copy
 
 
 class TestSparseBatchNorm(unittest.TestCase):
-
     def test(self):
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
         paddle.seed(0)
         channels = 4
         shape = [2, 3, 6, 6, channels]
-        #there is no zero in dense_x
+        # there is no zero in dense_x
         dense_x = paddle.randn(shape)
         dense_x.stop_gradient = False
 
@@ -40,7 +40,7 @@ def test(self):
         dense_x2 = copy.deepcopy(dense_x)
         dense_x2.stop_gradient = False
         sparse_x = dense_x2.to_sparse_coo(sparse_dim)
-        sparse_batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
+        sparse_batch_norm = paddle.sparse.nn.BatchNorm(channels)
         # set same params
         sparse_batch_norm._mean.set_value(batch_norm._mean)
         sparse_batch_norm._variance.set_value(batch_norm._variance)
@@ -48,17 +48,21 @@ def test(self):
 
         sparse_y = sparse_batch_norm(sparse_x)
         # compare the result with dense batch_norm
-        assert np.allclose(dense_y.flatten().numpy(),
-                           sparse_y.values().flatten().numpy(),
-                           atol=1e-5,
-                           rtol=1e-5)
+        assert np.allclose(
+            dense_y.flatten().numpy(),
+            sparse_y.values().flatten().numpy(),
+            atol=1e-5,
+            rtol=1e-5,
+        )
 
         # test backward
         sparse_y.backward(sparse_y)
-        assert np.allclose(dense_x.grad.flatten().numpy(),
-                           sparse_x.grad.values().flatten().numpy(),
-                           atol=1e-5,
-                           rtol=1e-5)
+        assert np.allclose(
+            dense_x.grad.flatten().numpy(),
+            sparse_x.grad.values().flatten().numpy(),
+            atol=1e-5,
+            rtol=1e-5,
+        )
         fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False})
 
     def test_error_layout(self):
@@ -66,8 +70,9 @@ def test_error_layout(self):
             shape = [2, 3, 6, 6, 3]
             x = paddle.randn(shape)
             sparse_x = x.to_sparse_coo(4)
-            sparse_batch_norm = paddle.incubate.sparse.nn.BatchNorm(
-                3, data_format='NCDHW')
+            sparse_batch_norm = paddle.sparse.nn.BatchNorm(
+                3, data_format='NCDHW'
+            )
             sparse_batch_norm(sparse_x)
 
     def test2(self):
@@ -76,7 +81,7 @@ def test2(self):
         x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
         dense_x = paddle.to_tensor(x_data)
         sparse_x = dense_x.to_sparse_coo(4)
-        batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
+        batch_norm = paddle.sparse.nn.BatchNorm(channels)
         batch_norm_out = batch_norm(sparse_x)
         dense_bn = paddle.nn.BatchNorm1D(channels)
         dense_x = dense_x.reshape((-1, dense_x.shape[-1]))
@@ -86,10 +91,10 @@ def test2(self):
 
 
 class TestSyncBatchNorm(unittest.TestCase):
-
     def test_sync_batch_norm(self):
-        x = np.array([[[[0.3, 0.4], [0.3, 0.07]],
-                       [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+        x = np.array(
+            [[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]
+        ).astype('float32')
         x = paddle.to_tensor(x)
         sparse_x = x.to_sparse_coo(len(x.shape) - 1)
 
@@ -100,23 +105,81 @@ def test_sync_batch_norm(self):
             dense_sync_bn = paddle.nn.SyncBatchNorm(2)
             x = x.reshape((-1, x.shape[-1]))
             dense_hidden = dense_sync_bn(x)
-            assert np.allclose(sparse_hidden.values().numpy(),
-                               dense_hidden.numpy())
+            assert np.allclose(
+                sparse_hidden.values().numpy(), dense_hidden.numpy()
+            )
 
     def test_convert(self):
-        base_model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5),
-                                          nn.BatchNorm(5))
+        base_model = paddle.nn.Sequential(
+            nn.Conv3D(3, 5, 3), nn.BatchNorm(5), nn.BatchNorm(5)
+        )
 
         model = paddle.nn.Sequential(
-            nn.Conv3D(3, 5, 3), nn.BatchNorm(5),
-            nn.BatchNorm(5,
-                         weight_attr=fluid.ParamAttr(name='bn.scale'),
-                         bias_attr=fluid.ParamAttr(name='bn.bias')))
+            nn.Conv3D(3, 5, 3),
+            nn.BatchNorm(5),
+            nn.BatchNorm(
+                5,
+                weight_attr=fluid.ParamAttr(name='bn.scale'),
+                bias_attr=fluid.ParamAttr(name='bn.bias'),
+            ),
+        )
         model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
         for idx, sublayer in enumerate(base_model.sublayers()):
             if isinstance(sublayer, nn.BatchNorm):
                 self.assertEqual(isinstance(model[idx], nn.SyncBatchNorm), True)
 
 
+class TestStatic(unittest.TestCase):
+    def test(self):
+        paddle.enable_static()
+        indices = paddle.static.data(
+            name='indices', shape=[4, 4], dtype='int32'
+        )
+        values = paddle.static.data(
+            name='values', shape=[4, 1], dtype='float32'
+        )
+        channels = 1
+        dense_shape = [1, 1, 3, 4, channels]
+        sp_x = sparse.sparse_coo_tensor(indices, values, dense_shape)
+
+        sparse_batch_norm = paddle.sparse.nn.BatchNorm(channels)
+        sp_y = sparse_batch_norm(sp_x)
+        out = sp_y.to_dense()
+
+        exe = paddle.static.Executor()
+        indices_data = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+        values_data = np.array([[1.0], [2.0], [3.0], [4.0]]).astype('float32')
+        bias_data = np.array([1.0]).astype('float32')
+        weight_data = np.array([2.0]).astype('float32')
+        mean_data = np.array([1.0]).astype('float32')
+        variance_data = np.array([2.0]).astype('float32')
+
+        fetch = exe.run(
+            feed={
+                'indices': indices_data,
+                'values': values_data,
+                'batch_norm_0.b_0': bias_data,
+                'batch_norm_0.w_0': weight_data,
+                'batch_norm_0.w_1': mean_data,
+                'batch_norm_0.w_2': variance_data,
+            },
+            fetch_list=[out],
+            return_numpy=True,
+        )
+        correct_out = np.array(
+            [
+                [
+                    [
+                        [[0.0], [-1.6832708], [0.0], [0.1055764]],
+                        [[0.0], [0.0], [1.8944236], [0.0]],
+                        [[0.0], [0.0], [0.0], [3.683271]],
+                    ]
+                ]
+            ]
+        ).astype('float32')
+        np.testing.assert_allclose(correct_out, fetch[0], rtol=1e-5)
+        paddle.disable_static()
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
index 0e6d8ddef94444..fe3782974d355e 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_pooling_op.py
@@ -48,7 +48,7 @@ def test(self):
             self.setUp()
             self.dense_x.stop_gradient = False
             sparse_x = self.dense_x.to_sparse_coo(4)
-            sparse_out = paddle.incubate.sparse.nn.functional.max_pool3d(
+            sparse_out = paddle.sparse.nn.functional.max_pool3d(
                 sparse_x,
                 self.kernel_sizes,
                 stride=self.strides,
@@ -109,8 +109,8 @@ def test(self):
         with _test_eager_guard():
             dense_x = paddle.randn((2, 3, 6, 6, 3))
             sparse_x = dense_x.to_sparse_coo(4)
-            max_pool3d = paddle.incubate.sparse.nn.MaxPool3D(
-                kernel_size=3, data_format='NDHWC')
+            max_pool3d = paddle.sparse.nn.MaxPool3D(kernel_size=3,
+                                                    data_format='NDHWC')
             out = max_pool3d(sparse_x)
             out = out.to_dense()
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_reshape_op.py b/python/paddle/fluid/tests/unittests/test_sparse_reshape_op.py
new file mode 100644
index 00000000000000..01619f76dc9481
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_reshape_op.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+
+
+class TestReshape(unittest.TestCase):
+    """
+    Test the API paddle.sparse.reshape on some sparse tensors.
+    x: sparse, out: sparse
+    """
+
+    def check_result(self, x_shape, new_shape, format):
+        """
+        x_shape: original shape
+        new_shape: new shape
+        format: "coo" or "csr"
+        Transform a sparse tensor with shape "x_shape" to
+        a sparse tensor with shape "new_shape".
+        Compare the output of paddle.reshape and the output of
+        paddle.sparse.reshape.
+        """
+        mask = np.random.randint(0, 2, x_shape)
+        np_x = np.random.randint(-100, 100, x_shape) * mask
+
+        # check cpu kernel
+        dense_x = paddle.to_tensor(np_x, place=paddle.CPUPlace())
+        dense_x.stop_gradient = False
+        dense_out = paddle.reshape(dense_x, new_shape)
+
+        if format == "coo":
+            sp_x = paddle.to_tensor(np_x,
+                                    place=paddle.CPUPlace()).to_sparse_coo(
+                                        len(x_shape))
+        else:
+            sp_x = paddle.to_tensor(np_x,
+                                    place=paddle.CPUPlace()).to_sparse_csr()
+        sp_x.stop_gradient = False
+        sp_out = paddle.sparse.reshape(sp_x, new_shape)
+
+        np.testing.assert_allclose(sp_out.to_dense().numpy(),
+                                   dense_out.numpy(),
+                                   rtol=1e-05)
+
+        dense_out.backward()
+        sp_out.backward()
+        np.testing.assert_allclose(sp_x.grad.to_dense().numpy(),
+                                   dense_x.grad.numpy() *
+                                   np_x.astype('bool').astype('int'),
+                                   rtol=1e-05)
+
+        # check gpu kernel
+        if paddle.device.is_compiled_with_cuda():
+            dense_x = paddle.to_tensor(np_x, place=paddle.CUDAPlace(0))
+            dense_x.stop_gradient = False
+            dense_out = paddle.reshape(dense_x, new_shape)
+
+            if format == "coo":
+                sp_x = paddle.to_tensor(
+                    np_x, place=paddle.CUDAPlace(0)).to_sparse_coo(len(x_shape))
+            else:
+                sp_x = paddle.to_tensor(
+                    np_x, place=paddle.CUDAPlace(0)).to_sparse_csr()
+            sp_x.stop_gradient = False
+            sp_out = paddle.sparse.reshape(sp_x, new_shape)
+
+            np.testing.assert_allclose(sp_out.to_dense().numpy(),
+                                       dense_out.numpy(),
+                                       rtol=1e-05)
+
+            dense_out.backward()
+            sp_out.backward()
+            np.testing.assert_allclose(sp_x.grad.to_dense().numpy(),
+                                       dense_x.grad.numpy() *
+                                       np_x.astype('bool').astype('int'),
+                                       rtol=1e-05)
+
+    def test_reshape_2d(self):
+        self.check_result([2, 5], [
+            10,
+        ], 'coo')
+        self.check_result([12, 5], [15, 4], 'coo')
+
+        self.check_result([10, 5], [2, 25], 'csr')
+        self.check_result([9, 8], [18, 4], 'csr')
+
+    def test_reshape_3d(self):
+        self.check_result([6, 2, 3], [6, 2, 3], 'coo')
+        self.check_result([6, 2, 3], [2, 3, 3, 2], 'coo')
+        self.check_result([6, 2, 3], [1, 18, 2], 'coo')
+        self.check_result([6, 2, 3], [2, 9, 2], 'coo')
+        self.check_result([6, 2, 3], [2, 1, 18], 'coo')
+        self.check_result([6, 2, 3], [1, 2, 2, 3, 3], 'coo')
+
+        self.check_result([6, 2, 3], [6, 2, 3], 'csr')
+        self.check_result([6, 2, 3], [6, 3, 2], 'csr')
+        self.check_result([6, 2, 3], [2, 6, 3], 'csr')
+        self.check_result([6, 2, 3], [3, 6, 2], 'csr')
+        self.check_result([6, 2, 3], [4, 9, 1], 'csr')
+        self.check_result([6, 2, 3], [12, 1, 3], 'csr')
+
+    def test_reshape_nd(self):
+        self.check_result([8, 3, 4, 4, 5, 3], [24, 8, 10, 3], 'coo')
+        self.check_result([3, 4, 4, 5, 7], [1, 12, 2, 5, 14], 'coo')
+
+    def test_reshape_with_zero_or_minus_one_in_new_shape(self):
+        self.check_result([6, 2, 3], [-1, 0, 3], 'coo')
+        self.check_result([6, 2, 3], [2, 3, 0, -1], 'coo')
+        self.check_result([6, 2, 3], [1, -1, 2], 'coo')
+        self.check_result([6, 2, 3], [-1, 9, 2], 'coo')
+        self.check_result([6, 2, 3], [2, -1, 18], 'coo')
+        self.check_result([6, 2, 3], [1, 0, 2, -1, 3], 'coo')
+
+        self.check_result([6, 2, 3], [0, 0, -1], 'csr')
+        self.check_result([6, 2, 3], [-1, 3, 2], 'csr')
+        self.check_result([6, 2, 3], [2, -1, 0], 'csr')
+        self.check_result([6, 2, 3], [-1, 6, 2], 'csr')
+        self.check_result([6, 2, 3], [-1, 9, 1], 'csr')
+        self.check_result([6, 2, 3], [-1, 1, 3], 'csr')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_softmax_op.py b/python/paddle/fluid/tests/unittests/test_sparse_softmax_op.py
index 76751e0f87ec7a..c7adabcece4f2f 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_softmax_op.py
@@ -48,7 +48,7 @@ def test_softmax2d(self):
                 np_out = np.concatenate([np_out, x_exp / x_exp_sum])
 
             csr = paddle.to_tensor(np_x, stop_gradient=False).to_sparse_csr()
-            m = paddle.incubate.sparse.nn.Softmax()
+            m = paddle.sparse.nn.Softmax()
             out = m(csr)
             np.testing.assert_allclose(out.crows().numpy(),
                                        np_csr.indptr,
@@ -105,7 +105,7 @@ def test_softmax3d(self):
                     np_out = np.concatenate([np_out, x_exp / x_exp_sum])
 
             csr = paddle.to_tensor(np_x, stop_gradient=False).to_sparse_csr()
-            m = paddle.incubate.sparse.nn.Softmax()
+            m = paddle.sparse.nn.Softmax()
             out = m(csr)
             np.testing.assert_allclose(out.values().numpy(), np_out, rtol=1e-05)
 
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_transpose_op.py b/python/paddle/fluid/tests/unittests/test_sparse_transpose_op.py
new file mode 100644
index 00000000000000..12d1fd9a8b770d
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_sparse_transpose_op.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import numpy as np
+import unittest
+from paddle.fluid.framework import _test_eager_guard
+
+
+class TestTranspose(unittest.TestCase):
+    # x: sparse, out: sparse
+    def check_result(self, x_shape, dims, format):
+        with _test_eager_guard():
+            mask = paddle.randint(0, 2, x_shape).astype("float32")
+            # "+ 1" to make sure that all zero elements in "origin_x" is caused by multiplying by "mask",
+            # or the backward checks may fail.
+            origin_x = (paddle.rand(x_shape, dtype='float32') + 1) * mask
+            dense_x = origin_x.detach()
+            dense_x.stop_gradient = False
+            dense_out = paddle.transpose(dense_x, dims)
+
+            if format == "coo":
+                sp_x = origin_x.detach().to_sparse_coo(len(x_shape))
+            else:
+                sp_x = origin_x.detach().to_sparse_csr()
+            sp_x.stop_gradient = False
+            sp_out = paddle.sparse.transpose(sp_x, dims)
+
+            np.testing.assert_allclose(sp_out.to_dense().numpy(),
+                                       dense_out.numpy(),
+                                       rtol=1e-05)
+            dense_out.backward()
+            sp_out.backward()
+            np.testing.assert_allclose(sp_x.grad.to_dense().numpy(),
+                                       (dense_x.grad * mask).numpy(),
+                                       rtol=1e-05)
+
+    def test_transpose_2d(self):
+        self.check_result([2, 5], [0, 1], 'coo')
+        self.check_result([2, 5], [0, 1], 'csr')
+        self.check_result([2, 5], [1, 0], 'coo')
+        self.check_result([2, 5], [1, 0], 'csr')
+
+    def test_transpose_3d(self):
+        self.check_result([6, 2, 3], [0, 1, 2], 'coo')
+        self.check_result([6, 2, 3], [0, 1, 2], 'csr')
+        self.check_result([6, 2, 3], [0, 2, 1], 'coo')
+        self.check_result([6, 2, 3], [0, 2, 1], 'csr')
+        self.check_result([6, 2, 3], [1, 0, 2], 'coo')
+        self.check_result([6, 2, 3], [1, 0, 2], 'csr')
+        self.check_result([6, 2, 3], [2, 0, 1], 'coo')
+        self.check_result([6, 2, 3], [2, 0, 1], 'csr')
+        self.check_result([6, 2, 3], [2, 1, 0], 'coo')
+        self.check_result([6, 2, 3], [2, 1, 0], 'csr')
+        self.check_result([6, 2, 3], [1, 2, 0], 'coo')
+        self.check_result([6, 2, 3], [1, 2, 0], 'csr')
+
+    def test_transpose_nd(self):
+        self.check_result([8, 3, 4, 4, 5, 3], [5, 3, 4, 1, 0, 2], 'coo')
+        # Randint now only supports access to dimension 0 to 9.
+        self.check_result([2, 3, 4, 2, 3, 4, 2, 3, 4],
+                          [2, 3, 4, 5, 6, 7, 8, 0, 1], 'coo')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
index 7abbaa84adb457..fc39ab973c009d 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_unary_op.py
@@ -90,84 +90,79 @@ def compare_with_dense_two_attr(self, dense_func, sparse_func, attr1,
         self.check_result(dense_func, sparse_func, 'csr', attr1, attr2)
 
     def test_sparse_sin(self):
-        self.compare_with_dense(paddle.sin, paddle.incubate.sparse.sin)
+        self.compare_with_dense(paddle.sin, paddle.sparse.sin)
 
     def test_sparse_tan(self):
-        self.compare_with_dense(paddle.tan, paddle.incubate.sparse.tan)
+        self.compare_with_dense(paddle.tan, paddle.sparse.tan)
 
     def test_sparse_asin(self):
-        self.compare_with_dense(paddle.asin, paddle.incubate.sparse.asin)
+        self.compare_with_dense(paddle.asin, paddle.sparse.asin)
 
     def test_sparse_atan(self):
-        self.compare_with_dense(paddle.atan, paddle.incubate.sparse.atan)
+        self.compare_with_dense(paddle.atan, paddle.sparse.atan)
 
     def test_sparse_sinh(self):
-        self.compare_with_dense(paddle.sinh, paddle.incubate.sparse.sinh)
+        self.compare_with_dense(paddle.sinh, paddle.sparse.sinh)
 
     def test_sparse_tanh(self):
-        self.compare_with_dense(paddle.tanh, paddle.incubate.sparse.tanh)
+        self.compare_with_dense(paddle.tanh, paddle.sparse.tanh)
 
     def test_sparse_asinh(self):
-        self.compare_with_dense(paddle.asinh, paddle.incubate.sparse.asinh)
+        self.compare_with_dense(paddle.asinh, paddle.sparse.asinh)
 
     def test_sparse_atanh(self):
-        self.compare_with_dense(paddle.atanh, paddle.incubate.sparse.atanh)
+        self.compare_with_dense(paddle.atanh, paddle.sparse.atanh)
 
     def test_sparse_sqrt(self):
-        self.compare_with_dense(paddle.sqrt, paddle.incubate.sparse.sqrt)
+        self.compare_with_dense(paddle.sqrt, paddle.sparse.sqrt)
 
     def test_sparse_square(self):
-        self.compare_with_dense(paddle.square, paddle.incubate.sparse.square)
+        self.compare_with_dense(paddle.square, paddle.sparse.square)
 
     def test_sparse_log1p(self):
-        self.compare_with_dense(paddle.log1p, paddle.incubate.sparse.log1p)
+        self.compare_with_dense(paddle.log1p, paddle.sparse.log1p)
 
     def test_sparse_relu(self):
-        self.compare_with_dense(paddle.nn.ReLU(),
-                                paddle.incubate.sparse.nn.ReLU())
+        self.compare_with_dense(paddle.nn.ReLU(), paddle.sparse.nn.ReLU())
 
     def test_sparse_relu6(self):
-        self.compare_with_dense(paddle.nn.ReLU6(),
-                                paddle.incubate.sparse.nn.ReLU6())
+        self.compare_with_dense(paddle.nn.ReLU6(), paddle.sparse.nn.ReLU6())
 
     def test_sparse_leaky_relu(self):
         self.compare_with_dense(paddle.nn.LeakyReLU(0.1),
-                                paddle.incubate.sparse.nn.LeakyReLU(0.1))
+                                paddle.sparse.nn.LeakyReLU(0.1))
 
     def test_sparse_abs(self):
-        self.compare_with_dense(paddle.abs, paddle.incubate.sparse.abs)
+        self.compare_with_dense(paddle.abs, paddle.sparse.abs)
 
     def test_sparse_expm1(self):
-        self.compare_with_dense(paddle.expm1, paddle.incubate.sparse.expm1)
+        self.compare_with_dense(paddle.expm1, paddle.sparse.expm1)
 
     def test_sparse_deg2rad(self):
-        self.compare_with_dense(paddle.deg2rad, paddle.incubate.sparse.deg2rad)
+        self.compare_with_dense(paddle.deg2rad, paddle.sparse.deg2rad)
 
     def test_sparse_rad2deg(self):
-        self.compare_with_dense(paddle.rad2deg, paddle.incubate.sparse.rad2deg)
+        self.compare_with_dense(paddle.rad2deg, paddle.sparse.rad2deg)
 
     def test_sparse_neg(self):
-        self.compare_with_dense(paddle.neg, paddle.incubate.sparse.neg)
+        self.compare_with_dense(paddle.neg, paddle.sparse.neg)
 
     def test_sparse_pow(self):
-        self.compare_with_dense_one_attr(paddle.pow, paddle.incubate.sparse.pow,
-                                         3)
+        self.compare_with_dense_one_attr(paddle.pow, paddle.sparse.pow, 3)
 
     def test_sparse_mul_scalar(self):
         self.compare_with_dense_one_attr(paddle.Tensor.__mul__,
-                                         paddle.incubate.sparse.multiply, 3)
+                                         paddle.sparse.multiply, 3)
 
     def test_sparse_div_scalar(self):
         self.compare_with_dense_one_attr(paddle.Tensor.__div__,
-                                         paddle.incubate.sparse.divide, 2)
+                                         paddle.sparse.divide, 2)
 
     def test_sparse_cast(self):
-        self.compare_with_dense_two_attr(paddle.cast,
-                                         paddle.incubate.sparse.cast, 'int32',
-                                         'float32')
-        self.compare_with_dense_two_attr(paddle.cast,
-                                         paddle.incubate.sparse.cast, 'int32',
-                                         'float64')
+        self.compare_with_dense_two_attr(paddle.cast, paddle.sparse.cast,
+                                         'int32', 'float32')
+        self.compare_with_dense_two_attr(paddle.cast, paddle.sparse.cast,
+                                         'int32', 'float64')
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
index 2287eca23c1728..8d6b51339ff595 100644
--- a/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sparse_utils_op.py
@@ -16,7 +16,6 @@
 import unittest
 import numpy as np
 import paddle
-from paddle.incubate import sparse
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.framework import _test_eager_guard
@@ -33,10 +32,10 @@ def test_create_coo_by_tensor(self):
             dense_shape = [3, 4]
             dense_indices = paddle.to_tensor(indices)
             dense_elements = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.incubate.sparse.sparse_coo_tensor(dense_indices,
-                                                           dense_elements,
-                                                           dense_shape,
-                                                           stop_gradient=False)
+            coo = paddle.sparse.sparse_coo_tensor(dense_indices,
+                                                  dense_elements,
+                                                  dense_shape,
+                                                  stop_gradient=False)
             # test the to_string.py
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
@@ -46,8 +45,7 @@ def test_create_coo_by_np(self):
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1.0, 2.0, 3.0]
             dense_shape = [3, 3]
-            coo = paddle.incubate.sparse.sparse_coo_tensor(
-                indices, values, dense_shape)
+            coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
             assert np.array_equal(3, coo.nnz())
             assert np.array_equal(indices, coo.indices().numpy())
             assert np.array_equal(values, coo.values().numpy())
@@ -62,12 +60,11 @@ def test_create_csr_by_tensor(self):
             dense_cols = paddle.to_tensor(cols)
             dense_elements = paddle.to_tensor(values, dtype='float32')
             stop_gradient = False
-            csr = paddle.incubate.sparse.sparse_csr_tensor(
-                dense_crows,
-                dense_cols,
-                dense_elements,
-                dense_shape,
-                stop_gradient=stop_gradient)
+            csr = paddle.sparse.sparse_csr_tensor(dense_crows,
+                                                  dense_cols,
+                                                  dense_elements,
+                                                  dense_shape,
+                                                  stop_gradient=stop_gradient)
 
     def test_create_csr_by_np(self):
         with _test_eager_guard():
@@ -75,8 +72,8 @@ def test_create_csr_by_np(self):
             cols = [1, 3, 2, 0, 1]
             values = [1, 2, 3, 4, 5]
             dense_shape = [3, 4]
-            csr = paddle.incubate.sparse.sparse_csr_tensor(
-                crows, cols, values, dense_shape)
+            csr = paddle.sparse.sparse_csr_tensor(crows, cols, values,
+                                                  dense_shape)
             # test the to_string.py
             assert np.array_equal(5, csr.nnz())
             assert np.array_equal(crows, csr.crows().numpy())
@@ -89,10 +86,10 @@ def test_place(self):
             indices = [[0, 1], [0, 1]]
             values = [1.0, 2.0]
             dense_shape = [2, 2]
-            coo = paddle.incubate.sparse.sparse_coo_tensor(indices,
-                                                           values,
-                                                           dense_shape,
-                                                           place=place)
+            coo = paddle.sparse.sparse_coo_tensor(indices,
+                                                  values,
+                                                  dense_shape,
+                                                  place=place)
             assert coo.place.is_cpu_place()
             assert coo.values().place.is_cpu_place()
             assert coo.indices().place.is_cpu_place()
@@ -100,10 +97,10 @@ def test_place(self):
             crows = [0, 2, 3, 5]
             cols = [1, 3, 2, 0, 1]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            csr = paddle.incubate.sparse.sparse_csr_tensor(crows,
-                                                           cols,
-                                                           values, [3, 5],
-                                                           place=place)
+            csr = paddle.sparse.sparse_csr_tensor(crows,
+                                                  cols,
+                                                  values, [3, 5],
+                                                  place=place)
             assert csr.place.is_cpu_place()
             assert csr.crows().place.is_cpu_place()
             assert csr.cols().place.is_cpu_place()
@@ -116,19 +113,19 @@ def test_dtype(self):
             dense_shape = [2, 2]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.incubate.sparse.sparse_coo_tensor(indices,
-                                                           values,
-                                                           dense_shape,
-                                                           dtype='float64')
+            coo = paddle.sparse.sparse_coo_tensor(indices,
+                                                  values,
+                                                  dense_shape,
+                                                  dtype='float64')
             assert coo.dtype == paddle.float64
 
             crows = [0, 2, 3, 5]
             cols = [1, 3, 2, 0, 1]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            csr = paddle.incubate.sparse.sparse_csr_tensor(crows,
-                                                           cols,
-                                                           values, [3, 5],
-                                                           dtype='float16')
+            csr = paddle.sparse.sparse_csr_tensor(crows,
+                                                  cols,
+                                                  values, [3, 5],
+                                                  dtype='float16')
             assert csr.dtype == paddle.float16
 
     def test_create_coo_no_shape(self):
@@ -137,7 +134,7 @@ def test_create_coo_no_shape(self):
             values = [1.0, 2.0]
             indices = paddle.to_tensor(indices, dtype='int32')
             values = paddle.to_tensor(values, dtype='float32')
-            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values)
+            coo = paddle.sparse.sparse_coo_tensor(indices, values)
             assert [2, 2] == coo.shape
 
 
@@ -155,7 +152,7 @@ def test_to_sparse_coo(self):
             #test to_sparse_coo_grad backward
             out_grad_indices = [[0, 1], [0, 1]]
             out_grad_values = [2.0, 3.0]
-            out_grad = paddle.incubate.sparse.sparse_coo_tensor(
+            out_grad = paddle.sparse.sparse_coo_tensor(
                 paddle.to_tensor(out_grad_indices),
                 paddle.to_tensor(out_grad_values),
                 shape=out.shape,
@@ -171,7 +168,7 @@ def test_coo_to_dense(self):
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
             indices_dtypes = ['int32', 'int64']
             for indices_dtype in indices_dtypes:
-                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                sparse_x = paddle.sparse.sparse_coo_tensor(
                     paddle.to_tensor(indices, dtype=indices_dtype),
                     paddle.to_tensor(values),
                     shape=[3, 4],
@@ -187,7 +184,7 @@ def test_coo_to_dense(self):
                                       sparse_x.grad.values().numpy())
 
                 paddle.device.set_device("cpu")
-                sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor(
+                sparse_x_cpu = paddle.sparse.sparse_coo_tensor(
                     paddle.to_tensor(indices, dtype=indices_dtype),
                     paddle.to_tensor(values),
                     shape=[3, 4],
@@ -218,7 +215,7 @@ def test_coo_values_grad(self):
         with _test_eager_guard():
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [1.0, 2.0, 3.0, 4.0, 5.0]
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+            sparse_x = paddle.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4],
@@ -231,7 +228,7 @@ def test_coo_values_grad(self):
             indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
             values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0], [4.0, 4.0],
                       [5.0, 5.0]]
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+            sparse_x = paddle.sparse.sparse_coo_tensor(
                 paddle.to_tensor(indices),
                 paddle.to_tensor(values),
                 shape=[3, 4, 2],
@@ -256,13 +253,13 @@ def test_sparse_coo_tensor_grad(self):
                     values = paddle.to_tensor(values,
                                               dtype='float32',
                                               stop_gradient=False)
-                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                    sparse_x = paddle.sparse.sparse_coo_tensor(
                         indices, values, shape=[2, 2], stop_gradient=False)
                     grad_indices = [[0, 1], [1, 1]]
                     grad_values = [2, 3]
                     grad_indices = paddle.to_tensor(grad_indices, dtype='int32')
                     grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                    sparse_out_grad = paddle.incubate.sparse.sparse_coo_tensor(
+                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
                         grad_indices, grad_values, shape=[2, 2])
                     sparse_x.backward(sparse_out_grad)
                     correct_values_grad = [0, 3]
@@ -274,11 +271,11 @@ def test_sparse_coo_tensor_grad(self):
                     values = paddle.to_tensor(values,
                                               dtype='float32',
                                               stop_gradient=False)
-                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
+                    sparse_x = paddle.sparse.sparse_coo_tensor(
                         indices, values, shape=[2, 2, 2], stop_gradient=False)
                     grad_values = [[2, 2], [3, 3]]
                     grad_values = paddle.to_tensor(grad_values, dtype='float32')
-                    sparse_out_grad = paddle.incubate.sparse.sparse_coo_tensor(
+                    sparse_out_grad = paddle.sparse.sparse_coo_tensor(
                         grad_indices, grad_values, shape=[2, 2, 2])
                     sparse_x.backward(sparse_out_grad)
                     correct_values_grad = [[0, 0], [3, 3]]
@@ -296,9 +293,8 @@ def test_sparse_coo_tensor_sorted(self):
                     values = [1.0, 2.0, 3.0]
                     indices = paddle.to_tensor(indices, dtype='int32')
                     values = paddle.to_tensor(values, dtype='float32')
-                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                        indices, values)
-                    sparse_x = paddle.incubate.sparse.coalesce(sparse_x)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    sparse_x = paddle.sparse.coalesce(sparse_x)
                     indices_sorted = [[0, 1], [1, 0]]
                     values_sorted = [5.0, 1.0]
                     assert np.array_equal(indices_sorted,
@@ -309,9 +305,8 @@ def test_sparse_coo_tensor_sorted(self):
                     # test the non-zero values is a vector
                     values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]]
                     values = paddle.to_tensor(values, dtype='float32')
-                    sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                        indices, values)
-                    sparse_x = paddle.incubate.sparse.coalesce(sparse_x)
+                    sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
+                    sparse_x = paddle.sparse.coalesce(sparse_x)
                     values_sorted = [[5.0, 5.0], [1.0, 1.0]]
                     assert np.array_equal(indices_sorted,
                                           sparse_x.indices().numpy())
@@ -365,8 +360,9 @@ def test_small_shape(self):
                 values = [1, 2]
                 # 1. the shape too small
                 dense_shape = [2, 2]
-                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                    indices, values, shape=dense_shape)
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices,
+                                                           values,
+                                                           shape=dense_shape)
 
     def test_same_nnz(self):
         with _test_eager_guard():
@@ -374,8 +370,7 @@ def test_same_nnz(self):
                 # 2. test the nnz of indices must same as nnz of values
                 indices = [[1, 2], [1, 0]]
                 values = [1, 2, 3]
-                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                    indices, values)
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
 
     def test_same_dimensions(self):
         with _test_eager_guard():
@@ -383,17 +378,16 @@ def test_same_dimensions(self):
                 indices = [[1, 2], [1, 0]]
                 values = [1, 2, 3]
                 shape = [2, 3, 4]
-                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices,
-                                                                    values,
-                                                                    shape=shape)
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices,
+                                                           values,
+                                                           shape=shape)
 
     def test_indices_dtype(self):
         with _test_eager_guard():
             with self.assertRaises(TypeError):
                 indices = [[1.0, 2.0], [0, 1]]
                 values = [1, 2]
-                sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
-                    indices, values)
+                sparse_x = paddle.sparse.sparse_coo_tensor(indices, values)
 
 
 class TestCsrError(unittest.TestCase):
@@ -405,7 +399,7 @@ def test_dimension1(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
     def test_dimension2(self):
@@ -415,7 +409,7 @@ def test_dimension2(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3, 3, 3, 3]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
     def test_same_shape1(self):
@@ -425,7 +419,7 @@ def test_same_shape1(self):
                 cols = [0, 1, 2, 3]
                 values = [1, 2, 3]
                 shape = [3, 4]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
     def test_same_shape2(self):
@@ -435,7 +429,7 @@ def test_same_shape2(self):
                 cols = [0, 1, 2, 3]
                 values = [1, 2, 3, 4]
                 shape = [3, 4]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
     def test_same_shape3(self):
@@ -445,7 +439,7 @@ def test_same_shape3(self):
                 cols = [0, 1, 2, 3, 0, 1, 2]
                 values = [1, 2, 3, 4, 0, 1, 2]
                 shape = [2, 3, 4]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
     def test_crows_first_value(self):
@@ -455,7 +449,7 @@ def test_crows_first_value(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3, 4]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
     def test_dtype(self):
@@ -465,7 +459,7 @@ def test_dtype(self):
                 cols = [0, 1, 2]
                 values = [1, 2, 3]
                 shape = [3]
-                sparse_x = paddle.incubate.sparse.sparse_csr_tensor(
+                sparse_x = paddle.sparse.sparse_csr_tensor(
                     crows, cols, values, shape)
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_split_op.py b/python/paddle/fluid/tests/unittests/test_split_op.py
index c31169feedbdd5..37ea0d429ca672 100644
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
@@ -441,6 +441,21 @@ def test_out(self):
                 np.testing.assert_allclose(ex_out, re, rtol=1e-05)
 
 
+class API_TestSplit6(unittest.TestCase):
+
+    def test_out(self):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            data = fluid.layers.data('data', shape=[-1, 10], dtype='float64')
+            x0, x1 = paddle.split(data, num_or_sections=[1, 1], axis=0)
+            place = fluid.CPUPlace()
+            exe = fluid.Executor(place)
+            input1 = np.random.random([2, 10]).astype('float64')
+            r0, r1 = exe.run(feed={"data": input1}, fetch_list=[x0, x1])
+            ex_x0, ex_x1 = np.split(input1, (1, ), axis=0)
+            np.testing.assert_allclose(ex_x0, r0, rtol=1e-05)
+            np.testing.assert_allclose(ex_x1, r1, rtol=1e-05)
+
+
 class API_TestDygraphFluidSplit(unittest.TestCase):
 
     def test_out1(self):
diff --git a/python/paddle/fluid/tests/unittests/test_squeeze_op.py b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
index d64cfcaa8d216c..9de95259328860 100755
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
@@ -22,6 +22,9 @@
 from paddle.fluid import compiler, Program, program_guard
 from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid.core as core
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 paddle.enable_static()
 
@@ -224,5 +227,79 @@ def executed_api(self):
         self.squeeze = paddle.squeeze_
 
 
+class TestSqueezeDoubleGradCheck(unittest.TestCase):
+
+    def squeeze_wrapper(self, x):
+        return paddle.squeeze(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.squeeze(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.squeeze_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestSqueezeTripleGradCheck(unittest.TestCase):
+
+    def squeeze_wrapper(self, x):
+        return paddle.squeeze(x[0])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3], False, dtype)
+        data.persistable = True
+        out = paddle.squeeze(data)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.squeeze_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index b143af2ac50c34..dc6d8673216c08 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -30,6 +30,9 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import _test_eager_guard
 import paddle.inference as paddle_infer
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 class TestSumOp(OpTest):
@@ -540,6 +543,9 @@ def test_static_and_infer(self):
             linear = paddle.nn.Linear(x.shape[-1], 5)
             linear_out = linear(x)
             out = self.pd_api(linear_out, axis, keepdim=self.keepdim)
+
+            sgd = paddle.optimizer.SGD(learning_rate=0.)
+            sgd.minimize(paddle.mean(out))
             exe = paddle.static.Executor(self.place)
             exe.run(starup_prog)
             static_out = exe.run(feed={'x': self.x.numpy().astype('float32')},
@@ -580,6 +586,158 @@ def init_data(self):
         ]
 
 
+class TestAddNDoubleGradCheck(unittest.TestCase):
+
+    def add_n_wrapper(self, x):
+        return paddle.add_n(x)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data1 = layers.data('data1', [3, 4, 5], False, dtype)
+        data1.persistable = True
+        data2 = layers.data('data2', [3, 4, 5], False, dtype)
+        data2.persistable = True
+        out = paddle.add_n([data1, data2])
+        data1_arr = np.random.uniform(-1, 1, data1.shape).astype(dtype)
+        data2_arr = np.random.uniform(-1, 1, data1.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data1, data2],
+                                           out,
+                                           x_init=[data1_arr, data2_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(
+            self.add_n_wrapper, [data1, data2],
+            out,
+            x_init=[data1_arr, data2_arr],
+            place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestAddNTripleGradCheck(unittest.TestCase):
+
+    def add_n_wrapper(self, x):
+        return paddle.add_n(x)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data1 = layers.data('data1', [3, 4, 5], False, dtype)
+        data1.persistable = True
+        data2 = layers.data('data2', [3, 4, 5], False, dtype)
+        data2.persistable = True
+        out = paddle.add_n([data1, data2])
+        data1_arr = np.random.uniform(-1, 1, data1.shape).astype(dtype)
+        data2_arr = np.random.uniform(-1, 1, data1.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data1, data2],
+                                           out,
+                                           x_init=[data1_arr, data2_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(
+            self.add_n_wrapper, [data1, data2],
+            out,
+            x_init=[data1_arr, data2_arr],
+            place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestSumDoubleGradCheck(unittest.TestCase):
+
+    def sum_wrapper(self, x):
+        return paddle.sum(x[0], axis=1, keepdim=True)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 4], False, dtype)
+        data.persistable = True
+        out = paddle.sum(data, axis=1, keepdim=True)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.sum_wrapper, [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestSumTripleGradCheck(unittest.TestCase):
+
+    def sum_wrapper(self, x):
+        return paddle.sum(x[0], axis=1, keepdim=True)
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 4], False, dtype)
+        data.persistable = True
+        out = paddle.sum(data, axis=1, keepdim=True)
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.sum_wrapper, [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tile_op.py b/python/paddle/fluid/tests/unittests/test_tile_op.py
index c1c6820d9c17e0..9f694ab3319f30 100644
--- a/python/paddle/fluid/tests/unittests/test_tile_op.py
+++ b/python/paddle/fluid/tests/unittests/test_tile_op.py
@@ -19,7 +19,10 @@
 from op_test import OpTest
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid import compiler, Program, program_guard
+from paddle.fluid import compiler, Program, program_guard, core
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 
 #Situation 1: repeat_times is a list (without tensor)
@@ -263,6 +266,80 @@ def test_api(self):
             assert np.array_equal(out_3.numpy(), np.tile(np_x, (2, 3)))
 
 
+class TestTileDoubleGradCheck(unittest.TestCase):
+
+    def tile_wrapper(self, x):
+        return paddle.tile(x[0], [2, 1])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [1, 2], False, dtype)
+        data.persistable = True
+        out = paddle.tile(data, [2, 1])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.tile_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestTileTripleGradCheck(unittest.TestCase):
+
+    def tile_wrapper(self, x):
+        return paddle.tile(x[0], [2, 1])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [1, 2], False, dtype)
+        data.persistable = True
+        out = paddle.tile(data, [2, 1])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.tile_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_transpose_op.py b/python/paddle/fluid/tests/unittests/test_transpose_op.py
index 7f1794c39fcad1..9ae50dfd170113 100644
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
@@ -16,11 +16,14 @@
 
 import unittest
 import numpy as np
-from op_test import OpTest, convert_float_to_uint16
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 import paddle.fluid.core as core
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 paddle.enable_static()
 
@@ -526,6 +529,80 @@ def test_error(self):
             paddle.moveaxis(x, [2, 1], [10, 3])
 
 
+class TestTransposeDoubleGradCheck(unittest.TestCase):
+
+    def transpose_wrapper(self, x):
+        return paddle.transpose(x[0], [1, 0, 2])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.transpose(data, [1, 0, 2])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.transpose_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestTransposeTripleGradCheck(unittest.TestCase):
+
+    def transpose_wrapper(self, x):
+        return paddle.transpose(x[0], [1, 0, 2])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.transpose(data, [1, 0, 2])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.transpose_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 21248a1577167f..52e6795cf21ffc 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -587,8 +587,17 @@ def test_default_fp64():
             out = paddle.tensor.random.uniform([2, 3])
             self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP64)
 
+        def test_dygraph_fp16():
+            if not paddle.is_compiled_with_cuda():
+                paddle.enable_static()
+                return
+            paddle.set_device('gpu')
+            out = paddle.uniform([2, 3], dtype=paddle.float16)
+            self.assertEqual(out.dtype, fluid.core.VarDesc.VarType.FP16)
+
         test_default_fp64()
         test_default_fp32()
+        test_dygraph_fp16()
 
         paddle.enable_static()
 
diff --git a/python/paddle/fluid/tests/unittests/test_unpool_op.py b/python/paddle/fluid/tests/unittests/test_unpool_op.py
index 42522ae6cb88b6..9d94e5acbfea7e 100644
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
@@ -436,6 +436,35 @@ def test_dygraph(self):
         np.testing.assert_array_equal(unpool_out.shape, [1, 3, 7, 7])
 
 
+class TestZOutputSizeTensor3(unittest.TestCase):
+
+    def setUp(self):
+        paddle.disable_static()
+
+    def tearDown(self):
+        paddle.enable_static()
+
+    def test_dygraph(self):
+        x = paddle.randn([1, 3, 6, 6])
+        pool_out, indices = F.max_pool2d(x,
+                                         kernel_size=2,
+                                         stride=2,
+                                         padding=0,
+                                         return_mask=True)
+        output_size = [
+            paddle.assign([1]),
+            paddle.assign([1]),
+            paddle.assign([7]),
+            paddle.assign([7])
+        ]
+        unpool_out = F.max_unpool2d(pool_out,
+                                    indices,
+                                    kernel_size=2,
+                                    padding=0,
+                                    output_size=output_size)
+        np.testing.assert_array_equal(unpool_out.shape, [1, 3, 7, 7])
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
index aeb0dac695cbbe..29f5e37cd0cec2 100755
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
@@ -21,6 +21,9 @@
 import paddle.fluid as fluid
 from op_test import OpTest, convert_float_to_uint16
 import paddle.fluid.core as core
+import gradient_checker
+from decorator_helper import prog_scope
+import paddle.fluid.layers as layers
 
 paddle.enable_static()
 
@@ -309,5 +312,79 @@ def executed_api(self):
         self.unsqueeze = paddle.unsqueeze_
 
 
+class TestUnsqueezeDoubleGradCheck(unittest.TestCase):
+
+    def unsqueeze_wrapper(self, x):
+        return paddle.unsqueeze(x[0], [0, 2])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.unsqueeze(data, [0, 2])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.double_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.double_grad_check_for_dygraph(self.unsqueeze_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
+class TestUnsqueezeTripleGradCheck(unittest.TestCase):
+
+    def unsqueeze_wrapper(self, x):
+        return paddle.unsqueeze(x[0], [0, 2])
+
+    @prog_scope()
+    def func(self, place):
+        # the shape of input variable should be clearly specified, not inlcude -1.
+        eps = 0.005
+        dtype = np.float32
+
+        data = layers.data('data', [2, 3, 4], False, dtype)
+        data.persistable = True
+        out = paddle.unsqueeze(data, [0, 2])
+        data_arr = np.random.uniform(-1, 1, data.shape).astype(dtype)
+
+        gradient_checker.triple_grad_check([data],
+                                           out,
+                                           x_init=[data_arr],
+                                           place=place,
+                                           eps=eps)
+        fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True})
+        gradient_checker.triple_grad_check_for_dygraph(self.unsqueeze_wrapper,
+                                                       [data],
+                                                       out,
+                                                       x_init=[data_arr],
+                                                       place=place)
+
+    def test_grad(self):
+        paddle.enable_static()
+        places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            places.append(fluid.CUDAPlace(0))
+        for p in places:
+            self.func(p)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_version.py b/python/paddle/fluid/tests/unittests/test_version.py
index d31288866861ad..55b5228882bffe 100644
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
@@ -12,16 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
-
-import unittest
 import re
+import unittest
 
 import paddle.version as fluid_version
 
 
 class VersionTest(unittest.TestCase):
-
     def setUp(self):
         self._major_regex = "[0-9]+"
         self._minor_regex = "[0-9]+"
@@ -37,15 +34,20 @@ def test_check_output(self):
 
         # check version format
         if fluid_version.istaged:
-            self.assertEqual(fluid_version.major, 0)
-            self.assertEqual(fluid_version.minor, 0)
-            self.assertEqual(fluid_version.patch, "0")
-            self.assertEqual(fluid_version.rc, 0)
-            self.assertEqual(fluid_version.full_version, "0.0.0")
-        else:
             self.assertTrue(re.match(self._major_regex, fluid_version.major))
             self.assertTrue(re.match(self._minor_regex, fluid_version.minor))
             self.assertTrue(re.match(self._patch_regex, fluid_version.patch))
             self.assertTrue(re.match(self._rc_regex, fluid_version.rc))
             self.assertTrue(
-                re.match(self._version_regex, fluid_version.full_version))
+                re.match(self._version_regex, fluid_version.full_version)
+            )
+        else:
+            self.assertEqual(fluid_version.major, "0")
+            self.assertEqual(fluid_version.minor, "0")
+            self.assertEqual(fluid_version.patch, "0")
+            self.assertEqual(fluid_version.rc, "0")
+            self.assertEqual(fluid_version.full_version, "0.0.0")
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
new file mode 100644
index 00000000000000..5d56f37d89733a
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_grad_op_xpu.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+
+sys.path.append("..")
+
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+
+def get_outputs(DOut, X, Y):
+    DX = np.dot(DOut, Y.T)
+    DY = np.dot(X.T, DOut)
+    DBias = np.sum(DOut, axis=0)
+
+    return DX, DY, DBias
+
+
+class XPUTestFuseGemmGradOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'fused_gemm_epilogue_grad'
+        self.use_dynamic_create_class = False
+
+    class TestFuseGemmEpilogueGradOpDXYBias1(XPUOpTest):
+
+        def setUp(self):
+            paddle.enable_static()
+            self.op_type = "fused_gemm_epilogue_grad"
+            self.__class__.no_need_check_grad = True
+
+            self.dtype = self.in_type
+            self.init_data()
+
+        def init_data(self):
+            self.inputs = {
+                'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+                'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+                'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+            }
+
+            self.attrs = {"activation": 'none'}
+
+            DX, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                        self.inputs['Y'])
+            self.outputs = {'DX': DX, 'DY': DY, 'DBias': DBias}
+
+        def test_check_output(self):
+            self.atol = 1e-4
+            if self.dtype == np.float16:
+                self.atol = 1e-3
+            self.check_output_with_place(core.XPUPlace(0), atol=self.atol)
+
+    class TestFuseGemmEpilogueGradOpDXYBias2(XPUOpTest):
+
+        def init_data(self):
+            self.inputs = {
+                'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+                'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+                'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+            }
+
+            self.attrs = {"activation": 'none'}
+
+            _, DY, DBias = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                       self.inputs['Y'])
+            self.outputs = {'DY': DY, 'DBias': DBias}
+
+    class TestFuseGemmEpilogueGradOpDXYBias3(XPUOpTest):
+
+        def init_data(self):
+            self.inputs = {
+                'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+                'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+                'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+            }
+
+            self.attrs = {"activation": 'none'}
+
+            _, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                   self.inputs['Y'])
+            self.outputs = {'DY': DY}
+
+    class TestFuseGemmEpilogueGradOpDXYBias4(XPUOpTest):
+
+        def init_data(self):
+            self.inputs = {
+                'DOut': np.random.random((8, 128)).astype(self.dtype) - 0.5,
+                'X': np.random.random((8, 4)).astype(self.dtype) - 0.5,
+                'Y': np.random.random((4, 128)).astype(self.dtype) - 0.5
+            }
+
+            self.attrs = {"activation": 'none'}
+
+            DX, DY, _ = get_outputs(self.inputs['DOut'], self.inputs['X'],
+                                    self.inputs['Y'])
+            self.outputs = {'DX': DX, 'DY': DY}
+
+
+support_types = get_xpu_op_support_types('fused_gemm_epilogue_grad')
+for stype in support_types:
+    create_test_class(globals(), XPUTestFuseGemmGradOp, stype)
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py
new file mode 100644
index 00000000000000..2e1d5848e6c6c7
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/xpu/test_fused_gemm_epilogue_op_xpu.py
@@ -0,0 +1,293 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2022 NVIDIA Corporation. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import sys
+
+sys.path.append("..")
+import unittest
+import numpy as np
+import paddle
+import paddle.fluid.core as core
+from op_test_xpu import XPUOpTest
+from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
+
+
+def gelu(x):
+    y_ref = 0.5 * x * (
+        1.0 + np.tanh(np.sqrt(2 / np.pi) * (x + 0.044715 * np.power(x, 3))))
+    return y_ref.astype(x.dtype)
+
+
+def relu(x):
+    mask = x > 0
+    return x * mask
+
+
+def get_output(X, Y, bias, act):
+    out = np.dot(X, Y) + bias
+    if act == 'relu':
+        return relu(out)
+    elif act == 'gelu':
+        return gelu(out)
+    else:
+        return out
+
+
+def matmul(x, y, bias, trans_x, trans_y):
+    x = np.array(x)
+    if trans_x:
+        x = np.ascontiguousarray(np.transpose(x))
+    if trans_y:
+        y = np.ascontiguousarray(np.transpose(y))
+    z = np.matmul(x, y)
+    if bias is None:
+        return z
+    else:
+        return z + bias
+
+
+def matmul_grad(x, y, bias, dz, trans_x, trans_y):
+    if trans_x:
+        if trans_y:
+            dx = matmul(y, dz, None, True, True)
+            dy = matmul(dz, x, None, True, True)
+        else:
+            dx = matmul(y, dz, None, False, True)
+            dy = matmul(x, dz, None, False, False)
+    else:
+        if trans_y:
+            dx = matmul(dz, y, None, False, False)
+            dy = matmul(dz, x, None, True, False)
+        else:
+            dx = matmul(dz, y, None, False, True)
+            dy = matmul(x, dz, None, True, False)
+    if bias is None:
+        dbias = None
+    else:
+        dbias = np.sum(dz, axis=0, keepdims=False)
+    return dx, dy, dbias
+
+
+class XPUTestFuseGemmOp(XPUOpTestWrapper):
+
+    def __init__(self):
+        self.op_name = 'fused_gemm_epilogue'
+        self.use_dynamic_create_class = False
+
+    class TestFuseGemmBase(XPUOpTest):
+
+        def setUp(self):
+            self.__class__.no_need_check_grad = True
+            self.op_type = "fused_gemm_epilogue"
+            self.init_dtype_type()
+            self.init_datas_shape_and_attrs()
+            self.inputs = {
+                'X': np.random.random(self.x_shape).astype(self.dtype) - 0.5,
+                'Y': np.random.random(self.y_shape).astype(self.dtype) - 0.5,
+                'Bias':
+                np.random.random(self.bias_shape).astype(self.dtype) - 0.5
+            }
+
+            if self.trans_x == True:
+                numpy_input_x = self.inputs['X'].reshape(
+                    (self.x_shape[0], -1)).T
+            else:
+                numpy_input_x = self.inputs['X'].reshape((-1, self.x_shape[-1]))
+
+            if self.trans_y == True:
+                numpy_input_y = self.inputs['Y'].T
+            else:
+                numpy_input_y = self.inputs['Y']
+
+            self.outputs = {
+                'Out':
+                get_output(numpy_input_x, numpy_input_y, self.inputs['Bias'],
+                           self.activation).reshape(self.out_shape)
+            }
+            self.attrs = {
+                "activation": self.activation,
+                "trans_y": self.trans_y,
+                "trans_x": self.trans_x
+            }
+
+        def init_dtype_type(self):
+            self.dtype = self.in_type
+            self.atol = 1e-4
+            if self.dtype == np.float16:
+                self.atol = 1e-3
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [8, 4]
+            self.y_shape = [4, 128]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [8, 128]
+            self.activation = "relu"
+            self.trans_y = False
+            self.trans_x = False
+
+        def test_check_output(self):
+            self.check_output_with_place(core.XPUPlace(0), atol=self.atol)
+
+    class TestFuseGemmEpilogueOp1(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [4, 8]
+            self.y_shape = [4, 128]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [8, 128]
+            self.activation = "relu"
+            self.trans_y = False
+            self.trans_x = True
+
+    class TestFuseGemmEpilogueOp2(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [8, 4]
+            self.y_shape = [128, 4]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [8, 128]
+            self.activation = "relu"
+            self.trans_y = True
+            self.trans_x = False
+
+    class TestFuseGemmEpilogueOp3(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [4, 8]
+            self.y_shape = [128, 4]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [8, 128]
+            self.activation = "relu"
+            self.trans_y = True
+            self.trans_x = True
+
+    class TestFuseGemmEpilogueOp4(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [2, 2, 8, 4]
+            self.y_shape = [4, 128]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [2, 2, 8, 128]
+            self.activation = "relu"
+            self.trans_y = False
+            self.trans_x = False
+
+    class TestFuseGemmEpilogueOp5(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [4, 2, 2, 8]
+            self.y_shape = [4, 128]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [2, 2, 8, 128]
+            self.activation = "relu"
+            self.trans_y = False
+            self.trans_x = True
+
+    class TestFuseGemmEpilogueOp6(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [8, 4]
+            self.y_shape = [4, 128]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [8, 128]
+            self.activation = "gelu"
+            self.trans_y = False
+            self.trans_x = False
+
+    class TestFuseGemmEpilogueOp7(TestFuseGemmBase):
+
+        def init_datas_shape_and_attrs(self):
+            self.x_shape = [8, 4]
+            self.y_shape = [4, 128]
+            self.bias_shape = [
+                128,
+            ]
+            self.out_shape = [8, 128]
+            self.activation = "none"
+            self.trans_y = False
+            self.trans_x = False
+
+
+class TestEagerFusedGemmEpilogue(unittest.TestCase):
+
+    def setUp(self):
+        paddle.set_device('xpu')
+
+    def test_case_act(self):
+        paddle.disable_static()
+        x_np = np.random.random((8, 4)).astype(np.float32) - 0.5
+        y_np = np.random.random((4, 128)).astype(np.float32) - 0.5
+        bias_np = np.random.random((128, )).astype(np.float32) - 0.5
+        x = paddle.to_tensor(x_np)
+        y = paddle.to_tensor(y_np)
+        bias = paddle.to_tensor(bias_np)
+        x.stop_gradient = False
+        y.stop_gradient = False
+
+        out1 = core.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False,
+                                            'trans_y', False, 'activation',
+                                            'none')
+        out2 = core.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False,
+                                            'trans_y', False, 'activation',
+                                            'relu')
+        out3 = core.ops.fused_gemm_epilogue(x, y, bias, 'trans_x', False,
+                                            'trans_y', False, 'activation',
+                                            'gelu')
+
+        out_np1 = get_output(x_np, y_np, bias_np, 'none')
+        out_np2 = get_output(x_np, y_np, bias_np, 'relu')
+        out_np3 = get_output(x_np, y_np, bias_np, 'gelu')
+
+        np.testing.assert_allclose(out1, out_np1, atol=1e-04)
+        np.testing.assert_allclose(out2, out_np2, atol=1e-04)
+        np.testing.assert_allclose(out3, out_np3, atol=1e-03)
+
+        out_grad_np1 = np.random.randint(low=-20, high=20,
+                                         size=out_np1.shape).astype(np.float32)
+        paddle.autograd.backward(out1,
+                                 grad_tensors=[paddle.to_tensor(out_grad_np1)])
+
+        x_grad_np, y_grad_np, bias_grad_np = matmul_grad(
+            x_np, y_np, bias_np, out_grad_np1, False, False)
+        np.testing.assert_allclose(x.grad.numpy(), x_grad_np, atol=1e-02)
+        self.assertEqual(y_grad_np.shape, y_np.shape)
+        np.testing.assert_allclose(y.grad.numpy(), y_grad_np, atol=1e-03)
+
+        paddle.enable_static()
+
+
+support_types = get_xpu_op_support_types('fused_gemm_epilogue')
+for stype in support_types:
+    create_test_class(globals(), XPUTestFuseGemmOp, stype)
+
+if __name__ == "__main__":
+    paddle.enable_static()
+    np.random.seed(0)
+    unittest.main()
diff --git a/python/paddle/fluid/variable_index.py b/python/paddle/fluid/variable_index.py
index a0a778759b0300..1a13e8304194c8 100644
--- a/python/paddle/fluid/variable_index.py
+++ b/python/paddle/fluid/variable_index.py
@@ -730,10 +730,13 @@ def _setitem_impl_(var, item, value):
         elif dtype == core.VarDesc.VarType.INT64:
             value_name = "int64_values"
             values = [int(v) for v in value.flat]
+        elif dtype == core.VarDesc.VarType.FP16:
+            value_name = "fp16_values"
+            values = [float(v) for v in value.flat]
         else:
             raise TypeError(
                 "When assign a numpy.ndarray, integer or float to a paddle.Tensor, "
-                "the data type of the paddle.Tensor must be bool, float32, int32 or int64, but "
+                "the data type of the paddle.Tensor must be bool, float32, int32, int64 or float16, but "
                 "received %s." % convert_dtype(dtype))
         attrs[value_name] = values
         attrs["shape"] = shape
diff --git a/python/paddle/framework/dtype.py b/python/paddle/framework/dtype.py
index 56a95f48b5f9b8..6abc8e6e1aa9a7 100644
--- a/python/paddle/framework/dtype.py
+++ b/python/paddle/framework/dtype.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 from ..fluid.core import VarDesc
+from ..fluid.core import iinfo as core_iinfo
 
 dtype = VarDesc.VarType
 dtype.__qualname__ = "dtype"
@@ -34,4 +35,37 @@
 
 bool = VarDesc.VarType.BOOL
 
-__all__ = []
+
+def iinfo(dtype):
+    """
+
+    paddle.iinfo is a function that returns an object that represents the numerical properties of 
+    an integer paddle.dtype.
+    This is similar to `numpy.iinfo <https://numpy.org/doc/stable/reference/generated/numpy.iinfo.html#numpy-iinfo>`_.
+
+    Args:
+        dtype(paddle.dtype):  One of paddle.uint8, paddle.int8, paddle.int16, paddle.int32, and paddle.int64.
+
+    Returns:
+        An iinfo object, which has the following 4 attributes:
+
+            - min: int, The smallest representable integer number.
+            - max: int, The largest representable integer number.
+            - bits: int, The number of bits occupied by the type.
+            - dtype: str, The string name of the argument dtype.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            iinfo_uint8 = paddle.iinfo(paddle.uint8)
+            print(iinfo_uint8)
+            # paddle.iinfo(min=0, max=255, bits=8, dtype=uint8)
+            print(iinfo_uint8.min) # 0
+            print(iinfo_uint8.max) # 255
+            print(iinfo_uint8.bits) # 8
+            print(iinfo_uint8.dtype) # uint8
+
+    """
+    return core_iinfo(dtype)
diff --git a/python/paddle/framework/framework.py b/python/paddle/framework/framework.py
index 41fd0c0703bbce..819c08a7f3ac97 100644
--- a/python/paddle/framework/framework.py
+++ b/python/paddle/framework/framework.py
@@ -46,11 +46,16 @@ def set_default_dtype(d):
         else:
             raise TypeError(
                 "set_default_dtype only supports [float16, float32, float64] "
-                ", but received %s" % d.__name__)
+                ", but received %s" % d.__name__
+            )
     else:
         if d in [
-                'float16', 'float32', 'float64', u'float16', u'float32',
-                u'float64'
+            'float16',
+            'float32',
+            'float64',
+            u'float16',
+            u'float32',
+            u'float64',
         ]:
             # this code is a little bit dangerous, since error could happen
             # when casting no-ascii code to str in python2.
@@ -61,7 +66,8 @@ def set_default_dtype(d):
         else:
             raise TypeError(
                 "set_default_dtype only supports [float16, float32, float64] "
-                ", but received %s" % str(d))
+                ", but received %s" % str(d)
+            )
 
     LayerHelperBase.set_default_dtype(d)
 
@@ -73,7 +79,7 @@ def get_default_dtype():
     Args:
         None.
     Returns:
-        The default dtype.
+        String, this global dtype only supports float16, float32, float64.
 
     Examples:
         .. code-block:: python
@@ -94,7 +100,7 @@ def set_grad_enabled(mode):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
             x = paddle.ones([3, 2])
             x.stop_gradient = False
@@ -127,9 +133,9 @@ def is_grad_enabled():
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # Dygraph gradient calculation mode is enabled by default.
             paddle.is_grad_enabled() # True
 
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 09f3c512401918..aa2375ca72fa60 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -26,14 +26,35 @@
 # deprecated module import
 from paddle import fluid
 from paddle.fluid import core
-from paddle.fluid.io import _unpack_saved_dict, _pack_loaded_dict, _pickle_loads_mac
+from paddle.fluid.io import (
+    _unpack_saved_dict,
+    _pack_loaded_dict,
+    _pickle_loads_mac,
+)
 from paddle.fluid.io import _legacy_save as _legacy_static_save
 from paddle.fluid.io import _open_file_buffer, _is_file_path, _is_memory_buffer
 
-from paddle.fluid.framework import Variable, _varbase_creator, _dygraph_tracer, _non_static_mode, ParamBase, EagerParamBase, _current_expected_place, Program
+from paddle.fluid.framework import (
+    Variable,
+    _varbase_creator,
+    _dygraph_tracer,
+    _non_static_mode,
+    ParamBase,
+    EagerParamBase,
+    _current_expected_place,
+    Program,
+)
 from paddle.fluid.dygraph.jit import _SaveLoadConfig
-from paddle.fluid.dygraph.io import _construct_program_holders, _construct_params_and_buffers
-from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
+from paddle.fluid.dygraph.io import (
+    _construct_program_holders,
+    _construct_params_and_buffers,
+)
+from paddle.fluid.dygraph.io import (
+    INFER_MODEL_SUFFIX,
+    INFER_PARAMS_SUFFIX,
+    INFER_PARAMS_INFO_SUFFIX,
+)
+
 try:
     from collections.abc import Iterable
 except:
@@ -70,7 +91,8 @@ def _load_state_dict_from_save_inference_model(model_path, config):
     # 2. load layer parameters & buffers
     with fluid.dygraph.guard():
         persistable_var_dict = _construct_params_and_buffers(
-            model_path, programs, config.params_filename, append_suffix=False)
+            model_path, programs, config.params_filename, append_suffix=False
+        )
 
         # 3. construct state_dict
         load_param_dict = dict()
@@ -86,10 +108,15 @@ def _load_state_dict_from_save_inference_model(model_path, config):
             structured_para_dict = dict()
             for var_name in load_param_dict:
                 structured_name = extra_var_info[var_name].get(
-                    'structured_name', None)
-                assert structured_name is not None, "Cannot find saved variable (%s)'s structured name in saved model." % var_name
+                    'structured_name', None
+                )
+                assert structured_name is not None, (
+                    "Cannot find saved variable (%s)'s structured name in saved model."
+                    % var_name
+                )
                 structured_para_dict[structured_name] = load_param_dict[
-                    var_name]
+                    var_name
+                ]
             load_param_dict = structured_para_dict
 
     return load_param_dict
@@ -117,7 +144,8 @@ def _load_state_dict_from_save_params(model_path):
                 type='load',
                 inputs={},
                 outputs={'Out': new_var},
-                attrs={'file_path': os.path.join(model_path, name)})
+                attrs={'file_path': os.path.join(model_path, name)},
+            )
             load_var_list.append(new_var)
 
     # 3. construct state_dict
@@ -153,7 +181,8 @@ def _build_load_path_and_config(path, config):
         raise ValueError(
             "The %s.pdmodel and %s directory exist at the same time, "
             "don't know which one to load, please make sure that the specified target "
-            "of ``path`` is unique." % (path, path))
+            "of ``path`` is unique." % (path, path)
+        )
     elif not prefix_format_exist and not directory_format_exist:
         error_msg = "The ``path`` (%s) to load model not exists."
         # if current path is a prefix, and the path.pdparams or path.pdopt
@@ -162,10 +191,12 @@ def _build_load_path_and_config(path, config):
         params_file_path = path + ".pdparams"
         opti_file_path = path + ".pdopt"
         if os.path.exists(params_file_path) or os.path.exists(opti_file_path):
-            error_msg += " If you want to load the results saved by `fluid.save_dygraph`, " \
-                "please specify the full file name, not just the file name prefix. For " \
-                "example, it should be written as `paddle.load('model.pdparams')` instead of " \
+            error_msg += (
+                " If you want to load the results saved by `fluid.save_dygraph`, "
+                "please specify the full file name, not just the file name prefix. For "
+                "example, it should be written as `paddle.load('model.pdparams')` instead of "
                 "`paddle.load('model')`."
+            )
         raise ValueError(error_msg % path)
     else:
         if prefix_format_exist:
@@ -175,13 +206,15 @@ def _build_load_path_and_config(path, config):
                 warnings.warn(
                     "When loading the result saved with the "
                     "specified file prefix, the ``model_filename`` config does "
-                    "not take effect.")
+                    "not take effect."
+                )
             config.model_filename = file_prefix + INFER_MODEL_SUFFIX
             if config.params_filename is not None:
                 warnings.warn(
                     "When loading the result saved with the "
                     "specified file prefix, the ``params_filename`` config does "
-                    "not take effect.")
+                    "not take effect."
+                )
             config.params_filename = file_prefix + INFER_PARAMS_SUFFIX
         else:
             # Compatible with the old save_inference_model format
@@ -192,7 +225,10 @@ def _build_load_path_and_config(path, config):
 
 def _parse_load_config(configs):
     supported_configs = [
-        'model_filename', 'params_filename', 'keep_name_table', 'return_numpy'
+        'model_filename',
+        'params_filename',
+        'keep_name_table',
+        'return_numpy',
     ]
 
     # input check
@@ -200,7 +236,8 @@ def _parse_load_config(configs):
         if key not in supported_configs:
             raise ValueError(
                 "The additional config (%s) of `paddle.load` is not supported."
-                % key)
+                % key
+            )
 
     # construct inner config
     inner_config = _SaveLoadConfig()
@@ -220,7 +257,8 @@ def _parse_save_config(configs):
         if key not in supported_configs:
             raise ValueError(
                 "The additional config (%s) of `paddle.save` is not supported."
-                % key)
+                % key
+            )
 
     # construct inner config
     inner_config = _SaveLoadConfig()
@@ -233,19 +271,22 @@ def _parse_save_config(configs):
 def _pickle_save(obj, f, protocol):
     # TODO(weixin):add support for BytesIO.
     if not isinstance(protocol, int):
-        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(protocol)))
+        raise ValueError(
+            "The 'protocol' MUST be `int`, but received {}".format(
+                type(protocol)
+            )
+        )
 
     if protocol < 2 or protocol > 4:
         raise ValueError(
-            "Expected 1<'protocol'<5, but received protocol={}".format(
-                protocol))
+            "Expected 1<'protocol'<5, but received protocol={}".format(protocol)
+        )
 
     def reduce_varbase(self):
         data = self.numpy()
         name = self.name
 
-        return (tuple, ((name, data), ))
+        return (tuple, ((name, data),))
 
     def reduce_LoDTensor(self):
         data = np.array(self)
@@ -254,7 +295,8 @@ def reduce_LoDTensor(self):
 
     def reduce_Layer(self):
         raise ValueError(
-            "paddle do not support saving `paddle.nn.Layer` object.")
+            "paddle do not support saving `paddle.nn.Layer` object."
+        )
 
     dispatch_table_layer = dict()
 
@@ -262,8 +304,9 @@ def create_layer_dispatch_table(layer):
         dispatch_table_layer[layer.__class__] = reduce_Layer
         return layer
 
-    _parse_every_object(obj, lambda v: isinstance(v, fluid.Layer),
-                        create_layer_dispatch_table)
+    _parse_every_object(
+        obj, lambda v: isinstance(v, fluid.Layer), create_layer_dispatch_table
+    )
 
     def add_dispatch_table():
         # This is not a good method, because the pickle module has been modified.
@@ -291,7 +334,7 @@ def pop_dispatch_table():
 
         max_bytes = 2**30
         for i in range(0, len(pickle_bytes), max_bytes):
-            f.write(pickle_bytes[i:i + max_bytes])
+            f.write(pickle_bytes[i : i + max_bytes])
     else:
         pickler = pickle.Pickler(f, protocol)
         pickler.dispatch_table = copyreg.dispatch_table.copy()
@@ -308,7 +351,8 @@ def pop_dispatch_table():
 def _contain_x(obj, condition_func):
     if isinstance(obj, core.SelectedRows):
         raise NotImplementedError(
-            "`paddle.save` do not support saving 'SelectedRows'.")
+            "`paddle.save` do not support saving 'SelectedRows'."
+        )
 
     if condition_func(obj):
         return True
@@ -332,8 +376,16 @@ def _is_state_dict(obj):
 
         def condition(obj):
             return isinstance(
-                obj, (fluid.Layer, Program, core.VarBase, core.eager.Tensor,
-                      core.LoDTensor, core.SelectedRows))
+                obj,
+                (
+                    fluid.Layer,
+                    Program,
+                    core.VarBase,
+                    core.eager.Tensor,
+                    core.LoDTensor,
+                    core.SelectedRows,
+                ),
+            )
 
         # If the value of a dict is a core.VarBase/LoDTensor or a dict
         # that does not contain a paddle type(Layer, Program, VarBase, LoDTensor, SelectedRows),
@@ -344,7 +396,8 @@ def condition(obj):
                     if _contain_x(v, condition):
                         return False
             elif not isinstance(
-                    value, (core.VarBase, core.eager.Tensor, core.LoDTensor)):
+                value, (core.VarBase, core.eager.Tensor, core.LoDTensor)
+            ):
                 return False
         return True
 
@@ -372,8 +425,10 @@ def _transformed_from_lodtensor(obj):
 def _to_LodTensor(ndarray):
     if not isinstance(ndarray, np.ndarray):
         raise TypeError(
-            'Type of `ndarray` should be numpy.ndarray, but received {}.'.
-            format(type(ndarray)))
+            'Type of `ndarray` should be numpy.ndarray, but received {}.'.format(
+                type(ndarray)
+            )
+        )
     t = core.LoDTensor()
     place = _current_expected_place()
     t.set(ndarray, place)
@@ -420,26 +475,30 @@ def _parse_every_object(obj, condition_func, convert_func):
             if condition_func(obj[key]):
                 obj[key] = convert_func(obj[key])
             else:
-                obj[key] = _parse_every_object(obj[key], condition_func,
-                                               convert_func)
+                obj[key] = _parse_every_object(
+                    obj[key], condition_func, convert_func
+                )
         return obj
     elif type(obj) == tuple:
         return tuple(
-            _parse_every_object(list(obj), condition_func, convert_func))
+            _parse_every_object(list(obj), condition_func, convert_func)
+        )
     elif type(obj) == set:
         return set(_parse_every_object(list(obj), condition_func, convert_func))
     else:
         if isinstance(obj, Iterable) and not isinstance(
-                obj,
-            (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor)):
+            obj,
+            (str, np.ndarray, core.VarBase, core.eager.Tensor, core.LoDTensor),
+        ):
             raise NotImplementedError(
-                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}."
-                .format(type(obj)))
+                "The iteratable objects supported are tuple, list, dict, OrderedDict, string. But received {}.".format(
+                    type(obj)
+                )
+            )
         return obj
 
 
 def _parse_load_result(obj, return_numpy):
-
     def is_layer(obj):
         return isinstance(obj, fluid.Layer)
 
@@ -465,13 +524,15 @@ def ndarray_to_tensor(obj):
     # tuple(name, ndarry) was converted from varbase of paddle2.1,
     # and all tuple(name, ndarry) are converted to tensor.
     if _contain_x(obj, _transformed_from_varbase):
-        return _parse_every_object(obj, _transformed_from_varbase,
-                                   tuple_to_tensor)
+        return _parse_every_object(
+            obj, _transformed_from_varbase, tuple_to_tensor
+        )
     # If there is no tuple(name, ndary), it is considered to be saved by paddle2.0
     # or converted from LoDTensor, and all ndarrays are converted to tensor.
     else:
-        return _parse_every_object(obj, _transformed_from_lodtensor,
-                                   ndarray_to_tensor)
+        return _parse_every_object(
+            obj, _transformed_from_lodtensor, ndarray_to_tensor
+        )
 
 
 def _save_lod_tensor(tensor, file_name):
@@ -492,8 +553,10 @@ def _save_lod_tensor(tensor, file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports saving objects to file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports saving objects to file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
     return _seek
 
 
@@ -511,8 +574,10 @@ def _load_lod_tensor(file_name):
 
     else:
         raise NotImplementedError(
-            'Only supports load objects from file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports load objects from file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
 
     return temp_t, _seek
 
@@ -531,8 +596,10 @@ def _save_selected_rows(selected_rows, file_name):
             _seek = f.tell()
     else:
         raise NotImplementedError(
-            'Only supports saving objects to file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports saving objects to file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
     return _seek
 
 
@@ -546,13 +613,16 @@ def _load_selected_rows(file_name):
         with _open_file_buffer(file_name, 'rb') as f:
             selected_rows_bytes = f.read()
             paddle.fluid.core.load_selected_rows_from_memory(
-                temp_sr, selected_rows_bytes)
+                temp_sr, selected_rows_bytes
+            )
         _seek = f.tell()
 
     else:
         raise NotImplementedError(
-            'Only supports load objects from file or BytesIO, but received {}'.
-            format(type(file_name)))
+            'Only supports load objects from file or BytesIO, but received {}'.format(
+                type(file_name)
+            )
+        )
 
     return temp_sr, _seek
 
@@ -567,34 +637,36 @@ def _save_binary_var(obj, path):
     else:
         # Since the concept of 'Tensor' is only exposed to users, the error message can only contain tensor instead of 'LoDTensor' or 'SelectedRows'
         raise NotImplementedError(
-            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}."
-            .format(type(obj)))
+            "When use_binary_format = True, `paddle.save`  expected Tensor, but received {}.".format(
+                type(obj)
+            )
+        )
 
 
 def save(obj, path, protocol=4, **configs):
     '''
     Save an object to the specified path.
-    
-    .. note::
+
+    Note:
         Now supports saving ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
-    .. note::
-        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file, 
-        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path`` 
-        of ``paddle.save`` will be directly used as the saved file name instead of a prefix. 
+    Note:
+        Different from ``paddle.jit.save``, since the save result of ``paddle.save`` is a single file,
+        there is no need to distinguish multiple saved files by adding a suffix. The argument ``path``
+        of ``paddle.save`` will be directly used as the saved file name instead of a prefix.
         In order to unify the saved file name format, we recommend using the paddle standard suffix:
-        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ; 
-        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` . 
+        1. for ``Layer.state_dict`` , recommend to use ``.pdparams`` ;
+        2. for ``Optimizer.state_dict`` , recommend to use ``.pdopt`` .
         For specific examples, please refer to API code examples.
-    
+
     Args:
         obj(Object) : The object to be saved.
-        path(str|BytesIO) : The path/buffer of the object to be saved. 
-          If saved in the current directory, the input path string will be used as the file name. 
+        path(str|BytesIO) : The path/buffer of the object to be saved.
+          If saved in the current directory, the input path string will be used as the file name.
         protocol(int, optional): The protocol version of pickle module must be greater than 1 and less than 5.
                                  Default: 4
         **configs(dict, optional): optional keyword arguments. The following options are currently supported:
-          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``. 
+          use_binary_format(bool): When the saved object is static graph variable, you can specify ``use_binary_for_var``.
           If True, save the file in the c++ binary format when saving a single static graph variable; otherwise, save it in pickle format.
           Default: False
 
@@ -687,7 +759,7 @@ def save(obj, path, protocol=4, **configs):
             paddle.save(state_dict, byio)
             tensor = paddle.randn([2, 3], dtype='float32')
             paddle.save(tensor, byio)
-    
+
     '''
     if _is_file_path(path):
         # 1. input check
@@ -696,7 +768,8 @@ def save(obj, path, protocol=4, **configs):
             raise ValueError(
                 "The input path MUST be format of dirname/filename "
                 "[dirname\\filename in Windows system], but received "
-                "filename is empty string.")
+                "filename is empty string."
+            )
 
         # 2. save object
         dirname = os.path.dirname(path)
@@ -704,15 +777,19 @@ def save(obj, path, protocol=4, **configs):
             os.makedirs(dirname)
     elif not _is_memory_buffer(path):
         raise ValueError(
-            "only supports saving objects to file and `BytesIO`, but got {}".
-            format(type(path)))
+            "only supports saving objects to file and `BytesIO`, but got {}".format(
+                type(path)
+            )
+        )
 
     config = _parse_save_config(configs)
 
     if not isinstance(config.use_binary_format, bool):
         raise TypeError(
-            "Type of `use_binary_format` should be bool, but received {}.".
-            format(type(config.use_binary_format)))
+            "Type of `use_binary_format` should be bool, but received {}.".format(
+                type(config.use_binary_format)
+            )
+        )
 
     if config.use_binary_format:
         _save_binary_var(obj, path)
@@ -744,19 +821,23 @@ def _legacy_save(obj, path, protocol=2):
     if not isinstance(obj, dict):
         raise NotImplementedError(
             "Now only supports save state_dict of Layer or Optimizer, "
-            "expect dict, but received %s." % type(obj))
+            "expect dict, but received %s." % type(obj)
+        )
 
     if len(obj) == 0:
         warnings.warn("The input state dict is empty, no need to save.")
 
     if not isinstance(protocol, int):
-        raise ValueError("The 'protocol' MUST be `int`, but received {}".format(
-            type(protocol)))
+        raise ValueError(
+            "The 'protocol' MUST be `int`, but received {}".format(
+                type(protocol)
+            )
+        )
 
     if protocol < 2 or protocol > 4:
         raise ValueError(
-            "Expected 1<'protocol'<5, but received protocol={}".format(
-                protocol))
+            "Expected 1<'protocol'<5, but received protocol={}".format(protocol)
+        )
 
     if _is_file_path(path):
         filename = os.path.basename(path)
@@ -764,7 +845,8 @@ def _legacy_save(obj, path, protocol=2):
             raise ValueError(
                 "The input path MUST be format of dirname/filename "
                 "[dirname\\filename in Windows system], but received "
-                "filename is empty string.")
+                "filename is empty string."
+            )
         # 2. save object
         dirname = os.path.dirname(path)
         if dirname and not os.path.exists(dirname):
@@ -776,13 +858,16 @@ def _legacy_save(obj, path, protocol=2):
     saved_obj = _unpack_saved_dict(saved_obj, protocol)
 
     # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-    if _is_file_path(
-            path) and sys.platform == 'darwin' and sys.version_info.major == 3:
+    if (
+        _is_file_path(path)
+        and sys.platform == 'darwin'
+        and sys.version_info.major == 3
+    ):
         pickle_bytes = pickle.dumps(saved_obj, protocol=protocol)
         with open(path, 'wb') as f:
             max_bytes = 2**30
             for i in range(0, len(pickle_bytes), max_bytes):
-                f.write(pickle_bytes[i:i + max_bytes])
+                f.write(pickle_bytes[i : i + max_bytes])
     else:
         with _open_file_buffer(path, 'wb') as f:
             pickle.dump(saved_obj, f, protocol=protocol)
@@ -792,46 +877,46 @@ def load(path, **configs):
     '''
     Load an object can be used in paddle from specified path.
 
-    .. note::
+    Note:
         Now supports loading ``state_dict`` of Layer/Optimizer, Tensor and nested structure containing Tensor, Program.
 
-    .. note::
-        In order to use the model parameters saved by paddle more efficiently, 
-        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of 
-        other save APIs except ``paddle.save`` , but the argument ``path`` format is 
+    Note:
+        In order to use the model parameters saved by paddle more efficiently,
+        ``paddle.load`` supports loading ``state_dict`` of Layer from the result of
+        other save APIs except ``paddle.save`` , but the argument ``path`` format is
         different:
-        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,  
-        ``path`` needs to be a complete file name, such as ``model.pdparams`` or 
-        ``model.pdopt`` ; 
-        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model`` 
-        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix, 
-        such as ``model/mnist``, and ``paddle.load`` will get information from 
+        1. loading from ``paddle.static.save`` or ``paddle.Model().save(training=True)`` ,
+        ``path`` needs to be a complete file name, such as ``model.pdparams`` or
+        ``model.pdopt`` ;
+        2. loading from ``paddle.jit.save`` or ``paddle.static.save_inference_model``
+        or ``paddle.Model().save(training=False)`` , ``path`` need to be a file prefix,
+        such as ``model/mnist``, and ``paddle.load`` will get information from
         ``mnist.pdmodel`` and ``mnist.pdiparams`` ;
-        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or 
-        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a 
+        3. loading from paddle 1.x APIs ``paddle.fluid.io.save_inference_model`` or
+        ``paddle.fluid.io.save_params/save_persistables`` , ``path`` need to be a
         directory, such as ``model`` and model is a directory.
 
-    .. note::
-        If you load ``state_dict`` from the saved result of static mode API such as 
-        ``paddle.static.save`` or ``paddle.static.save_inference_model`` , 
-        the structured variable name in dynamic mode will cannot be restored. 
-        You need to set the argument ``use_structured_name=False`` when using 
+    Note:
+        If you load ``state_dict`` from the saved result of static mode API such as
+        ``paddle.static.save`` or ``paddle.static.save_inference_model`` ,
+        the structured variable name in dynamic mode will cannot be restored.
+        You need to set the argument ``use_structured_name=False`` when using
         ``Layer.set_state_dict`` later.
 
     Args:
-        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target 
-            file path. When loading state_dict from the saved result of the API used to save 
+        path(str|BytesIO) : The path/buffer to load the target object. Generally, the path is the target
+            file path. When loading state_dict from the saved result of the API used to save
             the inference model, the path may be a file prefix or directory.
-        **configs (dict, optional): other load configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): other load configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
-            (1) model_filename (str): The inference model file name of the paddle 1.x 
-            ``save_inference_model`` save format. Default file name is :code:`__model__` . 
-            (2) params_filename (str): The persistable variables file name of the paddle 1.x 
-            ``save_inference_model`` save format. No default file name, save variables separately 
-            by default.            
-            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor. 
+            (1) model_filename (str): The inference model file name of the paddle 1.x
+            ``save_inference_model`` save format. Default file name is :code:`__model__` .
+            (2) params_filename (str): The persistable variables file name of the paddle 1.x
+            ``save_inference_model`` save format. No default file name, save variables separately
+            by default.
+            (3) return_numpy(bool): If specified as True, return tensor as numpy.ndarray, otherwise return tensor as paddle.Tensor.
             Default False.
 
     Returns:
@@ -949,9 +1034,11 @@ def load(path, **configs):
         try:
             with _open_file_buffer(path, 'rb') as f:
                 # When value of dict is lager than 4GB ,there is a Bug on 'MAC python3'
-                if _is_file_path(
-                        path
-                ) and sys.platform == 'darwin' and sys.version_info.major == 3:
+                if (
+                    _is_file_path(path)
+                    and sys.platform == 'darwin'
+                    and sys.version_info.major == 3
+                ):
                     load_result = _pickle_loads_mac(path, f)
                 else:
                     load_result = pickle.load(f, encoding='latin1')
@@ -965,18 +1052,24 @@ def load(path, **configs):
                         for key in load_result["StructuredToParameterName@@"]:
                             if isinstance(load_result[key], np.ndarray):
                                 load_result[key] = _ndarray_to_tensor(
-                                    load_result[key], config.return_numpy)
+                                    load_result[key], config.return_numpy
+                                )
 
-                        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+                        if (
+                            not config.keep_name_table
+                            and "StructuredToParameterName@@" in load_result
+                        ):
                             del load_result["StructuredToParameterName@@"]
                     else:
                         # paddle2.1 static.save/load
                         load_result = _parse_load_result(
-                            load_result, config.return_numpy)
+                            load_result, config.return_numpy
+                        )
 
                 else:
-                    load_result = _parse_load_result(load_result,
-                                                     config.return_numpy)
+                    load_result = _parse_load_result(
+                        load_result, config.return_numpy
+                    )
 
         except exception_type as msg_pickle:
             try:
@@ -996,12 +1089,15 @@ def load(path, **configs):
                         with _open_file_buffer(path, "rb") as f:
                             program_desc_str = f.read()
                             program = Program.parse_from_string(
-                                program_desc_str)
+                                program_desc_str
+                            )
                             return program
                     except:
                         raise ValueError(
                             "`paddle.load` can not parse the file:{}.".format(
-                                path))
+                                path
+                            )
+                        )
 
     else:
         load_result = _legacy_load(path, **configs)
@@ -1018,7 +1114,10 @@ def _legacy_load(path, **configs):
         with _open_file_buffer(path, 'rb') as f:
             load_result = pickle.load(f, encoding='latin1')
         load_result = _pack_loaded_dict(load_result)
-        if not config.keep_name_table and "StructuredToParameterName@@" in load_result:
+        if (
+            not config.keep_name_table
+            and "StructuredToParameterName@@" in load_result
+        ):
             del load_result["StructuredToParameterName@@"]
     else:
         # file prefix and directory are compatible cases
@@ -1039,7 +1138,8 @@ def _legacy_load(path, **configs):
             # the user to configure the `use_structured_name` argument when `set_state_dict`
             # NOTE(chenweihang): `jit.save` doesn't save optimizer state
             load_result = _load_state_dict_from_save_inference_model(
-                model_path, config)
+                model_path, config
+            )
         else:
             # load state dict by `io.save_params/persistables` save format
             # TODO(chenweihang): [ Now only supports loading parameters separately ]
diff --git a/python/paddle/geometric/math.py b/python/paddle/geometric/math.py
index 7a6db7d10aa991..22b0f32114a221 100644
--- a/python/paddle/geometric/math.py
+++ b/python/paddle/geometric/math.py
@@ -32,16 +32,15 @@ def segment_sum(data, segment_ids, name=None):
     Args:
         data (Tensor): A tensor, available data type float32, float64, int32, int64, float16.
         segment_ids (Tensor): A 1-D tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             Available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+        - output (Tensor), the reduced result.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
@@ -54,29 +53,30 @@ def segment_sum(data, segment_ids, name=None):
     if in_dygraph_mode():
         return _C_ops.segment_pool(data, segment_ids, "SUM")[0]
     if _in_legacy_dygraph():
-        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
-                                              "SUM")
+        out, tmp = _legacy_C_ops.segment_pool(
+            data, segment_ids, 'pooltype', "SUM"
+        )
         return out
 
     check_variable_and_dtype(
-        data, "X", ("float32", "float64", "int32", "int64", "float16"),
-        "segment_pool")
-    check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
-                             "segment_pool")
+        data,
+        "X",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "segment_pool",
+    )
+    check_variable_and_dtype(
+        segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool"
+    )
 
     helper = LayerHelper("segment_sum", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(type="segment_pool",
-                     inputs={
-                         "X": data,
-                         "SegmentIds": segment_ids
-                     },
-                     outputs={
-                         "Out": out,
-                         "SummedIds": summed_ids
-                     },
-                     attrs={"pooltype": "SUM"})
+    helper.append_op(
+        type="segment_pool",
+        inputs={"X": data, "SegmentIds": segment_ids},
+        outputs={"Out": out, "SummedIds": summed_ids},
+        attrs={"pooltype": "SUM"},
+    )
     return out
 
 
@@ -84,7 +84,7 @@ def segment_mean(data, segment_ids, name=None):
     r"""
     Segment mean Operator.
 
-    Ihis operator calculate the mean value of input `data` which
+    This operator calculate the mean value of input `data` which
     with the same index in `segment_ids`.
     It computes a tensor such that $out_i = \\frac{1}{n_i}  \\sum_{j} data[j]$
     where sum is over j such that 'segment_ids[j] == i' and $n_i$ is the number
@@ -92,17 +92,16 @@ def segment_mean(data, segment_ids, name=None):
 
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64, float16.
-        segment_ids (tensor): a 1-d tensor, which have the same size 
-                            with the first dimension of input data. 
+        segment_ids (tensor): a 1-d tensor, which have the same size
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+        - output (Tensor), the reduced result.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
@@ -116,29 +115,30 @@ def segment_mean(data, segment_ids, name=None):
     if in_dygraph_mode():
         return _C_ops.segment_pool(data, segment_ids, "MEAN")[0]
     if _in_legacy_dygraph():
-        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
-                                              "MEAN")
+        out, tmp = _legacy_C_ops.segment_pool(
+            data, segment_ids, 'pooltype', "MEAN"
+        )
         return out
 
     check_variable_and_dtype(
-        data, "X", ("float32", "float64", "int32", "int64", "float16"),
-        "segment_pool")
-    check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
-                             "segment_pool")
+        data,
+        "X",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "segment_pool",
+    )
+    check_variable_and_dtype(
+        segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool"
+    )
 
     helper = LayerHelper("segment_mean", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(type="segment_pool",
-                     inputs={
-                         "X": data,
-                         "SegmentIds": segment_ids
-                     },
-                     outputs={
-                         "Out": out,
-                         "SummedIds": summed_ids
-                     },
-                     attrs={"pooltype": "MEAN"})
+    helper.append_op(
+        type="segment_pool",
+        inputs={"X": data, "SegmentIds": segment_ids},
+        outputs={"Out": out, "SummedIds": summed_ids},
+        attrs={"pooltype": "MEAN"},
+    )
     return out
 
 
@@ -154,16 +154,15 @@ def segment_min(data, segment_ids, name=None):
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64, float16.
         segment_ids (tensor): a 1-d tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+        - output (Tensor), the reduced result.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
@@ -177,29 +176,30 @@ def segment_min(data, segment_ids, name=None):
     if in_dygraph_mode():
         return _C_ops.segment_pool(data, segment_ids, "MIN")[0]
     if _in_legacy_dygraph():
-        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
-                                              "MIN")
+        out, tmp = _legacy_C_ops.segment_pool(
+            data, segment_ids, 'pooltype', "MIN"
+        )
         return out
 
     check_variable_and_dtype(
-        data, "X", ("float32", "float64", "int32", "int64", "float16"),
-        "segment_pool")
-    check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
-                             "segment_pool")
+        data,
+        "X",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "segment_pool",
+    )
+    check_variable_and_dtype(
+        segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool"
+    )
 
     helper = LayerHelper("segment_min", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(type="segment_pool",
-                     inputs={
-                         "X": data,
-                         "SegmentIds": segment_ids
-                     },
-                     outputs={
-                         "Out": out,
-                         "SummedIds": summed_ids
-                     },
-                     attrs={"pooltype": "MIN"})
+    helper.append_op(
+        type="segment_pool",
+        inputs={"X": data, "SegmentIds": segment_ids},
+        outputs={"Out": out, "SummedIds": summed_ids},
+        attrs={"pooltype": "MIN"},
+    )
     return out
 
 
@@ -215,16 +215,15 @@ def segment_max(data, segment_ids, name=None):
     Args:
         data (tensor): a tensor, available data type float32, float64, int32, int64, float16.
         segment_ids (tensor): a 1-d tensor, which have the same size
-                            with the first dimension of input data. 
+                            with the first dimension of input data.
                             available data type is int32, int64.
-        name (str, optional): Name for the operation (optional, default is None). 
+        name (str, optional): Name for the operation (optional, default is None).
                             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-       output (Tensor): the reduced result.
+        - output (Tensor), the reduced result.
 
     Examples:
-
         .. code-block:: python
 
             import paddle
@@ -238,27 +237,28 @@ def segment_max(data, segment_ids, name=None):
     if in_dygraph_mode():
         return _C_ops.segment_pool(data, segment_ids, "MAX")[0]
     if _in_legacy_dygraph():
-        out, tmp = _legacy_C_ops.segment_pool(data, segment_ids, 'pooltype',
-                                              "MAX")
+        out, tmp = _legacy_C_ops.segment_pool(
+            data, segment_ids, 'pooltype', "MAX"
+        )
         return out
 
     check_variable_and_dtype(
-        data, "X", ("float32", "float64", "int32", "int64", "float16"),
-        "segment_pool")
-    check_variable_and_dtype(segment_ids, "SegmentIds", ("int32", "int64"),
-                             "segment_pool")
+        data,
+        "X",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "segment_pool",
+    )
+    check_variable_and_dtype(
+        segment_ids, "SegmentIds", ("int32", "int64"), "segment_pool"
+    )
 
     helper = LayerHelper("segment_max", **locals())
     out = helper.create_variable_for_type_inference(dtype=data.dtype)
     summed_ids = helper.create_variable_for_type_inference(dtype=data.dtype)
-    helper.append_op(type="segment_pool",
-                     inputs={
-                         "X": data,
-                         "SegmentIds": segment_ids
-                     },
-                     outputs={
-                         "Out": out,
-                         "SummedIds": summed_ids
-                     },
-                     attrs={"pooltype": "MAX"})
+    helper.append_op(
+        type="segment_pool",
+        inputs={"X": data, "SegmentIds": segment_ids},
+        outputs={"Out": out, "SummedIds": summed_ids},
+        attrs={"pooltype": "MAX"},
+    )
     return out
diff --git a/python/paddle/geometric/message_passing/__init__.py b/python/paddle/geometric/message_passing/__init__.py
index f215e5be74a48b..c07f9bc40c6b39 100644
--- a/python/paddle/geometric/message_passing/__init__.py
+++ b/python/paddle/geometric/message_passing/__init__.py
@@ -16,8 +16,4 @@
 from .send_recv import send_ue_recv  # noqa: F401
 from .send_recv import send_uv  # noqa: F401
 
-__all__ = [
-    'send_u_recv',
-    'send_ue_recv',
-    'send_uv',
-]
+__all__ = []
diff --git a/python/paddle/geometric/message_passing/send_recv.py b/python/paddle/geometric/message_passing/send_recv.py
index 03a272aa6af08f..839b6e93e80a27 100644
--- a/python/paddle/geometric/message_passing/send_recv.py
+++ b/python/paddle/geometric/message_passing/send_recv.py
@@ -14,29 +14,38 @@
 
 import numpy as np
 from paddle.fluid.layer_helper import LayerHelper
-from paddle.fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from paddle.fluid.framework import (
+    _non_static_mode,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from paddle.fluid.framework import Variable
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+    convert_dtype,
+)
 from paddle import _C_ops, _legacy_C_ops
 
-from .utils import convert_out_size_to_list, get_out_size_tensor_inputs, reshape_lhs_rhs
+from .utils import (
+    convert_out_size_to_list,
+    get_out_size_tensor_inputs,
+    reshape_lhs_rhs,
+)
 
 __all__ = []
 
 
-def send_u_recv(x,
-                src_index,
-                dst_index,
-                reduce_op="sum",
-                out_size=None,
-                name=None):
+def send_u_recv(
+    x, src_index, dst_index, reduce_op="sum", out_size=None, name=None
+):
     """
-
     Graph Learning message passing api.
 
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor 
+    to gather the corresponding data, and then use `dst_index` to update the corresponding position of output tensor
     in different reduce ops, like sum, mean, max, or min. Besides, we can use `out_size` to set necessary output shape.
 
     .. code-block:: text
@@ -65,21 +74,20 @@ def send_u_recv(x,
         x (Tensor): The input tensor, and the available data type is float32, float64, int32, int64.
                     And we support float16 in gpu version.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
-                            The available data type is int32, int64. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
+                            The available data type is int32, int64.
         reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or 
+        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
                                     out_size is smaller or equal to 0, then this input will not be used.
-                                    Otherwise, `out_size` should be equal with or larger than 
+                                    Otherwise, `out_size` should be equal with or larger than
                                     max(dst_index) + 1.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`. 
-                      If `out_size` is set correctly, then it should have the same shape as `x` except 
-                      the 0th dimension.
+        - out (Tensor), the output tensor, should have the same shape and same dtype as input tensor `x`.
+          If `out_size` is set correctly, then it should have the same shape as `x` except the 0th dimension.
 
     Examples:
         .. code-block:: python
@@ -110,74 +118,93 @@ def send_u_recv(x,
     if reduce_op not in ["sum", "mean", "max", "min"]:
         raise ValueError(
             "reduce_op should be `sum`, `mean`, `max` or `min`, but received %s"
-            % reduce_op)
+            % reduce_op
+        )
 
     # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
 
     if _in_legacy_dygraph():
         out_size = convert_out_size_to_list(out_size)
-        out, tmp = _legacy_C_ops.graph_send_recv(x, src_index, dst_index,
-                                                 None, 'reduce_op',
-                                                 reduce_op.upper(), 'out_size',
-                                                 out_size)
+        out, tmp = _legacy_C_ops.graph_send_recv(
+            x,
+            src_index,
+            dst_index,
+            None,
+            'reduce_op',
+            reduce_op.upper(),
+            'out_size',
+            out_size,
+        )
         return out
     if in_dygraph_mode():
         out_size = convert_out_size_to_list(out_size)
-        return _C_ops.graph_send_recv(x, src_index, dst_index,
-                                      reduce_op.upper(), out_size)
+        return _C_ops.graph_send_recv(
+            x, src_index, dst_index, reduce_op.upper(), out_size
+        )
 
     check_variable_and_dtype(
-        x, "X", ("float32", "float64", "int32", "int64", "float16"),
-        "graph_send_recv")
-    check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"),
-                             "graph_send_recv")
-    check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"),
-                             "graph_send_recv")
+        x,
+        "X",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "graph_send_recv",
+    )
+    check_variable_and_dtype(
+        src_index, "Src_index", ("int32", "int64"), "graph_send_recv"
+    )
+    check_variable_and_dtype(
+        dst_index, "Dst_index", ("int32", "int64"), "graph_send_recv"
+    )
     if out_size:
-        check_type(out_size, 'out_size', (int, np.int32, np.int64, Variable),
-                   'graph_send_recv')
+        check_type(
+            out_size,
+            'out_size',
+            (int, np.int32, np.int64, Variable),
+            'graph_send_recv',
+        )
     if isinstance(out_size, Variable):
-        check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'],
-                    'graph_send_recv')
+        check_dtype(
+            out_size.dtype, 'out_size', ['int32', 'int64'], 'graph_send_recv'
+        )
 
     helper = LayerHelper("send_u_recv", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    dst_count = helper.create_variable_for_type_inference(dtype="int32",
-                                                          stop_gradient=True)
+    dst_count = helper.create_variable_for_type_inference(
+        dtype="int32", stop_gradient=True
+    )
 
     inputs = {"X": x, "Src_index": src_index, "Dst_index": dst_index}
     attrs = {"reduce_op": reduce_op.upper()}
-    get_out_size_tensor_inputs(inputs=inputs,
-                               attrs=attrs,
-                               out_size=out_size,
-                               op_type='graph_send_recv')
-
-    helper.append_op(type="graph_send_recv",
-                     inputs=inputs,
-                     outputs={
-                         "Out": out,
-                         "Dst_count": dst_count
-                     },
-                     attrs=attrs)
+    get_out_size_tensor_inputs(
+        inputs=inputs, attrs=attrs, out_size=out_size, op_type='graph_send_recv'
+    )
+
+    helper.append_op(
+        type="graph_send_recv",
+        inputs=inputs,
+        outputs={"Out": out, "Dst_count": dst_count},
+        attrs=attrs,
+    )
     return out
 
 
-def send_ue_recv(x,
-                 y,
-                 src_index,
-                 dst_index,
-                 message_op="add",
-                 reduce_op="sum",
-                 out_size=None,
-                 name=None):
+def send_ue_recv(
+    x,
+    y,
+    src_index,
+    dst_index,
+    message_op="add",
+    reduce_op="sum",
+    out_size=None,
+    name=None,
+):
     """
 
     Graph Learning message passing api.
 
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
     consumption in the process of message passing. Take `x` as the input tensor, we first use `src_index`
-    to gather the corresponding data, after computing with `y` in different message ops like add/sub/mul/div, then use `dst_index` to 
-    update the corresponding position of output tensor in different reduce ops, like sum, mean, max, or min. 
+    to gather the corresponding data, after computing with `y` in different message ops like add/sub/mul/div, then use `dst_index` to
+    update the corresponding position of output tensor in different reduce ops, like sum, mean, max, or min.
     Besides, we can use `out_size` to set necessary output shape.
 
     .. code-block:: text
@@ -205,28 +232,28 @@ def send_ue_recv(x,
            out = [[1, 3, 4],
                   [4, 10, 12],
                   [2, 5, 6]]
+
     Args:
         x (Tensor): The input node feature tensor, and the available data type is float32, float64, int32, int64.
                     And we support float16 in gpu version.
         y (Tensor): The input edge feature tensor, and the available data type is float32, float64, int32, int64.
                     And we support float16 in gpu version.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
                             The available data type is int32, int64.
-        message_op (str): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
-        reduce_op (str): Different reduce ops, including `sum`, `mean`, `max`, `min`.
+        message_op (str, optional): Different message ops for x and e, including `add`, `sub`, `mul`, `div`.
+        reduce_op (str, optional): Different reduce ops, including `sum`, `mean`, `max`, `min`.
                          Default value is `sum`.
-        out_size (int|Tensor|None): We can set `out_size` to get necessary output shape. If not set or
+        out_size (int|Tensor, optional): We can set `out_size` to get necessary output shape. If not set or
                                     out_size is smaller or equal to 0, then this input will not be used.
                                     Otherwise, `out_size` should be equal with or larger than
-                                    max(dst_index) + 1.
+                                    max(dst_index) + 1. Default value is `None`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The output tensor, should have the same shape and same dtype as input tensor `x`.
-                      If `out_size` is set correctly, then it should have the same shape as `x` except
-                      the 0th dimension.
+        - out (Tensor), the output tensor, should have the same shape and same dtype as input tensor `x`.
+          If `out_size` is set correctly, then it should have the same shape as `x` except the 0th dimension.
 
     Examples:
         .. code-block:: python
@@ -259,13 +286,15 @@ def send_ue_recv(x,
 
     if message_op not in ["add", "sub", "mul", "div"]:
         raise ValueError(
-            "message_op should be `add`, `sub`, `mul`, `div`, but received %s" %
-            message_op)
+            "message_op should be `add`, `sub`, `mul`, `div`, but received %s"
+            % message_op
+        )
 
     if reduce_op not in ["sum", "mean", "max", "min"]:
         raise ValueError(
             "reduce_op should be `sum`, `mean`, `max` or `min`, but received %s"
-            % reduce_op)
+            % reduce_op
+        )
 
     x, y = reshape_lhs_rhs(x, y)
 
@@ -274,61 +303,89 @@ def send_ue_recv(x,
         y = -y
     if message_op == "div":
         message_op = 'mul'
-        y = 1. / (y + 1e-12)
+        y = 1.0 / (y + 1e-12)
 
     # TODO(daisiming): Should we add judgement for out_size: max(dst_index) + 1.
 
     if _in_legacy_dygraph():
         out_size = convert_out_size_to_list(out_size)
-        out, tmp = _legacy_C_ops.graph_send_ue_recv(x, y, src_index, dst_index,
-                                                    None, 'message_op',
-                                                    message_op.upper(),
-                                                    'reduce_op',
-                                                    reduce_op.upper(),
-                                                    'out_size', out_size)
+        out, tmp = _legacy_C_ops.graph_send_ue_recv(
+            x,
+            y,
+            src_index,
+            dst_index,
+            None,
+            'message_op',
+            message_op.upper(),
+            'reduce_op',
+            reduce_op.upper(),
+            'out_size',
+            out_size,
+        )
         return out
     if in_dygraph_mode():
         out_size = convert_out_size_to_list(out_size)
-        return _C_ops.graph_send_ue_recv(x, y, src_index, dst_index,
-                                         message_op.upper(), reduce_op.upper(),
-                                         out_size)
+        return _C_ops.graph_send_ue_recv(
+            x,
+            y,
+            src_index,
+            dst_index,
+            message_op.upper(),
+            reduce_op.upper(),
+            out_size,
+        )
 
     check_variable_and_dtype(
-        x, "X", ("float32", "float64", "int32", "int64", "float16"),
-        "graph_send_ue_recv")
+        x,
+        "X",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "graph_send_ue_recv",
+    )
+    check_variable_and_dtype(
+        y,
+        "Y",
+        ("float32", "float64", "int32", "int64", "float16"),
+        "graph_send_ue_recv",
+    )
+    check_variable_and_dtype(
+        src_index, "Src_index", ("int32", "int64"), "graph_send_ue_recv"
+    )
     check_variable_and_dtype(
-        y, "Y", ("float32", "float64", "int32", "int64", "float16"),
-        "graph_send_ue_recv")
-    check_variable_and_dtype(src_index, "Src_index", ("int32", "int64"),
-                             "graph_send_ue_recv")
-    check_variable_and_dtype(dst_index, "Dst_index", ("int32", "int64"),
-                             "graph_send_ue_recv")
+        dst_index, "Dst_index", ("int32", "int64"), "graph_send_ue_recv"
+    )
     if out_size:
-        check_type(out_size, 'out_size', (int, np.int32, np.int64, Variable),
-                   'graph_send_ue_recv')
+        check_type(
+            out_size,
+            'out_size',
+            (int, np.int32, np.int64, Variable),
+            'graph_send_ue_recv',
+        )
     if isinstance(out_size, Variable):
-        check_dtype(out_size.dtype, 'out_size', ['int32', 'int64'],
-                    'graph_send_ue_recv')
+        check_dtype(
+            out_size.dtype, 'out_size', ['int32', 'int64'], 'graph_send_ue_recv'
+        )
 
     helper = LayerHelper("send_ue_recv", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    dst_count = helper.create_variable_for_type_inference(dtype="int32",
-                                                          stop_gradient=True)
+    dst_count = helper.create_variable_for_type_inference(
+        dtype="int32", stop_gradient=True
+    )
 
     inputs = {"X": x, "Y": y, "Src_index": src_index, "Dst_index": dst_index}
     attrs = {"message_op": message_op.upper(), "reduce_op": reduce_op.upper()}
-    get_out_size_tensor_inputs(inputs=inputs,
-                               attrs=attrs,
-                               out_size=out_size,
-                               op_type='graph_send_ue_recv')
-
-    helper.append_op(type="graph_send_ue_recv",
-                     inputs=inputs,
-                     outputs={
-                         "Out": out,
-                         "Dst_count": dst_count
-                     },
-                     attrs=attrs)
+    get_out_size_tensor_inputs(
+        inputs=inputs,
+        attrs=attrs,
+        out_size=out_size,
+        op_type='graph_send_ue_recv',
+    )
+
+    helper.append_op(
+        type="graph_send_ue_recv",
+        inputs=inputs,
+        outputs={"Out": out, "Dst_count": dst_count},
+        attrs=attrs,
+    )
     return out
 
 
@@ -337,8 +394,8 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
 
     Graph Learning message passing api.
 
-    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory 
-    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as 
+    This api is mainly used in Graph Learning domain, and the main purpose is to reduce intermediate memory
+    consumption in the process of message passing. Take `x` as the source node feature tensor, take `y` as
     the destination node feature tensor. Then we use `src_index` and `dst_index` to gather the corresponding data,
     and then compute the edge features in different message_ops like `add`, `sub`, `mul`, `div`.
 
@@ -371,16 +428,17 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
         x (Tensor): The source node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
         y (Tensor): The destination node feature tensor, and the available data type is float32, float64, int32, int64. And we support float16 in gpu version.
         src_index (Tensor): An 1-D tensor, and the available data type is int32, int64.
-        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`. 
-                            The available data type is int32, int64. 
+        dst_index (Tensor): An 1-D tensor, and should have the same shape as `src_index`.
+                            The available data type is int32, int64.
         message_op (str): Different message ops for x and y, including `add`, `sub`, `mul` and `div`.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): The output tensor.
+        - out (Tensor), the output tensor.
 
     Examples:
+
         .. code-block:: python
 
             import paddle
@@ -397,8 +455,9 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
 
     if message_op not in ['add', 'sub', 'mul', 'div']:
         raise ValueError(
-            "message_op should be `add`, `sub`, `mul`, `div`, but received %s" %
-            message_op)
+            "message_op should be `add`, `sub`, `mul`, `div`, but received %s"
+            % message_op
+        )
 
     x, y = reshape_lhs_rhs(x, y)
 
@@ -407,38 +466,50 @@ def send_uv(x, y, src_index, dst_index, message_op="add", name=None):
         y = -y
     if message_op == 'div':
         message_op = 'mul'
-        y = 1. / (y + 1e-12)
+        y = 1.0 / (y + 1e-12)
 
     if in_dygraph_mode():
-        return _C_ops.graph_send_uv(x, y, src_index, dst_index,
-                                    message_op.upper())
+        return _C_ops.graph_send_uv(
+            x, y, src_index, dst_index, message_op.upper()
+        )
     else:
         if _in_legacy_dygraph():
-            return _legacy_C_ops.graph_send_uv(x, y, src_index, dst_index,
-                                               "message_op", message_op.upper())
+            return _legacy_C_ops.graph_send_uv(
+                x, y, src_index, dst_index, "message_op", message_op.upper()
+            )
         else:
             helper = LayerHelper("send_uv", **locals())
             check_variable_and_dtype(
-                x, 'x', ['int32', 'int64', 'float32', 'float64', 'float16'],
-                'graph_send_uv')
+                x,
+                'x',
+                ['int32', 'int64', 'float32', 'float64', 'float16'],
+                'graph_send_uv',
+            )
+            check_variable_and_dtype(
+                y,
+                'y',
+                ['int32', 'int64', 'float32', 'float64', 'float16'],
+                'graph_send_uv',
+            )
+            check_variable_and_dtype(
+                src_index, 'src_index', ['int32', 'int64'], 'graph_send_uv'
+            )
             check_variable_and_dtype(
-                y, 'y', ['int32', 'int64', 'float32', 'float64', 'float16'],
-                'graph_send_uv')
-            check_variable_and_dtype(src_index, 'src_index', ['int32', 'int64'],
-                                     'graph_send_uv')
-            check_variable_and_dtype(dst_index, 'dst_index', ['int32', 'int64'],
-                                     'graph_send_uv')
+                dst_index, 'dst_index', ['int32', 'int64'], 'graph_send_uv'
+            )
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
             inputs = {
                 'x': x,
                 'y': y,
                 'src_index': src_index,
-                'dst_index': dst_index
+                'dst_index': dst_index,
             }
             attrs = {'message_op': message_op.upper()}
-            helper.append_op(type="graph_send_uv",
-                             inputs=inputs,
-                             attrs=attrs,
-                             outputs={"out": out})
+            helper.append_op(
+                type="graph_send_uv",
+                inputs=inputs,
+                attrs=attrs,
+                outputs={"out": out},
+            )
             return out
diff --git a/python/paddle/geometric/reindex.py b/python/paddle/geometric/reindex.py
index 9580ff5c4ee1f2..3b68931dfb99e7 100644
--- a/python/paddle/geometric/reindex.py
+++ b/python/paddle/geometric/reindex.py
@@ -22,214 +22,205 @@
 __all__ = []
 
 
-def reindex_graph(x,
-                  neighbors,
-                  count,
-                  value_buffer=None,
-                  index_buffer=None,
-                  name=None):
+def reindex_graph(
+    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
+):
     """
+
     Reindex Graph API.
 
     This API is mainly used in Graph Learning domain, which should be used
-    in conjunction with `graph_sample_neighbors` API. And the main purpose
-    is to reindex the ids information of the input nodes, and return the 
+    in conjunction with `paddle.geometric.sample_neighbors` API. And the main purpose
+    is to reindex the ids information of the input nodes, and return the
     corresponding graph edges after reindex.
 
-    **Notes**: 
-        The number in x should be unique, otherwise it would cause potential errors.
-    We will reindex all the nodes from 0. 
-
-    Take input nodes x = [0, 1, 2] as an example.
-    If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
-    then we know that the neighbors of 0 is [8, 9], the neighbors of 1
-    is [0, 4, 7], and the neighbors of 2 is [6, 7]. 
-    Then after graph_reindex, we will have 3 different outputs:
-        1. reindex_src: [3, 4, 0, 5, 6, 7, 6]
-        2. reindex_dst: [0, 0, 1, 1, 1, 2, 2]
-        3. out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
-    We can see that the numbers in `reindex_src` and `reindex_dst` is the corresponding index
+    Take input nodes x = [0, 1, 2] as an example. If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
+    then we know that the neighbors of 0 is [8, 9], the neighbors of 1 is [0, 4, 7], and the neighbors of 2 is [6, 7].
+    Then after graph_reindex, we will have 3 different outputs: reindex_src: [3, 4, 0, 5, 6, 7, 6], reindex_dst: [0, 0, 1, 1, 1, 2, 2]
+    and out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]. We can see that the numbers in `reindex_src` and `reindex_dst` is the corresponding index
     of nodes in `out_nodes`.
 
+    Note:
+        The number in x should be unique, otherwise it would cause potential errors. We will reindex all the nodes from 0.
+
     Args:
         x (Tensor): The input nodes which we sample neighbors for. The available
                     data type is int32, int64.
         neighbors (Tensor): The neighbors of the input nodes `x`. The data type
                             should be the same with `x`.
-        count (Tensor): The neighbor count of the input nodes `x`. And the 
+        count (Tensor): The neighbor count of the input nodes `x`. And the
                         data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
-                                    `value_buffer` and `index_buffer` should be both not None 
-                                    if you want to speed up by using hashtable buffer.
+                                    `value_buffer` and `index_buffer` should be both not None
+                                    if you want to speed up by using hashtable buffer. Default is None.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
-                            where we put the input nodes `x` in the front, and put neighbor
-                            nodes in the back.
+        - reindex_src (Tensor), the source node index of graph edges after reindex.
+
+        - reindex_dst (Tensor), the destination node index of graph edges after reindex.
+
+        - out_nodes (Tensor), the index of unique input nodes and neighbors before reindex, where we put the input nodes `x` in the front, and put neighbor nodes in the back.
 
     Examples:
-        
         .. code-block:: python
 
-        import paddle
-
-        x = [0, 1, 2]
-        neighbors = [8, 9, 0, 4, 7, 6, 7]
-        count = [2, 3, 2]
-        x = paddle.to_tensor(x, dtype="int64")
-        neighbors = paddle.to_tensor(neighbors, dtype="int64")
-        count = paddle.to_tensor(count, dtype="int32")
+            import paddle
 
-        reindex_src, reindex_dst, out_nodes = \
-             paddle.geometric.reindex_graph(x, neighbors, count)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
+            x = [0, 1, 2]
+            neighbors = [8, 9, 0, 4, 7, 6, 7]
+            count = [2, 3, 2]
+            x = paddle.to_tensor(x, dtype="int64")
+            neighbors = paddle.to_tensor(neighbors, dtype="int64")
+            count = paddle.to_tensor(count, dtype="int32")
+            reindex_src, reindex_dst, out_nodes = paddle.geometric.reindex_graph(x, neighbors, count)
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
 
     """
-    use_buffer_hashtable = True if value_buffer is not None \
-                                and index_buffer is not None else False
+    use_buffer_hashtable = (
+        True if value_buffer is not None and index_buffer is not None else False
+    )
 
     if _non_static_mode():
-        reindex_src, reindex_dst, out_nodes = \
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
-                                 "flag_buffer_hashtable", use_buffer_hashtable)
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
+            x,
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            use_buffer_hashtable,
+        )
         return reindex_src, reindex_dst, out_nodes
 
     check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
-                             "graph_reindex")
+    check_variable_and_dtype(
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
     check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
 
     if use_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
-                                 "graph_reindex")
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
+        )
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
 
     helper = LayerHelper("reindex_graph", **locals())
     reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
     reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="graph_reindex",
-                     inputs={
-                         "X":
-                         x,
-                         "Neighbors":
-                         neighbors,
-                         "Count":
-                         count,
-                         "HashTable_Value":
-                         value_buffer if use_buffer_hashtable else None,
-                         "HashTable_Index":
-                         index_buffer if use_buffer_hashtable else None,
-                     },
-                     outputs={
-                         "Reindex_Src": reindex_src,
-                         "Reindex_Dst": reindex_dst,
-                         "Out_Nodes": out_nodes
-                     },
-                     attrs={"flag_buffer_hashtable": use_buffer_hashtable})
+    helper.append_op(
+        type="graph_reindex",
+        inputs={
+            "X": x,
+            "Neighbors": neighbors,
+            "Count": count,
+            "HashTable_Value": value_buffer if use_buffer_hashtable else None,
+            "HashTable_Index": index_buffer if use_buffer_hashtable else None,
+        },
+        outputs={
+            "Reindex_Src": reindex_src,
+            "Reindex_Dst": reindex_dst,
+            "Out_Nodes": out_nodes,
+        },
+        attrs={"flag_buffer_hashtable": use_buffer_hashtable},
+    )
     return reindex_src, reindex_dst, out_nodes
 
 
-def reindex_heter_graph(x,
-                        neighbors,
-                        count,
-                        value_buffer=None,
-                        index_buffer=None,
-                        name=None):
+def reindex_heter_graph(
+    x, neighbors, count, value_buffer=None, index_buffer=None, name=None
+):
     """
+
     Reindex HeterGraph API.
 
     This API is mainly used in Graph Learning domain, which should be used
-    in conjunction with `graph_sample_neighbors` API. And the main purpose
+    in conjunction with `paddle.geometric.sample_neighbors` API. And the main purpose
     is to reindex the ids information of the input nodes, and return the
     corresponding graph edges after reindex.
 
-    **Notes**:
-        The number in x should be unique, otherwise it would cause potential errors.
-    We support multi-edge-types neighbors reindexing in reindex_heter_graph api. 
-    We will reindex all the nodes from 0.
-
-    Take input nodes x = [0, 1, 2] as an example.
-    For graph A, suppose we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
-    then we know that the neighbors of 0 is [8, 9], the neighbors of 1
-    is [0, 4, 7], and the neighbors of 2 is [6, 7].
-    For graph B, suppose we have neighbors = [0, 2, 3, 5, 1], and count = [1, 3, 1],
-    then we know that the neighbors of 0 is [0], the neighbors of 1 is [2, 3, 5],
-    and the neighbors of 3 is [1].
-    We will get following outputs:
-        1. reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
-        2. reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
-        3. out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5] 
+    Take input nodes x = [0, 1, 2] as an example. For graph A, suppose we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2],
+    then we know that the neighbors of 0 is [8, 9], the neighbors of 1 is [0, 4, 7], and the neighbors of 2 is [6, 7]. For graph B,
+    suppose we have neighbors = [0, 2, 3, 5, 1], and count = [1, 3, 1], then we know that the neighbors of 0 is [0], the neighbors of 1 is [2, 3, 5],
+    and the neighbors of 3 is [1]. We will get following outputs: reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1], reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
+    and out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5].
+
+    Note:
+        The number in x should be unique, otherwise it would cause potential errors. We support multi-edge-types neighbors reindexing in reindex_heter_graph api. We will reindex all the nodes from 0.
 
     Args:
         x (Tensor): The input nodes which we sample neighbors for. The available
                     data type is int32, int64.
-        neighbors (list|tuple): The neighbors of the input nodes `x` from different graphs. 
+        neighbors (list|tuple): The neighbors of the input nodes `x` from different graphs.
                                 The data type should be the same with `x`.
-        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs. 
+        count (list|tuple): The neighbor counts of the input nodes `x` from different graphs.
                             And the data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should be int32,
-                                    and should be filled with -1. Only useful for gpu version.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should be int32,
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should be int32,
+                                    and should be filled with -1. Only useful for gpu version. Default is None.
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should be int32,
                                     and should be filled with -1. Only useful for gpu version.
                                     `value_buffer` and `index_buffer` should be both not None
-                                    if you want to speed up by using hashtable buffer.
+                                    if you want to speed up by using hashtable buffer. Default is None.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
-                            where we put the input nodes `x` in the front, and put neighbor
-                            nodes in the back.
+        - reindex_src (Tensor), the source node index of graph edges after reindex.
 
-    Examples:
+        - reindex_dst (Tensor), the destination node index of graph edges after reindex.
 
-        .. code-block:: python
+        - out_nodes (Tensor), the index of unique input nodes and neighbors before reindex,
+                              where we put the input nodes `x` in the front, and put neighbor
+                              nodes in the back.
 
-        import paddle
-
-        x = [0, 1, 2]
-        neighbors_a = [8, 9, 0, 4, 7, 6, 7]
-        count_a = [2, 3, 2]
-        x = paddle.to_tensor(x, dtype="int64")
-        neighbors_a = paddle.to_tensor(neighbors_a, dtype="int64")
-        count_a = paddle.to_tensor(count_a, dtype="int32")
-
-        neighbors_b = [0, 2, 3, 5, 1]
-        count_b = [1, 3, 1]
-        neighbors_b = paddle.to_tensor(neighbors_b, dtype="int64")
-        count_b = paddle.to_tensor(count_b, dtype="int32")
+    Examples:
+        .. code-block:: python
 
-        neighbors = [neighbors_a, neighbors_b]
-        count = [count_a, count_b]
-        reindex_src, reindex_dst, out_nodes = \
-             paddle.geometric.reindex_heter_graph(x, neighbors, count)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
+            import paddle
+
+            x = [0, 1, 2]
+            neighbors_a = [8, 9, 0, 4, 7, 6, 7]
+            count_a = [2, 3, 2]
+            x = paddle.to_tensor(x, dtype="int64")
+            neighbors_a = paddle.to_tensor(neighbors_a, dtype="int64")
+            count_a = paddle.to_tensor(count_a, dtype="int32")
+            neighbors_b = [0, 2, 3, 5, 1]
+            count_b = [1, 3, 1]
+            neighbors_b = paddle.to_tensor(neighbors_b, dtype="int64")
+            count_b = paddle.to_tensor(count_b, dtype="int32")
+            neighbors = [neighbors_a, neighbors_b]
+            count = [count_a, count_b]
+            reindex_src, reindex_dst, out_nodes = paddle.geometric.reindex_heter_graph(x, neighbors, count)
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
 
     """
-    use_buffer_hashtable = True if value_buffer is not None \
-                                and index_buffer is not None else False
+    use_buffer_hashtable = (
+        True if value_buffer is not None and index_buffer is not None else False
+    )
 
     if _non_static_mode():
         neighbors = paddle.concat(neighbors, axis=0)
         count = paddle.concat(count, axis=0)
-        reindex_src, reindex_dst, out_nodes = \
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
-                                 "flag_buffer_hashtable", use_buffer_hashtable)
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
+            x,
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            use_buffer_hashtable,
+        )
         return reindex_src, reindex_dst, out_nodes
 
     if isinstance(neighbors, Variable):
@@ -241,15 +232,18 @@ def reindex_heter_graph(x,
     count = paddle.concat(count, axis=0)
 
     check_variable_and_dtype(x, "X", ("int32", "int64"), "heter_graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
-                             "graph_reindex")
+    check_variable_and_dtype(
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
     check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
 
     if use_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
-                                 "graph_reindex")
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
+        )
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
 
     helper = LayerHelper("reindex_heter_graph", **locals())
     reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
@@ -257,23 +251,20 @@ def reindex_heter_graph(x,
     out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
     neighbors = paddle.concat(neighbors, axis=0)
     count = paddle.concat(count, axis=0)
-    helper.append_op(type="graph_reindex",
-                     inputs={
-                         "X":
-                         x,
-                         "Neighbors":
-                         neighbors,
-                         "Count":
-                         count,
-                         "HashTable_Value":
-                         value_buffer if use_buffer_hashtable else None,
-                         "HashTable_Index":
-                         index_buffer if use_buffer_hashtable else None,
-                     },
-                     outputs={
-                         "Reindex_Src": reindex_src,
-                         "Reindex_Dst": reindex_dst,
-                         "Out_Nodes": out_nodes
-                     },
-                     attrs={"flag_buffer_hashtable": use_buffer_hashtable})
+    helper.append_op(
+        type="graph_reindex",
+        inputs={
+            "X": x,
+            "Neighbors": neighbors,
+            "Count": count,
+            "HashTable_Value": value_buffer if use_buffer_hashtable else None,
+            "HashTable_Index": index_buffer if use_buffer_hashtable else None,
+        },
+        outputs={
+            "Reindex_Src": reindex_src,
+            "Reindex_Dst": reindex_dst,
+            "Out_Nodes": out_nodes,
+        },
+        attrs={"flag_buffer_hashtable": use_buffer_hashtable},
+    )
     return reindex_src, reindex_dst, out_nodes
diff --git a/python/paddle/geometric/sampling/__init__.py b/python/paddle/geometric/sampling/__init__.py
index 282fb8fd56792e..2e5b24fdd60b7f 100644
--- a/python/paddle/geometric/sampling/__init__.py
+++ b/python/paddle/geometric/sampling/__init__.py
@@ -14,6 +14,4 @@
 
 from .neighbors import sample_neighbors  # noqa: F401
 
-__all__ = [
-    'sample_neighbors',
-]
+__all__ = []
diff --git a/python/paddle/geometric/sampling/neighbors.py b/python/paddle/geometric/sampling/neighbors.py
index a9619d54a852ed..a52570576b04c6 100644
--- a/python/paddle/geometric/sampling/neighbors.py
+++ b/python/paddle/geometric/sampling/neighbors.py
@@ -21,25 +21,28 @@
 __all__ = []
 
 
-def sample_neighbors(row,
-                     colptr,
-                     input_nodes,
-                     sample_size=-1,
-                     eids=None,
-                     return_eids=False,
-                     perm_buffer=None,
-                     name=None):
+def sample_neighbors(
+    row,
+    colptr,
+    input_nodes,
+    sample_size=-1,
+    eids=None,
+    return_eids=False,
+    perm_buffer=None,
+    name=None,
+):
     """
+
     Graph Sample Neighbors API.
 
     This API is mainly used in Graph Learning domain, and the main purpose is to
-    provide high performance of graph sampling method. For example, we get the 
-    CSC(Compressed Sparse Column) format of the input graph edges as `row` and 
+    provide high performance of graph sampling method. For example, we get the
+    CSC(Compressed Sparse Column) format of the input graph edges as `row` and
     `colptr`, so as to convert graph data into a suitable format for sampling.
-    `input_nodes` means the nodes we need to sample neighbors, and `sample_sizes` 
+    `input_nodes` means the nodes we need to sample neighbors, and `sample_sizes`
     means the number of neighbors and number of layers we want to sample.
 
-    Besides, we support fisher-yates sampling in GPU version. 
+    Besides, we support fisher-yates sampling in GPU version.
 
     Args:
         row (Tensor): One of the components of the CSC format of the input graph, and
@@ -50,95 +53,121 @@ def sample_neighbors(row,
                          The data type should be the same with `row`.
         input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
-        sample_size (int): The number of neighbors we need to sample. Default value is -1, 
+        sample_size (int, optional): The number of neighbors we need to sample. Default value is -1,
                            which means returning all the neighbors of the input nodes.
-        eids (Tensor): The eid information of the input graph. If return_eids is True,
-                            then `eids` should not be None. The data type should be the 
+        eids (Tensor, optional): The eid information of the input graph. If return_eids is True,
+                            then `eids` should not be None. The data type should be the
                             same with `row`. Default is None.
-        return_eids (bool): Whether to return eid information of sample edges. Default is False.
-        perm_buffer (Tensor): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
+        return_eids (bool, optional): Whether to return eid information of sample edges. Default is False.
+        perm_buffer (Tensor, optional): Permutation buffer for fisher-yates sampling. If `use_perm_buffer`
                               is True, then `perm_buffer` should not be None. The data type should
                               be the same with `row`. If not None, we will use fiser-yates sampling
-                              to speed up. Only useful for gpu version.
+                              to speed up. Only useful for gpu version. Default is None.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
-                            should be the same with `input_nodes`.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
-                           sample edges.
+        - out_neighbors (Tensor), the sample neighbors of the input nodes.
+
+        - out_count (Tensor), the number of sampling neighbors of each input node, and the shape
+          should be the same with `input_nodes`.
+
+        - out_eids (Tensor), if `return_eids` is True, we will return the eid information of the
+          sample edges.
 
     Examples:
         .. code-block:: python
-        import paddle
-        # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
-        #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        nodes = [0, 8, 1, 2]
-        sample_size = 2
-        row = paddle.to_tensor(row, dtype="int64")
-        colptr = paddle.to_tensor(colptr, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
-        out_neighbors, out_count = \
-            paddle.geometric.sample_neighbors(row, colptr, nodes, 
-                                              sample_size=sample_size)
+
+            import paddle
+
+            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
+            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            nodes = [0, 8, 1, 2]
+            sample_size = 2
+            row = paddle.to_tensor(row, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
+            nodes = paddle.to_tensor(nodes, dtype="int64")
+            out_neighbors, out_count = paddle.geometric.sample_neighbors(row, colptr, nodes, sample_size=sample_size)
 
     """
 
     if return_eids:
         if eids is None:
             raise ValueError(
-                f"`eids` should not be None if `return_eids` is True.")
+                f"`eids` should not be None if `return_eids` is True."
+            )
 
     use_perm_buffer = True if perm_buffer is not None else False
 
     if _non_static_mode():
-        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
-            row, colptr, input_nodes, eids, perm_buffer, "sample_size",
-            sample_size, "return_eids", return_eids, "flag_perm_buffer",
-            use_perm_buffer)
+        (
+            out_neighbors,
+            out_count,
+            out_eids,
+        ) = _legacy_C_ops.graph_sample_neighbors(
+            row,
+            colptr,
+            input_nodes,
+            eids,
+            perm_buffer,
+            "sample_size",
+            sample_size,
+            "return_eids",
+            return_eids,
+            "flag_perm_buffer",
+            use_perm_buffer,
+        )
         if return_eids:
             return out_neighbors, out_count, out_eids
         return out_neighbors, out_count
 
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
-                             "graph_sample_neighbors")
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
-                             "graph_sample_neighbors")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
-                             "graph_sample_neighbors")
+    check_variable_and_dtype(
+        row, "Row", ("int32", "int64"), "graph_sample_neighbors"
+    )
+    check_variable_and_dtype(
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
+    )
     if return_eids:
-        check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
-                                 "graph_sample_neighbors")
+        check_variable_and_dtype(
+            eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
+        )
     if use_perm_buffer:
-        check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
-                                 "graph_sample_neighbors")
+        check_variable_and_dtype(
+            perm_buffer,
+            "Perm_Buffer",
+            ("int32", "int64"),
+            "graph_sample_neighbors",
+        )
 
     helper = LayerHelper("sample_neighbors", **locals())
     out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_sample_neighbors",
-                     inputs={
-                         "Row": row,
-                         "Col_Ptr": colptr,
-                         "X": input_nodes,
-                         "Eids": eids if return_eids else None,
-                         "Perm_Buffer": perm_buffer if use_perm_buffer else None
-                     },
-                     outputs={
-                         "Out": out_neighbors,
-                         "Out_Count": out_count,
-                         "Out_Eids": out_eids
-                     },
-                     attrs={
-                         "sample_size": sample_size,
-                         "return_eids": return_eids,
-                         "flag_perm_buffer": use_perm_buffer
-                     })
+    helper.append_op(
+        type="graph_sample_neighbors",
+        inputs={
+            "Row": row,
+            "Col_Ptr": colptr,
+            "X": input_nodes,
+            "Eids": eids if return_eids else None,
+            "Perm_Buffer": perm_buffer if use_perm_buffer else None,
+        },
+        outputs={
+            "Out": out_neighbors,
+            "Out_Count": out_count,
+            "Out_Eids": out_eids,
+        },
+        attrs={
+            "sample_size": sample_size,
+            "return_eids": return_eids,
+            "flag_perm_buffer": use_perm_buffer,
+        },
+    )
     if return_eids:
         return out_neighbors, out_count, out_eids
     return out_neighbors, out_count
diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 1ba33a6b52bd7c..bdd79b35a499ab 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -20,7 +20,7 @@
 import numpy as np
 
 import paddle
-from paddle.distributed import ParallelEnv
+from paddle.fluid.dygraph.parallel import ParallelEnv
 from paddle.utils import try_import
 
 from .progressbar import ProgressBar
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 16b3646a4a81a4..cea4951d8ef683 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -47,9 +47,10 @@
 from paddle.io import DistributedBatchSampler
 from paddle.metric import Metric
 from paddle.static import InputSpec as Input
-import paddle.distributed as dist
-import paddle.distributed.fleet as fleet
 from paddle.distributed.fleet.base import role_maker
+from paddle.autograd import no_grad
+from paddle.distributed import fleet
+from paddle.distributed.parallel import init_parallel_env
 
 from .callbacks import config_callbacks, EarlyStopping
 from .model_summary import summary
@@ -68,8 +69,9 @@ def to_list(value):
 
 
 def to_numpy(var):
-    assert isinstance(var, (Variable, fluid.core.VarBase,
-                            fluid.core.eager.Tensor)), "not a variable"
+    assert isinstance(
+        var, (Variable, fluid.core.VarBase, fluid.core.eager.Tensor)
+    ), "not a variable"
     if isinstance(var, (fluid.core.VarBase, fluid.core.eager.Tensor)):
         return var.numpy()
     t = global_scope().find_var(var.name).get_tensor()
@@ -104,10 +106,9 @@ def extract_args(func):
 
 
 def _all_gather(x, nranks, ring_id=0, use_calc_stream=True):
-    return collective._c_allgather(x,
-                                   nranks,
-                                   ring_id=ring_id,
-                                   use_calc_stream=use_calc_stream)
+    return collective._c_allgather(
+        x, nranks, ring_id=ring_id, use_calc_stream=use_calc_stream
+    )
 
 
 def wait_server_ready(endpoints):
@@ -118,7 +119,8 @@ def wait_server_ready(endpoints):
         for ep in endpoints:
             ip_port = ep.split(":")
             with contextlib.closing(
-                    socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
+                socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+            ) as sock:
                 sock.settimeout(2)
                 result = sock.connect_ex((ip_port[0], int(ip_port[1])))
                 if result != 0:
@@ -130,8 +132,9 @@ def wait_server_ready(endpoints):
             break
 
 
-def init_communicator(program, rank, nranks, wait_port, current_endpoint,
-                      endpoints):
+def init_communicator(
+    program, rank, nranks, wait_port, current_endpoint, endpoints
+):
     if nranks < 2:
         return
     other_endpoints = endpoints[:]
@@ -143,53 +146,66 @@ def init_communicator(program, rank, nranks, wait_port, current_endpoint,
         nccl_id_var = block.create_var(
             name=fluid.unique_name.generate('nccl_id'),
             persistable=True,
-            type=fluid.core.VarDesc.VarType.RAW)
-
-        block.append_op(type='c_gen_nccl_id',
-                        inputs={},
-                        outputs={'Out': nccl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-
-        block.append_op(type='c_comm_init',
-                        inputs={'X': nccl_id_var},
-                        outputs={},
-                        attrs={
-                            'nranks': nranks,
-                            'rank': rank,
-                            'ring_id': 0,
-                        })
+            type=fluid.core.VarDesc.VarType.RAW,
+        )
+
+        block.append_op(
+            type='c_gen_nccl_id',
+            inputs={},
+            outputs={'Out': nccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+
+        block.append_op(
+            type='c_comm_init',
+            inputs={'X': nccl_id_var},
+            outputs={},
+            attrs={
+                'nranks': nranks,
+                'rank': rank,
+                'ring_id': 0,
+            },
+        )
     elif core.is_compiled_with_npu():
         hccl_id_var = block.create_var(
             name=fluid.unique_name.generate('hccl_id'),
             persistable=True,
-            type=core.VarDesc.VarType.RAW)
-        block.append_op(type='c_gen_hccl_id',
-                        inputs={},
-                        outputs={'Out': hccl_id_var},
-                        attrs={
-                            'rank': rank,
-                            'endpoint': current_endpoint,
-                            'other_endpoints': other_endpoints
-                        })
-        block.append_op(type='c_comm_init_hccl',
-                        inputs={'X': hccl_id_var},
-                        outputs={},
-                        attrs={
-                            'rank': rank,
-                            'ring_id': 0,
-                            'device_id': int(os.getenv("FLAGS_selected_npus")),
-                            'rank_ids': nranks
-                        })
+            type=core.VarDesc.VarType.RAW,
+        )
+        block.append_op(
+            type='c_gen_hccl_id',
+            inputs={},
+            outputs={'Out': hccl_id_var},
+            attrs={
+                'rank': rank,
+                'endpoint': current_endpoint,
+                'other_endpoints': other_endpoints,
+            },
+        )
+        block.append_op(
+            type='c_comm_init_hccl',
+            inputs={'X': hccl_id_var},
+            outputs={},
+            attrs={
+                'rank': rank,
+                'ring_id': 0,
+                'device_id': int(os.getenv("FLAGS_selected_npus")),
+                'rank_ids': nranks,
+            },
+        )
 
 
 def prepare_distributed_context(place=None):
     if place is None:
-        place = fluid.CUDAPlace(ParallelEnv().dev_id) if ParallelEnv().nranks > 1 \
+        place = (
+            fluid.CUDAPlace(ParallelEnv().dev_id)
+            if ParallelEnv().nranks > 1
             else fluid.CUDAPlace(0)
+        )
 
     place = _get_paddle_place(place)
     strategy = fluid.dygraph.parallel.ParallelStrategy()
@@ -207,9 +223,14 @@ def prepare_distributed_context(place=None):
 
         def _init_context():
             communicator_prog = fluid.Program()
-            init_communicator(communicator_prog, strategy.local_rank,
-                              strategy.nranks, True, strategy.current_endpoint,
-                              strategy.trainer_endpoints)
+            init_communicator(
+                communicator_prog,
+                strategy.local_rank,
+                strategy.nranks,
+                True,
+                strategy.current_endpoint,
+                strategy.trainer_endpoints,
+            )
             exe = fluid.Executor(place)
             exe.run(communicator_prog)
 
@@ -219,7 +240,7 @@ def _init_context():
             fluid.enable_dygraph(place)
 
     else:
-        assert ("Only support CUDAPlace for now.")
+        assert "Only support CUDAPlace for now."
 
     _parallel_context_initialized = True
     return strategy
@@ -245,7 +266,9 @@ def _update_input_info(inputs):
 
 class StaticGraphAdapter(object):
     """
+
     Model traning/inference with a static graph.
+
     """
 
     def __init__(self, model):
@@ -268,7 +291,7 @@ def __init__(self, model):
             'eval_total': 0,
             'test_total': 0,
             'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
         }
 
         self._nranks = ParallelEnv().nranks
@@ -288,10 +311,13 @@ def mode(self, value):
         self.model.mode = value
 
     def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
+        assert (
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
         self.mode = 'train'
-        assert update is True, "Does not support `update == False` in static mode by now."
+        assert (
+            update is True
+        ), "Does not support `update == False` in static mode by now."
         return self._run(inputs, labels)
 
     def eval_batch(self, inputs, labels=None):
@@ -306,7 +332,6 @@ def parameters(self, *args, **kwargs):
         return self.model.network.parameters(*args, **kwargs)
 
     def save(self, path):
-
         def _save(state, path):
             if not state:
                 return
@@ -330,8 +355,7 @@ def _save(state, path):
         # XXX `optimizer.state_dict()` only work in dygraph mode
         optim_path = path + ".pdopt"
         optim = {
-            p.name: p
-            for p in filter(is_belong_to_optimizer, prog.list_vars())
+            p.name: p for p in filter(is_belong_to_optimizer, prog.list_vars())
         }
         if not optim:
             return
@@ -347,8 +371,10 @@ def load(self, param_state_pairs, optim_state):
 
         # restore parameter states
         fluid.core._create_loaded_parameter(
-            [param for param, state in param_state_pairs], global_scope(),
-            executor)
+            [param for param, state in param_state_pairs],
+            global_scope(),
+            executor,
+        )
         for param, state in param_state_pairs:
             self._set_var(param, state)
 
@@ -376,9 +402,10 @@ def _load_optimizer(self, state, executor):
                 # static-graph, since the time of global_step to increase is
                 # different.
                 state_val = (
-                    np.array(converted_state.pop("global_step")) - 1
-                ) if "global_step" in converted_state else converted_state.pop(
-                    "@LR_DECAY_COUNTER@", None)
+                    (np.array(converted_state.pop("global_step")) - 1)
+                    if "global_step" in converted_state
+                    else converted_state.pop("@LR_DECAY_COUNTER@", None)
+                )
                 if state_val is not None:
                     converted_state[var.name] = state_val
             elif var.name.startswith("learning_rate_"):
@@ -395,36 +422,61 @@ def _load_optimizer(self, state, executor):
                     opt_cls_name = self.model._optimizer.__class__.__name__
                     opt_unq_name = None
                     for name in self.model._optimizer._accumulators.keys():
-                        accum_name = name if opt_name is None else name[
-                            len(opt_name) + 1:]
-                        for param_name, state_var in self.model._optimizer._accumulators[
-                                name].items():
+                        accum_name = (
+                            name
+                            if opt_name is None
+                            else name[len(opt_name) + 1 :]
+                        )
+                        for (
+                            param_name,
+                            state_var,
+                        ) in self.model._optimizer._accumulators[name].items():
                             if opt_unq_name is None:
                                 # can not infer out the exact unique(opt_name),
                                 # thus try to extract rather than generate
-                                for state_key in sorted(state.keys(),
-                                                        key=lambda x: len(x),
-                                                        reverse=True):
-                                    prefix = param_name + "_" + (
-                                        opt_cls_name
-                                        if opt_name is None else opt_name) + "_"
+                                for state_key in sorted(
+                                    state.keys(),
+                                    key=lambda x: len(x),
+                                    reverse=True,
+                                ):
+                                    prefix = (
+                                        param_name
+                                        + "_"
+                                        + (
+                                            opt_cls_name
+                                            if opt_name is None
+                                            else opt_name
+                                        )
+                                        + "_"
+                                    )
                                     if state_key.startswith(prefix):
-                                        prefix_offset = state_key[len(
-                                            prefix):].find("_") + len(prefix)
+                                        prefix_offset = state_key[
+                                            len(prefix) :
+                                        ].find("_") + len(prefix)
                                         opt_unq_name = state_key[
-                                            len(param_name + "_"):prefix_offset]
+                                            len(
+                                                param_name + "_"
+                                            ) : prefix_offset
+                                        ]
                                         # TODO: assert
                                         # assert opt_unq_name is None
                                     # gen(param.name + "_" + gen(opt_name) + "_" + accum_name)
                                     # always end with "_0" since the unique optimizer._name
-                            dy_state_name = (param_name + "_" + opt_unq_name +
-                                             "_" + accum_name + "_0")
+                            dy_state_name = (
+                                param_name
+                                + "_"
+                                + opt_unq_name
+                                + "_"
+                                + accum_name
+                                + "_0"
+                            )
                             converted_state[
-                                state_var.name] = converted_state.pop(
-                                    dy_state_name)
+                                state_var.name
+                            ] = converted_state.pop(dy_state_name)
 
-            assert var.name in converted_state, \
-                "variable [{}] is not in optimizer state file".format(var.name)
+            assert (
+                var.name in converted_state
+            ), "variable [{}] is not in optimizer state file".format(var.name)
             self._set_var(var, converted_state[var.name])
 
     def _set_var(self, var, ndarray):
@@ -443,15 +495,17 @@ def _set_var(self, var, ndarray):
 
     def _run(self, inputs, labels=None):
         compiled_prog = self._compiled_progs.get(self.mode, None)
-        assert compiled_prog, \
-            "Model is not ready, please call `model.prepare()` first"
+        assert (
+            compiled_prog
+        ), "Model is not ready, please call `model.prepare()` first"
 
         inputs = to_list(inputs)
         if labels is not None:
             labels = to_list(labels)
-        assert len(inputs) == len(self._input_vars[self.mode]), \
-            "number of inputs" \
+        assert len(inputs) == len(self._input_vars[self.mode]), (
+            "number of inputs"
             + " does not match number of arguments of `forward` method"
+        )
 
         feed = {}
         input_names = [v.name for v in self._input_vars[self.mode]]
@@ -461,8 +515,10 @@ def _run(self, inputs, labels=None):
             # train and test may take different arguments
             if inputs[idx] is not None:
                 feed[n] = inputs[idx]
-            if self._amp_level == 'O2' and input_dtypes[
-                    idx] == core.VarDesc.VarType.FP16:
+            if (
+                self._amp_level == 'O2'
+                and input_dtypes[idx] == core.VarDesc.VarType.FP16
+            ):
                 if isinstance(feed[n], core.LoDTensor):
                     feed[n] = feed[n]._as_type(core.VarDesc.VarType.FP16)
                 elif isinstance(feed[n], np.array):
@@ -490,10 +546,12 @@ def _run(self, inputs, labels=None):
             else:
                 pruned_fetch_list.append(fetch_var)
 
-        rets = self._executor.run(compiled_prog,
-                                  feed=feed,
-                                  fetch_list=pruned_fetch_list,
-                                  return_numpy=False)
+        rets = self._executor.run(
+            compiled_prog,
+            feed=feed,
+            fetch_list=pruned_fetch_list,
+            return_numpy=False,
+        )
 
         # restore pruned fetch_list Variable from feeds
         for i, name in enumerate(pruned_fetch_idx_name_map):
@@ -509,20 +567,24 @@ def _run(self, inputs, labels=None):
         metrics = []
         for metric, state in zip(self.model._metrics, metric_states):
             # cut off padding size
-            if self.mode != 'train' and self.model._test_dataloader is not None \
-                    and isinstance(self.model._test_dataloader, DataLoader) \
-                    and self._nranks > 1:
+            if (
+                self.mode != 'train'
+                and self.model._test_dataloader is not None
+                and isinstance(self.model._test_dataloader, DataLoader)
+                and self._nranks > 1
+            ):
                 total_size = len(self.model._test_dataloader.dataset)
                 # TODO: fixme if have better way to get batch size
                 samples = state[0].shape[0]
                 current_count = self._merge_count.get(self.mode + '_total', 0)
                 if current_count + samples >= total_size:
                     state = [
-                        s[:int(total_size - current_count), ...] for s in state
+                        s[: int(total_size - current_count), ...] for s in state
                     ]
                     self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
-                                                                  current_count)
+                    self._merge_count[self.mode + '_batch'] = int(
+                        total_size - current_count
+                    )
                 else:
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
@@ -554,8 +616,11 @@ def _make_program(self, mode):
         if mode != 'train':
             for op in list(prog.global_block().ops):
                 prog.global_block()._remove_op(0)
-        if mode == 'train' and self.model._optimizer \
-                and self.model._optimizer._learning_rate_map:
+        if (
+            mode == 'train'
+            and self.model._optimizer
+            and self.model._optimizer._learning_rate_map
+        ):
             # HACK workaround learning rate map issue
             lr_var = self.model._optimizer._learning_rate_map[self._orig_prog]
             new_lr_var = prog.global_block().vars[lr_var.name]
@@ -593,20 +658,27 @@ def _make_program(self, mode):
                         dist_strategy.amp = True
                         dist_strategy.amp_configs = self._amp_configs.copy()
                         dist_strategy.amp_configs.update(self._amp_custom_lists)
-                        dist_strategy.amp_configs[
-                            'use_pure_fp16'] = self._amp_level == 'O2'
+                        dist_strategy.amp_configs['use_pure_fp16'] = (
+                            self._amp_level == 'O2'
+                        )
                     self.model._optimizer = fleet.distributed_optimizer(
-                        self.model._optimizer, strategy=dist_strategy)
+                        self.model._optimizer, strategy=dist_strategy
+                    )
                 elif self._amp_level != "O0" and core.is_compiled_with_cuda:
-                    amp_lists = paddle.static.amp.AutoMixedPrecisionLists(
-                        **self._amp_custom_lists
-                    ) if self._amp_custom_lists else None
+                    amp_lists = (
+                        paddle.static.amp.AutoMixedPrecisionLists(
+                            **self._amp_custom_lists
+                        )
+                        if self._amp_custom_lists
+                        else None
+                    )
                     self.model._optimizer = paddle.static.amp.decorate(
                         self.model._optimizer,
                         amp_lists=amp_lists,
                         use_pure_fp16=self._amp_level == "O2",
                         use_fp16_guard=self._use_fp16_guard,
-                        **self._amp_configs)
+                        **self._amp_configs
+                    )
 
                 self.model._optimizer.minimize(self._loss_endpoint)
 
@@ -619,7 +691,7 @@ def _make_program(self, mode):
         self._endpoints[mode] = {
             "output": outputs,
             "loss": to_list(losses),
-            "metric": metrics
+            "metric": metrics,
         }
 
     def _compile_and_initialize(self, prog, mode):
@@ -627,8 +699,9 @@ def _compile_and_initialize(self, prog, mode):
         if compiled_prog is not None:
             return compiled_prog
 
-        assert self.model._place is not None, \
-            "device is not set, please call `model.prepare()` first"
+        assert (
+            self.model._place is not None
+        ), "device is not set, please call `model.prepare()` first"
 
         place = self.model._place
 
@@ -641,8 +714,11 @@ def _compile_and_initialize(self, prog, mode):
             uninitialized = []
             for var_py in self._startup_prog.list_vars():
                 var = fluid.global_scope().find_var(var_py.name)
-                if not var_py.name.startswith('nccl_id') and var and \
-                        var.get_tensor()._is_initialized():
+                if (
+                    not var_py.name.startswith('nccl_id')
+                    and var
+                    and var.get_tensor()._is_initialized()
+                ):
                     continue
 
                 uninitialized.append(var_py)
@@ -650,7 +726,10 @@ def _compile_and_initialize(self, prog, mode):
                 startup_prog = self._startup_prog._prune(uninitialized)
                 self._executor.run(startup_prog)
 
-        if self._amp_level == "O2" and mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and mode == 'train'
+            and core.is_compiled_with_cuda()
         ):
             self.model._optimizer.amp_init(place)
 
@@ -663,7 +742,6 @@ def _compile_and_initialize(self, prog, mode):
 
 
 class DynamicGraphAdapter(object):
-
     def __init__(self, model):
         super(DynamicGraphAdapter, self).__init__()
         self.model = model
@@ -673,7 +751,7 @@ def __init__(self, model):
             'eval_total': 0,
             'test_total': 0,
             'eval_batch': 0,
-            'test_batch': 0
+            'test_batch': 0,
         }
 
         self._input_info = None
@@ -683,14 +761,15 @@ def __init__(self, model):
         self._use_fp16_guard = True
 
         if self._nranks > 1:
-            dist.init_parallel_env()
+            init_parallel_env()
             stradegy = fluid.dygraph.parallel.ParallelStrategy()
             stradegy.nranks = ParallelEnv().nranks
             stradegy.local_rank = ParallelEnv().local_rank
             stradegy.trainer_endpoints = ParallelEnv().trainer_endpoints
             stradegy.current_endpoint = ParallelEnv().current_endpoint
             self.ddp_model = fluid.dygraph.parallel.DataParallel(
-                self.model.network, stradegy)
+                self.model.network, stradegy
+            )
 
     @property
     def mode(self):
@@ -702,8 +781,9 @@ def mode(self, value):
 
     # TODO multi device in dygraph mode not implemented at present time
     def train_batch(self, inputs, labels=None, update=True):
-        assert self.model._optimizer, \
-            "model not ready, please call `model.prepare()` first"
+        assert (
+            self.model._optimizer
+        ), "model not ready, please call `model.prepare()` first"
         self.model.network.train()
         self.mode = 'train'
         inputs = to_list(inputs)
@@ -715,15 +795,15 @@ def train_batch(self, inputs, labels=None, update=True):
         if self._amp_level != "O0" and self.model._scaler is None:
             self.model._scaler = paddle.amp.GradScaler(**self._amp_configs)
 
-        with paddle.amp.auto_cast(enable=self._amp_level != 'O0',
-                                  **self._amp_custom_lists,
-                                  level=self._amp_level):
+        with paddle.amp.auto_cast(
+            enable=self._amp_level != 'O0',
+            **self._amp_custom_lists,
+            level=self._amp_level
+        ):
             if self._nranks > 1:
-                outputs = self.ddp_model.forward(
-                    *[to_variable(x) for x in inputs])
+                outputs = self.ddp_model(*[to_variable(x) for x in inputs])
             else:
-                outputs = self.model.network.forward(
-                    *[to_variable(x) for x in inputs])
+                outputs = self.model.network(*[to_variable(x) for x in inputs])
 
         losses = self.model._loss(*(to_list(outputs) + labels))
         losses = to_list(losses)
@@ -747,8 +827,11 @@ def train_batch(self, inputs, labels=None, update=True):
             m = metric.update(*[to_numpy(m) for m in to_list(metric_outs)])
             metrics.append(m)
 
-        return ([to_numpy(l) for l in losses], metrics) \
-            if len(metrics) > 0 else [to_numpy(l) for l in losses]
+        return (
+            ([to_numpy(l) for l in losses], metrics)
+            if len(metrics) > 0
+            else [to_numpy(l) for l in losses]
+        )
 
     def eval_batch(self, inputs, labels=None):
         self.model.network.eval()
@@ -758,7 +841,7 @@ def eval_batch(self, inputs, labels=None):
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
-        outputs = self.model.network.forward(*[to_variable(x) for x in inputs])
+        outputs = self.model.network(*[to_variable(x) for x in inputs])
 
         # Transfrom data to expected device
         expected_device = paddle.device.get_device()
@@ -778,21 +861,25 @@ def eval_batch(self, inputs, labels=None):
         metrics = []
         for metric in self.model._metrics:
             # cut off padding value.
-            if self.model._test_dataloader is not None and self._nranks > 1 \
-                    and isinstance(self.model._test_dataloader, DataLoader):
+            if (
+                self.model._test_dataloader is not None
+                and self._nranks > 1
+                and isinstance(self.model._test_dataloader, DataLoader)
+            ):
                 total_size = len(self.model._test_dataloader.dataset)
                 samples = outputs[0].shape[0]
                 current_count = self._merge_count.get(self.mode + '_total', 0)
                 if current_count + samples >= total_size:
                     outputs = [
-                        o[:int(total_size - current_count)] for o in outputs
+                        o[: int(total_size - current_count)] for o in outputs
                     ]
                     labels = [
-                        l[:int(total_size - current_count)] for l in labels
+                        l[: int(total_size - current_count)] for l in labels
                     ]
                     self._merge_count[self.mode + '_total'] = 0
-                    self._merge_count[self.mode + '_batch'] = int(total_size -
-                                                                  current_count)
+                    self._merge_count[self.mode + '_batch'] = int(
+                        total_size - current_count
+                    )
                 else:
                     self._merge_count[self.mode + '_total'] += samples
                     self._merge_count[self.mode + '_batch'] = samples
@@ -813,7 +900,7 @@ def predict_batch(self, inputs):
         self.mode = 'test'
         inputs = [to_variable(x) for x in to_list(inputs)]
         self._input_info = _update_input_info(inputs)
-        outputs = self.model.network.forward(*inputs)
+        outputs = self.model.network(*inputs)
         if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
             outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
 
@@ -859,38 +946,48 @@ def load(self, param_state_pairs, optim_state, scaler_state=None):
             opt_unq_name = ''
 
         opt_cls_name = self.model._optimizer.__class__.__name__
-        opt_name = opt_unq_name[:opt_unq_name.rfind("_")]  # remove suffix idx
+        opt_name = opt_unq_name[: opt_unq_name.rfind("_")]  # remove suffix idx
         param_names = [param.name for param in self.model.network.parameters()]
-        for var_name, state_var in sorted(optim_state.items(),
-                                          key=lambda x: len(x[0]),
-                                          reverse=True):
+        for var_name, state_var in sorted(
+            optim_state.items(), key=lambda x: len(x[0]), reverse=True
+        ):
             if var_name in ["@LR_DECAY_COUNTER@", "global_step"]:
                 # NOTE: dygraph saved global_step is 1 larger than that in
                 # static-graph, since the time of global_step to increase is
                 # different.
                 if var_name == "@LR_DECAY_COUNTER@":
-                    converted_state["global_step"] = np.array(
-                        converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                    converted_state["global_step"] = (
+                        np.array(converted_state.pop("@LR_DECAY_COUNTER@")) + 1
+                    )
             else:
                 # moment and other accumulators
                 # extend state dict to include promising dygraph names
                 for param_name in param_names:
                     if var_name.startswith(param_name + "_" + opt_name):
                         # when init optimizer with name
-                        accum_name = var_name[len(param_name + "_" + opt_name +
-                                                  "_"):]
-                    elif var_name.startswith(param_name +
-                                             "_") and opt_name == opt_cls_name:
+                        accum_name = var_name[
+                            len(param_name + "_" + opt_name + "_") :
+                        ]
+                    elif (
+                        var_name.startswith(param_name + "_")
+                        and opt_name == opt_cls_name
+                    ):
                         # when init optimizer without name
-                        accum_name = var_name[len(param_name + "_"):]
+                        accum_name = var_name[len(param_name + "_") :]
                     else:
                         continue
                     # remove suffix idx
-                    accum_name = accum_name[:accum_name.rfind("_")]
+                    accum_name = accum_name[: accum_name.rfind("_")]
                     # state names always end with "_0" in dygraph because of the
                     # unique optimizer._name
-                    dy_state_name = (param_name + "_" + opt_unq_name + "_" +
-                                     accum_name + "_0")
+                    dy_state_name = (
+                        param_name
+                        + "_"
+                        + opt_unq_name
+                        + "_"
+                        + accum_name
+                        + "_0"
+                    )
                     converted_state[dy_state_name] = state_var
 
         if not hasattr(self.model._optimizer, 'set_state_dict'):
@@ -902,18 +999,23 @@ def load(self, param_state_pairs, optim_state, scaler_state=None):
             self.model._optimizer.set_state_dict(converted_state)
 
     def prepare(self):
-        if self._amp_level == "O2" and self.model.mode == 'train' and core.is_compiled_with_cuda(
+        if (
+            self._amp_level == "O2"
+            and self.model.mode == 'train'
+            and core.is_compiled_with_cuda()
         ):
             self.model.network, self.model._optimizer = paddle.amp.decorate(
                 models=self.model.network,
                 optimizers=self.model._optimizer,
-                level='O2')
+                level='O2',
+            )
         if self._amp_level != "O0":
             self.model._scaler = None
 
 
 class Model(object):
     """
+
     An Model object is network with training and inference features.
     Dynamic graph and static graph are supported at the same time,
     switched by `paddle.enable_static()`. The usage is as follows.
@@ -921,7 +1023,7 @@ class Model(object):
     instantiating a Model. The input description, i.e, paddle.static.InputSpec,
     must be required for static graph.
 
-    When training on GPU, auto mixed precision (AMP O1) and pure float16 
+    When training on GPU, auto mixed precision (AMP O1) and pure float16
     (AMP O2) training are both supported in static mode and dynamic mode.
     In static graph mode, before training with pure float16 (AMP O2),
     `multi_precision` could be set to True when creating optimizer, which can
@@ -966,7 +1068,7 @@ class Model(object):
             # inputs and labels are not required for dynamic graph.
             input = InputSpec([None, 784], 'float32', 'x')
             label = InputSpec([None, 1], 'int64', 'label')
-            
+
             model = paddle.Model(net, input, label)
             optim = paddle.optimizer.SGD(learning_rate=1e-3,
                 parameters=model.parameters())
@@ -1054,16 +1156,17 @@ def __init__(self, network, inputs=None, labels=None):
 
     def train_batch(self, inputs, labels=None, update=True):
         """
+
         Run one training step on one batch of data. And using `update` indicates
         whether optimizer update gradients computing by this batch.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                 set None. Default: None.
             update (bool, optional): Whether update parameters after loss.backward() computing.
                 Set it to False to accumulate gradients. Default: True.
@@ -1076,7 +1179,7 @@ def train_batch(self, inputs, labels=None, update=True):
         Examples:
 
             .. code-block:: python
-            
+
                 import paddle
                 import paddle.nn as nn
                 from paddle.static import InputSpec
@@ -1099,24 +1202,26 @@ def train_batch(self, inputs, labels=None, update=True):
                 loss = model.train_batch([data], [label])
                 print(loss)
                 # [array([2.192784], dtype=float32)]
+
         """
         loss = self._adapter.train_batch(inputs, labels, update)
         if fluid._non_static_mode() and self._input_info is None:
             self._update_inputs()
         return loss
 
-    @paddle.no_grad()
+    @no_grad()
     def eval_batch(self, inputs, labels=None):
         """
+
         Run one evaluating step on a batch of data.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
-            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be 
-                a numpy array or paddle.Tensor, or a list of arrays or tensors 
-                (in case the model has multiple labels). If has no labels, 
+            labels (numpy.ndarray|Tensor|list, optional): Batch of labels. It could be
+                a numpy array or paddle.Tensor, or a list of arrays or tensors
+                (in case the model has multiple labels). If has no labels,
                 set None. Default: None.
 
         Returns:
@@ -1151,20 +1256,22 @@ def eval_batch(self, inputs, labels=None):
                 loss, acc = model.eval_batch([data], [label])
                 print(loss, acc)
                 # [array([2.8825705], dtype=float32)] [0.0]
+
         """
         loss = self._adapter.eval_batch(inputs, labels)
         if fluid._non_static_mode() and self._input_info is None:
             self._update_inputs()
         return loss
 
-    @paddle.no_grad()
+    @no_grad()
     def predict_batch(self, inputs):
         """
+
         Run one predicting step on a batch of data.
 
         Args:
-            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
-                be a numpy array or paddle.Tensor, or a list of arrays or 
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could
+                be a numpy array or paddle.Tensor, or a list of arrays or
                 tensors (in case the model has multiple inputs).
 
         Returns:
@@ -1180,7 +1287,7 @@ def predict_batch(self, inputs):
                 from paddle.static import InputSpec
 
                 device = paddle.set_device('cpu') # or 'gpu'
-                
+
                 input = InputSpec([None, 784], 'float32', 'x')
                 label = InputSpec([None, 1], 'int64', 'label')
 
@@ -1198,6 +1305,7 @@ def predict_batch(self, inputs):
                 # [array([[0.08189095, 0.16740078, 0.06889386, 0.05085445, 0.10729759,
                 #          0.02217775, 0.14518553, 0.1591538 , 0.01808308, 0.17906217]],
                 #          dtype=float32)]
+
         """
         loss = self._adapter.predict_batch(inputs)
         if fluid._non_static_mode() and self._input_info is None:
@@ -1205,12 +1313,13 @@ def predict_batch(self, inputs):
         return loss
 
     def save(self, path, training=True):
-        """  
-        This function saves parameters, optimizer information or model and 
+        """
+
+        This function saves parameters, optimizer information or model and
         paramters only for inference to path. It depends on the parameter
         `training`.
 
-        If `training` is set to True, the parameters saved contain all 
+        If `training` is set to True, the parameters saved contain all
         the trainable Variable, will save to a file with suffix ".pdparams".
         The optimizer information contains all the variable used by optimizer.
         For Adam optimizer, contains beta1, beta2, momentum etc. All the
@@ -1269,10 +1378,11 @@ def forward(self, x):
                     T.Normalize([127.5], [127.5])
                 ])
                 data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
-                
+
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
+
         """
 
         if ParallelEnv().local_rank == 0:
@@ -1283,6 +1393,7 @@ def forward(self, x):
 
     def load(self, path, skip_mismatch=False, reset_optimizer=False):
         """
+
         Load from files storing the model states and optimizer states. The file
         for optimizer states is not necessary if no need to restore the optimizer.
 
@@ -1330,6 +1441,7 @@ def load(self, path, skip_mismatch=False, reset_optimizer=False):
 
                 model.save('checkpoint/test')
                 model.load('checkpoint/test')
+
         """
 
         def _load_state_from_path(path):
@@ -1342,17 +1454,24 @@ def _check_match(key, param):
             state = param_state.get(key, None)
             if state is None:
                 raise ValueError(
-                    "{} is not found in the providing file.".format(key))
+                    "{} is not found in the providing file.".format(key)
+                )
             if list(state.shape) != list(param.shape):
                 raise ValueError(
-                    "{} receives a shape {}, but the expected shape is {}.".
-                    format(key, list(state.shape), list(param.shape)))
+                    "{} receives a shape {}, but the expected shape is {}.".format(
+                        key, list(state.shape), list(param.shape)
+                    )
+                )
             return param, state
 
         def _strip_postfix(path):
             path, ext = os.path.splitext(path)
-            assert ext in ['', '.pdparams', '.pdopt', '.pdmodel'], \
-                    "Unknown postfix {} from weights".format(ext)
+            assert ext in [
+                '',
+                '.pdparams',
+                '.pdopt',
+                '.pdmodel',
+            ], "Unknown postfix {} from weights".format(ext)
             return path
 
         path = _strip_postfix(path)
@@ -1366,15 +1485,17 @@ def _strip_postfix(path):
             except ValueError as err:
                 if skip_mismatch:
                     warnings.warn(
-                        ("Skip loading for {}. ".format(key) + str(err)))
+                        ("Skip loading for {}. ".format(key) + str(err))
+                    )
                     # reset optimizer when mismatch happens
                     reset_optimizer = True
                 else:
                     raise err
             matched_param_state.append(match_res)
 
-        optim_state = None if reset_optimizer else _load_state_from_path(
-            path + ".pdopt")
+        optim_state = (
+            None if reset_optimizer else _load_state_from_path(path + ".pdopt")
+        )
 
         # TODO: support save/load scaler state in static graph
         if _non_static_mode():
@@ -1383,13 +1504,15 @@ def _strip_postfix(path):
                 if os.path.exists(path + '.pdscaler'):
                     scaler_state = paddle.load(path + '.pdscaler')
 
-            return self._adapter.load(matched_param_state, optim_state,
-                                      scaler_state)
+            return self._adapter.load(
+                matched_param_state, optim_state, scaler_state
+            )
         else:
             return self._adapter.load(matched_param_state, optim_state)
 
     def parameters(self, *args, **kwargs):
         """
+
         Returns a list of parameters of the model.
 
         Returns:
@@ -1399,30 +1522,32 @@ def parameters(self, *args, **kwargs):
         Examples:
 
             .. code-block:: python
-            
+
                 import paddle
                 import paddle.nn as nn
                 from paddle.static import InputSpec
 
                 input = InputSpec([None, 784], 'float32', 'x')
-                
+
                 model = paddle.Model(nn.Sequential(
                     nn.Linear(784, 200),
                     nn.Tanh(),
                     nn.Linear(200, 10)), input)
 
                 params = model.parameters()
+
         """
         return self._adapter.parameters()
 
     def _prepare_amp(self, amp_configs):
-
         def _check_pure_fp16_configs():
             # pure float16 training has some restricts now
             if self._adapter._amp_level == "O2" and self._optimizer._grad_clip:
                 # clip by value is not supported
-                assert isinstance(self._optimizer._grad_clip, (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm)), \
-                     "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
+                assert isinstance(
+                    self._optimizer._grad_clip,
+                    (paddle.nn.ClipGradByGlobalNorm, paddle.nn.ClipGradByNorm),
+                ), "Only GradientClipByNorm and GradientClipByGlobalNorm are supported in amp training with level=O2 currently."
 
         self._adapter._amp_custom_lists = {}
         self._adapter._amp_configs = {}
@@ -1434,7 +1559,8 @@ def _check_pure_fp16_configs():
         elif isinstance(amp_configs, str):
             if amp_configs not in ('O0', 'O1', 'O2'):
                 raise ValueError(
-                    "The level of amp_configs should be 'O0', 'O1' or 'O2'.")
+                    "The level of amp_configs should be 'O0', 'O1' or 'O2'."
+                )
             self._adapter._amp_level = amp_configs
             _check_pure_fp16_configs()
             return
@@ -1443,7 +1569,8 @@ def _check_pure_fp16_configs():
                 self._adapter._amp_level = 'O1'
             elif amp_configs['level'] not in ('O0', 'O1', 'O2'):
                 raise ValueError(
-                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'.")
+                    "amp_configs['level'] should be 'O0', 'O1' or 'O2'."
+                )
             else:
                 self._adapter._amp_level = amp_configs['level']
         amp_config_key_set = set(amp_configs.keys()) - {'level'}
@@ -1460,12 +1587,14 @@ def _check_pure_fp16_configs():
         # construct amp_custom_lists
         if self._adapter._amp_level != 'O0' and amp_config_key_set:
             for param_name in [
-                    'custom_white_list', 'custom_black_list',
-                    'custom_black_varnames'
+                'custom_white_list',
+                'custom_black_list',
+                'custom_black_varnames',
             ]:
                 if param_name in amp_config_key_set:
                     self._adapter._amp_custom_lists[param_name] = amp_configs[
-                        param_name]
+                        param_name
+                    ]
                     amp_config_key_set -= {param_name}
 
         def _check_amp_configs(amp_config_key_set):
@@ -1480,13 +1609,16 @@ def _check_amp_configs(amp_config_key_set):
             }
             if amp_config_key_set - accepted_param_set:
                 raise ValueError(
-                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized."
-                    .format(tuple(amp_config_key_set - accepted_param_set)))
+                    "Except for 'level', the keys of 'amp_configs' must be accepted by mixed precision APIs, but {} could not be recognized.".format(
+                        tuple(amp_config_key_set - accepted_param_set)
+                    )
+                )
 
             if 'use_fp16_guard' in amp_config_key_set:
                 if _non_static_mode():
                     raise ValueError(
-                        "'use_fp16_guard' is supported in static mode only.")
+                        "'use_fp16_guard' is supported in static mode only."
+                    )
                 self._adapter._use_fp16_guard = amp_configs['use_fp16_guard']
                 amp_config_key_set.remove('use_fp16_guard')
 
@@ -1496,12 +1628,11 @@ def _check_amp_configs(amp_config_key_set):
         for key in amp_configs_set:
             self._adapter._amp_configs[key] = amp_configs[key]
 
-    def prepare(self,
-                optimizer=None,
-                loss=None,
-                metrics=None,
-                amp_configs=None):
+    def prepare(
+        self, optimizer=None, loss=None, metrics=None, amp_configs=None
+    ):
         """
+
         Configures the model before runing.
 
         Args:
@@ -1533,6 +1664,7 @@ def prepare(self,
 
         Returns:
             None
+
         """
         self._place = _get_device()
         if isinstance(self._place, fluid.CUDAPlace):
@@ -1540,15 +1672,17 @@ def prepare(self,
             if ParallelEnv().nranks > 1 and not _parallel_context_initialized:
                 if fluid._non_static_mode():
                     main_prog_seed = fluid.default_main_program().random_seed
-                    startup_prog_seed = fluid.default_startup_program(
-                    ).random_seed
+                    startup_prog_seed = (
+                        fluid.default_startup_program().random_seed
+                    )
                     fluid.disable_dygraph()
                     paddle.disable_static(self._place)
                     # enable_dygraph would create and switch to a new program,
                     # thus also copy seed to the new program
                     fluid.default_main_program().random_seed = main_prog_seed
-                    fluid.default_startup_program(
-                    ).random_seed = startup_prog_seed
+                    fluid.default_startup_program().random_seed = (
+                        startup_prog_seed
+                    )
                 else:
                     prepare_distributed_context(self._place)
                 _parallel_context_initialized = True
@@ -1563,43 +1697,46 @@ def prepare(self,
 
         metrics = metrics or []
         for metric in to_list(metrics):
-            assert isinstance(metric, Metric), \
-                "{} is not sub class of Metric".format(
-                    metric.__class__.__name__)
+            assert isinstance(
+                metric, Metric
+            ), "{} is not sub class of Metric".format(metric.__class__.__name__)
         self._metrics = to_list(metrics)
         self._prepare_amp(amp_configs)
 
         self._adapter.prepare()
 
-    def fit(self,
-            train_data=None,
-            eval_data=None,
-            batch_size=1,
-            epochs=1,
-            eval_freq=1,
-            log_freq=10,
-            save_dir=None,
-            save_freq=1,
-            verbose=2,
-            drop_last=False,
-            shuffle=True,
-            num_workers=0,
-            callbacks=None,
-            accumulate_grad_batches=1,
-            num_iters=None):
+    def fit(
+        self,
+        train_data=None,
+        eval_data=None,
+        batch_size=1,
+        epochs=1,
+        eval_freq=1,
+        log_freq=10,
+        save_dir=None,
+        save_freq=1,
+        verbose=2,
+        drop_last=False,
+        shuffle=True,
+        num_workers=0,
+        callbacks=None,
+        accumulate_grad_batches=1,
+        num_iters=None,
+    ):
         """
+
         Trains the model for a fixed number of epochs. If `eval_data` is set,
         evaluation will be done at the end of each epoch.
 
         Args:
-            train_data (Dataset|DataLoader, optional): An iterable data loader is used for 
-                train. An instance of paddle paddle.io.Dataset or 
+            train_data (Dataset|DataLoader, optional): An iterable data loader is used for
+                train. An instance of paddle paddle.io.Dataset or
                 paddle.io.Dataloader is recomended. Default: None.
             eval_data (Dataset|DataLoader, optional): An iterable data loader is used for
-                evaluation at the end of epoch. If None, will not do evaluation. 
-                An instance of paddle.io.Dataset or paddle.io.Dataloader 
+                evaluation at the end of epoch. If None, will not do evaluation.
+                An instance of paddle.io.Dataset or paddle.io.Dataloader
                 is recomended. Default: None.
-            batch_size (int, optional): The batch size of train_data and eval_data. When 
+            batch_size (int, optional): The batch size of train_data and eval_data. When
                 train_data and eval_data are both the instance of Dataloader, this
                 parameter will be ignored. Default: 1.
             epochs (int, optional): The number of epochs to train the model. Default: 1.
@@ -1627,7 +1764,7 @@ def fit(self,
             callbacks (Callback|None, optional): A list of `Callback` instances to apply
                 during training. If None, :ref:`api_paddle_callbacks_ProgBarLogger` and
                 :ref:`api_paddle_callbacks_ModelCheckpoint` are automatically inserted. Default: None.
-            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident 
+            accumulate_grad_batches (int, optional): The number of batches to accumulate gradident
                 during training process before optimizer updates. It can mimic large batch
                 size. Default: 1.
             num_iters (int|None, optional): The number of iterations to evaluate the model.
@@ -1642,7 +1779,7 @@ def fit(self,
                How to make a batch is done internally.
 
             .. code-block:: python
-              :name: code-example1
+              :name: code-example3
 
                 import paddle
                 import paddle.vision.transforms as T
@@ -1682,7 +1819,7 @@ def fit(self,
                DataLoader.
 
             .. code-block:: python
-              :name: code-example2
+              :name: code-example4
 
                 import paddle
                 import paddle.vision.transforms as T
@@ -1692,7 +1829,7 @@ def fit(self,
                 dynamic = True
                 if not dynamic:
                     paddle.enable_static()
-                
+
                 transform = T.Compose([
                         T.Transpose(),
                         T.Normalize([127.5], [127.5])
@@ -1719,31 +1856,38 @@ def fit(self,
                             val_loader,
                             epochs=2,
                             save_dir='mnist_checkpoint')
+
         """
-        assert train_data is not None, \
-                "train_data must be given!"
+        assert train_data is not None, "train_data must be given!"
 
         if isinstance(train_data, Dataset):
-            train_sampler = DistributedBatchSampler(train_data,
-                                                    batch_size=batch_size,
-                                                    shuffle=shuffle,
-                                                    drop_last=drop_last)
-            train_loader = DataLoader(train_data,
-                                      batch_sampler=train_sampler,
-                                      places=self._place,
-                                      num_workers=num_workers,
-                                      return_list=True)
+            train_sampler = DistributedBatchSampler(
+                train_data,
+                batch_size=batch_size,
+                shuffle=shuffle,
+                drop_last=drop_last,
+            )
+            train_loader = DataLoader(
+                train_data,
+                batch_sampler=train_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         else:
             train_loader = train_data
 
         if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
-                                                   batch_size=batch_size)
-            eval_loader = DataLoader(eval_data,
-                                     batch_sampler=eval_sampler,
-                                     places=self._place,
-                                     num_workers=num_workers,
-                                     return_list=True)
+            eval_sampler = DistributedBatchSampler(
+                eval_data, batch_size=batch_size
+            )
+            eval_loader = DataLoader(
+                eval_data,
+                batch_sampler=eval_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         elif eval_data is not None:
             eval_loader = eval_data
         else:
@@ -1756,8 +1900,11 @@ def fit(self,
 
         steps = self._len_data_loader(train_loader)
         self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
-                steps, int):
+        if (
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(steps, int)
+        ):
             assert num_iters > 0, "num_iters must be greater than 0!"
             epochs = (num_iters // steps) + 1
             steps = min(num_iters, steps)
@@ -1785,10 +1932,10 @@ def fit(self,
             if do_eval and epoch % eval_freq == 0:
 
                 eval_steps = self._len_data_loader(eval_loader)
-                cbks.on_begin('eval', {
-                    'steps': eval_steps,
-                    'metrics': self._metrics_name()
-                })
+                cbks.on_begin(
+                    'eval',
+                    {'steps': eval_steps, 'metrics': self._metrics_name()},
+                )
 
                 eval_logs = self._run_one_epoch(eval_loader, cbks, 'eval')
 
@@ -1799,20 +1946,22 @@ def fit(self,
         cbks.on_end('train', logs)
         self._test_dataloader = None
 
-    def evaluate(self,
-                 eval_data,
-                 batch_size=1,
-                 log_freq=10,
-                 verbose=2,
-                 num_workers=0,
-                 callbacks=None,
-                 num_iters=None):
+    def evaluate(
+        self,
+        eval_data,
+        batch_size=1,
+        log_freq=10,
+        verbose=2,
+        num_workers=0,
+        callbacks=None,
+        num_iters=None,
+    ):
         """
         Evaluate the loss and metrics of the model on input dataset.
 
         Args:
             eval_data (Dataset|DataLoader): An iterable data loader is used for
-                evaluation. An instance of paddle.io.Dataset or 
+                evaluation. An instance of paddle.io.Dataset or
                 paddle.io.Dataloader is recomended.
             batch_size (int, optional): The batch size of train_data and eval_data.
                 When eval_data is the instance of Dataloader, this argument will be
@@ -1860,13 +2009,16 @@ def evaluate(self,
         """
 
         if eval_data is not None and isinstance(eval_data, Dataset):
-            eval_sampler = DistributedBatchSampler(eval_data,
-                                                   batch_size=batch_size)
-            eval_loader = DataLoader(eval_data,
-                                     batch_sampler=eval_sampler,
-                                     places=self._place,
-                                     num_workers=num_workers,
-                                     return_list=True)
+            eval_sampler = DistributedBatchSampler(
+                eval_data, batch_size=batch_size
+            )
+            eval_loader = DataLoader(
+                eval_data,
+                batch_sampler=eval_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         else:
             eval_loader = eval_data
 
@@ -1882,15 +2034,17 @@ def evaluate(self,
 
         eval_steps = self._len_data_loader(eval_loader)
         self.num_iters = num_iters
-        if num_iters is not None and isinstance(num_iters, int) and isinstance(
-                eval_steps, int):
+        if (
+            num_iters is not None
+            and isinstance(num_iters, int)
+            and isinstance(eval_steps, int)
+        ):
             assert num_iters > 0, "num_iters must be greater than 0!"
             eval_steps = min(num_iters, eval_steps)
             self.num_iters = eval_steps
-        cbks.on_begin('eval', {
-            'steps': eval_steps,
-            'metrics': self._metrics_name()
-        })
+        cbks.on_begin(
+            'eval', {'steps': eval_steps, 'metrics': self._metrics_name()}
+        )
 
         logs = self._run_one_epoch(eval_loader, cbks, 'eval')
 
@@ -1904,13 +2058,15 @@ def evaluate(self,
 
         return eval_result
 
-    def predict(self,
-                test_data,
-                batch_size=1,
-                num_workers=0,
-                stack_outputs=False,
-                verbose=1,
-                callbacks=None):
+    def predict(
+        self,
+        test_data,
+        batch_size=1,
+        num_workers=0,
+        stack_outputs=False,
+        verbose=1,
+        callbacks=None,
+    ):
         """
         Compute the output predictions on testing data.
 
@@ -1920,7 +2076,7 @@ def predict(self,
                 is recomended.
             batch_size (int, optional): The batch size of test_data. When test_data is the
                 instance of Dataloader, this argument will be ignored. Default: 1.
-            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess 
+            num_workers (int, optional): The number of subprocess to load data, 0 for no subprocess
                 used and loading data in main process. When test_data is the instance of Dataloader,
                 this argument will be ignored. Default: 0.
             stack_outputs (bool, optional): Whether stack output field like a batch, as for an output
@@ -1981,13 +2137,16 @@ def __len__(self):
         """
 
         if test_data is not None and isinstance(test_data, Dataset):
-            test_sampler = DistributedBatchSampler(test_data,
-                                                   batch_size=batch_size)
-            test_loader = DataLoader(test_data,
-                                     batch_sampler=test_sampler,
-                                     places=self._place,
-                                     num_workers=num_workers,
-                                     return_list=True)
+            test_sampler = DistributedBatchSampler(
+                test_data, batch_size=batch_size
+            )
+            test_loader = DataLoader(
+                test_data,
+                batch_sampler=test_sampler,
+                places=self._place,
+                num_workers=num_workers,
+                return_list=True,
+            )
         else:
             test_loader = test_data
 
@@ -2037,7 +2196,8 @@ def _save_inference_model(self, path):
                 if self._is_shape_inferred:
                     warnings.warn(
                         "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
-                        % self._input_info[0])
+                        % self._input_info[0]
+                    )
 
                 paddle.jit.save(layer, path, input_spec=self._inputs)
 
@@ -2048,7 +2208,8 @@ def _save_inference_model(self, path):
                 raise ValueError(
                     "The input path MUST be format of dirname/file_prefix "
                     "[dirname\\file_prefix in Windows system], but received "
-                    "file_prefix is empty string.")
+                    "file_prefix is empty string."
+                )
 
             dirname = os.path.dirname(path)
             if dirname and not os.path.exists(dirname):
@@ -2059,21 +2220,24 @@ def _save_inference_model(self, path):
             params_filename = file_prefix + INFER_PARAMS_SUFFIX
 
             prog = self._adapter._progs.get('test', None)
-            assert prog, \
-                "Model is not ready, please call `model.prepare()` first"
+            assert (
+                prog
+            ), "Model is not ready, please call `model.prepare()` first"
 
             infer_prog = prog.clone(for_test=True)
 
             input_names = [v.name for v in self._adapter._input_vars['test']]
             endpoints = self._adapter._endpoints['test']['output']
 
-            fluid.io.save_inference_model(model_path,
-                                          input_names,
-                                          endpoints,
-                                          self._adapter._executor,
-                                          main_program=infer_prog,
-                                          model_filename=model_filename,
-                                          params_filename=params_filename)
+            fluid.io.save_inference_model(
+                model_path,
+                input_names,
+                endpoints,
+                self._adapter._executor,
+                main_program=infer_prog,
+                model_filename=model_filename,
+                params_filename=params_filename,
+            )
 
     def _run_one_epoch(
         self,
@@ -2099,16 +2263,21 @@ def _run_one_epoch(
             # LoDTensor.shape is callable, where LoDTensor comes from
             # DataLoader in static graph
 
-            batch_size = data[0].shape()[0] if callable(
-                data[0].shape) else data[0].shape[0]
+            batch_size = (
+                data[0].shape()[0]
+                if callable(data[0].shape)
+                else data[0].shape[0]
+            )
 
             callbacks.on_batch_begin(mode, step, logs)
 
             if mode != 'predict':
-                _inputs = [data[:len(self._inputs)], data[len(self._inputs):]]
+                _inputs = [data[: len(self._inputs)], data[len(self._inputs) :]]
                 if mode == 'train':
-                    _inputs.append((step + 1) % self._accumulate == 0
-                                   or step + 1 == len(data_loader))
+                    _inputs.append(
+                        (step + 1) % self._accumulate == 0
+                        or step + 1 == len(data_loader)
+                    )
 
                 outs = getattr(self, mode + '_batch')(*_inputs)
 
@@ -2129,15 +2298,17 @@ def _run_one_epoch(
                     logs[k] = v
             else:
                 if self._inputs is not None:
-                    outs = self.predict_batch(data[:len(self._inputs)])
+                    outs = self.predict_batch(data[: len(self._inputs)])
                 else:
                     outs = self.predict_batch(data)
 
                 outputs.append(outs)
 
             logs['step'] = step
-            if mode == 'train' or self._adapter._merge_count.get(
-                    mode + '_batch', 0) <= 0:
+            if (
+                mode == 'train'
+                or self._adapter._merge_count.get(mode + '_batch', 0) <= 0
+            ):
                 logs['batch_size'] = batch_size * ParallelEnv().nranks
             else:
                 logs['batch_size'] = self._adapter._merge_count[mode + '_batch']
@@ -2159,10 +2330,10 @@ def summary(self, input_size=None, dtype=None):
         """Prints a string summary of the network.
 
         Args:
-            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor. 
-                    if not set, input_size will get from ``self._inputs`` if network only have 
-                    one input, input_size can be tuple or InputSpec. if model have multiple 
-                    input, input_size must be a list which contain every input's shape. 
+            input_size (tuple|InputSpec|list[tuple|InputSpec], optional): size of input tensor.
+                    if not set, input_size will get from ``self._inputs`` if network only have
+                    one input, input_size can be tuple or InputSpec. if model have multiple
+                    input, input_size must be a list which contain every input's shape.
                     Default: None.
             dtype (str, optional): if dtype is None, 'float32' will be used, Default: None.
 
@@ -2191,8 +2362,9 @@ def summary(self, input_size=None, dtype=None):
                 # {'total_params': 61610, 'trainable_params': 61610}
 
         """
-        assert (input_size is not None or self._inputs
-                is not None), "'input_size' or 'self._input' must be set"
+        assert (
+            input_size is not None or self._inputs is not None
+        ), "'input_size' or 'self._input' must be set"
         if input_size is not None:
             _input_size = input_size
         else:
@@ -2209,7 +2381,10 @@ def _verify_spec(self, specs, shapes=None, dtypes=None, is_input=False):
             if is_input:
                 arg_names = extract_args(self.network.forward)[1:]
                 # While Saving inference model in dygraph, and providing inputs only in running.
-                if shapes is not None and dtypes is not None and fluid._non_static_mode(
+                if (
+                    shapes is not None
+                    and dtypes is not None
+                    and fluid._non_static_mode()
                 ):
                     out_specs = [
                         Input(name=n, dtype=dtypes[i], shape=shapes[i])
@@ -2222,7 +2397,8 @@ def _verify_spec(self, specs, shapes=None, dtypes=None, is_input=False):
         elif isinstance(specs, dict):
             assert is_input is False
             out_specs = [
-                specs[n] for n in extract_args(self.network.forward)
+                specs[n]
+                for n in extract_args(self.network.forward)
                 if n != 'self'
             ]
         else:
@@ -2233,8 +2409,10 @@ def _verify_spec(self, specs, shapes=None, dtypes=None, is_input=False):
                 assert isinstance(spec, Input)
                 if spec.name is None:
                     raise ValueError(
-                        "Requires Input[{}].name != None, but receive `None` with {}."
-                        .format(i, spec))
+                        "Requires Input[{}].name != None, but receive `None` with {}.".format(
+                            i, spec
+                        )
+                    )
 
         return out_specs
 
@@ -2259,6 +2437,7 @@ def _update_inputs(self):
         "Update self._inputs according to given inputs."
         self._input_info = self._adapter._input_info
         if self._input_info is not None and len(self._input_info) == 2:
-            self._inputs = self._verify_spec(None, self._input_info[0],
-                                             self._input_info[1], True)
+            self._inputs = self._verify_spec(
+                None, self._input_info[0], self._input_info[1], True
+            )
             self._is_shape_inferred = True
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 6928bc75f5f714..26bf28ed10afdf 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -19,7 +19,7 @@
 import paddle
 import paddle.nn as nn
 from paddle.static import InputSpec
-
+from paddle.autograd import no_grad
 from collections import OrderedDict
 
 __all__ = []
@@ -229,7 +229,7 @@ def _check_input(input_size):
     return params_info
 
 
-@paddle.no_grad()
+@no_grad()
 def summary_string(model, input_size=None, dtypes=None, input=None):
 
     def _all_is_numper(items):
diff --git a/python/paddle/incubate/__init__.py b/python/paddle/incubate/__init__.py
index 543b0b815c16ee..68c7db054991a0 100644
--- a/python/paddle/incubate/__init__.py
+++ b/python/paddle/incubate/__init__.py
@@ -31,7 +31,6 @@
 
 from . import autograd  #noqa: F401
 from . import autotune  #noqa: F401
-from . import sparse  #noqa: F401
 from . import nn  #noqa: F401
 from . import asp  #noqa: F401
 
diff --git a/python/paddle/incubate/autograd/primapi.py b/python/paddle/incubate/autograd/primapi.py
index ba7a2537df1339..18a06af5dca7fc 100644
--- a/python/paddle/incubate/autograd/primapi.py
+++ b/python/paddle/incubate/autograd/primapi.py
@@ -22,14 +22,14 @@
 def forward_grad(outputs, inputs, grad_inputs=None):
     """Forward mode of automatic differentiation.
 
-    .. note::
+    Note:
         **ONLY available in the static mode and primitive operators.**
 
     Args:
         outputs(Tensor|Sequence[Tensor]): The output tensor or tensors.
         inputs(Tensor|Sequence[Tensor]): The input tensor or tensors.
-        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
-            Tensors of inputs which has the same shape with inputs, Defaults to 
+        grad_inputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
+            Tensors of inputs which has the same shape with inputs, Defaults to
             None, in this case is equivalent to all ones.
 
     Returns:
@@ -50,7 +50,7 @@ def forward_grad(outputs, inputs, grad_inputs=None):
 
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
-                y = x * x 
+                y = x * x
                 y_grad = paddle.incubate.autograd.forward_grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
@@ -64,25 +64,35 @@ def forward_grad(outputs, inputs, grad_inputs=None):
             paddle.disable_static()
     """
     if not utils.prim_enabled():
-        raise RuntimeError('forward_grad must be running on primitive'
-                           'operators, use enable_prim to turn it on.')
+        raise RuntimeError(
+            'forward_grad must be running on primitive'
+            'operators, use enable_prim to turn it on.'
+        )
 
     if not isinstance(outputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(outputs)}.')
+        raise TypeError(
+            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(outputs)}.'
+        )
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(inputs)}.')
+        raise TypeError(
+            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(inputs)}.'
+        )
 
-    ys, xs, xs_dot = utils.as_tensors(outputs), utils.as_tensors(
-        inputs), utils.as_tensors(grad_inputs)
+    ys, xs, xs_dot = (
+        utils.as_tensors(outputs),
+        utils.as_tensors(inputs),
+        utils.as_tensors(grad_inputs),
+    )
 
     block = framework.default_main_program().current_block()
     if any(x.block != block for x in xs + ys):
         raise RuntimeError(
             'Variable in inputs and targets should exist in current block of '
-            'main program.')
+            'main program.'
+        )
 
     primx.orig2prim(block)
     ad = primx.Transform(ys[0].block)
@@ -95,18 +105,18 @@ def forward_grad(outputs, inputs, grad_inputs=None):
 def grad(outputs, inputs, grad_outputs=None):
     """Reverse mode of automatic differentiation.
 
-    .. note::
+    Note:
         **ONLY available in the static mode and primitive operators**
 
     Args:
         outputs(Tensor|Sequence[Tensor]): The output Tensor or Tensors.
         inputs(Tensor|Sequence[Tensor]): The input Tensor or Tensors.
-        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or 
-            Tensors of outputs which has the same shape with outputs, Defaults 
+        grad_outputs(Tensor|Sequence[Tensor]): Optional, the gradient Tensor or
+            Tensors of outputs which has the same shape with outputs, Defaults
             to None, in this case is equivalent to all ones.
 
     Returns:
-        grad_inputs(Tensor|Tensors): The gradients for inputs. 
+        grad_inputs(Tensor|Tensors): The gradients for inputs.
 
     Examples:
 
@@ -123,7 +133,7 @@ def grad(outputs, inputs, grad_outputs=None):
             with paddle.static.program_guard(main_program, startup_program):
                 x = paddle.static.data('x', shape=[1], dtype='float32')
                 x.stop_gradients = False
-                y = x * x 
+                y = x * x
                 x_grad = paddle.incubate.autograd.grad(y, x)
                 paddle.incubate.autograd.prim2orig()
 
@@ -132,7 +142,7 @@ def grad(outputs, inputs, grad_outputs=None):
             x_grad = exe.run(main_program, feed={'x': np.array([2.]).astype('float32')}, fetch_list=[x_grad])
             print(x_grad)
             # [array([4.], dtype=float32)]
-            
+
             paddle.incubate.autograd.disable_prim()
             paddle.disable_static()
     """
@@ -141,22 +151,32 @@ def grad(outputs, inputs, grad_outputs=None):
         # backward.gradients returns a list though the inputs is a signle Tensor.
         # The follow code snippet fixes the problem by return the first element
         # of grad_inputs when the inputs is a signle Tensor.
-        if isinstance(inputs, framework.Variable) and isinstance(
-                grad_inputs, typing.Sequence) and len(grad_inputs) > 0:
+        if (
+            isinstance(inputs, framework.Variable)
+            and isinstance(grad_inputs, typing.Sequence)
+            and len(grad_inputs) > 0
+        ):
             return grad_inputs[0]
         else:
             return grad_inputs
 
     if not isinstance(outputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected outputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(outputs)}.')
+        raise TypeError(
+            f'Expected outputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(outputs)}.'
+        )
 
     if not isinstance(inputs, (framework.Variable, typing.Sequence)):
-        raise TypeError(f'Expected inputs is Tensor|Sequence[Tesnor], '
-                        f'but got {type(inputs)}.')
+        raise TypeError(
+            f'Expected inputs is Tensor|Sequence[Tesnor], '
+            f'but got {type(inputs)}.'
+        )
 
-    ys, xs, ys_bar = utils.as_tensors(outputs), utils.as_tensors(
-        inputs), utils.as_tensors(grad_outputs)
+    ys, xs, ys_bar = (
+        utils.as_tensors(outputs),
+        utils.as_tensors(inputs),
+        utils.as_tensors(grad_outputs),
+    )
     block = framework.default_main_program().current_block()
     if any((x is not None and x.block != block) for x in xs + ys):
         raise RuntimeError(
diff --git a/python/paddle/incubate/autograd/primops.py b/python/paddle/incubate/autograd/primops.py
index cb67d3f23e911d..636dc892204905 100644
--- a/python/paddle/incubate/autograd/primops.py
+++ b/python/paddle/incubate/autograd/primops.py
@@ -137,6 +137,11 @@ def exp(x, out=None):
     return _simple_unop(LayerHelper('exp_p', **locals()))
 
 
+@REGISTER_FN('abs_p', 'X', 'Y')
+def abs(x, out=None):
+    return _simple_unop(LayerHelper('abs_p', **locals()))
+
+
 @REGISTER_FN('reshape_p', 'X', 'Y')
 def reshape(x, shape, out=None):
     return _manipulation_unop(LayerHelper('reshape_p', **locals()))
@@ -193,15 +198,17 @@ def concat(xs, axis=0, out=None):
     return out
 
 
-@REGISTER_FN('reduce_p', 'X', 'Y')
-def reduce(x, axis, keepdim=False, out=None):
+@REGISTER_FN('reduce_sum_p', 'X', 'Y')
+def reduce_sum(x, axis=None, keepdim=False, out=None):
+    axes = axis or tuple(range(0, len(x.shape)))
+    axes = (axes, ) if isinstance(axes, int) else axes
     if not isinstance(axis, (tuple, list)):
         raise TypeError(f'axis must be tuple or list, but got {type(axis)}')
     if not isinstance(keepdim, bool):
         raise TypeError(f'keepdim must be bool, but got {type(keepdim)}')
-    attrs = {'axis': axis, 'keepdim': keepdim}
 
-    helper = LayerHelper('reduce_p', **locals())
+    attrs = {'axis': axis, 'keepdim': keepdim}
+    helper = LayerHelper('reduce_sum_p', **locals())
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
@@ -347,6 +354,21 @@ def eq(x, y, out=None):
     return _simple_binop(LayerHelper('eq_p', **locals()))
 
 
+@REGISTER_FN('gt_p', 'X', 'Y', 'Z')
+def gt(x, y, out=None):
+    return _simple_binop(LayerHelper('gt_p', **locals()))
+
+
+@REGISTER_FN('ge_p', 'X', 'Y', 'Z')
+def ge(x, y, out=None):
+    return _simple_binop(LayerHelper('ge_p', **locals()))
+
+
+@REGISTER_FN('ne_p', 'X', 'Y', 'Z')
+def ne(x, y, out=None):
+    return _simple_binop(LayerHelper('ne_p', **locals()))
+
+
 @REGISTER_FN('pow_p', 'X', 'Y', 'Z')
 def pow(x, y, out=None):
     return _simple_binop(LayerHelper('pow_p', **locals()))
@@ -360,3 +382,15 @@ def max(x, y, out=None):
 @REGISTER_FN('erf_p', 'X', 'Y')
 def erf(x, out=None):
     return _simple_unop(LayerHelper('erf_p', **locals()))
+
+
+@REGISTER_FN('cast_p', 'X', 'Y')
+def cast(x, dtype, out=None):
+    helper = LayerHelper('cast_p', **locals())
+    if out is None:
+        out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type=helper.layer_type,
+                     inputs={'X': x},
+                     outputs={'Y': out},
+                     attrs={'dtype': dtype})
+    return out
diff --git a/python/paddle/incubate/autograd/primreg.py b/python/paddle/incubate/autograd/primreg.py
index 6c3ece09a6be1f..34b1c7f4833484 100644
--- a/python/paddle/incubate/autograd/primreg.py
+++ b/python/paddle/incubate/autograd/primreg.py
@@ -80,7 +80,7 @@ def div(x, y, out=None):
     
     """
     args = _primop_position_argnames.lookup(op.type)
-    assert args is not None, 'args should not be None in op_position_inputs().'
+    assert args is not None, f'args of {op.type} should not be None in op_position_inputs().'
     *input_names, _ = args
 
     inputs = []
diff --git a/python/paddle/incubate/autograd/primrules.py b/python/paddle/incubate/autograd/primrules.py
index 9e14c8633307aa..4625cfd362f070 100644
--- a/python/paddle/incubate/autograd/primrules.py
+++ b/python/paddle/incubate/autograd/primrules.py
@@ -11,21 +11,26 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import typing
+import functools
 import math
+import operator
+import typing
 
 import paddle
 
 from . import primops
-from .primops import (add, broadcast, concat, cos, div, exp, fill_const, gather,
-                      matmul, mul, neg, reduce, reshape, scatter_add, set_value,
+from .primops import (add, broadcast, concat, cos, div, eq, erf, exp,
+                      fill_const, gather, ge, gt, log, matmul, max, mul, ne,
+                      neg, reduce_sum, reshape, scatter_add, select, set_value,
                       sin, slice_assign, slice_select, split, sqrt, sub, tanh,
-                      transpose, log, select, eq, max, erf)
+                      transpose)
 from .primreg import (REGISTER_JVP, REGISTER_ORIG2PRIM, REGISTER_PRIM2ORIG,
                       REGISTER_TRANSPOSE, lookup_fn, lookup_jvp,
                       lookup_orig2prim, lookup_prim2orig, lookup_transpose,
                       op_position_inputs, op_position_output)
 from .utils import INT_DTYPE_2_STRING, get_input_var_list, get_output_var_list
+from paddle.fluid.data_feeder import convert_dtype
+from paddle.fluid.framework import convert_np_dtype_to_dtype_
 
 
 def _orig2prim(op, *args):
@@ -63,6 +68,7 @@ def linear_jvp(op, *args, **kwargs):
 elementwise_mul
 tanh
 fill_zeros_like
+fill_any_like
 sum
 index_select
 scale
@@ -152,6 +158,13 @@ def elementwise_mul_orig2prim(op, x, y):
     return z
 
 
+@REGISTER_ORIG2PRIM('elementwise_div')
+def elementwise_div_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    return primops.div(x, y)
+
+
 @REGISTER_ORIG2PRIM('tanh')
 def tanh_orig2prim(op, x):
     return tanh(x)
@@ -177,6 +190,11 @@ def erf_orig2prim(op, x):
     return erf(x)
 
 
+@REGISTER_ORIG2PRIM('abs')
+def abs_orig2prim(op, x):
+    return primops.abs(x)
+
+
 @REGISTER_ORIG2PRIM('log')
 def log_orig2prim(op, x):
     return log(x)
@@ -187,6 +205,16 @@ def fill_zeros_like_orig2prim(op, x):
     return fill_const(value=0.0, shape=x.shape, dtype=x.dtype)
 
 
+@REGISTER_ORIG2PRIM('fill_any_like')
+def fill_any_like_orig2prim(op, x):
+    if op.attr('dtype') == -1:
+        return fill_const(value=op.attr('value'), shape=x.shape, dtype=x.dtype)
+    return fill_const(value=op.attr('value'),
+                      shape=x.shape,
+                      dtype=convert_np_dtype_to_dtype_(
+                          convert_dtype(INT_DTYPE_2_STRING[op.attr('dtype')])))
+
+
 @REGISTER_ORIG2PRIM('sum')
 def sum_orig2prim(op, xs):
     x0 = xs[0]
@@ -294,13 +322,18 @@ def num_el(shape):
         x = reshape(x, shape=[num_el(x.shape)])
 
     if abs(op.attr('porder') - 2.0) < 1e-5:
-        return sqrt(reduce(mul(x, x), axis=[0]))
+        return sqrt(reduce_sum(mul(x, x), axis=[0]))
     elif abs(op.attr('porder') - 1.0) < 1e-5:
-        return reduce(sqrt(mul(x, x)), axis=[0])
+        return reduce_sum(sqrt(mul(x, x)), axis=[0])
     else:
         raise RuntimeError('Only support lower l2/l1 norm currently')
 
 
+@REGISTER_ORIG2PRIM('cast')
+def cast_orig2prim(op, x):
+    return primops.cast(x, paddle.dtype(op.attr('out_dtype')))
+
+
 # TODO: support broadcast
 @REGISTER_ORIG2PRIM('where')
 def select_orig2prim(op, condition, x, y):
@@ -314,15 +347,48 @@ def equal_orig2prim(op, x, y):
     return eq(x, y)
 
 
+@REGISTER_ORIG2PRIM('not_equal')
+def ne_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    return ne(x, y)
+
+
+@REGISTER_ORIG2PRIM('greater_than')
+def gt_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    return gt(x, y)
+
+
+@REGISTER_ORIG2PRIM('greater_equal')
+def ge_orig2prim(op, x, y):
+    if x.shape != y.shape:
+        y = broadcast(y, shape=x.shape)
+    return ge(x, y)
+
+
+# paddle.pow API use "elementwise_pow" operator when y is a Tensor.
 @REGISTER_ORIG2PRIM('elementwise_pow')
 def elementwise_pow_orig2prim(op, x, y):
     if x.shape != y.shape:
         y = broadcast(y, shape=x.shape)
-
     z = primops.pow(x, y)
     return z
 
 
+# paddle.pow API use "pow" operator when y is a scalar.
+@REGISTER_ORIG2PRIM('pow')
+def pow_orig2prim(op, x, y):
+    # x is factorTensor defined in paddle phi op. Currently it is None.
+    return primops.pow(y, fill_const(op.attr('factor'), y.shape, y.dtype))
+
+
+@REGISTER_ORIG2PRIM('square')
+def square_orig2prim(op, x):
+    return primops.pow(x, fill_const(2., x.shape, x.dtype))
+
+
 @REGISTER_ORIG2PRIM('elementwise_max')
 def elementwise_max_orig2prim(op, x, y):
     if x.shape != y.shape:
@@ -354,6 +420,31 @@ def gelu_orig2prim(op, x):
                 erf(mul(x, fill_const(1 / math.sqrt(2.), x.shape, x.dtype)))))
 
 
+@REGISTER_ORIG2PRIM('reduce_sum')
+def reduce_sum_orig2prim(op, x):
+    axes = tuple(range(0, len(
+        x.shape))) if op.attr('reduce_all') else op.attr('dim')
+    return reduce_sum(x, axis=axes, keepdim=op.attr('keep_dim'))
+
+
+@REGISTER_ORIG2PRIM('reduce_mean')
+def reduce_mean_orig2prim(op, x):
+    axes = tuple(range(0, len(
+        x.shape))) if op.attr('reduce_all') else op.attr('dim')
+    sum = reduce_sum(x, axis=axes, keepdim=op.attr('keep_dim'))
+    norm = fill_const(shape=sum.shape,
+                      value=functools.reduce(operator.mul,
+                                             [x.shape[axis] for axis in axes]),
+                      dtype=sum.dtype)
+    return div(sum, norm)
+
+
+@REGISTER_ORIG2PRIM('size')
+def size_orig2prim(op, x):
+    return fill_const(functools.reduce(operator.mul, x.shape), (1, ),
+                      paddle.int64)
+
+
 ## Register prim2orig lower rules
 @REGISTER_PRIM2ORIG('add_p')
 def add_prim2orig(op, x, y):
@@ -405,6 +496,11 @@ def erf_prim2orig(op, x):
     return paddle.erf(x)
 
 
+@REGISTER_PRIM2ORIG('abs_p')
+def abs_prim2orig(op, x):
+    return paddle.abs(x)
+
+
 @REGISTER_PRIM2ORIG('log_p')
 def log_prim2orig(op, x):
     return paddle.log(x)
@@ -440,7 +536,7 @@ def concat_prim2orig(op, xs):
     return paddle.concat(xs, axis=op.attr('axis'))
 
 
-@REGISTER_PRIM2ORIG('reduce_p')
+@REGISTER_PRIM2ORIG('reduce_sum_p')
 def reduce_prim2orig(op, x):
     return paddle.sum(x, axis=op.attr('axis'), keepdim=op.attr('keepdim'))
 
@@ -501,6 +597,21 @@ def eq_prim2orig(op, x, y):
     return paddle.equal(x, y)
 
 
+@REGISTER_PRIM2ORIG('gt_p')
+def gt_prim2orig(op, x, y):
+    return paddle.greater_than(x, y)
+
+
+@REGISTER_PRIM2ORIG('ge_p')
+def ge_prim2orig(op, x, y):
+    return paddle.greater_equal(x, y)
+
+
+@REGISTER_PRIM2ORIG('ne_p')
+def ne_prim2orig(op, x, y):
+    return paddle.not_equal(x, y)
+
+
 @REGISTER_PRIM2ORIG('pow_p')
 def pow_prim2orig(op, x, y):
     return paddle.pow(x, y)
@@ -511,6 +622,11 @@ def max_prim2orig(op, x, y):
     return paddle.maximum(x, y)
 
 
+@REGISTER_PRIM2ORIG('cast_p')
+def cast_prim2orig(op, x):
+    return paddle.cast(x, paddle.dtype(op.attr('dtype')))
+
+
 ## Register linearize rules
 @REGISTER_JVP('add_p')
 def add_jvp(op, x_dot, y_dot):
@@ -616,6 +732,14 @@ def erf_jvp(op, x_dot):
         mul(x_dot, exp(neg(primops.pow(x, fill_const(2., x.shape, x.dtype))))))
 
 
+@REGISTER_JVP('abs_p')
+def abs_jvp(op, x_dot):
+    if x_dot is None:
+        return None
+    x, = op_position_inputs(op)
+    return select(ge(x, fill_const(0., x.shape, x.dtype)), x_dot, neg(x_dot))
+
+
 @REGISTER_JVP('log_p')
 def log_jvp(op, x_dot):
     if x_dot is None:
@@ -665,8 +789,8 @@ def concat_jvp(op, xs_dot):
     return linear_jvp(op, xs_dot, axis=axis)
 
 
-@REGISTER_JVP('reduce_p')
-def reduce_jvp(op, x_dot):
+@REGISTER_JVP('reduce_sum_p')
+def reduce_sum_jvp(op, x_dot):
     if x_dot is None:
         return None
     axis = op.attr('axis')
@@ -765,6 +889,33 @@ def eq_jvp(op, x_dot, y_dot):
     return z_dot
 
 
+@REGISTER_JVP('gt_p')
+def gt_jvp(op, x_dot, y_dot):
+    if x_dot is None and y_dot is None:
+        return None
+    x, _ = op_position_inputs(op)
+    z_dot = fill_const(value=0., shape=x.shape, dtype=x.dtype)
+    return z_dot
+
+
+@REGISTER_JVP('ge_p')
+def ge_jvp(op, x_dot, y_dot):
+    if x_dot is None and y_dot is None:
+        return None
+    x, _ = op_position_inputs(op)
+    z_dot = fill_const(value=0., shape=x.shape, dtype=x.dtype)
+    return z_dot
+
+
+@REGISTER_JVP('ne_p')
+def ne_jvp(op, x_dot, y_dot):
+    if x_dot is None and y_dot is None:
+        return None
+    x, _ = op_position_inputs(op)
+    z_dot = fill_const(value=0., shape=x.shape, dtype=x.dtype)
+    return z_dot
+
+
 @REGISTER_JVP('pow_p')
 def pow_jvp(op, x_dot, y_dot):
 
@@ -812,6 +963,12 @@ def max_jvp(op, x_dot, y_dot):
         return select(eq(y, z), y_dot, x_dot)
 
 
+@REGISTER_JVP('cast_p')
+def cast_jvp(op, x_dot):
+    y = op_position_output(op)
+    return primops.cast(x_dot, y.dtype)
+
+
 ## Register transpose rules
 
 
@@ -873,7 +1030,7 @@ def broadcast_transpose(op, check_dot, y_bar):
     keepdim = [(bat + i) for i, s in enumerate(x.shape) if s == 1]
     axis += keepdim
     # TODO: Change it. keepdim boolean
-    out = reduce(y_bar, axis=axis, keepdim=False)
+    out = reduce_sum(y_bar, axis=axis, keepdim=False)
     return reshape(out, x.shape)
 
 
@@ -908,8 +1065,8 @@ def concat_transpose(op, check_dot, y_bar):
     return split(y_bar, num_or_sections=sections, axis=axis)
 
 
-@REGISTER_TRANSPOSE('reduce_p')
-def reduce_transpose(op, check_dot, y_bar):
+@REGISTER_TRANSPOSE('reduce_sum_p')
+def reduce_sum_transpose(op, check_dot, y_bar):
     x, = op_position_inputs(op)
     assert check_dot(x), 'check_dot(x) must be True'
     axes = op.attr('axis')
@@ -1016,3 +1173,9 @@ def select_transpose(op, check_dot, z_bar):
     y_bar = select(cond, zeros_y, z_bar) if check_dot(y) else None
 
     return cond_bar, x_bar, y_bar
+
+
+@REGISTER_TRANSPOSE('cast_p')
+def cast_transpose(op, check_dot, y_bar):
+    x, = op_position_inputs(op)
+    return primops.cast(y_bar, x.dtype)
diff --git a/python/paddle/incubate/autograd/primx.py b/python/paddle/incubate/autograd/primx.py
index 19f87dd9292154..66a400d9c06f1f 100644
--- a/python/paddle/incubate/autograd/primx.py
+++ b/python/paddle/incubate/autograd/primx.py
@@ -12,23 +12,34 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from collections import OrderedDict
+
 import paddle
-from paddle.fluid import framework as framework
-from paddle.fluid.framework import default_main_program
-from paddle.fluid.framework import Operator
 from paddle import compat as cpt
-from .primops import fill_const, add
-from .primreg import op_position_inputs, op_position_output, lookup_orig2prim, lookup_prim2orig
-from .primrules import _orig2prim, _prim2orig, _jvp, _transpose
-from .utils import get_input_var_list, get_output_var_list, flatten, flatten_and_remove_none
-from collections import OrderedDict
+from paddle.fluid import framework as framework
+from paddle.fluid.framework import Operator, default_main_program
 from paddle.incubate.autograd.utils import as_tensors
 
+from .primops import add, fill_const
+from .primreg import (
+    lookup_orig2prim,
+    lookup_prim2orig,
+    op_position_inputs,
+    op_position_output,
+)
+from .primrules import _jvp, _orig2prim, _prim2orig, _transpose
+from .utils import (
+    flatten,
+    flatten_and_remove_none,
+    get_input_var_list,
+    get_output_var_list,
+)
+
 
 def topo_path(xs, ys, block=None):
-    """ Returns the list of ops on the path from `xs` to `ys` in topological 
+    """Returns the list of ops on the path from `xs` to `ys` in topological
     order.
-    
+
     TODO(Tongxin): supporting control flow and nested blocks.
     Args:
         xs: a list|tuple of vars as source
@@ -48,13 +59,16 @@ def topo_path(xs, ys, block=None):
 
     # Initialize reached vars
     for x in xs:
-        assert x is None or x.block == block, f'x is not None and x.block != block'
+        assert (
+            x is None or x.block == block
+        ), f'x is not None and x.block != block'
         reached_vars[id(x)] = x
 
     # Reaching test, returning whether an op is reached from the given input
     reaching = lambda op: any(
         id(v) in reached_vars
-        for v in flatten_and_remove_none(get_input_var_list(op)))
+        for v in flatten_and_remove_none(get_input_var_list(op))
+    )
 
     # block.ops are supposedly in the order that preserves correct data
     # dependence.
@@ -68,7 +82,8 @@ def topo_path(xs, ys, block=None):
     used_vars = OrderedDict((id(y), y) for y in ys if id(y) in reached_vars)
     back_reaching = lambda op: any(
         id(out) in used_vars
-        for out in flatten_and_remove_none(get_output_var_list(op)))
+        for out in flatten_and_remove_none(get_output_var_list(op))
+    )
 
     # Backward pass to find all used variables
     for op in reversed(path):
@@ -84,9 +99,9 @@ def topo_path(xs, ys, block=None):
 
 
 def output_vars_on_path(path):
-    """ Returns the output variables of all the ops on the path from `xs`
+    """Returns the output variables of all the ops on the path from `xs`
     to `ys`.
-    
+
     Args:
         path: a list of ops on which to find the output variables
 
@@ -102,8 +117,8 @@ def output_vars_on_path(path):
 
 
 class VarMap(object):
-    """ A general map data structure for linking variables to variables.
-    
+    """A general map data structure for linking variables to variables.
+
     An example is linking variables to their gradients.
     """
 
@@ -123,7 +138,8 @@ def add_rec(self, key_vars, value_vars):
         if isinstance(key_vars, paddle.fluid.framework.Variable):
             if not isinstance(value_vars, paddle.fluid.framework.Variable):
                 raise TypeError(
-                    f'value_vars must be Variable, but got {type(value_vars)}')
+                    f'value_vars must be Variable, but got {type(value_vars)}'
+                )
             self.tab[id(key_vars)] = id(value_vars)
         else:
             assert len(key_vars) == len(value_vars), (
@@ -166,11 +182,12 @@ def contain_value(self, value_var):
 
 # TODO(lml): supporting control flow, nested blocks, and block other than current block of main program.
 class Transform(object):
-    """ An object that maintains the state of transformations applied to a 
-    primitve program. """
+    """An object that maintains the state of transformations applied to a
+    primitve program."""
 
     def __init__(self, block):
-        assert block == default_main_program().current_block(
+        assert (
+            block == default_main_program().current_block()
         ), f'only support transform on current block of main program.'
         self.block = block
         self.vars = self.init_vars(block)
@@ -222,7 +239,7 @@ def erase_dots(self, vars_to_erase):
         block._sync_with_cpp()
 
     def var2dot_rec(self, vars):
-        """ Lookup var2dot recursively."""
+        """Lookup var2dot recursively."""
         if isinstance(vars, paddle.fluid.framework.Variable):
             dot = self.var2dot.lookup(vars)
             return dot
@@ -241,9 +258,9 @@ def dot2bar_rec(self, dots):
         return bars
 
     def linearize(self, xs, ys, xs_dot=None):
-        """ Performs the linearization transform, a.k.a, forward mode AD 
+        """Performs the linearization transform, a.k.a, forward mode AD
         transform, on a primitive lowered program.
-        
+
         Args:
             xs: a list of input variables
             ys: a list of output variables
@@ -253,9 +270,9 @@ def linearize(self, xs, ys, xs_dot=None):
 
         Returns:
             (xs_dot, ys_dot): a tuple of two lists. `xs_dot` is the list of
-            gradient inputs of the resulting linearized program. `ys_dot` is 
+            gradient inputs of the resulting linearized program. `ys_dot` is
             the list gradient outputs of the resulting linearized program
-            
+
         """
         if xs_dot is None:
             xs_dot = [fill_const(1.0, shape=x.shape, dtype=x.dtype) for x in xs]
@@ -263,15 +280,18 @@ def linearize(self, xs, ys, xs_dot=None):
         else:
             assert len(xs) == len(xs_dot), (
                 f'len(xs) should be equal to len(xs_dot), '
-                f'but len(xs)={len(xs)} and len(xs_dot)={len(xs_dot)}')
+                f'but len(xs)={len(xs)} and len(xs_dot)={len(xs_dot)}'
+            )
 
         for x, dot in zip(xs, xs_dot):
             assert x.dtype == dot.dtype, (
                 f'x.dtype should be equal to dot.dtype, '
-                f'but x.dtype={x.dtype} and dot.dtype={dot.dtype}')
+                f'but x.dtype={x.dtype} and dot.dtype={dot.dtype}'
+            )
             assert x.shape == dot.shape, (
                 f'x.shape should be equal to dot.shape, '
-                f'but x.shape={x.shape} and dot.shape={dot.shape}')
+                f'but x.shape={x.shape} and dot.shape={dot.shape}'
+            )
             self.var2dot.add(x, dot)
 
         path, unused_xs, _ = topo_path(xs, ys, self.block)
@@ -297,23 +317,23 @@ def linearize(self, xs, ys, xs_dot=None):
         return xs_dot, ys_dot
 
     def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
-        """ Performs the transpose transform, a.k.a, reverse mode AD 
+        """Performs the transpose transform, a.k.a, reverse mode AD
         transform, on a linearized primitive program.
 
         Note, `transpose` is supposed to be used in couple with `linearize`.
-        
+
         Args:
             ys_dot: a list of outputs of the linearized program.
             xs_dot: a list of inputs of the linearized program.
-            ys_bar: optional, a list of inputs of the resulting transposed 
+            ys_bar: optional, a list of inputs of the resulting transposed
                 program. The list size must be equal to `len(ys_dot)`. The shape
                 and dtype of each element must be the same as in `ys_dot`
 
         Returns:
             (ys_bar, xs_bar): a tuple of two lists. `ys_bar` is the list of
-            inputs of the resulting transposed program. `xs_bar` is 
+            inputs of the resulting transposed program. `xs_bar` is
             the list outputs of the resulting transposed program
-            
+
         """
         assert all(v is not None for v in xs_dot), f'`xs_dot` includes None.'
         assert all(v is not None for v in ys_dot), f'`ys_dot` includes None.'
@@ -326,7 +346,8 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
         else:
             assert len(ys_dot) == len(ys_bar), (
                 f'len(ys_dot) should be equal to len(ys_bar), '
-                f'but len(ys_dot)={len(ys_dot)} and len(ys_bar)={len(ys_bar)}')
+                f'but len(ys_dot)={len(ys_dot)} and len(ys_bar)={len(ys_bar)}'
+            )
             for y_dot, y_bar in zip(ys_dot, ys_bar):
                 assert y_dot.shape == y_bar.shape, (
                     f'y_dot.shape should be equal to y_bar.shape, '
@@ -370,7 +391,8 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
             ins = flatten(op_position_inputs(op))
             assert len(ins) == len(ins_bar), (
                 f'len(ins) should be equal to len(ins_bar), '
-                f'but len(ins)={len(ins)} and len(ins_bar)={len(ins_bar)}')
+                f'but len(ins)={len(ins)} and len(ins_bar)={len(ins_bar)}'
+            )
 
             for dot, bar in zip(ins, ins_bar):
                 if bar is not None:
@@ -389,7 +411,8 @@ def transpose(self, ys_dot, xs_dot, ys_bar=None, retain_fwd=False):
             vars_to_remove = set()
             for op in path:
                 vars_to_remove.update(
-                    flatten_and_remove_none(get_output_var_list(op)))
+                    flatten_and_remove_none(get_output_var_list(op))
+                )
 
             op_indexes = []
 
@@ -457,10 +480,12 @@ def expand_nested_list(xs):
             bind(input_args, to_bind, value_table)
 
             for orig_out, new_out in zip(
-                    expand_nested_list(get_output_var_list(op)),
-                    expand_nested_list(as_tensors(lower_fn(op, *input_args)))):
+                expand_nested_list(get_output_var_list(op)),
+                expand_nested_list(as_tensors(lower_fn(op, *input_args))),
+            ):
                 assert not (orig_out is None) ^ (
-                    new_out is None), "orig_out and new_out should match."
+                    new_out is None
+                ), "orig_out and new_out should match."
                 vars_to_remove.add(new_out.name)
                 value_table[new_out.name] = new_out
                 to_bind[orig_out.name] = new_out.name
@@ -469,7 +494,8 @@ def expand_nested_list(xs):
             inputs = {}
             for i in range(len(op.input_names)):
                 inputs[op.input_names[i]] = bind_name(
-                    op.input(op.input_names[i]), to_bind)
+                    op.input(op.input_names[i]), to_bind
+                )
 
             outputs = {}
             for i in range(len(op.output_names)):
@@ -479,14 +505,17 @@ def expand_nested_list(xs):
             for name in sorted(op.attr_names):
                 attrs[name] = op.attr(name)
             from paddle.fluid.dygraph.base import param_guard
+
             new_op_desc = block.desc.append_op()
             with param_guard(inputs), param_guard(outputs):
-                op = Operator(block=block,
-                              desc=new_op_desc,
-                              type=op.type,
-                              inputs=inputs,
-                              outputs=outputs,
-                              attrs=attrs)
+                op = Operator(
+                    block=block,
+                    desc=new_op_desc,
+                    type=op.type,
+                    inputs=inputs,
+                    outputs=outputs,
+                    attrs=attrs,
+                )
             block.ops.append(op)
 
     # Step3: Do some post-processing work
@@ -506,8 +535,9 @@ def expand_nested_list(xs):
                 op._rename_output(out_name, to_bind_rev[out_name])
 
     for var_name in sorted(vars_to_remove):
-        assert var_name in to_bind_rev, 'var_name "{}" is not in to_bind_rev.'.format(
-            var_name)
+        assert (
+            var_name in to_bind_rev
+        ), 'var_name "{}" is not in to_bind_rev.'.format(var_name)
         if var_name != to_bind_rev[var_name]:
             block.desc._remove_var(cpt.to_bytes(var_name))
             del block.vars[var_name]
@@ -516,8 +546,8 @@ def expand_nested_list(xs):
 
 @framework.static_only
 def orig2prim(block=None):
-    """ 
-    .. note::
+    """
+    Note:
         **This API is ONLY available in the static mode.**
         **Args block must be None or current block of main program.**
 
@@ -525,7 +555,7 @@ def orig2prim(block=None):
     If it is an original operator, it will be transformed into
     one or a series of automatic differential basic operators with
     equivalent function.
-    
+
     Args:
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
@@ -533,7 +563,8 @@ def orig2prim(block=None):
     """
 
     block = default_main_program().current_block() if block is None else block
-    assert block == default_main_program().current_block(
+    assert (
+        block == default_main_program().current_block()
     ), f'block is neither None nor current block of main program'
     _lower(block, reverse=False, blacklist=[])
 
@@ -541,7 +572,7 @@ def orig2prim(block=None):
 @framework.static_only
 def prim2orig(block=None, blacklist=None):
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
         **Args block must be None or current block of main program.**
 
@@ -549,7 +580,7 @@ def prim2orig(block=None, blacklist=None):
     If it is an automatic differential basic operator, it will be
     transformed into one or a series of original operators with
     equivalent function to support execution.
-    
+
     Args:
         block(paddle.static.Block|None, optional): The
             target block to process on. Default None, and will
@@ -565,10 +596,10 @@ def prim2orig(block=None, blacklist=None):
 
             import paddle
             from paddle.incubate.autograd import enable_prim, prim_enabled, prim2orig
-            
+
             paddle.enable_static()
             enable_prim()
-            
+
             x = paddle.ones(shape=[2, 2], dtype='float32')
             x.stop_gradients = False
             y = x * x
@@ -578,7 +609,8 @@ def prim2orig(block=None, blacklist=None):
     """
 
     block = default_main_program().current_block() if block is None else block
-    assert block == default_main_program().current_block(
+    assert (
+        block == default_main_program().current_block()
     ), f'block is neither None nor current block of main program'
     blacklist = [] if blacklist is None else blacklist
     _lower(block, reverse=True, blacklist=blacklist)
diff --git a/python/paddle/incubate/autograd/utils.py b/python/paddle/incubate/autograd/utils.py
index 96faf7f7440ca5..2b8082bf48de7a 100644
--- a/python/paddle/incubate/autograd/utils.py
+++ b/python/paddle/incubate/autograd/utils.py
@@ -18,7 +18,6 @@
 
 
 class PrimOption(object):
-
     def __init__(self):
         self.enable_prim = False
 
@@ -35,12 +34,12 @@ def set_status(self, flag):
 @framework.static_only
 def prim_enabled():
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
 
-    Shows whether the automatic differentiation mechanism based on 
+    Shows whether the automatic differentiation mechanism based on
     automatic differential basic operators is ON. Defaults to OFF.
-     
+
     Returns:
         flag(bool): Whether the automatic differentiation mechanism based on automatic differential basic operators is ON.
 
@@ -50,7 +49,7 @@ def prim_enabled():
 
             import paddle
             from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -66,19 +65,19 @@ def prim_enabled():
 @framework.static_only
 def enable_prim():
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
 
-    Turns ON automatic differentiation mechanism based on automatic 
+    Turns ON automatic differentiation mechanism based on automatic
     differential basic operators.
-    
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             from paddle.incubate.autograd import enable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -90,19 +89,19 @@ def enable_prim():
 @framework.static_only
 def disable_prim():
     """
-    .. note::
+    Note:
         **ONLY available in the static mode.**
 
-    Turns OFF automatic differentiation mechanism based on automatic 
+    Turns OFF automatic differentiation mechanism based on automatic
     differential basic operators.
-    
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             from paddle.incubate.autograd import enable_prim, disable_prim, prim_enabled
-            
+
             paddle.enable_static()
             enable_prim()
 
@@ -175,7 +174,7 @@ def flatten_and_remove_none(inp):
 
 def as_tensors(xs):
     if isinstance(xs, framework.Variable):
-        return (xs, )
+        return (xs,)
     elif isinstance(xs, typing.Sequence):
         return tuple(xs)
     else:
diff --git a/python/paddle/incubate/distributed/fleet/__init__.py b/python/paddle/incubate/distributed/fleet/__init__.py
new file mode 100644
index 00000000000000..94e1a7c8bbe77b
--- /dev/null
+++ b/python/paddle/incubate/distributed/fleet/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet.recompute import recompute_sequential, recompute_hybrid
+
+__all__ = ["recompute_sequential", "recompute_hybrid"]
diff --git a/python/paddle/incubate/distributed/models/moe/moe_layer.py b/python/paddle/incubate/distributed/models/moe/moe_layer.py
index ebf300abf95457..0a0fe32a8e9186 100644
--- a/python/paddle/incubate/distributed/models/moe/moe_layer.py
+++ b/python/paddle/incubate/distributed/models/moe/moe_layer.py
@@ -26,7 +26,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
-from paddle.distributed.utils import global_scatter, global_gather
+from paddle.distributed.utils.moe_utils import global_scatter, global_gather
 from paddle.distributed import alltoall, all_gather
 
 from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker
@@ -34,9 +34,9 @@
 from paddle.autograd import PyLayer
 from .gate import NaiveGate, GShardGate, SwitchGate, BaseGate
 from .utils import count_by_gate
-from paddle.distributed.fleet.meta_parallel.pp_utils.utils import _hp_recompute
 from paddle import fluid
 from paddle.fluid.framework import in_dygraph_mode
+from paddle.incubate.distributed.fleet import recompute_hybrid
 
 
 def _local_scatter(inp, pos):
@@ -265,7 +265,6 @@ class MoELayer(nn.Layer):
         from paddle.distributed import fleet
 
         moe_group = Group(fleet.worker_index(),
-                          fleet.worker_num(),
                           0,
                           list(range(fleet.worker_num())))
         mp_group = None
@@ -424,8 +423,8 @@ def experts_fwd(x, fwd_expert_count, experts):
         if self.recompute_interval <= 0 or x.shape[0] == 0:
             x = experts_fwd(x, fwd_expert_count.numpy(), self.experts)
         else:
-            x = _hp_recompute(experts_fwd, x, fwd_expert_count.numpy(),
-                              self.experts)
+            x = recompute_hybrid(self.recompute_ctx, experts_fwd, x,
+                                 fwd_expert_count.numpy(), self.experts)
 
         out_batch_size = inp.shape[0]
         if len(gate.shape) == 2:
diff --git a/python/paddle/incubate/multiprocessing/__init__.py b/python/paddle/incubate/multiprocessing/__init__.py
index 27c23be3a89411..df0f98f74d58bc 100644
--- a/python/paddle/incubate/multiprocessing/__init__.py
+++ b/python/paddle/incubate/multiprocessing/__init__.py
@@ -19,8 +19,6 @@
 
 from multiprocessing import *  # noqa: F403
 
-__all__ += multiprocessing.__all__  # type: ignore[attr-defined]
-
 # Only support linux for now
 # Only support file_system sharing strategy.
 
diff --git a/python/paddle/incubate/nn/functional/fused_transformer.py b/python/paddle/incubate/nn/functional/fused_transformer.py
index b1d759b6953c36..02b844751a8898 100644
--- a/python/paddle/incubate/nn/functional/fused_transformer.py
+++ b/python/paddle/incubate/nn/functional/fused_transformer.py
@@ -28,26 +28,28 @@ def _verify_dropout_rate(dropout_rate):
         raise ValueError("dropout_rate argument should between 0 and 1")
 
 
-def fused_feedforward(x,
-                      linear1_weight,
-                      linear2_weight,
-                      linear1_bias=None,
-                      linear2_bias=None,
-                      ln1_scale=None,
-                      ln1_bias=None,
-                      ln2_scale=None,
-                      ln2_bias=None,
-                      dropout1_rate=0.5,
-                      dropout2_rate=0.5,
-                      activation="relu",
-                      ln1_epsilon=1e-5,
-                      ln2_epsilon=1e-5,
-                      pre_layer_norm=False,
-                      training=True,
-                      mode='upscale_in_train',
-                      ring_id=-1,
-                      add_residual=True,
-                      name=None):
+def fused_feedforward(
+    x,
+    linear1_weight,
+    linear2_weight,
+    linear1_bias=None,
+    linear2_bias=None,
+    ln1_scale=None,
+    ln1_bias=None,
+    ln2_scale=None,
+    ln2_bias=None,
+    dropout1_rate=0.5,
+    dropout2_rate=0.5,
+    activation="relu",
+    ln1_epsilon=1e-5,
+    ln2_epsilon=1e-5,
+    pre_layer_norm=False,
+    training=True,
+    mode='upscale_in_train',
+    ring_id=-1,
+    add_residual=True,
+    name=None,
+):
     r"""
     This is a fusion operator to compute feed forward layer in transformer model architecture.
     This operator only supports running on GPU. The function of the operator is consistent with
@@ -126,116 +128,167 @@ def fused_feedforward(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
         out, _, _, _, _, _, _, _, _, _, _ = _legacy_C_ops.fused_feedforward(
-            x, None, None, linear1_weight, linear1_bias, linear2_weight,
-            linear2_bias, ln1_scale, ln1_bias, ln2_scale, ln2_bias,
-            'pre_layer_norm', pre_layer_norm, 'ln1_epsilon', ln1_epsilon,
-            'ln2_epsilon', ln2_epsilon, 'act_method', activation,
-            'dropout1_rate', dropout1_rate, 'dropout2_rate', dropout2_rate,
-            "is_test", not training, "dropout1_fix_seed", seed is not None,
-            "dropout2_fix_seed", seed is not None, "dropout1_seed",
-            seed if seed is not None else 0, "dropout2_seed",
-            seed if seed is not None else 0, 'dropout1_implementation', mode,
-            'dropout2_implementation', mode, 'add_residual', add_residual,
-            'ring_id', ring_id)
+            x,
+            None,
+            None,
+            linear1_weight,
+            linear1_bias,
+            linear2_weight,
+            linear2_bias,
+            ln1_scale,
+            ln1_bias,
+            ln2_scale,
+            ln2_bias,
+            'pre_layer_norm',
+            pre_layer_norm,
+            'ln1_epsilon',
+            ln1_epsilon,
+            'ln2_epsilon',
+            ln2_epsilon,
+            'act_method',
+            activation,
+            'dropout1_rate',
+            dropout1_rate,
+            'dropout2_rate',
+            dropout2_rate,
+            "is_test",
+            not training,
+            "dropout1_fix_seed",
+            seed is not None,
+            "dropout2_fix_seed",
+            seed is not None,
+            "dropout1_seed",
+            seed if seed is not None else 0,
+            "dropout2_seed",
+            seed if seed is not None else 0,
+            'dropout1_implementation',
+            mode,
+            'dropout2_implementation',
+            mode,
+            'add_residual',
+            add_residual,
+            'ring_id',
+            ring_id,
+        )
         return out
 
     helper = LayerHelper("fused_feedforward")
     dtype = x.dtype
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'fused_feedforward')
-    check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                'fused_feedforward')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'fused_feedforward'
+    )
+    check_dtype(
+        dtype, 'dtype', ['float16', 'float32', 'float64'], 'fused_feedforward'
+    )
 
     out = helper.create_variable_for_type_inference(x.dtype)
     dropout1_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
+        'uint8', stop_gradient=True
+    )
     dropout2_mask = helper.create_variable_for_type_inference(
-        'uint8', stop_gradient=True)
-    ln1_mean = helper.create_variable_for_type_inference(x.dtype,
-                                                         stop_gradient=True)
-    ln1_variance = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    ln2_mean = helper.create_variable_for_type_inference(x.dtype,
-                                                         stop_gradient=True)
-    ln2_variance = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    linear1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                            stop_gradient=True)
-    ln1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                        stop_gradient=True)
-    dropout1_out = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
-    dropout2_out = helper.create_variable_for_type_inference(x.dtype,
-                                                             stop_gradient=True)
+        'uint8', stop_gradient=True
+    )
+    ln1_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln1_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln2_mean = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln2_variance = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    linear1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    ln1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    dropout1_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
+    dropout2_out = helper.create_variable_for_type_inference(
+        x.dtype, stop_gradient=True
+    )
 
     if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
         seed = helper.main_program.random_seed
 
-    helper.append_op(type='fused_feedforward',
-                     inputs={
-                         'X': x,
-                         'Linear1Weight': linear1_weight,
-                         'Linear1Bias': linear1_bias,
-                         'Linear2Weight': linear2_weight,
-                         'Linear2Bias': linear2_bias,
-                         'Ln1Scale': ln1_scale,
-                         'Ln1Bias': ln1_bias,
-                         'Ln2Scale': ln2_scale,
-                         'Ln2Bias': ln2_bias,
-                     },
-                     outputs={
-                         'Out': out,
-                         'Dropout1Mask': dropout1_mask,
-                         'Dropout2Mask': dropout2_mask,
-                         'Ln1Mean': ln1_mean,
-                         'Ln1Variance': ln1_variance,
-                         'Ln2Mean': ln2_mean,
-                         'Ln2Variance': ln2_variance,
-                         'Linear1Out': linear1_out,
-                         'Ln1Out': ln1_out,
-                         'Dropout1Out': dropout1_out,
-                         'Dropout2Out': dropout2_out,
-                     },
-                     attrs={
-                         'dropout1_rate': dropout1_rate,
-                         'dropout2_rate': dropout2_rate,
-                         'act_method': activation,
-                         'pre_layer_norm': pre_layer_norm,
-                         'ln1_epsilon': ln1_epsilon,
-                         'ln2_epsilon': ln2_epsilon,
-                         'is_test': not training,
-                         'dropout1_fix_seed': seed is not None,
-                         'dropout2_fix_seed': seed is not None,
-                         'dropout1_seed': seed if seed is not None else 0,
-                         'dropout2_seed': seed if seed is not None else 0,
-                         'dropout1_implementation': mode,
-                         'dropout2_implementation': mode,
-                         'add_residual': add_residual,
-                         'ring_id': ring_id,
-                     })
+    helper.append_op(
+        type='fused_feedforward',
+        inputs={
+            'X': x,
+            'Linear1Weight': linear1_weight,
+            'Linear1Bias': linear1_bias,
+            'Linear2Weight': linear2_weight,
+            'Linear2Bias': linear2_bias,
+            'Ln1Scale': ln1_scale,
+            'Ln1Bias': ln1_bias,
+            'Ln2Scale': ln2_scale,
+            'Ln2Bias': ln2_bias,
+        },
+        outputs={
+            'Out': out,
+            'Dropout1Mask': dropout1_mask,
+            'Dropout2Mask': dropout2_mask,
+            'Ln1Mean': ln1_mean,
+            'Ln1Variance': ln1_variance,
+            'Ln2Mean': ln2_mean,
+            'Ln2Variance': ln2_variance,
+            'Linear1Out': linear1_out,
+            'Ln1Out': ln1_out,
+            'Dropout1Out': dropout1_out,
+            'Dropout2Out': dropout2_out,
+        },
+        attrs={
+            'dropout1_rate': dropout1_rate,
+            'dropout2_rate': dropout2_rate,
+            'act_method': activation,
+            'pre_layer_norm': pre_layer_norm,
+            'ln1_epsilon': ln1_epsilon,
+            'ln2_epsilon': ln2_epsilon,
+            'is_test': not training,
+            'dropout1_fix_seed': seed is not None,
+            'dropout2_fix_seed': seed is not None,
+            'dropout1_seed': seed if seed is not None else 0,
+            'dropout2_seed': seed if seed is not None else 0,
+            'dropout1_implementation': mode,
+            'dropout2_implementation': mode,
+            'add_residual': add_residual,
+            'ring_id': ring_id,
+        },
+    )
     return out
 
 
-def fused_bias_dropout_residual_layer_norm(x,
-                                           residual,
-                                           bias=None,
-                                           ln_scale=None,
-                                           ln_bias=None,
-                                           dropout_rate=0.5,
-                                           ln_epsilon=1e-5,
-                                           training=True,
-                                           mode='upscale_in_train',
-                                           name=None):
+def fused_bias_dropout_residual_layer_norm(
+    x,
+    residual,
+    bias=None,
+    ln_scale=None,
+    ln_bias=None,
+    dropout_rate=0.5,
+    ln_epsilon=1e-5,
+    training=True,
+    mode='upscale_in_train',
+    name=None,
+):
     r"""
+
     The fused_bias_dropout_residual_layer_norm operator. The pseudo code is as follows:
 
     .. code-block:: python
+
         y = layer_norm(residual + dropout(bias + x))
 
     Parameters:
@@ -264,10 +317,9 @@ def fused_bias_dropout_residual_layer_norm(x,
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: The output Tensor, the data type and shape is same as `x`.
+        Tensor, The output Tensor, the data type and shape is same as `x`.
 
     Examples:
-
         .. code-block:: python
 
             # required: gpu
@@ -285,43 +337,79 @@ def fused_bias_dropout_residual_layer_norm(x,
                 x, residual, bias)
             # [2, 4, 128]
             print(output.shape)
+
     """
     seed = None
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if ln_scale is not None:
-        assert len(ln_scale.shape
-                   ) == 1, "The dims of the shape of ln_scale should be 1."
-        assert x.shape[len(x.shape) - 1] == ln_scale.shape[
-            0], "The dim of ln_scale must equal to the last dim of x."
+        assert (
+            len(ln_scale.shape) == 1
+        ), "The dims of the shape of ln_scale should be 1."
+        assert (
+            x.shape[len(x.shape) - 1] == ln_scale.shape[0]
+        ), "The dim of ln_scale must equal to the last dim of x."
     if ln_bias is not None:
-        assert len(
-            ln_bias.shape) == 1, "The dims of the shape of ln_bias should be 1."
-        assert x.shape[len(x.shape) - 1] == ln_bias.shape[
-            0], "The dim of ln_bias must equal to the last dim of x."
+        assert (
+            len(ln_bias.shape) == 1
+        ), "The dims of the shape of ln_bias should be 1."
+        assert (
+            x.shape[len(x.shape) - 1] == ln_bias.shape[0]
+        ), "The dim of ln_bias must equal to the last dim of x."
 
     if _non_static_mode():
         if default_main_program().random_seed != 0:
             seed = default_main_program().random_seed
-        _, _, _, _, final_out = _legacy_C_ops.fused_bias_dropout_residual_layer_norm(
-            x, residual, bias, ln_scale, ln_bias, 'dropout_rate', dropout_rate,
-            'ln_epsilon', ln_epsilon, 'is_test', not training,
-            'dropout_fix_seed', seed is not None, 'dropout_seed',
-            seed if seed is not None else 0, 'dropout_implementation', mode)
+        (
+            _,
+            _,
+            _,
+            _,
+            final_out,
+        ) = _legacy_C_ops.fused_bias_dropout_residual_layer_norm(
+            x,
+            residual,
+            bias,
+            ln_scale,
+            ln_bias,
+            'dropout_rate',
+            dropout_rate,
+            'ln_epsilon',
+            ln_epsilon,
+            'is_test',
+            not training,
+            'dropout_fix_seed',
+            seed is not None,
+            'dropout_seed',
+            seed if seed is not None else 0,
+            'dropout_implementation',
+            mode,
+        )
         return final_out
     else:
-        helper = LayerHelper('fused_bias_dropout_residual_layer_norm',
-                             **locals())
+        helper = LayerHelper(
+            'fused_bias_dropout_residual_layer_norm', **locals()
+        )
         dtype = x.dtype
         # check dtypes
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'fused_bias_dropout_residual_layer_norm')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                    'fused_bias_dropout_residual_layer_norm')
+        check_variable_and_dtype(
+            x,
+            'x',
+            ['float16', 'float32', 'float64'],
+            'fused_bias_dropout_residual_layer_norm',
+        )
+        check_dtype(
+            dtype,
+            'dtype',
+            ['float16', 'float32', 'float64'],
+            'fused_bias_dropout_residual_layer_norm',
+        )
         # set inputs
         inputs = dict()
         inputs['X'] = [x]
@@ -345,50 +433,57 @@ def fused_bias_dropout_residual_layer_norm(x,
         }
         # set outputs
         dropout_mask_out = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
         ln_mean_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         ln_variance_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
-            dtype=dtype)
+            dtype=dtype
+        )
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='fused_bias_dropout_residual_layer_norm',
-                         inputs=inputs,
-                         outputs={
-                             "BiasDropoutResidualOut":
-                             bias_dropout_residual_out,
-                             "DropoutMaskOut": dropout_mask_out,
-                             "LnMean": ln_mean_out,
-                             "LnVariance": ln_variance_out,
-                             'Y': final_out,
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='fused_bias_dropout_residual_layer_norm',
+            inputs=inputs,
+            outputs={
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "LnMean": ln_mean_out,
+                "LnVariance": ln_variance_out,
+                'Y': final_out,
+            },
+            attrs=attrs,
+        )
         return final_out
 
 
-def fused_multi_head_attention(x,
-                               qkv_weight,
-                               linear_weight,
-                               pre_layer_norm=False,
-                               pre_ln_scale=None,
-                               pre_ln_bias=None,
-                               ln_scale=None,
-                               ln_bias=None,
-                               pre_ln_epsilon=1e-05,
-                               qkv_bias=None,
-                               linear_bias=None,
-                               cache_kv=None,
-                               attn_mask=None,
-                               dropout_rate=0.5,
-                               attn_dropout_rate=0.5,
-                               ln_epsilon=1e-05,
-                               training=True,
-                               mode='upscale_in_train',
-                               ring_id=-1,
-                               add_residual=True,
-                               name=None):
+def fused_multi_head_attention(
+    x,
+    qkv_weight,
+    linear_weight,
+    pre_layer_norm=False,
+    pre_ln_scale=None,
+    pre_ln_bias=None,
+    ln_scale=None,
+    ln_bias=None,
+    pre_ln_epsilon=1e-05,
+    qkv_bias=None,
+    linear_bias=None,
+    cache_kv=None,
+    attn_mask=None,
+    dropout_rate=0.5,
+    attn_dropout_rate=0.5,
+    ln_epsilon=1e-05,
+    training=True,
+    mode='upscale_in_train',
+    ring_id=-1,
+    add_residual=True,
+    name=None,
+):
     r"""
     Attention mapps queries and a set of key-value pairs to outputs, and
     Multi-Head Attention performs multiple parallel attention to jointly attending
@@ -512,7 +607,9 @@ def fused_multi_head_attention(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if _non_static_mode():
         if default_main_program().random_seed != 0:
@@ -520,29 +617,83 @@ def fused_multi_head_attention(x,
         # pre_ln_mean, pre_ln_variance, pre_ln_out, qkv_out, qkv_bias_out, transpose_out, qk_out,
         # qktv_out, softmax_out, attn_dropout_mask_out, attn_dropout_out, attn_mask_out, fmha_out,
         # linear_out, dropout_mask_out, ln_mean_out, ln_var_out, bias_dropout_residual_out, final_out
-        assert len(qkv_weight.shape
-                   ) == 4, "The dims of the shape of qkv_weight should be 4."
-        assert qkv_weight.shape[
-            0] == 3, "The shape of qkv_weight should be [3, num_head, head_dim, embed_dim]."
-        assert qkv_weight.shape[3] == x.shape[
-            2], "The 3rd dim of qkv_weight and 2nd dim of x should be the same, i.e., embed_dim."
+        assert (
+            len(qkv_weight.shape) == 4
+        ), "The dims of the shape of qkv_weight should be 4."
+        assert (
+            qkv_weight.shape[0] == 3
+        ), "The shape of qkv_weight should be [3, num_head, head_dim, embed_dim]."
+        assert (
+            qkv_weight.shape[3] == x.shape[2]
+        ), "The 3rd dim of qkv_weight and 2nd dim of x should be the same, i.e., embed_dim."
         if ring_id == -1:
             # under mp, the num head will be split, this equation will not hold
-            assert qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[
-                3], "embed_dim must be divisible by num_heads."
-
-        _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, cache_kv_out, final_out = _legacy_C_ops.fused_attention(
-            x, pre_ln_scale, pre_ln_bias, qkv_weight, qkv_bias, cache_kv,
-            attn_mask, linear_weight, linear_bias, ln_scale, ln_bias,
-            'pre_layer_norm', pre_layer_norm, 'epsilon', pre_ln_epsilon,
-            'dropout_rate', dropout_rate, 'attn_dropout_rate',
-            attn_dropout_rate, 'ln_epsilon', ln_epsilon, 'is_test',
-            not training, 'attn_dropout_fix_seed', seed is not None,
-            'dropout_fix_seed', seed is not None, 'attn_dropout_seed',
-            seed if seed is not None else 0, 'dropout_seed',
-            seed if seed is not None else 0, 'attn_dropout_implementation',
-            mode, 'dropout_implementation', mode, 'add_residual', add_residual,
-            'ring_id', ring_id)
+            assert (
+                qkv_weight.shape[1] * qkv_weight.shape[2] == qkv_weight.shape[3]
+            ), "embed_dim must be divisible by num_heads."
+
+        (
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            cache_kv_out,
+            final_out,
+        ) = _legacy_C_ops.fused_attention(
+            x,
+            pre_ln_scale,
+            pre_ln_bias,
+            qkv_weight,
+            qkv_bias,
+            cache_kv,
+            attn_mask,
+            linear_weight,
+            linear_bias,
+            ln_scale,
+            ln_bias,
+            'pre_layer_norm',
+            pre_layer_norm,
+            'epsilon',
+            pre_ln_epsilon,
+            'dropout_rate',
+            dropout_rate,
+            'attn_dropout_rate',
+            attn_dropout_rate,
+            'ln_epsilon',
+            ln_epsilon,
+            'is_test',
+            not training,
+            'attn_dropout_fix_seed',
+            seed is not None,
+            'dropout_fix_seed',
+            seed is not None,
+            'attn_dropout_seed',
+            seed if seed is not None else 0,
+            'dropout_seed',
+            seed if seed is not None else 0,
+            'attn_dropout_implementation',
+            mode,
+            'dropout_implementation',
+            mode,
+            'add_residual',
+            add_residual,
+            'ring_id',
+            ring_id,
+        )
         if cache_kv is not None:
             return final_out, cache_kv_out
         return final_out
@@ -550,10 +701,18 @@ def fused_multi_head_attention(x,
         helper = LayerHelper('fused_multi_head_attention', **locals())
         dtype = x.dtype
         # check dtypes
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'fused_multihead_attention')
-        check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                    'fused_multi_head_attention')
+        check_variable_and_dtype(
+            x,
+            'x',
+            ['float16', 'float32', 'float64'],
+            'fused_multihead_attention',
+        )
+        check_dtype(
+            dtype,
+            'dtype',
+            ['float16', 'float32', 'float64'],
+            'fused_multi_head_attention',
+        )
 
         # set inputs
         inputs = dict()
@@ -573,7 +732,8 @@ def fused_multi_head_attention(x,
             inputs['Ln2Scale'] = [ln_scale]
         if ln_bias:
             inputs['Ln2Bias'] = [ln_bias]
-        if cache_kv: inputs['CacheKV'] = [cache_kv]
+        if cache_kv:
+            inputs['CacheKV'] = [cache_kv]
 
         if (seed is None or seed == 0) and helper.main_program.random_seed != 0:
             seed = helper.main_program.random_seed
@@ -593,14 +753,16 @@ def fused_multi_head_attention(x,
             'attn_dropout_implementation': mode,
             'dropout_implementation': mode,
             'add_residual': add_residual,
-            'ring_id': ring_id
+            'ring_id': ring_id,
         }
 
         # set outputs
         pre_ln_mean_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         pre_ln_variance_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         pre_ln_out = helper.create_variable_for_type_inference(dtype=dtype)
 
         qkv_out = helper.create_variable_for_type_inference(dtype=dtype)
@@ -611,78 +773,87 @@ def fused_multi_head_attention(x,
         qktv_out = helper.create_variable_for_type_inference(dtype=dtype)
         softmax_out = helper.create_variable_for_type_inference(dtype=dtype)
         attn_dropout_mask_out = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
         attn_dropout_out = helper.create_variable_for_type_inference(
-            dtype=dtype)
+            dtype=dtype
+        )
         attn_mask_out = helper.create_variable_for_type_inference(dtype=dtype)
         fmha_out = helper.create_variable_for_type_inference(dtype=dtype)
         out_linear_out = helper.create_variable_for_type_inference(dtype=dtype)
         dropout_mask_out = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
         ln_mean_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         ln_variance_out = helper.create_variable_for_type_inference(
-            dtype=dtype, stop_gradient=True)
+            dtype=dtype, stop_gradient=True
+        )
         bias_dropout_residual_out = helper.create_variable_for_type_inference(
-            dtype=dtype)
+            dtype=dtype
+        )
         final_out = helper.create_variable_for_type_inference(dtype=dtype)
         cache_kv_out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='fused_attention',
-                         inputs=inputs,
-                         outputs={
-                             "LnMean": pre_ln_mean_out,
-                             "LnVariance": pre_ln_variance_out,
-                             "LnOut": pre_ln_out,
-                             "QKVOut": qkv_out,
-                             "QKVBiasOut": qkv_bias_out,
-                             "TransposeOut2": transpose_out,
-                             "QKOut": qk_out,
-                             "QKTVOut": qktv_out,
-                             "SoftmaxOut": softmax_out,
-                             "AttnDropoutMaskOut": attn_dropout_mask_out,
-                             "AttnDropoutOut": attn_dropout_out,
-                             "SrcMaskOut": attn_mask_out,
-                             "FMHAOut": fmha_out,
-                             "OutLinearOut": out_linear_out,
-                             "DropoutMaskOut": dropout_mask_out,
-                             "Ln2Mean": ln_mean_out,
-                             "Ln2Variance": ln_variance_out,
-                             "BiasDropoutResidualOut":
-                             bias_dropout_residual_out,
-                             'Y': final_out,
-                             'CacheKVOut': cache_kv_out
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='fused_attention',
+            inputs=inputs,
+            outputs={
+                "LnMean": pre_ln_mean_out,
+                "LnVariance": pre_ln_variance_out,
+                "LnOut": pre_ln_out,
+                "QKVOut": qkv_out,
+                "QKVBiasOut": qkv_bias_out,
+                "TransposeOut2": transpose_out,
+                "QKOut": qk_out,
+                "QKTVOut": qktv_out,
+                "SoftmaxOut": softmax_out,
+                "AttnDropoutMaskOut": attn_dropout_mask_out,
+                "AttnDropoutOut": attn_dropout_out,
+                "SrcMaskOut": attn_mask_out,
+                "FMHAOut": fmha_out,
+                "OutLinearOut": out_linear_out,
+                "DropoutMaskOut": dropout_mask_out,
+                "Ln2Mean": ln_mean_out,
+                "Ln2Variance": ln_variance_out,
+                "BiasDropoutResidualOut": bias_dropout_residual_out,
+                'Y': final_out,
+                'CacheKVOut': cache_kv_out,
+            },
+            attrs=attrs,
+        )
 
         return (final_out, cache_kv_out) if cache_kv else final_out
 
 
-def fused_multi_transformer(x,
-                            ln_scales,
-                            ln_biases,
-                            qkv_weights,
-                            qkv_biases,
-                            linear_weights,
-                            linear_biases,
-                            ffn_ln_scales,
-                            ffn_ln_biases,
-                            ffn1_weights,
-                            ffn1_biases,
-                            ffn2_weights,
-                            ffn2_biases,
-                            pre_layer_norm=True,
-                            epsilon=1e-05,
-                            cache_kvs=None,
-                            time_step=None,
-                            attn_mask=None,
-                            dropout_rate=0.0,
-                            activation="gelu",
-                            training=False,
-                            mode='upscale_in_train',
-                            trans_qkvw=True,
-                            ring_id=-1,
-                            name=None):
+def fused_multi_transformer(
+    x,
+    ln_scales,
+    ln_biases,
+    qkv_weights,
+    qkv_biases,
+    linear_weights,
+    linear_biases,
+    ffn_ln_scales,
+    ffn_ln_biases,
+    ffn1_weights,
+    ffn1_biases,
+    ffn2_weights,
+    ffn2_biases,
+    pre_layer_norm=True,
+    epsilon=1e-05,
+    cache_kvs=None,
+    time_step=None,
+    attn_mask=None,
+    dropout_rate=0.0,
+    activation="gelu",
+    training=False,
+    mode='upscale_in_train',
+    trans_qkvw=True,
+    ring_id=-1,
+    name=None,
+):
     r"""
     This is a fusion operator to compute multi transformer layers in transformer model architecture.
     This operator only supports running on GPU. The function of the transformer layer is consistent
@@ -776,7 +947,6 @@ def fused_multi_transformer(x,
             # required: gpu
             import paddle
             import paddle.incubate.nn.functional as F
-            import numpy as np
 
             # input: [batch_size, seq_len, embed_dim]
             x = paddle.rand(shape=(2, 4, 128), dtype="float32")
@@ -821,17 +991,46 @@ def fused_multi_transformer(x,
         raise ValueError(
             "mode argument should be 'downscale_in_infer' or 'upscale_in_train'"
         )
-    mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+    mode = (
+        'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+    )  # semantic transfer
 
     if _non_static_mode():
         cache_kv_out, final_out = _legacy_C_ops.fused_multi_transformer(
-            x, ln_scales, ln_biases, qkv_weights, qkv_biases, cache_kvs,
-            time_step, attn_mask, linear_weights, linear_biases, ffn_ln_scales,
-            ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases,
-            cache_kvs, 'pre_layer_norm', pre_layer_norm, 'epsilon', epsilon,
-            'dropout_rate', dropout_rate, 'is_test', not training,
-            'dropout_implementation', mode, 'act_method', activation,
-            'trans_qkvw', trans_qkvw, 'ring_id', ring_id)
+            x,
+            ln_scales,
+            ln_biases,
+            qkv_weights,
+            qkv_biases,
+            cache_kvs,
+            time_step,
+            attn_mask,
+            linear_weights,
+            linear_biases,
+            ffn_ln_scales,
+            ffn_ln_biases,
+            ffn1_weights,
+            ffn1_biases,
+            ffn2_weights,
+            ffn2_biases,
+            cache_kvs,
+            'pre_layer_norm',
+            pre_layer_norm,
+            'epsilon',
+            epsilon,
+            'dropout_rate',
+            dropout_rate,
+            'is_test',
+            not training,
+            'dropout_implementation',
+            mode,
+            'act_method',
+            activation,
+            'trans_qkvw',
+            trans_qkvw,
+            'ring_id',
+            ring_id,
+        )
         if cache_kvs is not None:
             return final_out, cache_kv_out
         return final_out
@@ -839,10 +1038,12 @@ def fused_multi_transformer(x,
         helper = LayerHelper('fused_multi_transformer', **locals())
         dtype = x.dtype
         # check dtypes
-        check_variable_and_dtype(x, 'x', ['float16', 'float32'],
-                                 'fused_multi_transformer')
-        check_dtype(dtype, 'dtype', ['float16', 'float32'],
-                    'fused_multi_transformer')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32'], 'fused_multi_transformer'
+        )
+        check_dtype(
+            dtype, 'dtype', ['float16', 'float32'], 'fused_multi_transformer'
+        )
 
         # set inputs
         inputs = dict()
@@ -880,7 +1081,7 @@ def fused_multi_transformer(x,
             'dropout_implementation': mode,
             'act_method': activation,
             'trans_qkvw': trans_qkvw,
-            'ring_id': ring_id
+            'ring_id': ring_id,
         }
 
         outputs = dict()
@@ -890,9 +1091,11 @@ def fused_multi_transformer(x,
             # NOTE: inplace
             outputs['CacheKVOut'] = cache_kvs
 
-        helper.append_op(type='fused_multi_transformer',
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type='fused_multi_transformer',
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+        )
 
         return (final_out, cache_kvs) if cache_kvs else final_out
diff --git a/python/paddle/incubate/nn/layer/fused_transformer.py b/python/paddle/incubate/nn/layer/fused_transformer.py
index ba14ac5b865299..c3655c9d93a274 100644
--- a/python/paddle/incubate/nn/layer/fused_transformer.py
+++ b/python/paddle/incubate/nn/layer/fused_transformer.py
@@ -16,7 +16,10 @@
 from paddle.nn import Layer
 from paddle.framework import ParamAttr
 import paddle
-from paddle.nn.layer.transformer import _convert_attention_mask, _convert_param_attr_to_list
+from paddle.nn.layer.transformer import (
+    _convert_attention_mask,
+    _convert_param_attr_to_list,
+)
 from paddle.nn.initializer import Constant
 from paddle.fluid.dygraph import no_grad
 from paddle.fluid.framework import convert_np_dtype_to_dtype_, _non_static_mode
@@ -51,7 +54,8 @@ def _to_dtype(t, dtype):
     if t.place.is_gpu_place():
         size_dtype = core.size_of_dtype(dtype)
         waiting_alloc_memory = (
-            (np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+            ((np.prod(t.shape) * size_dtype) / 256 + 1) * 256 * 1.2
+        )
         gpu_memory_available = core.gpu_memory_available()
         if gpu_memory_available < waiting_alloc_memory:
             t_used = t._copy_to(paddle.CPUPlace(), False)
@@ -106,31 +110,38 @@ class FusedBiasDropoutResidualLayerNorm(Layer):
             output = fused_bias_dropout_residual_ln(x, residual)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 embed_dim,
-                 dropout_rate=0.5,
-                 weight_attr=None,
-                 bias_attr=None,
-                 epsilon=1e-5,
-                 name=None):
+    def __init__(
+        self,
+        embed_dim,
+        dropout_rate=0.5,
+        weight_attr=None,
+        bias_attr=None,
+        epsilon=1e-5,
+        name=None,
+    ):
         super(FusedBiasDropoutResidualLayerNorm, self).__init__()
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but recieved {}".format(embed_dim))
+        assert embed_dim > 0, (
+            "Expected embed_dim to be greater than 0, "
+            "but recieved {}".format(embed_dim)
+        )
         self._dtype = self._helper.get_default_dtype()
         self._bias_attr = bias_attr
         self._weight_attr = weight_attr
         self.embed_dim = embed_dim
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
-                                                 attr=self._bias_attr,
-                                                 dtype=self._dtype,
-                                                 is_bias=True)
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
         self.ln_scale = self.create_parameter(
             attr=self._weight_attr,
             shape=[embed_dim],
-            default_initializer=Constant(value=1.0))
-        self.ln_bias = self.create_parameter(attr=self._bias_attr,
-                                             shape=[embed_dim],
-                                             is_bias=True)
+            default_initializer=Constant(value=1.0),
+        )
+        self.ln_bias = self.create_parameter(
+            attr=self._bias_attr, shape=[embed_dim], is_bias=True
+        )
         self.dropout_rate = dropout_rate
         self._epsilon = epsilon
 
@@ -163,14 +174,20 @@ def forward(self, x, residual):
             ln_epsilon=self._epsilon,
             training=self.training,
             mode='upscale_in_train',
-            name=self.name)
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'embed_dim={}, seq_len={}, dropout_rate={}, epsilon={}, dtype={}{}'.format(
-            self.embed_dim, self.seq_len, self.dropout_rate, self._epsilon,
-            self._dtype, name_str)
+            self.embed_dim,
+            self.seq_len,
+            self.dropout_rate,
+            self._epsilon,
+            self._dtype,
+            name_str,
+        )
 
 
 class FusedMultiHeadAttention(Layer):
@@ -246,33 +263,40 @@ class FusedMultiHeadAttention(Layer):
             output = multi_head_attn(query, None, None, attn_mask=attn_mask)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dropout_rate=0.5,
-                 attn_dropout_rate=0.5,
-                 kdim=None,
-                 vdim=None,
-                 normalize_before=False,
-                 need_weights=False,
-                 qkv_weight_attr=None,
-                 qkv_bias_attr=None,
-                 linear_weight_attr=None,
-                 linear_bias_attr=None,
-                 pre_ln_scale_attr=None,
-                 pre_ln_bias_attr=None,
-                 ln_scale_attr=None,
-                 ln_bias_attr=None,
-                 epsilon=1e-5,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dropout_rate=0.5,
+        attn_dropout_rate=0.5,
+        kdim=None,
+        vdim=None,
+        normalize_before=False,
+        need_weights=False,
+        qkv_weight_attr=None,
+        qkv_bias_attr=None,
+        linear_weight_attr=None,
+        linear_bias_attr=None,
+        pre_ln_scale_attr=None,
+        pre_ln_bias_attr=None,
+        ln_scale_attr=None,
+        ln_bias_attr=None,
+        epsilon=1e-5,
+        nranks=1,
+        ring_id=-1,
+        name=None,
+    ):
         super(FusedMultiHeadAttention, self).__init__()
 
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but received {}".format(embed_dim))
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but received {}".format(num_heads))
+        assert embed_dim > 0, (
+            "Expected embed_dim to be greater than 0, "
+            "but received {}".format(embed_dim)
+        )
+        assert (
+            num_heads > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -285,7 +309,9 @@ def __init__(self,
         self.kdim = kdim
         self.vdim = vdim
         self.need_weights = need_weights
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
         assert need_weights is False, "Only support need_weight is False now."
 
         # tensor model parallel
@@ -296,21 +322,26 @@ def __init__(self,
             shape=[3, num_heads, self.head_dim, embed_dim],
             attr=qkv_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
         self.qkv_bias = self.create_parameter(
             shape=[3, num_heads, self.head_dim],
             attr=qkv_bias_attr,
             dtype=self._dtype,
-            is_bias=True)
+            is_bias=True,
+        )
         self.linear_weight = self.create_parameter(
             shape=[num_heads * self.head_dim, embed_dim],
             attr=linear_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
-        self.linear_bias = self.create_parameter(shape=[embed_dim],
-                                                 attr=linear_bias_attr,
-                                                 dtype=self._dtype,
-                                                 is_bias=True)
+            is_bias=False,
+        )
+        self.linear_bias = self.create_parameter(
+            shape=[embed_dim],
+            attr=linear_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
         # tensor model parallel
         if nranks > 1:
@@ -325,10 +356,11 @@ def __init__(self,
             self.pre_ln_scale = self.create_parameter(
                 attr=pre_ln_scale_attr,
                 shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.pre_ln_bias = self.create_parameter(attr=pre_ln_bias_attr,
-                                                     shape=[embed_dim],
-                                                     is_bias=True)
+                default_initializer=Constant(value=1.0),
+            )
+            self.pre_ln_bias = self.create_parameter(
+                attr=pre_ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
             self.ln_scale = None
             self.ln_bias = None
         else:
@@ -337,10 +369,11 @@ def __init__(self,
             self.ln_scale = self.create_parameter(
                 attr=ln_scale_attr,
                 shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            self.ln_bias = self.create_parameter(attr=ln_bias_attr,
-                                                 shape=[embed_dim],
-                                                 is_bias=True)
+                default_initializer=Constant(value=1.0),
+            )
+            self.ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
 
         self.dropout_rate = dropout_rate
         self.attn_dropout_rate = attn_dropout_rate
@@ -404,15 +437,25 @@ def forward(self, query, key=None, value=None, attn_mask=None, cache=None):
             ln_epsilon=self._epsilon,
             training=self.training,
             ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'embed_dim={}, num_heads={}, dropout_rate={}, attn_dropout_rate={}, epsilon={}, kdim={}, vdim={}, normalize_before={}, need_weights={}, dtype={}{}'.format(
-            self.embed_dim, self.num_heads, self.dropout_rate,
-            self.attn_dropout_rate, self._epsilon, self.kdim, self.vdim,
-            self.normalize_before, self.need_weights, self._dtype, name_str)
+            self.embed_dim,
+            self.num_heads,
+            self.dropout_rate,
+            self.attn_dropout_rate,
+            self._epsilon,
+            self.kdim,
+            self.vdim,
+            self.normalize_before,
+            self.need_weights,
+            self._dtype,
+            name_str,
+        )
 
     def _amp_decorate(self, dtype):
         # tmp fix for amp.decorator(O2)
@@ -495,33 +538,39 @@ class FusedFeedForward(Layer):
             # (1, 8, 8)
     """
 
-    def __init__(self,
-                 d_model,
-                 dim_feedforward,
-                 dropout_rate=0.1,
-                 epsilon=1e-05,
-                 activation="relu",
-                 act_dropout_rate=None,
-                 normalize_before=False,
-                 linear1_weight_attr=None,
-                 linear1_bias_attr=None,
-                 linear2_weight_attr=None,
-                 linear2_bias_attr=None,
-                 ln1_scale_attr=None,
-                 ln1_bias_attr=None,
-                 ln2_scale_attr=None,
-                 ln2_bias_attr=None,
-                 nranks=1,
-                 ring_id=-1,
-                 name=None):
+    def __init__(
+        self,
+        d_model,
+        dim_feedforward,
+        dropout_rate=0.1,
+        epsilon=1e-05,
+        activation="relu",
+        act_dropout_rate=None,
+        normalize_before=False,
+        linear1_weight_attr=None,
+        linear1_bias_attr=None,
+        linear2_weight_attr=None,
+        linear2_bias_attr=None,
+        ln1_scale_attr=None,
+        ln1_bias_attr=None,
+        ln2_scale_attr=None,
+        ln2_bias_attr=None,
+        nranks=1,
+        ring_id=-1,
+        name=None,
+    ):
 
         super(FusedFeedForward, self).__init__()
-        assert d_model > 0, (
-            "Expected d_model to be greater than 0, but received {}".format(
-                d_model))
-        assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but received {}".
-            format(dim_feedforward))
+        assert (
+            d_model > 0
+        ), "Expected d_model to be greater than 0, but received {}".format(
+            d_model
+        )
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
 
         self._dtype = self._helper.get_default_dtype()
         self._d_model = d_model
@@ -530,7 +579,9 @@ def __init__(self,
         dim_feedforward = dim_feedforward // nranks
         self._dim_feedforward = dim_feedforward
         self._dropout_rate = dropout_rate
-        self._act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+        self._act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
         self._act_method = activation
         self._normalize_before = normalize_before
         self._epsilon = epsilon
@@ -540,22 +591,28 @@ def __init__(self,
             shape=[d_model, dim_feedforward],
             attr=linear1_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
-        self._linear1_bias = self.create_parameter(shape=[dim_feedforward],
-                                                   attr=linear1_bias_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=True)
+            is_bias=False,
+        )
+        self._linear1_bias = self.create_parameter(
+            shape=[dim_feedforward],
+            attr=linear1_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
         self._linear2_weight = self.create_parameter(
             shape=[dim_feedforward, d_model],
             attr=linear2_weight_attr,
             dtype=self._dtype,
-            is_bias=False)
+            is_bias=False,
+        )
 
-        self._linear2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=linear2_bias_attr,
-                                                   dtype=self._dtype,
-                                                   is_bias=True)
+        self._linear2_bias = self.create_parameter(
+            shape=[d_model],
+            attr=linear2_bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
         if nranks > 1:
             assert ring_id != -1
@@ -569,10 +626,11 @@ def __init__(self,
                 shape=[d_model],
                 attr=ln1_scale_attr,
                 is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln1_bias = self.create_parameter(shape=[d_model],
-                                                   attr=ln1_bias_attr,
-                                                   is_bias=True)
+                default_initializer=Constant(1.0),
+            )
+            self._ln1_bias = self.create_parameter(
+                shape=[d_model], attr=ln1_bias_attr, is_bias=True
+            )
             self._ln2_scale = None
             self._ln2_bias = None
         else:
@@ -582,10 +640,11 @@ def __init__(self,
                 shape=[d_model],
                 attr=ln2_scale_attr,
                 is_bias=False,
-                default_initializer=Constant(1.0))
-            self._ln2_bias = self.create_parameter(shape=[d_model],
-                                                   attr=ln2_bias_attr,
-                                                   is_bias=True)
+                default_initializer=Constant(1.0),
+            )
+            self._ln2_bias = self.create_parameter(
+                shape=[d_model], attr=ln2_bias_attr, is_bias=True
+            )
 
         self.name = name
 
@@ -608,15 +667,23 @@ def forward(self, src, cache=None):
             pre_layer_norm=self._normalize_before,
             training=self.training,
             ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'd_model={}, dim_feedforward={}, dropout_rate={}, epsilon={}, activation={}, act_dropout_rate={}, normalize_before={}, dtype={}{}'.format(
-            self._d_model, self._dim_feedforward, self._dropout_rate,
-            self._epsilon, self._act_method, self._act_dropout_rate,
-            self._normalize_before, self._dtype, name_str)
+            self._d_model,
+            self._dim_feedforward,
+            self._dropout_rate,
+            self._epsilon,
+            self._act_method,
+            self._act_dropout_rate,
+            self._normalize_before,
+            self._dtype,
+            name_str,
+        )
 
     def _amp_decorate(self, dtype):
         # tmp fix for amp.decorator(O2)
@@ -640,6 +707,7 @@ def _amp_decorate(self, dtype):
 
 class FusedTransformerEncoderLayer(Layer):
     """
+
     FusedTransformerEncoderLayer is composed of two sub-layers which are self (multi-head)
     attention and feedforward network. Before and after each sub-layer, pre-process
     and post-precess would be applied on the input and output accordingly. If
@@ -681,10 +749,9 @@ class FusedTransformerEncoderLayer(Layer):
 
 
     Examples:
-
         .. code-block:: python
 
-	    # required: gpu
+            # required: gpu
             import paddle
             from paddle.incubate.nn import FusedTransformerEncoderLayer
 
@@ -694,33 +761,47 @@ class FusedTransformerEncoderLayer(Layer):
             attn_mask = paddle.rand((2, 2, 4, 4))
             encoder_layer = FusedTransformerEncoderLayer(128, 2, 512)
             enc_output = encoder_layer(enc_input, attn_mask)  # [2, 4, 128]
+
     """
 
-    def __init__(self,
-                 d_model,
-                 nhead,
-                 dim_feedforward,
-                 dropout_rate=0.1,
-                 activation="relu",
-                 attn_dropout_rate=None,
-                 act_dropout_rate=None,
-                 normalize_before=False,
-                 weight_attr=None,
-                 bias_attr=None):
+    def __init__(
+        self,
+        d_model,
+        nhead,
+        dim_feedforward,
+        dropout_rate=0.1,
+        activation="relu",
+        attn_dropout_rate=None,
+        act_dropout_rate=None,
+        normalize_before=False,
+        weight_attr=None,
+        bias_attr=None,
+    ):
         self._config = locals()
         self._config.pop("self")
         self._config.pop("__class__", None)  # py3
 
         super(FusedTransformerEncoderLayer, self).__init__()
-        assert d_model > 0, ("Expected d_model to be greater than 0, "
-                             "but received {}".format(d_model))
-        assert nhead > 0, ("Expected nhead to be greater than 0, "
-                           "but received {}".format(nhead))
+        assert (
+            d_model > 0
+        ), "Expected d_model to be greater than 0, " "but received {}".format(
+            d_model
+        )
+        assert (
+            nhead > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            nhead
+        )
         assert dim_feedforward > 0, (
             "Expected dim_feedforward to be greater than 0, "
-            "but received {}".format(dim_feedforward))
-        attn_dropout_rate = dropout_rate if attn_dropout_rate is None else attn_dropout_rate
-        act_dropout_rate = dropout_rate if act_dropout_rate is None else act_dropout_rate
+            "but received {}".format(dim_feedforward)
+        )
+        attn_dropout_rate = (
+            dropout_rate if attn_dropout_rate is None else attn_dropout_rate
+        )
+        act_dropout_rate = (
+            dropout_rate if act_dropout_rate is None else act_dropout_rate
+        )
         self.normalize_before = normalize_before
 
         weight_attrs = _convert_param_attr_to_list(weight_attr, 2)
@@ -739,22 +820,27 @@ def __init__(self,
             pre_ln_scale_attr=weight_attrs[0],
             pre_ln_bias_attr=bias_attrs[0],
             ln_scale_attr=weight_attrs[0],
-            ln_bias_attr=bias_attrs[0])
-
-        self.ffn = FusedFeedForward(d_model,
-                                    dim_feedforward,
-                                    dropout_rate=dropout_rate,
-                                    activation=activation,
-                                    act_dropout_rate=act_dropout_rate,
-                                    normalize_before=self.normalize_before,
-                                    linear1_weight_attr=weight_attrs[1],
-                                    linear1_bias_attr=bias_attrs[1],
-                                    linear2_weight_attr=weight_attrs[1],
-                                    linear2_bias_attr=bias_attrs[1])
+            ln_bias_attr=bias_attrs[0],
+        )
+
+        self.ffn = FusedFeedForward(
+            d_model,
+            dim_feedforward,
+            dropout_rate=dropout_rate,
+            activation=activation,
+            act_dropout_rate=act_dropout_rate,
+            normalize_before=self.normalize_before,
+            linear1_weight_attr=weight_attrs[1],
+            linear1_bias_attr=bias_attrs[1],
+            linear2_weight_attr=weight_attrs[1],
+            linear2_bias_attr=bias_attrs[1],
+        )
 
     def forward(self, src, src_mask=None, cache=None):
         """
+
         Applies a Transformer encoder layer on the input.
+
         Parameters:
             src (Tensor): The input of Transformer encoder layer. It is
                 a tensor with shape `[batch_size, sequence_length, d_model]`.
@@ -770,25 +856,27 @@ def forward(self, src, src_mask=None, cache=None):
                 `-INF` values and the others have 0 values. It can be None when
                 nothing wanted or needed to be prevented attention to. Default None.
             cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`.
-                See `TransformerEncoderLayer.gen_cache` for more details. It is
+                See :ref:`api_paddle_nn_TransformerEncoderLayer`.gen_cache for more details. It is
                 only used for inference and should be None for training. Default
                 None.
+
         Returns:
-            Tensor|tuple: It is a tensor that has the same shape and data type \
+            Tensor|tuple, It is a tensor that has the same shape and data type \
                 as `enc_input`, representing the output of Transformer encoder \
                 layer. Or a tuple if `cache` is not None, except for encoder \
                 layer output, the tuple includes the new cache which is same \
                 as input `cache` argument but `incremental_cache` has an \
                 incremental length. See `MultiHeadAttention.gen_cache` and \
                 `MultiHeadAttention.forward` for more details.
+
         """
         src_mask = _convert_attention_mask(src_mask, src.dtype)
         if cache is None:
             attn_out = self.fused_attn(src, attn_mask=src_mask)
         else:
-            attn_out, incremental_cache = self.fused_attn(src,
-                                                          attn_mask=src_mask,
-                                                          cache=cache)
+            attn_out, incremental_cache = self.fused_attn(
+                src, attn_mask=src_mask, cache=cache
+            )
 
         ffn_out = self.ffn(attn_out)
 
@@ -889,21 +977,23 @@ class FusedTransformer(Layer):
                                  cross_attn_mask)  # [2, 6, 128]
     """
 
-    def __init__(self,
-                 d_model=512,
-                 nhead=8,
-                 num_encoder_layers=6,
-                 num_decoder_layers=6,
-                 dim_feedforward=2048,
-                 dropout=0.1,
-                 activation="relu",
-                 attn_dropout=None,
-                 act_dropout=None,
-                 normalize_before=False,
-                 weight_attr=None,
-                 bias_attr=None,
-                 custom_encoder=None,
-                 custom_decoder=None):
+    def __init__(
+        self,
+        d_model=512,
+        nhead=8,
+        num_encoder_layers=6,
+        num_decoder_layers=6,
+        dim_feedforward=2048,
+        dropout=0.1,
+        activation="relu",
+        attn_dropout=None,
+        act_dropout=None,
+        normalize_before=False,
+        weight_attr=None,
+        bias_attr=None,
+        custom_encoder=None,
+        custom_decoder=None,
+    ):
         super(fusedTransformer, self).__init__()
         raise NotImplementedError()
 
@@ -1071,40 +1161,49 @@ class FusedMultiTransformer(Layer):
             enc_output = encoder_layers(enc_input, attn_mask)  # [2, 4, 128]
     """
 
-    def __init__(self,
-                 embed_dim,
-                 num_heads,
-                 dim_feedforward,
-                 dropout_rate=0.0,
-                 activation="gelu",
-                 normalize_before=True,
-                 ln_scale_attrs=None,
-                 ln_bias_attrs=None,
-                 qkv_weight_attrs=None,
-                 qkv_bias_attrs=None,
-                 linear_weight_attrs=None,
-                 linear_bias_attrs=None,
-                 ffn_ln_scale_attrs=None,
-                 ffn_ln_bias_attrs=None,
-                 ffn1_weight_attrs=None,
-                 ffn1_bias_attrs=None,
-                 ffn2_weight_attrs=None,
-                 ffn2_bias_attrs=None,
-                 epsilon=1e-5,
-                 num_layers=-1,
-                 nranks=1,
-                 trans_qkvw=True,
-                 ring_id=-1,
-                 name=None):
+    def __init__(
+        self,
+        embed_dim,
+        num_heads,
+        dim_feedforward,
+        dropout_rate=0.0,
+        activation="gelu",
+        normalize_before=True,
+        ln_scale_attrs=None,
+        ln_bias_attrs=None,
+        qkv_weight_attrs=None,
+        qkv_bias_attrs=None,
+        linear_weight_attrs=None,
+        linear_bias_attrs=None,
+        ffn_ln_scale_attrs=None,
+        ffn_ln_bias_attrs=None,
+        ffn1_weight_attrs=None,
+        ffn1_bias_attrs=None,
+        ffn2_weight_attrs=None,
+        ffn2_bias_attrs=None,
+        epsilon=1e-5,
+        num_layers=-1,
+        nranks=1,
+        trans_qkvw=True,
+        ring_id=-1,
+        name=None,
+    ):
         super(FusedMultiTransformer, self).__init__()
 
-        assert embed_dim > 0, ("Expected embed_dim to be greater than 0, "
-                               "but received {}".format(embed_dim))
-        assert num_heads > 0, ("Expected nhead to be greater than 0, "
-                               "but received {}".format(num_heads))
-        assert dim_feedforward > 0, (
-            "Expected dim_feedforward to be greater than 0, but received {}".
-            format(dim_feedforward))
+        assert embed_dim > 0, (
+            "Expected embed_dim to be greater than 0, "
+            "but received {}".format(embed_dim)
+        )
+        assert (
+            num_heads > 0
+        ), "Expected nhead to be greater than 0, " "but received {}".format(
+            num_heads
+        )
+        assert (
+            dim_feedforward > 0
+        ), "Expected dim_feedforward to be greater than 0, but received {}".format(
+            dim_feedforward
+        )
 
         self.normalize_before = normalize_before
         self._dtype = self._helper.get_default_dtype()
@@ -1115,7 +1214,9 @@ def __init__(self,
         self.embed_dim = embed_dim
         self.num_heads = num_heads
         self.head_dim = embed_dim // num_heads
-        assert self.head_dim * num_heads == embed_dim, "embed_dim must be divisible by num_heads"
+        assert (
+            self.head_dim * num_heads == embed_dim
+        ), "embed_dim must be divisible by num_heads"
 
         # tensor model parallel
         if nranks > 1:
@@ -1161,57 +1262,71 @@ def get_attr(attrs, idx):
             ln_scale = self.create_parameter(
                 attr=ln_scale_attr,
                 shape=[embed_dim],
-                default_initializer=Constant(value=1.0))
-            ln_bias = self.create_parameter(attr=ln_bias_attr,
-                                            shape=[embed_dim],
-                                            is_bias=True)
+                default_initializer=Constant(value=1.0),
+            )
+            ln_bias = self.create_parameter(
+                attr=ln_bias_attr, shape=[embed_dim], is_bias=True
+            )
             qkv_weight = self.create_parameter(
                 shape=[3, num_heads, self.head_dim, embed_dim]
-                if trans_qkvw else [embed_dim, 3, num_heads, self.head_dim],
+                if trans_qkvw
+                else [embed_dim, 3, num_heads, self.head_dim],
                 attr=qkv_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
+                is_bias=False,
+            )
             qkv_bias = self.create_parameter(
                 shape=[3, num_heads, self.head_dim],
                 attr=qkv_bias_attr,
                 dtype=self._dtype,
-                is_bias=True)
+                is_bias=True,
+            )
             linear_weight = self.create_parameter(
                 shape=[num_heads * self.head_dim, embed_dim],
                 attr=linear_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
-            linear_bias = self.create_parameter(shape=[embed_dim],
-                                                attr=linear_bias_attr,
-                                                dtype=self._dtype,
-                                                is_bias=True)
+                is_bias=False,
+            )
+            linear_bias = self.create_parameter(
+                shape=[embed_dim],
+                attr=linear_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
 
             ffn_ln_scale = self.create_parameter(
                 shape=[embed_dim],
                 attr=ffn_ln_scale_attr,
                 is_bias=False,
-                default_initializer=Constant(1.0))
-            ffn_ln_bias = self.create_parameter(shape=[embed_dim],
-                                                attr=ffn_ln_bias_attr,
-                                                is_bias=True)
+                default_initializer=Constant(1.0),
+            )
+            ffn_ln_bias = self.create_parameter(
+                shape=[embed_dim], attr=ffn_ln_bias_attr, is_bias=True
+            )
             ffn1_weight = self.create_parameter(
                 shape=[embed_dim, dim_feedforward],
                 attr=ffn1_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
-            ffn1_bias = self.create_parameter(shape=[dim_feedforward],
-                                              attr=ffn1_bias_attr,
-                                              dtype=self._dtype,
-                                              is_bias=True)
+                is_bias=False,
+            )
+            ffn1_bias = self.create_parameter(
+                shape=[dim_feedforward],
+                attr=ffn1_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
             ffn2_weight = self.create_parameter(
                 shape=[dim_feedforward, embed_dim],
                 attr=ffn2_weight_attr,
                 dtype=self._dtype,
-                is_bias=False)
-            ffn2_bias = self.create_parameter(shape=[embed_dim],
-                                              attr=ffn2_bias_attr,
-                                              dtype=self._dtype,
-                                              is_bias=True)
+                is_bias=False,
+            )
+            ffn2_bias = self.create_parameter(
+                shape=[embed_dim],
+                attr=ffn2_bias_attr,
+                dtype=self._dtype,
+                is_bias=True,
+            )
 
             # tensor model parallel
             if nranks > 1:
@@ -1300,5 +1415,6 @@ def forward(self, src, attn_mask=None, caches=None, time_step=None):
             mode='upscale_in_train',
             trans_qkvw=self._trans_qkvw,
             ring_id=self._ring_id,
-            name=self.name)
+            name=self.name,
+        )
         return out
diff --git a/python/paddle/incubate/operators/graph_khop_sampler.py b/python/paddle/incubate/operators/graph_khop_sampler.py
index 58e0fdafab6793..b23eb5e6305164 100644
--- a/python/paddle/incubate/operators/graph_khop_sampler.py
+++ b/python/paddle/incubate/operators/graph_khop_sampler.py
@@ -20,104 +20,134 @@
 from paddle import _C_ops, _legacy_C_ops
 
 
-def graph_khop_sampler(row,
-                       colptr,
-                       input_nodes,
-                       sample_sizes,
-                       sorted_eids=None,
-                       return_eids=False,
-                       name=None):
+def graph_khop_sampler(
+    row,
+    colptr,
+    input_nodes,
+    sample_sizes,
+    sorted_eids=None,
+    return_eids=False,
+    name=None,
+):
     """
+
     Graph Khop Sampler API.
 
-    This API is mainly used in Graph Learning domain, and the main purpose is to 
+    This API is mainly used in Graph Learning domain, and the main purpose is to
     provide high performance graph khop sampling method with subgraph reindex step.
     For example, we get the CSC(Compressed Sparse Column) format of the input graph
-    edges as `row` and `colptr`, so as to covert graph data into a suitable format 
+    edges as `row` and `colptr`, so as to covert graph data into a suitable format
     for sampling. And the `input_nodes` means the nodes we need to sample neighbors,
     and `sample_sizes` means the number of neighbors and number of layers we want
-    to sample. 
+    to sample.
 
     Args:
-        row (Tensor): One of the components of the CSC format of the input graph, and 
+        row (Tensor): One of the components of the CSC format of the input graph, and
                       the shape should be [num_edges, 1] or [num_edges]. The available
                       data type is int32, int64.
         colptr (Tensor): One of the components of the CSC format of the input graph,
-                         and the shape should be [num_nodes + 1, 1] or [num_nodes]. 
+                         and the shape should be [num_nodes + 1, 1] or [num_nodes].
                          The data type should be the same with `row`.
-        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the 
+        input_nodes (Tensor): The input nodes we need to sample neighbors for, and the
                               data type should be the same with `row`.
         sample_sizes (list|tuple): The number of neighbors and number of layers we want
                                    to sample. The data type should be int, and the shape
                                    should only have one dimension.
-        sorted_eids (Tensor): The sorted edge ids, should not be None when `return_eids`
+        sorted_eids (Tensor, optional): The sorted edge ids, should not be None when `return_eids`
                               is True. The shape should be [num_edges, 1], and the data
-                              type should be the same with `row`.
-        return_eids (bool): Whether to return the id of the sample edges. Default is False.
+                              type should be the same with `row`. Default is None.
+        return_eids (bool, optional): Whether to return the id of the sample edges. Default is False.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        edge_src (Tensor): The src index of the output edges, also means the first column of 
-                           the edges. The shape is [num_sample_edges, 1] currently.
-        edge_dst (Tensor): The dst index of the output edges, also means the second column
-                           of the edges. The shape is [num_sample_edges, 1] currently.
-        sample_index (Tensor): The original id of the input nodes and sampled neighbor nodes.
-        reindex_nodes (Tensor): The reindex id of the input nodes.
-        edge_eids (Tensor): Return the id of the sample edges if `return_eids` is True.
+        - edge_src (Tensor), The src index of the output edges, also means the first column of
+          the edges. The shape is [num_sample_edges, 1] currently.
+        - edge_dst (Tensor), The dst index of the output edges, also means the second column
+          of the edges. The shape is [num_sample_edges, 1] currently.
+        - sample_index (Tensor), The original id of the input nodes and sampled neighbor nodes.
+        - reindex_nodes (Tensor), The reindex id of the input nodes.
+        - edge_eids (Tensor), Return the id of the sample edges if `return_eids` is True.
 
     Examples:
-        
         .. code-block:: python
 
-        import paddle
+            import paddle
+
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            nodes = [0, 8, 1, 2]
+            sample_sizes = [2, 2]
+            row = paddle.to_tensor(row, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
+            nodes = paddle.to_tensor(nodes, dtype="int64")
 
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        nodes = [0, 8, 1, 2]
-        sample_sizes = [2, 2]
-        row = paddle.to_tensor(row, dtype="int64")
-        colptr = paddle.to_tensor(colptr, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
-        
-        edge_src, edge_dst, sample_index, reindex_nodes = \
-            paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
+            edge_src, edge_dst, sample_index, reindex_nodes = paddle.incubate.graph_khop_sampler(row, colptr, nodes, sample_sizes, False)
 
     """
 
     if _non_static_mode():
         if return_eids:
             if sorted_eids is None:
-                raise ValueError(f"`sorted_eid` should not be None "
-                                 f"if return_eids is True.")
-            edge_src, edge_dst, sample_index, reindex_nodes, edge_eids = \
-                _legacy_C_ops.graph_khop_sampler(row, sorted_eids,
-                                              colptr, input_nodes,
-                                              "sample_sizes", sample_sizes,
-                                              "return_eids", True)
+                raise ValueError(
+                    f"`sorted_eid` should not be None "
+                    f"if return_eids is True."
+                )
+            (
+                edge_src,
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                edge_eids,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                sorted_eids,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                True,
+            )
             return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
         else:
-            edge_src, edge_dst, sample_index, reindex_nodes, _ = \
-                _legacy_C_ops.graph_khop_sampler(row, None,
-                                              colptr, input_nodes,
-                                              "sample_sizes", sample_sizes,
-                                              "return_eids", False)
+            (
+                edge_src,
+                edge_dst,
+                sample_index,
+                reindex_nodes,
+                _,
+            ) = _legacy_C_ops.graph_khop_sampler(
+                row,
+                None,
+                colptr,
+                input_nodes,
+                "sample_sizes",
+                sample_sizes,
+                "return_eids",
+                False,
+            )
             return edge_src, edge_dst, sample_index, reindex_nodes
 
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
-                             "graph_khop_sampler")
+    check_variable_and_dtype(
+        row, "Row", ("int32", "int64"), "graph_khop_sampler"
+    )
 
     if return_eids:
         if sorted_eids is None:
-            raise ValueError(f"`sorted_eid` should not be None "
-                             f"if return_eids is True.")
-        check_variable_and_dtype(sorted_eids, "Eids", ("int32", "int64"),
-                                 "graph_khop_sampler")
+            raise ValueError(
+                f"`sorted_eid` should not be None " f"if return_eids is True."
+            )
+        check_variable_and_dtype(
+            sorted_eids, "Eids", ("int32", "int64"), "graph_khop_sampler"
+        )
 
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
-                             "graph_khop_sampler")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
-                             "graph_khop_sampler")
+    check_variable_and_dtype(
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_khop_sampler"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_khop_sampler"
+    )
 
     helper = LayerHelper("graph_khop_sampler", **locals())
     edge_src = helper.create_variable_for_type_inference(dtype=row.dtype)
@@ -125,24 +155,23 @@ def graph_khop_sampler(row,
     sample_index = helper.create_variable_for_type_inference(dtype=row.dtype)
     reindex_nodes = helper.create_variable_for_type_inference(dtype=row.dtype)
     edge_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_khop_sampler",
-                     inputs={
-                         "Row": row,
-                         "Eids": sorted_eids,
-                         "Col_Ptr": colptr,
-                         "X": input_nodes
-                     },
-                     outputs={
-                         "Out_Src": edge_src,
-                         "Out_Dst": edge_dst,
-                         "Sample_Index": sample_index,
-                         "Reindex_X": reindex_nodes,
-                         "Out_Eids": edge_eids
-                     },
-                     attrs={
-                         "sample_sizes": sample_sizes,
-                         "return_eids": return_eids
-                     })
+    helper.append_op(
+        type="graph_khop_sampler",
+        inputs={
+            "Row": row,
+            "Eids": sorted_eids,
+            "Col_Ptr": colptr,
+            "X": input_nodes,
+        },
+        outputs={
+            "Out_Src": edge_src,
+            "Out_Dst": edge_dst,
+            "Sample_Index": sample_index,
+            "Reindex_X": reindex_nodes,
+            "Out_Eids": edge_eids,
+        },
+        attrs={"sample_sizes": sample_sizes, "return_eids": return_eids},
+    )
     if return_eids:
         return edge_src, edge_dst, sample_index, reindex_nodes, edge_eids
     else:
diff --git a/python/paddle/incubate/operators/graph_reindex.py b/python/paddle/incubate/operators/graph_reindex.py
index e7e940c2750cca..f1c771ba45cdc9 100644
--- a/python/paddle/incubate/operators/graph_reindex.py
+++ b/python/paddle/incubate/operators/graph_reindex.py
@@ -21,18 +21,23 @@
 import paddle.utils.deprecated as deprecated
 
 
-@deprecated(since="2.4.0",
-            update_to="paddle.geometric.reindex_graph",
-            level=1,
-            reason="paddle.incubate.graph_reindex will be removed in future")
-def graph_reindex(x,
-                  neighbors,
-                  count,
-                  value_buffer=None,
-                  index_buffer=None,
-                  flag_buffer_hashtable=False,
-                  name=None):
+@deprecated(
+    since="2.4.0",
+    update_to="paddle.geometric.reindex_graph",
+    level=1,
+    reason="paddle.incubate.graph_reindex will be removed in future",
+)
+def graph_reindex(
+    x,
+    neighbors,
+    count,
+    value_buffer=None,
+    index_buffer=None,
+    flag_buffer_hashtable=False,
+    name=None,
+):
     """
+
     Graph Reindex API.
 
     This API is mainly used in Graph Learning domain, which should be used
@@ -40,11 +45,11 @@ def graph_reindex(x,
     is to reindex the ids information of the input nodes, and return the 
     corresponding graph edges after reindex.
 
-    **Notes**: 
+    Notes:
         The number in x should be unique, otherwise it would cause potential errors.
-    Besides, we also support multi-edge-types neighbors reindexing. If we have different
-    edge_type neighbors for x, we should concatenate all the neighbors and count of x. 
-    We will reindex all the nodes from 0. 
+        Besides, we also support multi-edge-types neighbors reindexing. If we have different
+        edge_type neighbors for x, we should concatenate all the neighbors and count of x.
+        We will reindex all the nodes from 0.
 
     Take input nodes x = [0, 1, 2] as an example. 
     If we have neighbors = [8, 9, 0, 4, 7, 6, 7], and count = [2, 3, 2], 
@@ -58,98 +63,105 @@ def graph_reindex(x,
                             should be the same with `x`.
         count (Tensor): The neighbor count of the input nodes `x`. And the 
                         data type should be int32.
-        value_buffer (Tensor|None): Value buffer for hashtable. The data type should 
-                                    be int32, and should be filled with -1.
-        index_buffer (Tensor|None): Index buffer for hashtable. The data type should 
-                                    be int32, and should be filled with -1.
-        flag_buffer_hashtable (bool): Whether to use buffer for hashtable to speed up.
+        value_buffer (Tensor, optional): Value buffer for hashtable. The data type should
+                                    be int32, and should be filled with -1. Default is None.
+        index_buffer (Tensor, optional): Index buffer for hashtable. The data type should
+                                    be int32, and should be filled with -1. Default is None.
+        flag_buffer_hashtable (bool, optional): Whether to use buffer for hashtable to speed up.
                                       Default is False. Only useful for gpu version currently.
         name (str, optional): Name for the operation (optional, default is None).
                               For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
-        reindex_src (Tensor): The source node index of graph edges after reindex.
-        reindex_dst (Tensor): The destination node index of graph edges after reindex.
-        out_nodes (Tensor): The index of unique input nodes and neighbors before reindex,
-                            where we put the input nodes `x` in the front, and put neighbor
-                            nodes in the back.
+        - reindex_src (Tensor), The source node index of graph edges after reindex.
+        - reindex_dst (Tensor), The destination node index of graph edges after reindex.
+        - out_nodes (Tensor), The index of unique input nodes and neighbors before reindex,
+          where we put the input nodes `x` in the front, and put neighbor
+          nodes in the back.
 
     Examples:
-        
         .. code-block:: python
 
-        import paddle
-
-        x = [0, 1, 2]
-        neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
-        count_e1 = [2, 3, 2]
-        x = paddle.to_tensor(x, dtype="int64")
-        neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
-        count_e1 = paddle.to_tensor(count_e1, dtype="int32")
-
-        reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
-
-        neighbors_e2 = [0, 2, 3, 5, 1]
-        count_e2 = [1, 3, 1]
-        neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
-        count_e2 = paddle.to_tensor(count_e2, dtype="int32")
-        
-        neighbors = paddle.concat([neighbors_e1, neighbors_e2])
-        count = paddle.concat([count_e1, count_e2])
-        reindex_src, reindex_dst, out_nodes = \
-             paddle.incubate.graph_reindex(x, neighbors, count)
-        # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
-        # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
-        # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
+            import paddle
+
+            x = [0, 1, 2]
+            neighbors_e1 = [8, 9, 0, 4, 7, 6, 7]
+            count_e1 = [2, 3, 2]
+            x = paddle.to_tensor(x, dtype="int64")
+            neighbors_e1 = paddle.to_tensor(neighbors_e1, dtype="int64")
+            count_e1 = paddle.to_tensor(count_e1, dtype="int32")
+
+            reindex_src, reindex_dst, out_nodes = \
+                paddle.incubate.graph_reindex(x, neighbors_e1, count_e1)
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6]
+
+            neighbors_e2 = [0, 2, 3, 5, 1]
+            count_e2 = [1, 3, 1]
+            neighbors_e2 = paddle.to_tensor(neighbors_e2, dtype="int64")
+            count_e2 = paddle.to_tensor(count_e2, dtype="int32")
+
+            neighbors = paddle.concat([neighbors_e1, neighbors_e2])
+            count = paddle.concat([count_e1, count_e2])
+            reindex_src, reindex_dst, out_nodes = \
+                paddle.incubate.graph_reindex(x, neighbors, count)
+            # reindex_src: [3, 4, 0, 5, 6, 7, 6, 0, 2, 8, 9, 1]
+            # reindex_dst: [0, 0, 1, 1, 1, 2, 2, 0, 1, 1, 1, 2]
+            # out_nodes: [0, 1, 2, 8, 9, 4, 7, 6, 3, 5]
 
     """
     if flag_buffer_hashtable:
         if value_buffer is None or index_buffer is None:
-            raise ValueError(f"`value_buffer` and `index_buffer` should not"
-                             "be None if `flag_buffer_hashtable` is True.")
+            raise ValueError(
+                f"`value_buffer` and `index_buffer` should not"
+                "be None if `flag_buffer_hashtable` is True."
+            )
 
     if _non_static_mode():
-        reindex_src, reindex_dst, out_nodes = \
-            _legacy_C_ops.graph_reindex(x, neighbors, count, value_buffer, index_buffer,
-                                 "flag_buffer_hashtable", flag_buffer_hashtable)
+        reindex_src, reindex_dst, out_nodes = _legacy_C_ops.graph_reindex(
+            x,
+            neighbors,
+            count,
+            value_buffer,
+            index_buffer,
+            "flag_buffer_hashtable",
+            flag_buffer_hashtable,
+        )
         return reindex_src, reindex_dst, out_nodes
 
     check_variable_and_dtype(x, "X", ("int32", "int64"), "graph_reindex")
-    check_variable_and_dtype(neighbors, "Neighbors", ("int32", "int64"),
-                             "graph_reindex")
+    check_variable_and_dtype(
+        neighbors, "Neighbors", ("int32", "int64"), "graph_reindex"
+    )
     check_variable_and_dtype(count, "Count", ("int32"), "graph_reindex")
 
     if flag_buffer_hashtable:
-        check_variable_and_dtype(value_buffer, "HashTable_Value", ("int32"),
-                                 "graph_reindex")
-        check_variable_and_dtype(index_buffer, "HashTable_Index", ("int32"),
-                                 "graph_reindex")
+        check_variable_and_dtype(
+            value_buffer, "HashTable_Value", ("int32"), "graph_reindex"
+        )
+        check_variable_and_dtype(
+            index_buffer, "HashTable_Index", ("int32"), "graph_reindex"
+        )
 
     helper = LayerHelper("graph_reindex", **locals())
     reindex_src = helper.create_variable_for_type_inference(dtype=x.dtype)
     reindex_dst = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_nodes = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="graph_reindex",
-                     inputs={
-                         "X":
-                         x,
-                         "Neighbors":
-                         neighbors,
-                         "Count":
-                         count,
-                         "HashTable_Value":
-                         value_buffer if flag_buffer_hashtable else None,
-                         "HashTable_Index":
-                         index_buffer if flag_buffer_hashtable else None,
-                     },
-                     outputs={
-                         "Reindex_Src": reindex_src,
-                         "Reindex_Dst": reindex_dst,
-                         "Out_Nodes": out_nodes
-                     },
-                     attrs={"flag_buffer_hashtable": flag_buffer_hashtable})
+    helper.append_op(
+        type="graph_reindex",
+        inputs={
+            "X": x,
+            "Neighbors": neighbors,
+            "Count": count,
+            "HashTable_Value": value_buffer if flag_buffer_hashtable else None,
+            "HashTable_Index": index_buffer if flag_buffer_hashtable else None,
+        },
+        outputs={
+            "Reindex_Src": reindex_src,
+            "Reindex_Dst": reindex_dst,
+            "Out_Nodes": out_nodes,
+        },
+        attrs={"flag_buffer_hashtable": flag_buffer_hashtable},
+    )
     return reindex_src, reindex_dst, out_nodes
diff --git a/python/paddle/incubate/operators/graph_sample_neighbors.py b/python/paddle/incubate/operators/graph_sample_neighbors.py
index b230b2a45d58dc..980071b384b3f7 100644
--- a/python/paddle/incubate/operators/graph_sample_neighbors.py
+++ b/python/paddle/incubate/operators/graph_sample_neighbors.py
@@ -25,17 +25,21 @@
     since="2.4.0",
     update_to="paddle.geometric.sample_neighbors",
     level=1,
-    reason="paddle.incubate.graph_sample_neighbors will be removed in future")
-def graph_sample_neighbors(row,
-                           colptr,
-                           input_nodes,
-                           eids=None,
-                           perm_buffer=None,
-                           sample_size=-1,
-                           return_eids=False,
-                           flag_perm_buffer=False,
-                           name=None):
+    reason="paddle.incubate.graph_sample_neighbors will be removed in future",
+)
+def graph_sample_neighbors(
+    row,
+    colptr,
+    input_nodes,
+    eids=None,
+    perm_buffer=None,
+    sample_size=-1,
+    return_eids=False,
+    flag_perm_buffer=False,
+    name=None,
+):
     """
+
     Graph Sample Neighbors API.
 
     This API is mainly used in Graph Learning domain, and the main purpose is to
@@ -71,86 +75,109 @@ def graph_sample_neighbors(row,
                               For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out_neighbors (Tensor): The sample neighbors of the input nodes.
-        out_count (Tensor): The number of sampling neighbors of each input node, and the shape
-                            should be the same with `input_nodes`.
-        out_eids (Tensor): If `return_eids` is True, we will return the eid information of the 
-                           sample edges.
+        - out_neighbors (Tensor), The sample neighbors of the input nodes.
+        - out_count (Tensor), The number of sampling neighbors of each input node, and the shape should be the same with `input_nodes`.
+        - out_eids (Tensor), If `return_eids` is True, we will return the eid information of the sample edges.
 
     Examples:
         .. code-block:: python
-        import paddle
-        # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
-        #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
-        row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
-        colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
-        nodes = [0, 8, 1, 2]
-        sample_size = 2
-        row = paddle.to_tensor(row, dtype="int64")
-        colptr = paddle.to_tensor(colptr, dtype="int64")
-        nodes = paddle.to_tensor(nodes, dtype="int64")
-        out_neighbors, out_count = \
-            paddle.incubate.graph_sample_neighbors(row, colptr, nodes, 
-                                                   sample_size=sample_size)
+
+            import paddle
+            # edges: (3, 0), (7, 0), (0, 1), (9, 1), (1, 2), (4, 3), (2, 4),
+            #        (9, 5), (3, 5), (9, 6), (1, 6), (9, 8), (7, 8)
+            row = [3, 7, 0, 9, 1, 4, 2, 9, 3, 9, 1, 9, 7]
+            colptr = [0, 2, 4, 5, 6, 7, 9, 11, 11, 13, 13]
+            nodes = [0, 8, 1, 2]
+            sample_size = 2
+            row = paddle.to_tensor(row, dtype="int64")
+            colptr = paddle.to_tensor(colptr, dtype="int64")
+            nodes = paddle.to_tensor(nodes, dtype="int64")
+            out_neighbors, out_count = \
+                paddle.incubate.graph_sample_neighbors(row, colptr, nodes,
+                                                    sample_size=sample_size)
 
     """
 
     if return_eids:
         if eids is None:
             raise ValueError(
-                f"`eids` should not be None if `return_eids` is True.")
+                f"`eids` should not be None if `return_eids` is True."
+            )
 
     if flag_perm_buffer:
         if perm_buffer is None:
             raise ValueError(
                 f"`perm_buffer` should not be None if `flag_perm_buffer`"
-                "is True.")
+                "is True."
+            )
 
     if _non_static_mode():
-        out_neighbors, out_count, out_eids = _legacy_C_ops.graph_sample_neighbors(
-            row, colptr, input_nodes, eids, perm_buffer, "sample_size",
-            sample_size, "return_eids", return_eids, "flag_perm_buffer",
-            flag_perm_buffer)
+        (
+            out_neighbors,
+            out_count,
+            out_eids,
+        ) = _legacy_C_ops.graph_sample_neighbors(
+            row,
+            colptr,
+            input_nodes,
+            eids,
+            perm_buffer,
+            "sample_size",
+            sample_size,
+            "return_eids",
+            return_eids,
+            "flag_perm_buffer",
+            flag_perm_buffer,
+        )
         if return_eids:
             return out_neighbors, out_count, out_eids
         return out_neighbors, out_count
 
-    check_variable_and_dtype(row, "Row", ("int32", "int64"),
-                             "graph_sample_neighbors")
-    check_variable_and_dtype(colptr, "Col_Ptr", ("int32", "int64"),
-                             "graph_sample_neighbors")
-    check_variable_and_dtype(input_nodes, "X", ("int32", "int64"),
-                             "graph_sample_neighbors")
+    check_variable_and_dtype(
+        row, "Row", ("int32", "int64"), "graph_sample_neighbors"
+    )
+    check_variable_and_dtype(
+        colptr, "Col_Ptr", ("int32", "int64"), "graph_sample_neighbors"
+    )
+    check_variable_and_dtype(
+        input_nodes, "X", ("int32", "int64"), "graph_sample_neighbors"
+    )
     if return_eids:
-        check_variable_and_dtype(eids, "Eids", ("int32", "int64"),
-                                 "graph_sample_neighbors")
+        check_variable_and_dtype(
+            eids, "Eids", ("int32", "int64"), "graph_sample_neighbors"
+        )
     if flag_perm_buffer:
-        check_variable_and_dtype(perm_buffer, "Perm_Buffer", ("int32", "int64"),
-                                 "graph_sample_neighbors")
+        check_variable_and_dtype(
+            perm_buffer,
+            "Perm_Buffer",
+            ("int32", "int64"),
+            "graph_sample_neighbors",
+        )
 
     helper = LayerHelper("graph_sample_neighbors", **locals())
     out_neighbors = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_count = helper.create_variable_for_type_inference(dtype=row.dtype)
     out_eids = helper.create_variable_for_type_inference(dtype=row.dtype)
-    helper.append_op(type="graph_sample_neighbors",
-                     inputs={
-                         "Row": row,
-                         "Col_Ptr": colptr,
-                         "X": input_nodes,
-                         "Eids": eids if return_eids else None,
-                         "Perm_Buffer":
-                         perm_buffer if flag_perm_buffer else None
-                     },
-                     outputs={
-                         "Out": out_neighbors,
-                         "Out_Count": out_count,
-                         "Out_Eids": out_eids
-                     },
-                     attrs={
-                         "sample_size": sample_size,
-                         "return_eids": return_eids,
-                         "flag_perm_buffer": flag_perm_buffer
-                     })
+    helper.append_op(
+        type="graph_sample_neighbors",
+        inputs={
+            "Row": row,
+            "Col_Ptr": colptr,
+            "X": input_nodes,
+            "Eids": eids if return_eids else None,
+            "Perm_Buffer": perm_buffer if flag_perm_buffer else None,
+        },
+        outputs={
+            "Out": out_neighbors,
+            "Out_Count": out_count,
+            "Out_Eids": out_eids,
+        },
+        attrs={
+            "sample_size": sample_size,
+            "return_eids": return_eids,
+            "flag_perm_buffer": flag_perm_buffer,
+        },
+    )
     if return_eids:
         return out_neighbors, out_count, out_eids
     return out_neighbors, out_count
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index d230b6afca2995..3f3df92be5ebeb 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -13,13 +13,13 @@
 # limitations under the License.
 
 import os
+import paddle
 from paddle.fluid import framework, core, layers, unique_name
 from paddle.fluid.framework import Variable
 from paddle.fluid.clip import ClipGradByGlobalNorm
 from paddle.fluid.initializer import Constant
 from paddle.fluid.layer_helper import LayerHelper
 from paddle.fluid.optimizer import Optimizer
-from paddle.distributed import get_rank, get_world_size
 from paddle.distributed.collective import new_group
 from paddle.fluid.executor import global_scope
 from paddle.fluid.framework import name_scope
@@ -288,8 +288,8 @@ def _apply_gradients_impl(self, params_grads):
 
         step = self._get_or_create_step()
 
-        rank = get_rank()
-        nranks = get_world_size()
+        rank = paddle.distributed.get_rank()
+        nranks = paddle.distributed.get_world_size()
         if self._nproc_per_node is None:
             nproc_per_node = nranks
         else:
diff --git a/python/paddle/incubate/optimizer/lookahead.py b/python/paddle/incubate/optimizer/lookahead.py
index 8f70f321c0db61..c04eaadffd8c1f 100644
--- a/python/paddle/incubate/optimizer/lookahead.py
+++ b/python/paddle/incubate/optimizer/lookahead.py
@@ -14,7 +14,14 @@
 
 from paddle.optimizer import Optimizer
 from paddle.fluid import core, framework, layers, unique_name
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid.framework import (
+    Program,
+    Variable,
+    name_scope,
+    default_main_program,
+    default_startup_program,
+    device_guard,
+)
 from paddle.fluid.layer_helper import LayerHelper
 import paddle
 import numpy as np
@@ -29,18 +36,18 @@ class LookAhead(Optimizer):
     paper : https://arxiv.org/abs/1907.08610.
 
     Lookahead keeps two sets of params: the fast_params and
-    the slow_params. inner_optimizer update fast_params every 
-    training step. Lookahead updates the slow_params and fast_params 
+    the slow_params. inner_optimizer update fast_params every
+    training step. Lookahead updates the slow_params and fast_params
     every k training steps as follows:
 
     .. math::
-        
+
         slow\_param_t &= slow\_param_{t-1} + \\alpha * (fast\_param_{t-1} - slow\_param_{t-1})
-	    
+
         fast\_param_t &=  slow\_param_t
 
     Args:
-        inner_optimizer (Optimizer): The optimizer that update fast params step by step. 
+        inner_optimizer (Optimizer): The optimizer that update fast params step by step.
         alpha (float, optinal): The learning rate of Lookahead. The default value is 0.5.
         k (int, optinal): The slow params is updated every k steps. The default value is 5.
         name (str, optional): Normally there is no need for user to set this property.
@@ -50,7 +57,7 @@ class LookAhead(Optimizer):
     Examples:
 
         .. code-block:: python
-        
+
             import numpy as np
             import paddle
             import paddle.nn as nn
@@ -109,31 +116,34 @@ def train(layer, loader, loss_fn, opt):
                 shuffle=True,
                 drop_last=True,
                 num_workers=2)
-            
+
             train(layer, loader, loss_fn, lookahead)
 
     """
     _slow_str = "slow"
 
     def __init__(self, inner_optimizer, alpha=0.5, k=5, name=None):
-        assert (inner_optimizer is not None), "inner optimizer can not be None"
+        assert inner_optimizer is not None, "inner optimizer can not be None"
         assert (
             0.0 <= alpha <= 1.0
         ), "alpha should be larger or equal to 0.0, and less or equal than 1.0"
-        assert (isinstance(k, int) and k > 0), "k should be a positive integer"
+        assert isinstance(k, int) and k > 0, "k should be a positive integer"
 
         self.inner_optimizer = inner_optimizer
         if self.inner_optimizer._parameter_list is None:
-            parameters = framework.default_main_program().global_block(
-            ).all_parameters()
+            parameters = (
+                framework.default_main_program().global_block().all_parameters()
+            )
         else:
             parameters = self.inner_optimizer._parameter_list
 
-        super(LookAhead, self).__init__(learning_rate=alpha,
-                                        parameters=parameters,
-                                        weight_decay=None,
-                                        grad_clip=None,
-                                        name=name)
+        super(LookAhead, self).__init__(
+            learning_rate=alpha,
+            parameters=parameters,
+            weight_decay=None,
+            grad_clip=None,
+            name=name,
+        )
 
         self.alpha = alpha
         self.k = k
@@ -147,7 +157,7 @@ def __init__(self, inner_optimizer, alpha=0.5, k=5, name=None):
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -156,8 +166,7 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                inp = paddle.to_tensor(np.random.random([1, 10]).astype('float32'))
+                inp = paddle.rand([1,10], dtype="float32")
                 linear = paddle.nn.Linear(10, 1)
                 out = linear(inp)
                 loss = paddle.mean(out)
@@ -179,9 +188,9 @@ def step(self):
                 grad_var = param._grad_ivar()
                 params_grads.append((param, grad_var))
 
-        self._apply_optimize(loss=None,
-                             startup_program=None,
-                             params_grads=params_grads)
+        self._apply_optimize(
+            loss=None, startup_program=None, params_grads=params_grads
+        )
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -196,24 +205,28 @@ def _increment_global_var(self):
                 shape=[1],
                 value=0,
                 dtype='int32',
-                persistable=True)
+                persistable=True,
+            )
 
-        self.helper.append_op(type='increment',
-                              inputs={'X': [self._global_step_var]},
-                              outputs={'Out': [self._global_step_var]},
-                              attrs={'step': 1.0})
+        self.helper.append_op(
+            type='increment',
+            inputs={'X': [self._global_step_var]},
+            outputs={'Out': [self._global_step_var]},
+            attrs={'step': 1.0},
+        )
 
     def _append_optimize_op(self, block, param_and_grad):
         one_var = paddle.ones(shape=[1], dtype='int32', name='lookahead_ones')
-        zero_var = paddle.zeros(shape=[1],
-                                dtype='int32',
-                                name='lookahead_zeros')
+        zero_var = paddle.zeros(
+            shape=[1], dtype='int32', name='lookahead_zeros'
+        )
         k_var = layers.create_global_var(
             name=unique_name.generate("lookahead_k"),
             shape=[1],
             value=self.k,
             dtype='int32',
-            persistable=True)
+            persistable=True,
+        )
 
         mod = paddle.remainder(self._global_step_var, k_var)
 
@@ -236,11 +249,9 @@ def _append_optimize_op(self, block, param_and_grad):
         paddle.assign(tmp_var_1, slow_var)
 
     @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameters=None, no_grad_set=None
+    ):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
 
@@ -259,8 +270,8 @@ def minimize(self,
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
@@ -268,8 +279,8 @@ def minimize(self,
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                inp = paddle.to_tensor(np.random.random([1, 10]).astype('float32'))
+
+                inp = paddle.rand([1, 10], dtype="float32")
                 linear = paddle.nn.Linear(10, 1)
                 out = linear(inp)
                 loss = paddle.mean(out)
@@ -287,12 +298,13 @@ def minimize(self,
             loss,
             startup_program=startup_program,
             parameters=parameters,
-            no_grad_set=no_grad_set)
+            no_grad_set=no_grad_set,
+        )
 
         self._increment_global_var()
 
-        _ = self._apply_optimize(loss,
-                                 startup_program=startup_program,
-                                 params_grads=params_grads)
+        _ = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads
+        )
 
         return optimize_ops, params_grads
diff --git a/python/paddle/incubate/optimizer/modelaverage.py b/python/paddle/incubate/optimizer/modelaverage.py
index 67be022c288f21..6247ce6e72768c 100644
--- a/python/paddle/incubate/optimizer/modelaverage.py
+++ b/python/paddle/incubate/optimizer/modelaverage.py
@@ -163,17 +163,21 @@ def evaluate(layer, loader, loss_fn):
   
     """
 
-    def __init__(self,
-                 average_window_rate,
-                 parameters=None,
-                 min_average_window=10000,
-                 max_average_window=10000,
-                 name=None):
-        super(ModelAverage, self).__init__(learning_rate=0.0,
-                                           parameters=parameters,
-                                           weight_decay=None,
-                                           grad_clip=None,
-                                           name=name)
+    def __init__(
+        self,
+        average_window_rate,
+        parameters=None,
+        min_average_window=10000,
+        max_average_window=10000,
+        name=None,
+    ):
+        super(ModelAverage, self).__init__(
+            learning_rate=0.0,
+            parameters=parameters,
+            weight_decay=None,
+            grad_clip=None,
+            name=name,
+        )
 
         self.helper = LayerHelper(self.__class__.__name__)
         self.average_window = average_window_rate
@@ -183,7 +187,8 @@ def __init__(self,
 
         if not framework._non_static_mode():
             global_block = framework.default_main_program().global_block()
-            all_parameters = parameters if parameters else global_block.all_parameters(
+            all_parameters = (
+                parameters if parameters else global_block.all_parameters()
             )
 
             self._create_accumulators(global_block, all_parameters)
@@ -208,18 +213,15 @@ def _create_accumulators(self, block, parameters):
             self._add_accumulator('sum_2', param)
             self._add_accumulator('sum_3', param)
             self._add_accumulator('restore', param)
-            self._add_accumulator('num_accumulates',
-                                  param,
-                                  dtype='int64',
-                                  shape=[1])
-            self._add_accumulator('old_num_accumulates',
-                                  param,
-                                  dtype='int64',
-                                  shape=[1])
-            self._add_accumulator('num_updates',
-                                  param,
-                                  dtype='int64',
-                                  shape=[1])
+            self._add_accumulator(
+                'num_accumulates', param, dtype='int64', shape=[1]
+            )
+            self._add_accumulator(
+                'old_num_accumulates', param, dtype='int64', shape=[1]
+            )
+            self._add_accumulator(
+                'num_updates', param, dtype='int64', shape=[1]
+            )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -227,26 +229,50 @@ def _append_optimize_op(self, block, param_and_grad):
         sum_1 = self._get_accumulator('sum_1', param_and_grad[0])
         sum_2 = self._get_accumulator('sum_2', param_and_grad[0])
         sum_3 = self._get_accumulator('sum_3', param_and_grad[0])
-        num_accumulates = self._get_accumulator('num_accumulates',
-                                                param_and_grad[0])
-        old_num_accumulates = self._get_accumulator('old_num_accumulates',
-                                                    param_and_grad[0])
+        num_accumulates = self._get_accumulator(
+            'num_accumulates', param_and_grad[0]
+        )
+        old_num_accumulates = self._get_accumulator(
+            'old_num_accumulates', param_and_grad[0]
+        )
         num_updates = self._get_accumulator('num_updates', param_and_grad[0])
 
         if in_dygraph_mode():
             _, _, _, _, _, _ = _C_ops.average_accumulates_(
-                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
-                old_num_accumulates, num_updates, self.average_window,
-                self.max_average_window, self.min_average_window)
+                param_and_grad[0],
+                sum_1,
+                sum_2,
+                sum_3,
+                num_accumulates,
+                old_num_accumulates,
+                num_updates,
+                self.average_window,
+                self.max_average_window,
+                self.min_average_window,
+            )
             return None
         elif framework._non_static_mode():
             _, _, _, _, _, _ = _legacy_C_ops.average_accumulates(
-                param_and_grad[0], sum_1, sum_2, sum_3, num_accumulates,
-                old_num_accumulates, num_updates, sum_1, sum_2, sum_3,
-                num_accumulates, old_num_accumulates, num_updates,
-                'average_window', self.average_window, 'min_average_window',
-                self.min_average_window, 'max_average_window',
-                self.max_average_window)
+                param_and_grad[0],
+                sum_1,
+                sum_2,
+                sum_3,
+                num_accumulates,
+                old_num_accumulates,
+                num_updates,
+                sum_1,
+                sum_2,
+                sum_3,
+                num_accumulates,
+                old_num_accumulates,
+                num_updates,
+                'average_window',
+                self.average_window,
+                'min_average_window',
+                self.min_average_window,
+                'max_average_window',
+                self.max_average_window,
+            )
             return None
 
         block = framework.default_main_program().global_block()
@@ -263,7 +289,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "in_sum_3": sum_3,
             "in_num_accumulates": num_accumulates,
             "in_old_num_accumulates": old_num_accumulates,
-            "in_num_updates": num_updates
+            "in_num_updates": num_updates,
         }
 
         outputs = {
@@ -275,23 +301,23 @@ def _append_optimize_op(self, block, param_and_grad):
             "out_num_updates": num_updates,
         }
 
-        average_accumulates_op = block.append_op(type=self.type,
-                                                 inputs=inputs,
-                                                 outputs=outputs,
-                                                 attrs=attrs,
-                                                 stop_gradient=True)
+        average_accumulates_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True,
+        )
 
         return average_accumulates_op
 
     @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameters=None, no_grad_set=None
+    ):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
-        
+
         Args:
             loss (Tensor): A ``Tensor`` containing the value to minimize.
             startup_program (Program, optional): :ref:`api_fluid_Program` for
@@ -302,22 +328,21 @@ def minimize(self,
                 will be updated.
             no_grad_set (set, optional): Set of ``Tensor``  or ``Tensor.name`` that don't need
                 to be updated. The default value is None.
-        
+
         Returns:
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
-        
+
         Examples:
-        
+
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                inp = paddle.to_tensor(np.random.random([1, 10]).astype('float32'))
+                inp = paddle.rand([1, 10], dtype="float32")
                 linear = paddle.nn.Linear(10, 1)
                 out = linear(inp)
                 loss = paddle.mean(out)
@@ -343,7 +368,7 @@ def minimize(self,
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -352,8 +377,7 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                inp = paddle.to_tensor(np.random.random([1, 10]).astype('float32'))
+                inp = paddle.rand([1, 10], dtype="float32")
                 linear = paddle.nn.Linear(10, 1)
                 out = linear(inp)
                 loss = paddle.mean(out)
@@ -399,8 +423,7 @@ def apply(self, executor=None, need_restore=True):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                inp = paddle.to_tensor(np.random.random([1, 10]).astype('float32'))
+                inp = paddle.rand([1, 10], dtype="float32")
                 linear = paddle.nn.Linear(10, 1)
                 out = linear(inp)
                 loss = paddle.mean(out)
@@ -414,7 +437,7 @@ def apply(self, executor=None, need_restore=True):
                                                             max_average_window=4)
                 sgd.step()
                 modelaverage.step()
-                
+
                 with modelaverage.apply():
                     for param in linear.parameters():
                         print(param)
@@ -424,10 +447,12 @@ def apply(self, executor=None, need_restore=True):
         """
         if framework._non_static_mode():
             for param in self._parameter_list:
-                num_accumulates = self._get_accumulator('num_accumulates',
-                                                        param)
+                num_accumulates = self._get_accumulator(
+                    'num_accumulates', param
+                )
                 old_num_accumulates = self._get_accumulator(
-                    'old_num_accumulates', param)
+                    'old_num_accumulates', param
+                )
                 sum_1 = self._get_accumulator('sum_1', param)
                 sum_2 = self._get_accumulator('sum_2', param)
                 sum_3 = self._get_accumulator('sum_3', param)
@@ -437,8 +462,9 @@ def apply(self, executor=None, need_restore=True):
                 total_param = sum_1 + sum_2 + sum_3
                 total_accumulates = num_accumulates + old_num_accumulates
                 total_param = paddle.cast(total_param, dtype='float32')
-                total_accumulates = paddle.cast(total_accumulates,
-                                                dtype='float32')
+                total_accumulates = paddle.cast(
+                    total_accumulates, dtype='float32'
+                )
                 average_param = total_param / total_accumulates
                 paddle.assign(average_param, param)
             try:
@@ -449,7 +475,8 @@ def apply(self, executor=None, need_restore=True):
             return
         if executor is None:
             raise RuntimeError(
-                "Executor should not be None in static graph mode.")
+                "Executor should not be None in static graph mode."
+            )
         executor.run(self.apply_program)
         try:
             yield
@@ -461,7 +488,7 @@ def apply(self, executor=None, need_restore=True):
     def restore(self, executor=None):
         """
         Restore ``Parameter`` values of current model.
-        
+
         Args:
             executor(Executor): The network executor in static-graph mode. The default value is None in dygraph mode
 
@@ -470,8 +497,7 @@ def restore(self, executor=None):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                inp = paddle.to_tensor(np.random.random([1, 10]).astype('float32'))
+                inp = paddle.rand([1, 10], dtype="float32")
                 linear = paddle.nn.Linear(10, 1)
                 out = linear(inp)
                 loss = paddle.mean(out)
@@ -485,7 +511,7 @@ def restore(self, executor=None):
                                                             max_average_window=4)
                 sgd.step()
                 modelaverage.step()
-                
+
                 with modelaverage.apply(need_restore=False):
                     for param in linear.parameters():
                         print(param)
@@ -505,7 +531,8 @@ def restore(self, executor=None):
             return
         if executor is None:
             raise RuntimeError(
-                "Executor should not be None in static graph mode.")
+                "Executor should not be None in static graph mode."
+            )
         executor.run(self.restore_program)
 
     def _add_average_apply_op(self, block, param):
@@ -515,18 +542,22 @@ def _add_average_apply_op(self, block, param):
         sum_2 = block._clone_variable(self._get_accumulator('sum_2', param))
         sum_3 = block._clone_variable(self._get_accumulator('sum_3', param))
         num_accumulates = block._clone_variable(
-            self._get_accumulator('num_accumulates', param))
+            self._get_accumulator('num_accumulates', param)
+        )
         old_num_accumulates = block._clone_variable(
-            self._get_accumulator('old_num_accumulates', param))
+            self._get_accumulator('old_num_accumulates', param)
+        )
         # backup param value to grad
         layers.assign(input=param, output=grad)
         # param = (sum_1 + sum_2 + sum_3) / (num_accumulates + old_num_accumulates)
         tmp = layers.sum(x=[num_accumulates, old_num_accumulates])
         sum = layers.sum(x=[sum_1, sum_2, sum_3])
         tmp = layers.cast(
-            x=tmp, dtype='float32' if self._dtype is None else self._dtype)
+            x=tmp, dtype='float32' if self._dtype is None else self._dtype
+        )
         sum = layers.cast(
-            x=sum, dtype='float32' if self._dtype is None else self._dtype)
+            x=sum, dtype='float32' if self._dtype is None else self._dtype
+        )
         layers.ops._elementwise_div(x=sum, y=tmp, out=param)
 
     def _add_average_restore_op(self, block, param):
diff --git a/python/paddle/incubate/xpu/resnet_block.py b/python/paddle/incubate/xpu/resnet_block.py
index 39b439730759cf..6c83f5bda498ab 100644
--- a/python/paddle/incubate/xpu/resnet_block.py
+++ b/python/paddle/incubate/xpu/resnet_block.py
@@ -36,106 +36,232 @@
 __all__ = ['resnet_basic_block', 'ResNetBasicBlock']
 
 
-def resnet_basic_block(x,
-                       filter1,
-                       scale1,
-                       bias1,
-                       mean1,
-                       var1,
-                       filter2,
-                       scale2,
-                       bias2,
-                       mean2,
-                       var2,
-                       filter3,
-                       scale3,
-                       bias3,
-                       mean3,
-                       var3,
-                       stride1,
-                       stride2,
-                       stride3,
-                       padding1,
-                       padding2,
-                       padding3,
-                       dilation1,
-                       dilation2,
-                       dilation3,
-                       groups,
-                       momentum,
-                       eps,
-                       data_format,
-                       has_shortcut,
-                       use_global_stats=None,
-                       training=False,
-                       trainable_statistics=False,
-                       find_conv_max=True):
+def resnet_basic_block(
+    x,
+    filter1,
+    scale1,
+    bias1,
+    mean1,
+    var1,
+    filter2,
+    scale2,
+    bias2,
+    mean2,
+    var2,
+    filter3,
+    scale3,
+    bias3,
+    mean3,
+    var3,
+    stride1,
+    stride2,
+    stride3,
+    padding1,
+    padding2,
+    padding3,
+    dilation1,
+    dilation2,
+    dilation3,
+    groups,
+    momentum,
+    eps,
+    data_format,
+    has_shortcut,
+    use_global_stats=None,
+    training=False,
+    trainable_statistics=False,
+    find_conv_max=True,
+):
 
     if fluid.framework.in_dygraph_mode():
-        attrs = ('stride1', stride1, 'stride2', stride2, 'stride3', stride3,
-                 'padding1', padding1, 'padding2', padding2, 'padding3',
-                 padding3, 'dilation1', dilation1, 'dilation2', dilation2,
-                 'dilation3', dilation3, 'group', groups, 'momentum', momentum,
-                 'epsilon', eps, 'data_format', data_format, 'has_shortcut',
-                 has_shortcut, 'use_global_stats', use_global_stats,
-                 "trainable_statistics", trainable_statistics, 'is_test',
-                 not training, 'act_type', "relu", 'find_conv_input_max',
-                 find_conv_max)
-
-        out, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _, _ = \
-                getattr(_C_ops, "resnet_basic_block")(x, filter1, scale1, bias1, mean1, var1, filter2, scale2, bias2, mean2, var2, \
-                filter3, scale3, bias3, mean3, var3, mean1, var1, mean2, var2, mean3, var3, *attrs)
+        attrs = (
+            'stride1',
+            stride1,
+            'stride2',
+            stride2,
+            'stride3',
+            stride3,
+            'padding1',
+            padding1,
+            'padding2',
+            padding2,
+            'padding3',
+            padding3,
+            'dilation1',
+            dilation1,
+            'dilation2',
+            dilation2,
+            'dilation3',
+            dilation3,
+            'group',
+            groups,
+            'momentum',
+            momentum,
+            'epsilon',
+            eps,
+            'data_format',
+            data_format,
+            'has_shortcut',
+            has_shortcut,
+            'use_global_stats',
+            use_global_stats,
+            "trainable_statistics",
+            trainable_statistics,
+            'is_test',
+            not training,
+            'act_type',
+            "relu",
+            'find_conv_input_max',
+            find_conv_max,
+        )
+
+        (
+            out,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+            _,
+        ) = getattr(_C_ops, "resnet_basic_block")(
+            x,
+            filter1,
+            scale1,
+            bias1,
+            mean1,
+            var1,
+            filter2,
+            scale2,
+            bias2,
+            mean2,
+            var2,
+            filter3,
+            scale3,
+            bias3,
+            mean3,
+            var3,
+            mean1,
+            var1,
+            mean2,
+            var2,
+            mean3,
+            var3,
+            *attrs
+        )
         return out
     helper = LayerHelper('resnet_basic_block', **locals())
     bn_param_dtype = fluid.core.VarDesc.VarType.FP32
     max_dtype = fluid.core.VarDesc.VarType.FP32
 
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                    stop_gradient=True)
-    conv1 = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                      stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    conv1 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_mean1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
     saved_invstd1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
-    running_mean1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if mean1 is None else mean1
-    running_var1 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var1 is None else var1
-    conv2 = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                      stop_gradient=True)
-    conv2_input = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                            stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
+    running_mean1 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean1 is None
+        else mean1
+    )
+    running_var1 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var1 is None
+        else var1
+    )
+    conv2 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    conv2_input = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_mean2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
     saved_invstd2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
-    running_mean2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if mean2 is None else mean2
-    running_var2 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var2 is None else var2
-    conv3 = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                      stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
+    running_mean2 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean2 is None
+        else mean2
+    )
+    running_var2 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var2 is None
+        else var2
+    )
+    conv3 = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_mean3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
+        dtype=bn_param_dtype, stop_gradient=True
+    )
     saved_invstd3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True)
-    running_mean3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if mean3 is None else mean3
-    running_var3 = helper.create_variable_for_type_inference(
-        dtype=bn_param_dtype, stop_gradient=True) if var3 is None else var3
+        dtype=bn_param_dtype, stop_gradient=True
+    )
+    running_mean3 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if mean3 is None
+        else mean3
+    )
+    running_var3 = (
+        helper.create_variable_for_type_inference(
+            dtype=bn_param_dtype, stop_gradient=True
+        )
+        if var3 is None
+        else var3
+    )
     conv1_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv1_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv2_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv2_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv3_input_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
     conv3_filter_max = helper.create_variable_for_type_inference(
-        dtype=max_dtype, stop_gradient=True)
+        dtype=max_dtype, stop_gradient=True
+    )
 
     inputs = {
         'X': x,
@@ -175,7 +301,7 @@ def resnet_basic_block(x,
         "trainable_statistics": trainable_statistics,
         'is_test': not training,
         'act_type': "relu",
-        'find_conv_input_max': find_conv_max
+        'find_conv_input_max': find_conv_max,
     }
 
     outputs = {
@@ -203,88 +329,172 @@ def resnet_basic_block(x,
         'MaxInput3': conv3_input_max,
         'MaxFilter3': conv3_filter_max,
     }
-    helper.append_op(type='resnet_basic_block',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='resnet_basic_block', inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return out
 
 
 class ResNetBasicBlock(Layer):
-    """
+    r"""
+
     ResNetBasicBlock is designed for optimize the performence of the basic unit of ssd resnet block.
-    The fusion op architecture like this:
-            has_shortcut = True:       else:
-                    X                         X
-                  /                         /
-                |       |                 |       |
-              CONV1     |               CONV1     |
-                |       |                 |       |
-               BN1      |                BN1      |
-                |       |                 |       |
-              RELU1     |               RELU1     |
-                |       |                 |       |
-              CONV2   CONV3             CONV2     |
-                |       |                 |       |
-               BN2     BN3               BN2      |
-                 \     /                   \     /
-                   ADD                       ADD
-                    |                         |
-                   RELU                      RELU
-                    |                         |
-                    Y                         Y
+    If has_shortcut = True, it can calculate 3 Conv2D, 3 BatchNorm and 2 ReLU in one time.
+    If has_shortcut = False, it can calculate 2 Conv2D, 2 BatchNorm and 2 ReLU in one time. In this
+    case the shape of output is same with input.
+
+
+    Args:
+        num_channels (int): The number of input image channel.
+        num_filter (int): The number of filter. It is as same as the output image channel.
+        filter_size (int|list|tuple): The filter size. If filter_size
+            is a tuple, it must contain two integers, (filter_size_height,
+            filter_size_width). Otherwise, filter_size_height = filter_size_width =\
+            filter_size.
+        stride (int, optional): The stride size. It means the stride in convolution.
+            If stride is a tuple, it must contain two integers, (stride_height, stride_width).
+            Otherwise, stride_height = stride_width = stride. Default: stride = 1.
+        act (str, optional): Activation type, if it is set to None, activation is not appended.
+            Default: None
+        momentum (float, optional): The value used for the moving_mean and
+            moving_var computation. This should be a float number or a Tensor with
+            shape [1] and data type as float32. The updated formula is:
+            :math:`moving\_mean = moving\_mean * momentum + new\_mean * (1. - momentum)`
+            :math:`moving\_var = moving\_var * momentum + new\_var * (1. - momentum)`
+            Default is 0.9.
+        eps (float, optional): A value added to the denominator for
+            numerical stability. Default is 1e-5.
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. Now is only support `"NCHW"`, the data is stored in
+            the order of: `[batch_size, input_channels, input_height, input_width]`.
+        has_shortcut (bool, optional): Whether to calculate CONV3 and BN3. Default: False.
+        use_global_stats (bool, optional): Whether to use global mean and
+            variance. In inference or test mode, set use_global_stats to true
+            or is_test to true, and the behavior is equivalent.
+            In train mode, when setting use_global_stats True, the global mean
+            and variance are also used during train period. Default: False.
+        is_test (bool, optional): A flag indicating whether it is in
+            test phrase or not. Default: False.
+        filter_attr (ParamAttr, optional): The parameter attribute for learnable parameters/weights
+            of conv2d. If it is set to None or one attribute of ParamAttr, conv2d
+            will create ParamAttr as param_attr. Default: None.
+        scale_attr (ParamAttr, optional): The parameter attribute for Parameter `scale`
+            of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr
+            as param_attr, the name of scale can be set in ParamAttr. If the Initializer of the param_attr is not set,
+            the parameter is initialized with Xavier. Default: None.
+        bias_attr (ParamAttr, optional): The parameter attribute for the bias of batch_norm.
+            If it is set to None or one attribute of ParamAttr, batch_norm
+            will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr.
+            If the Initializer of the bias_attr is not set, the bias is initialized zero.
+            Default: None.
+        moving_mean_name (str, optional): The name of moving_mean which store the global Mean. If it
+            is set to None, batch_norm will save global mean with a random name, otherwise, batch_norm
+            will save global mean with the string. Default: None.
+        moving_var_name (str, optional): The name of the moving_variance which store the global Variance.
+            If it is set to None, batch_norm will save global variance with a random name, otherwise, batch_norm
+            will save global variance with the string. Default: None.
+        padding (int, optional): The padding size. It is only spupport padding_height = padding_width = padding.
+            Default: padding = 0.
+        dilation (int, optional): The dilation size. It means the spacing between the kernel
+            points. It is only spupport dilation_height = dilation_width = dilation.
+            Default: dilation = 1.
+        trainable_statistics (bool, optional): Whether to calculate mean and var in eval mode. In eval mode, when
+            setting trainable_statistics True, mean and variance will be calculated by current batch statistics.
+            Default: False.
+        find_conv_max (bool, optional): Whether to calculate max value of each conv2d. Default: True.
+
+
+    Returns:
+        A Tensor representing the ResNetBasicBlock, whose data type is the same with input.
+
+
+    Examples:
+        .. code-block:: python
+
+            # required: xpu
+            import paddle
+            from paddle.incubate.xpu.resnet_block import ResNetBasicBlock
+
+            ch_in = 4
+            ch_out = 8
+            x = paddle.uniform((2, ch_in, 16, 16), dtype='float32', min=-1., max=1.)
+            resnet_basic_block = ResNetBasicBlock(num_channels1=ch_in,
+                                                num_filter1=ch_out,
+                                                filter1_size=3,
+                                                num_channels2=ch_out,
+                                                num_filter2=ch_out,
+                                                filter2_size=3,
+                                                num_channels3=ch_in,
+                                                num_filter3=ch_out,
+                                                filter3_size=1,
+                                                stride1=1,
+                                                stride2=1,
+                                                stride3=1,
+                                                act='relu',
+                                                padding1=1,
+                                                padding2=1,
+                                                padding3=0,
+                                                has_shortcut=True)
+            out = resnet_basic_block.forward(x)
+
+            print(out.shape) # [2, 8, 16, 16]
+
     """
 
-    def __init__(self,
-                 num_channels1,
-                 num_filter1,
-                 filter1_size,
-                 num_channels2,
-                 num_filter2,
-                 filter2_size,
-                 num_channels3,
-                 num_filter3,
-                 filter3_size,
-                 stride1=1,
-                 stride2=1,
-                 stride3=1,
-                 act='relu',
-                 momentum=0.9,
-                 eps=1e-5,
-                 data_format='NCHW',
-                 has_shortcut=False,
-                 use_global_stats=False,
-                 is_test=False,
-                 filter1_attr=None,
-                 scale1_attr=None,
-                 bias1_attr=None,
-                 moving_mean1_name=None,
-                 moving_var1_name=None,
-                 filter2_attr=None,
-                 scale2_attr=None,
-                 bias2_attr=None,
-                 moving_mean2_name=None,
-                 moving_var2_name=None,
-                 filter3_attr=None,
-                 scale3_attr=None,
-                 bias3_attr=None,
-                 moving_mean3_name=None,
-                 moving_var3_name=None,
-                 padding1=0,
-                 padding2=0,
-                 padding3=0,
-                 dilation1=1,
-                 dilation2=1,
-                 dilation3=1,
-                 trainable_statistics=False,
-                 find_conv_max=True):
+    def __init__(
+        self,
+        num_channels1,
+        num_filter1,
+        filter1_size,
+        num_channels2,
+        num_filter2,
+        filter2_size,
+        num_channels3,
+        num_filter3,
+        filter3_size,
+        stride1=1,
+        stride2=1,
+        stride3=1,
+        act='relu',
+        momentum=0.9,
+        eps=1e-5,
+        data_format='NCHW',
+        has_shortcut=False,
+        use_global_stats=False,
+        is_test=False,
+        filter1_attr=None,
+        scale1_attr=None,
+        bias1_attr=None,
+        moving_mean1_name=None,
+        moving_var1_name=None,
+        filter2_attr=None,
+        scale2_attr=None,
+        bias2_attr=None,
+        moving_mean2_name=None,
+        moving_var2_name=None,
+        filter3_attr=None,
+        scale3_attr=None,
+        bias3_attr=None,
+        moving_mean3_name=None,
+        moving_var3_name=None,
+        padding1=0,
+        padding2=0,
+        padding3=0,
+        dilation1=1,
+        dilation2=1,
+        dilation3=1,
+        trainable_statistics=False,
+        find_conv_max=True,
+    ):
         super(ResNetBasicBlock, self).__init__()
         self._stride1 = stride1
         self._stride2 = stride2
-        self._kernel1_size = utils.convert_to_list(filter1_size, 2,
-                                                   'filter1_size')
-        self._kernel2_size = utils.convert_to_list(filter2_size, 2,
-                                                   'filter2_size')
+        self._kernel1_size = utils.convert_to_list(
+            filter1_size, 2, 'filter1_size'
+        )
+        self._kernel2_size = utils.convert_to_list(
+            filter2_size, 2, 'filter2_size'
+        )
         self._dilation1 = dilation1
         self._dilation2 = dilation2
         self._padding1 = padding1
@@ -301,8 +511,9 @@ def __init__(self,
         self._find_conv_max = find_conv_max
 
         if has_shortcut:
-            self._kernel3_size = utils.convert_to_list(filter3_size, 2,
-                                                       'filter3_size')
+            self._kernel3_size = utils.convert_to_list(
+                filter3_size, 2, 'filter3_size'
+            )
             self._padding3 = padding3
             self._stride3 = stride3
             self._dilation3 = dilation3
@@ -317,11 +528,13 @@ def __init__(self,
         if data_format not in valid_format:
             raise ValueError(
                 "conv_format must be one of {}, but got conv_format={}".format(
-                    valid_format, data_format))
+                    valid_format, data_format
+                )
+            )
 
         def _get_default_param_initializer(channels, kernel_size):
             filter_elem_num = np.prod(kernel_size) * channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return I.Normal(0.0, std)
 
         # init filter
@@ -335,92 +548,128 @@ def _get_default_param_initializer(channels, kernel_size):
             shape=filter1_shape,
             attr=filter1_attr,
             default_initializer=_get_default_param_initializer(
-                num_channels1, self._kernel1_size))
+                num_channels1, self._kernel1_size
+            ),
+        )
         self.scale_1 = self.create_parameter(
             shape=bn1_param_shape,
             attr=scale1_attr,
             dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
-        self.bias_1 = self.create_parameter(shape=bn1_param_shape,
-                                            attr=bias1_attr,
-                                            dtype=bn_param_dtype,
-                                            is_bias=True)
-        self.mean_1 = self.create_parameter(attr=ParamAttr(
-            name=moving_mean1_name,
-            initializer=I.Constant(0.0),
-            trainable=False),
-                                            shape=bn1_param_shape,
-                                            dtype=bn_param_dtype)
+            default_initializer=I.Constant(1.0),
+        )
+        self.bias_1 = self.create_parameter(
+            shape=bn1_param_shape,
+            attr=bias1_attr,
+            dtype=bn_param_dtype,
+            is_bias=True,
+        )
+        self.mean_1 = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean1_name,
+                initializer=I.Constant(0.0),
+                trainable=False,
+            ),
+            shape=bn1_param_shape,
+            dtype=bn_param_dtype,
+        )
         self.mean_1.stop_gradient = True
         self.var_1 = self.create_parameter(
-            attr=ParamAttr(name=moving_var1_name,
-                           initializer=I.Constant(1.0),
-                           trainable=False),
+            attr=ParamAttr(
+                name=moving_var1_name,
+                initializer=I.Constant(1.0),
+                trainable=False,
+            ),
             shape=bn1_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
         self.var_1.stop_gradient = True
 
         self.filter_2 = self.create_parameter(
             shape=filter2_shape,
             attr=filter2_attr,
             default_initializer=_get_default_param_initializer(
-                num_channels2, self._kernel2_size))
+                num_channels2, self._kernel2_size
+            ),
+        )
         self.scale_2 = self.create_parameter(
             shape=bn2_param_shape,
             attr=scale2_attr,
             dtype=bn_param_dtype,
-            default_initializer=I.Constant(1.0))
-        self.bias_2 = self.create_parameter(shape=bn2_param_shape,
-                                            attr=bias2_attr,
-                                            dtype=bn_param_dtype,
-                                            is_bias=True)
-        self.mean_2 = self.create_parameter(attr=ParamAttr(
-            name=moving_mean2_name,
-            initializer=I.Constant(0.0),
-            trainable=False),
-                                            shape=bn2_param_shape,
-                                            dtype=bn_param_dtype)
+            default_initializer=I.Constant(1.0),
+        )
+        self.bias_2 = self.create_parameter(
+            shape=bn2_param_shape,
+            attr=bias2_attr,
+            dtype=bn_param_dtype,
+            is_bias=True,
+        )
+        self.mean_2 = self.create_parameter(
+            attr=ParamAttr(
+                name=moving_mean2_name,
+                initializer=I.Constant(0.0),
+                trainable=False,
+            ),
+            shape=bn2_param_shape,
+            dtype=bn_param_dtype,
+        )
         self.mean_2.stop_gradient = True
         self.var_2 = self.create_parameter(
-            attr=ParamAttr(name=moving_var2_name,
-                           initializer=I.Constant(1.0),
-                           trainable=False),
+            attr=ParamAttr(
+                name=moving_var2_name,
+                initializer=I.Constant(1.0),
+                trainable=False,
+            ),
             shape=bn2_param_shape,
-            dtype=bn_param_dtype)
+            dtype=bn_param_dtype,
+        )
         self.var_2.stop_gradient = True
 
         if has_shortcut:
             bn3_param_shape = [1, 1, num_filter3]
             filter3_shape = [
-                num_filter3, num_channels3, filter3_size, filter3_size
+                num_filter3,
+                num_channels3,
+                filter3_size,
+                filter3_size,
             ]
             self.filter_3 = self.create_parameter(
                 shape=filter3_shape,
                 attr=filter3_attr,
                 default_initializer=_get_default_param_initializer(
-                    num_channels3, self._kernel3_size))
+                    num_channels3, self._kernel3_size
+                ),
+            )
             self.scale_3 = self.create_parameter(
                 shape=bn3_param_shape,
                 attr=scale3_attr,
                 dtype=bn_param_dtype,
-                default_initializer=I.Constant(1.0))
-            self.bias_3 = self.create_parameter(shape=bn3_param_shape,
-                                                attr=bias3_attr,
-                                                dtype=bn_param_dtype,
-                                                is_bias=True)
-            self.mean_3 = self.create_parameter(attr=ParamAttr(
-                name=moving_mean3_name,
-                initializer=I.Constant(0.0),
-                trainable=False),
-                                                shape=bn3_param_shape,
-                                                dtype=bn_param_dtype)
+                default_initializer=I.Constant(1.0),
+            )
+            self.bias_3 = self.create_parameter(
+                shape=bn3_param_shape,
+                attr=bias3_attr,
+                dtype=bn_param_dtype,
+                is_bias=True,
+            )
+            self.mean_3 = self.create_parameter(
+                attr=ParamAttr(
+                    name=moving_mean3_name,
+                    initializer=I.Constant(0.0),
+                    trainable=False,
+                ),
+                shape=bn3_param_shape,
+                dtype=bn_param_dtype,
+            )
             self.mean_3.stop_gradient = True
-            self.var_3 = self.create_parameter(attr=ParamAttr(
-                name=moving_var3_name,
-                initializer=I.Constant(1.0),
-                trainable=False),
-                                               shape=bn3_param_shape,
-                                               dtype=bn_param_dtype)
+            self.var_3 = self.create_parameter(
+                attr=ParamAttr(
+                    name=moving_var3_name,
+                    initializer=I.Constant(1.0),
+                    trainable=False,
+                ),
+                shape=bn3_param_shape,
+                dtype=bn_param_dtype,
+            )
             self.var_3.stop_gradient = True
         else:
             self.filter_3 = None
@@ -464,5 +713,6 @@ def forward(self, x):
             use_global_stats=self._use_global_stats,
             training=self.training,
             trainable_statistics=self._trainable_statistics,
-            find_conv_max=self._find_conv_max)
+            find_conv_max=self._find_conv_max,
+        )
         return out
diff --git a/python/paddle/inference/__init__.py b/python/paddle/inference/__init__.py
index b33fb1fdfe9d10..e033a366d9ae81 100644
--- a/python/paddle/inference/__init__.py
+++ b/python/paddle/inference/__init__.py
@@ -16,10 +16,10 @@
 from ..fluid.inference import DataType  # noqa: F401
 from ..fluid.inference import PlaceType  # noqa: F401
 from ..fluid.inference import PrecisionType  # noqa: F401
-from ..fluid.inference import BackendType  # noqa: F401
 from ..fluid.inference import Tensor  # noqa: F401
 from ..fluid.inference import Predictor  # noqa: F401
 from ..fluid.inference import create_predictor  # noqa: F401
+from ..fluid.inference import _get_phi_kernel_name
 from ..fluid.inference import get_version  # noqa: F401
 from ..fluid.inference import get_trt_compile_version  # noqa: F401
 from ..fluid.inference import get_trt_runtime_version  # noqa: F401
@@ -28,8 +28,8 @@
 from ..fluid.inference import PredictorPool  # noqa: F401
 
 __all__ = [  # noqa
-    'Config', 'DataType', 'PlaceType', 'PrecisionType', 'BackendType', 'Tensor',
-    'Predictor', 'create_predictor', 'get_version', 'get_trt_compile_version',
-    'convert_to_mixed_precision', 'get_trt_runtime_version',
-    'get_num_bytes_of_data_type', 'PredictorPool'
+    'Config', 'DataType', 'PlaceType', 'PrecisionType', 'Tensor', 'Predictor',
+    'create_predictor', 'get_version', '_get_phi_kernel_name',
+    'get_trt_compile_version', 'convert_to_mixed_precision',
+    'get_trt_runtime_version', 'get_num_bytes_of_data_type', 'PredictorPool'
 ]
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 7cebcbbfcabe21..cda61ade77d8d0 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -23,7 +23,11 @@
 import warnings
 from ...fluid.layer_helper import LayerHelper
 from ...fluid.framework import convert_np_dtype_to_dtype_
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ...fluid.framework import (
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+    _non_static_mode,
+)
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 import paddle
 from paddle import _C_ops, _legacy_C_ops, in_dynamic_mode
@@ -44,8 +48,7 @@ def celu(x, alpha=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         alpha (float, optional): The 'alpha' value of the CELU formulation. Default is 1.0.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -71,10 +74,12 @@ def celu(x, alpha=1.0, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'celu')
     helper = LayerHelper("celu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='celu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': alpha})
+    helper.append_op(
+        type='celu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha},
+    )
     return out
 
 
@@ -95,8 +100,7 @@ def elu(x, alpha=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         alpha (float, optional): The 'alpha' value of the ELU formulation. Default is 1.0.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -122,10 +126,12 @@ def elu(x, alpha=1.0, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'elu')
     helper = LayerHelper("elu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='elu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': alpha})
+    helper.append_op(
+        type='elu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': alpha},
+    )
     return out
 
 
@@ -135,7 +141,7 @@ def elu_(x, alpha=1.0, name=None):
     Inplace version of ``elu`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_nn_cn_elu`.
     """
-    assert alpha >= 0., "elu_ only support alpha >= 0, please use elu instead."
+    assert alpha >= 0.0, "elu_ only support alpha >= 0, please use elu instead."
     if in_dygraph_mode():
         return _C_ops.elu_(x, alpha)
     return _legacy_C_ops.elu_(x, 'alpha', alpha)
@@ -145,6 +151,8 @@ def gelu(x, approximate=False, name=None):
     r"""
     gelu activation.
 
+    The activation function of Gelu is calculated element by element. More information refers to :ref: `Gaussian Error Linear Units`.
+
     if approximate is True
 
     .. math::
@@ -159,9 +167,8 @@ def gelu(x, approximate=False, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        approximate (bool, optional): Wether to enable approximation. Default is False.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        approximate (bool, optional): Whether to enable approximation. Default is False.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -190,10 +197,12 @@ def gelu(x, approximate=False, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'gelu')
     helper = LayerHelper("gelu", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='gelu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'approximate': approximate})
+    helper.append_op(
+        type='gelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'approximate': approximate},
+    )
     return out
 
 
@@ -214,9 +223,8 @@ def hardshrink(x, threshold=0.5, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        threshold (float, optional): The value of threshold for hardthrink. Default is 0.5.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -237,20 +245,23 @@ def hardshrink(x, threshold=0.5, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.hard_shrink(x, 'threshold', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardshrink')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardshrink'
+    )
     helper = LayerHelper('hardshrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='hard_shrink',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='hard_shrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
 def hardtanh(x, min=-1.0, max=1.0, name=None):
     r"""
-    hardtanh activation
+    hardtanh activation. Calculate the `hardtanh` of input `x`.
 
     .. math::
 
@@ -267,8 +278,7 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         min (float, optional): The minimum value of the linear region range. Default is -1.
         max (float, optional): The maximum value of the linear region range. Default is 1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -278,9 +288,8 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
+            x = paddle.to_tensor([-1.5, 0.3, 2.5])
             out = F.hardtanh(x) # [-1., 0.3, 1.]
     """
 
@@ -290,25 +299,24 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.brelu(x, 't_min', min, 't_max', max)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardtanh')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardtanh'
+    )
 
     helper = LayerHelper('hardtanh', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='brelu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         't_min': min,
-                         't_max': max
-                     })
+    helper.append_op(
+        type='brelu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'t_min': min, 't_max': max},
+    )
     return out
 
 
 def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     r"""
-    hardsigmoid activation.
-
+    hardsigmoid activation. Calculate the `hardsigmoid` of input `x`.
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
     which is much faster than sigmoid.
 
@@ -327,8 +335,7 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         slope (float, optional): The slope of hardsigmoid function. Default is 0.1666667.
         offset (float, optional): The offset of hardsigmoid function. Default is 0.5.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -349,28 +356,26 @@ def hardsigmoid(x, slope=0.1666667, offset=0.5, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.hard_sigmoid(x, 'slope', slope, 'offset', offset)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardsigmoid')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardsigmoid'
+    )
 
     helper = LayerHelper('hardsigmoid', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='hard_sigmoid',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'slope': slope,
-                         'offset': offset
-                     })
+    helper.append_op(
+        type='hard_sigmoid',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'slope': slope, 'offset': offset},
+    )
     return out
 
 
 def hardswish(x, name=None):
     r"""
-    hardswish activation
-
-    hardswish is proposed in MobileNetV3, and performs better in computational stability
-    and efficiency compared to swish function. For more details please refer
-    to: https://arxiv.org/pdf/1905.02244.pdf
+    hardswish activation. hardswish is proposed in MobileNetV3, and performs
+    better in computational stability and efficiency compared to swish function.
+    For more details please refer to: https://arxiv.org/pdf/1905.02244.pdf
 
     .. math::
 
@@ -385,8 +390,7 @@ def hardswish(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -406,8 +410,9 @@ def hardswish(x, name=None):
     if in_dygraph_mode():
         return _C_ops.hard_swish(x, 6, 6, 3)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'hardswish')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'hardswish'
+    )
 
     helper = LayerHelper('hardswish', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
@@ -417,7 +422,7 @@ def hardswish(x, name=None):
 
 def leaky_relu(x, negative_slope=0.01, name=None):
     r"""
-    leaky_relu activation
+    leaky_relu activation. The calculation formula is:
 
     .. math::
         leaky\_relu(x)=
@@ -432,8 +437,7 @@ def leaky_relu(x, negative_slope=0.01, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         negative_slope (float, optional): Slope of the activation function at
             :math:`x < 0` . Default is 0.01.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -456,14 +460,17 @@ def leaky_relu(x, negative_slope=0.01, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.leaky_relu(x, 'alpha', negative_slope)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'leaky_relu')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'leaky_relu'
+    )
     helper = LayerHelper('leaky_relu', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='leaky_relu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'alpha': negative_slope})
+    helper.append_op(
+        type='leaky_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'alpha': negative_slope},
+    )
     return out
 
 
@@ -479,8 +486,7 @@ def prelu(x, weight, data_format="NCHW", name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         weight (Tensor): The learnable parameter with data type same as ``x``.
             The weight shape is [1] or [in], where `in` is the input channel of ``x``.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
         data_format(str, optional): Data format that specifies the layout of input.
             It may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default: "NCHW".
 
@@ -511,60 +517,68 @@ def prelu(x, weight, data_format="NCHW", name=None):
             #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'prelu')
-    check_variable_and_dtype(weight, 'weight',
-                             ['float16', 'float32', 'float64'], 'prelu')
+    check_variable_and_dtype(
+        weight, 'weight', ['float16', 'float32', 'float64'], 'prelu'
+    )
 
-    assert len(weight.shape
-               ) == 1, "The dim count of weight shape should be 1 in prelu()."
+    assert (
+        len(weight.shape) == 1
+    ), "The dim count of weight shape should be 1 in prelu()."
 
     mode = 'all'
     if weight.shape[0] > 1:
 
         true_data_format = [
-            'NC', 'NCL', 'NCHW', 'NCDHW', 'NLC', 'NHWC', 'NDHWC'
+            'NC',
+            'NCL',
+            'NCHW',
+            'NCDHW',
+            'NLC',
+            'NHWC',
+            'NDHWC',
         ]
         if data_format not in true_data_format:
             raise ValueError(
                 "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+                "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+            )
 
         data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
-        assert len(
-            x.shape
-        ) > 1, "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
+        assert (
+            len(x.shape) > 1
+        ), "The dim count of x should be equal or larger than 2 in prelu() when weight shape is not [1]."
 
-        #NOTE(GuoxiaWang): support NHWC data format
+        # NOTE(GuoxiaWang): support NHWC data format
         if data_format == 'NHWC':
-            assert weight.shape[0] == x.shape[
-                -1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+            assert (
+                weight.shape[0] == x.shape[-1]
+            ), "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
         else:
-            assert weight.shape[0] == x.shape[
-                1], "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
+            assert (
+                weight.shape[0] == x.shape[1]
+            ), "The weight size should be equal to x input channel in prelu() when weight shape is not [1]."
         mode = 'channel'
 
     if in_dygraph_mode():
         return _C_ops.prelu(x, weight, data_format, mode)
     if _in_legacy_dygraph():
-        return _legacy_C_ops.prelu(x, weight, 'mode', mode, 'data_format',
-                                   data_format)
+        return _legacy_C_ops.prelu(
+            x, weight, 'mode', mode, 'data_format', data_format
+        )
 
     helper = LayerHelper('prelu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type="prelu",
-                     inputs={
-                         "X": x,
-                         "Alpha": weight
-                     },
-                     outputs={"Out": out},
-                     attrs={
-                         "mode": mode,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="prelu",
+        inputs={"X": x, "Alpha": weight},
+        outputs={"Out": out},
+        attrs={"mode": mode, "data_format": data_format},
+    )
     return out
 
 
-def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
+def rrelu(x, lower=1.0 / 8.0, upper=1.0 / 3.0, training=True, name=None):
     r"""
     rrelu activation.
 
@@ -607,8 +621,7 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
         lower (float, optional): The lower bound of uniform distribution. Default: 0.125.
         upper (float, optional): The upper bound of uniform distribution. Default: 0.333.
         training (bool, optional): Current mode is in training or others.  Default is True.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -637,47 +650,56 @@ def rrelu(x, lower=1. / 8., upper=1. / 3., training=True, name=None):
     """
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'X', ['float16', 'float32', 'float64'],
-                                 'rrelu')
+        check_variable_and_dtype(
+            x, 'X', ['float16', 'float32', 'float64'], 'rrelu'
+        )
 
     if not isinstance(lower, float) or not isinstance(upper, float):
         raise TypeError(
-            "The lower and upper values must be float type. Received: lower {}, upper {}."
-            .format(lower, upper))
+            "The lower and upper values must be float type. Received: lower {}, upper {}.".format(
+                lower, upper
+            )
+        )
 
     if lower < 0 or lower > 1:
         raise ValueError(
-            "The lower value must be no less than zero or greater than one. Received: {}."
-            .format(lower))
+            "The lower value must be no less than zero or greater than one. Received: {}.".format(
+                lower
+            )
+        )
 
     if upper < lower:
         raise ValueError(
-            "The upper value must be greater than lower value. Received: lower {}, upper {}."
-            .format(lower, upper))
+            "The upper value must be greater than lower value. Received: lower {}, upper {}.".format(
+                lower, upper
+            )
+        )
 
     if upper > 1:
         raise ValueError(
             "The upper value must be no greater than one. Received: {}.".format(
-                upper))
+                upper
+            )
+        )
 
     is_test = not training
 
     if _in_legacy_dygraph():
-        out, noise = _legacy_C_ops.rrelu(x, 'lower', lower, 'upper', upper,
-                                         'is_test', is_test)
+        out, noise = _legacy_C_ops.rrelu(
+            x, 'lower', lower, 'upper', upper, 'is_test', is_test
+        )
         return out
 
     helper = LayerHelper('rrelu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     noise = helper.create_variable_for_type_inference(dtype=x.dtype)
     attrs = {'lower': lower, 'upper': upper, 'is_test': is_test}
-    helper.append_op(type='rrelu',
-                     inputs={"X": x},
-                     outputs={
-                         "Out": out,
-                         "Noise": noise
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='rrelu',
+        inputs={"X": x},
+        outputs={"Out": out, "Noise": noise},
+        attrs=attrs,
+    )
     return out
 
 
@@ -691,8 +713,7 @@ def relu(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -742,8 +763,7 @@ def log_sigmoid(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -764,8 +784,9 @@ def log_sigmoid(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.logsigmoid(x)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'log_sigmoid')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'log_sigmoid'
+    )
     helper = LayerHelper("log_sigmoid", **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='logsigmoid', inputs={'X': x}, outputs={'Out': out})
@@ -803,8 +824,7 @@ def maxout(x, groups, axis=1, name=None):
             is NHWC. If ``axis`` < 0, it works the same way as :math:`axis + D` ,
             where D is the dimensions of ``x`` . ``axis`` only supports 1, 3 or -1.
             Default is 1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type as ``x`` .
@@ -835,19 +855,19 @@ def maxout(x, groups, axis=1, name=None):
     if axis not in [1, -1, 3]:
         raise ValueError(
             "Attr(axis) should be 1 when data format is NCHW, -1 or 3 when data format is NHWC. Received "
-            "Attr(axis): %s." % str(axis))
+            "Attr(axis): %s." % str(axis)
+        )
     if axis == -1:
         axis = 3
 
     helper = LayerHelper('maxout', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='maxout',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'groups': groups,
-                         'axis': axis
-                     })
+    helper.append_op(
+        type='maxout',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'groups': groups, 'axis': axis},
+    )
     return out
 
 
@@ -861,8 +881,7 @@ def relu6(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -887,17 +906,21 @@ def relu6(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'relu6')
     helper = LayerHelper('relu6', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='relu6',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='relu6',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
-def selu(x,
-         scale=1.0507009873554804934193349852946,
-         alpha=1.6732632423543772848170429916717,
-         name=None):
+def selu(
+    x,
+    scale=1.0507009873554804934193349852946,
+    alpha=1.6732632423543772848170429916717,
+    name=None,
+):
     r"""
     selu activation
 
@@ -915,8 +938,7 @@ def selu(x,
         x (Tensor): The input Tensor with data type float32, float64.
         scale (float, optional): The value of scale(must be greater than 1.0) for selu. Default is 1.0507009873554804934193349852946
         alpha (float, optional): The value of alpha(must be no less than zero) for selu. Default is 1.6732632423543772848170429916717
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -934,11 +956,13 @@ def selu(x,
     """
     if scale <= 1.0:
         raise ValueError(
-            "The scale must be greater than 1.0. Received: {}.".format(scale))
+            "The scale must be greater than 1.0. Received: {}.".format(scale)
+        )
 
     if alpha < 0:
         raise ValueError(
-            "The alpha must be no less than zero. Received: {}.".format(alpha))
+            "The alpha must be no less than zero. Received: {}.".format(alpha)
+        )
 
     if in_dygraph_mode():
         return _C_ops.selu(x, scale, alpha)
@@ -948,13 +972,12 @@ def selu(x,
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'selu')
     helper = LayerHelper('selu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='selu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'scale': scale,
-                         'alpha': alpha
-                     })
+    helper.append_op(
+        type='selu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'scale': scale, 'alpha': alpha},
+    )
     return out
 
 
@@ -965,21 +988,22 @@ def silu(x, name=None):
     .. math::
 
         silu(x) = \frac{x}{1 + e^{-x}}
-    
+
+    Where :math:`x` is the input Tensor.
+
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-    
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
     Returns:
-        A Tensor with the same data type and shape as ``x`` .
-    
+        A Tensor with the same data type and shape as :attr:`x`.
+
     Examples:
         .. code-block:: python
 
             import paddle
             import paddle.nn.functional as F
-            
+
             x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             out = F.silu(x) # [ 0.731059, 1.761594, 2.857722, 3.928055 ]
     """
@@ -1074,13 +1098,12 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int, optional): The axis along which to perform log_softmax
+        axis (int, optional): The axis along which to perform softmax
             calculations. It should be in range [-D, D), where D is the
-            dimensions of ``x`` . If ``axis`` < 0, it works the same way as
+            rank of ``x`` . If ``axis`` < 0, it works the same way as
             :math:`axis + D` . Default is -1.
         dtype (str, optional): The data type of the output tensor, can be float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
@@ -1091,15 +1114,13 @@ def softmax(x, axis=-1, dtype=None, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
                         [3.0, 4.0, 5.0, 6.0],
                         [7.0, 8.0, 8.0, 9.0]],
                         [[1.0, 2.0, 3.0, 4.0],
                         [5.0, 6.0, 7.0, 8.0],
-                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
-            x = paddle.to_tensor(x)
+                        [6.0, 7.0, 8.0, 9.0]]],dtype='float32')
             out1 = F.softmax(x)
             out2 = F.softmax(x, dtype='float64')
             # out1's data type is float32; out2's data type is float64
@@ -1117,44 +1138,50 @@ def softmax(x, axis=-1, dtype=None, name=None):
     use_cudnn = True
 
     if in_dygraph_mode():
-        outs_cast = x if dtype is None \
-            else _C_ops.cast(x, dtype)
+        outs_cast = x if dtype is None else _C_ops.cast(x, dtype)
         return _C_ops.softmax(outs_cast, axis)
 
     if _in_legacy_dygraph():
-        outs_cast = x if dtype is None \
+        outs_cast = (
+            x
+            if dtype is None
             else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _legacy_C_ops.softmax(outs_cast, 'axis', axis, 'use_cudnn',
-                                     use_cudnn)
+        )
+        return _legacy_C_ops.softmax(
+            outs_cast, 'axis', axis, 'use_cudnn', use_cudnn
+        )
 
     if dtype is None:
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'softmax')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'softmax'
+        )
     else:
         check_dtype(
-            dtype, 'dtype', ['float32', 'float64'], 'softmax',
-            'If dtype is not None, it only support float32 or float64.')
+            dtype,
+            'dtype',
+            ['float32', 'float64'],
+            'softmax',
+            'If dtype is not None, it only support float32 or float64.',
+        )
 
     helper = LayerHelper("softmax", **locals())
     outs_cast = x
     if dtype is not None:
         outs_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='cast',
-                         inputs={'X': x},
-                         outputs={'Out': outs_cast},
-                         attrs={
-                             'in_dtype': x.dtype,
-                             'out_dtype': dtype
-                         })
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': outs_cast},
+            attrs={'in_dtype': x.dtype, 'out_dtype': dtype},
+        )
 
     outs_softmax = helper.create_variable_for_type_inference(outs_cast.dtype)
-    helper.append_op(type='softmax',
-                     inputs={'X': outs_cast},
-                     outputs={'Out': outs_softmax},
-                     attrs={
-                         'axis': axis,
-                         'use_cudnn': use_cudnn
-                     })
+    helper.append_op(
+        type='softmax',
+        inputs={'X': outs_cast},
+        outputs={'Out': outs_softmax},
+        attrs={'axis': axis, 'use_cudnn': use_cudnn},
+    )
 
     return outs_softmax
 
@@ -1170,15 +1197,22 @@ def softmax_(x, axis=-1, dtype=None, name=None):
     use_cudnn = True
 
     if in_dygraph_mode():
-        outs_cast = x if dtype is None \
+        outs_cast = (
+            x
+            if dtype is None
             else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
+        )
         return _C_ops.softmax_(outs_cast, axis)
 
     if _in_legacy_dygraph():
-        outs_cast = x if dtype is None \
+        outs_cast = (
+            x
+            if dtype is None
             else _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
-        return _legacy_C_ops.softmax_(outs_cast, 'axis', axis, 'use_cudnn',
-                                      use_cudnn)
+        )
+        return _legacy_C_ops.softmax_(
+            outs_cast, 'axis', axis, 'use_cudnn', use_cudnn
+        )
 
 
 def softplus(x, beta=1, threshold=20, name=None):
@@ -1186,16 +1220,16 @@ def softplus(x, beta=1, threshold=20, name=None):
     softplus activation
 
     .. math::
-
-        softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\
-        \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+        softplus(x)=\begin{cases}
+                \frac{1}{\beta} * \log(1 + e^{\beta * x}),&x\leqslant\frac{\varepsilon}{\beta};\\
+                x,&x>\frac{\varepsilon}{\beta}.
+            \end{cases}
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        beta (float, optional): The value of beta for softplus. Default is 1
-        threshold (float, optional): The value of threshold for softplus. Default is 20
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        beta (float, optional): The value of :math:`\beta` for softplus. Default is 1
+        threshold (float, optional): The value of :math:`\varepsilon` for softplus. Default is 20
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1205,9 +1239,8 @@ def softplus(x, beta=1, threshold=20, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3], dtype='float32')
             out = F.softplus(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
 
@@ -1217,17 +1250,17 @@ def softplus(x, beta=1, threshold=20, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.softplus(x, 'beta', beta, 'threshold', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'softplus')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'softplus'
+    )
     helper = LayerHelper('softplus', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='softplus',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'beta': beta,
-                         'threshold': threshold
-                     })
+    helper.append_op(
+        type='softplus',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'beta': beta, 'threshold': threshold},
+    )
     return out
 
 
@@ -1249,8 +1282,7 @@ def softshrink(x, threshold=0.5, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         threshold (float, optional): The value of threshold(must be no less than zero) for softplus. Default is 0.5
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1260,29 +1292,36 @@ def softshrink(x, threshold=0.5, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
-            out = F.softshrink(x) # [-0.4, 0, 0, 0.3]
+            x = paddle.to_tensor([-0.9, -0.2, 0.1, 0.8])
+            out = F.softshrink(x)
+            print(out)
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.39999998,  0.        ,  0.        ,  0.30000001])
     """
     if threshold < 0:
         raise ValueError(
             "The threshold must be no less than zero. Received: {}.".format(
-                threshold))
+                threshold
+            )
+        )
 
     if in_dygraph_mode():
         return _C_ops.soft_shrink(x, threshold)
     if _in_legacy_dygraph():
         return _legacy_C_ops.softshrink(x, 'lambda', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'softshrink')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'softshrink'
+    )
     helper = LayerHelper('softshrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='softshrink',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'lambda': threshold})
+    helper.append_op(
+        type='softshrink',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'lambda': threshold},
+    )
     return out
 
 
@@ -1296,8 +1335,7 @@ def softsign(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1307,18 +1345,21 @@ def softsign(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
-            out = F.softsign(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            out = F.softsign(x)
+            print(out)
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.28571430, -0.16666666,  0.09090909,  0.23076925])
     """
     if in_dygraph_mode():
         return _C_ops.softsign(x)
     if in_dynamic_mode():
         return _legacy_C_ops.softsign(x)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'softsign')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'softsign'
+    )
     helper = LayerHelper('softsign', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='softsign', inputs={'X': x}, outputs={'Out': out})
@@ -1335,8 +1376,7 @@ def swish(x, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1346,10 +1386,12 @@ def swish(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-2., 0., 1.]))
-            out = F.swish(x) # [-0.238406, 0., 0.731059]
+            x = paddle.to_tensor([-2., 0., 1.])
+            out = F.swish(x)
+            print(out)
+            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.23840584,  0.        ,  0.73105854])
     """
     if in_dygraph_mode():
         return _C_ops.swish(x, 1.0)
@@ -1359,10 +1401,9 @@ def swish(x, name=None):
     check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], 'swish')
     helper = LayerHelper('swish', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='swish',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'beta': 1.0})
+    helper.append_op(
+        type='swish', inputs={'X': x}, outputs={'Out': out}, attrs={'beta': 1.0}
+    )
     return out
 
 
@@ -1381,8 +1422,7 @@ def mish(x, name=None):
     
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1418,8 +1458,7 @@ def tanhshrink(x, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1429,10 +1468,12 @@ def tanhshrink(x, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
-            out = F.tanhshrink(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
+            out = F.tanhshrink(x)
+            print(out)
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.02005106, -0.00262468,  0.00033200,  0.00868741])
     """
     if in_dygraph_mode():
         return _C_ops.tanh_shrink(x)
@@ -1440,8 +1481,9 @@ def tanhshrink(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.tanh_shrink(x)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'tanhshrink')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'tanhshrink'
+    )
     helper = LayerHelper('tanh_shrink', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     helper.append_op(type='tanh_shrink', inputs={'X': x}, outputs={'Out': out})
@@ -1466,8 +1508,7 @@ def thresholded_relu(x, threshold=1.0, name=None):
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
         threshold (float, optional): The value of threshold for thresholded_relu. Default is 1.0
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -1477,10 +1518,12 @@ def thresholded_relu(x, threshold=1.0, name=None):
 
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([2., 0., 1.]))
-            out = F.thresholded_relu(x) # [2., 0., 0.]
+            x = paddle.to_tensor([2., 0., 1.])
+            out = F.thresholded_relu(x)
+            print(out)
+            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [2., 0., 0.])
     """
 
     if in_dygraph_mode():
@@ -1489,14 +1532,17 @@ def thresholded_relu(x, threshold=1.0, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.thresholded_relu(x, 'threshold', threshold)
 
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'thresholded_relu')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'thresholded_relu'
+    )
     helper = LayerHelper('thresholded_relu', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='thresholded_relu',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'threshold': threshold})
+    helper.append_op(
+        type='thresholded_relu',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'threshold': threshold},
+    )
     return out
 
 
@@ -1524,8 +1570,7 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
             preventing data type overflows. Supported dtype: float32, float64.
             If ``dtype`` is None, the output Tensor has the same dtype as x.
             Default is None.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         A Tensor with the same shape and data type (use ``dtype`` if it is
@@ -1570,37 +1615,43 @@ def log_softmax(x, axis=-1, dtype=None, name=None):
         return _legacy_C_ops.log_softmax(x, 'axis', axis)
 
     if dtype is None:
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'log_softmax')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'log_softmax'
+        )
     else:
         check_dtype(
-            dtype, 'dtype', ['float32', 'float64'], 'log_softmax',
-            'If dtype is not None, it only support float32 or float64.')
+            dtype,
+            'dtype',
+            ['float32', 'float64'],
+            'log_softmax',
+            'If dtype is not None, it only support float32 or float64.',
+        )
 
     helper = LayerHelper("log_softmax", **locals())
     out_cast = x
     if dtype is not None:
         out_cast = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='cast',
-                         inputs={'X': x},
-                         outputs={'Out': out_cast},
-                         attrs={
-                             'in_dtype': x.dtype,
-                             'out_dtype': dtype
-                         })
+        helper.append_op(
+            type='cast',
+            inputs={'X': x},
+            outputs={'Out': out_cast},
+            attrs={'in_dtype': x.dtype, 'out_dtype': dtype},
+        )
 
     out = helper.create_variable_for_type_inference(out_cast.dtype)
-    helper.append_op(type='log_softmax',
-                     inputs={'X': out_cast},
-                     outputs={'Out': out},
-                     attrs={'axis': axis})
+    helper.append_op(
+        type='log_softmax',
+        inputs={'X': out_cast},
+        outputs={'Out': out},
+        attrs={'axis': axis},
+    )
 
     return out
 
 
 def glu(x, axis=-1, name=None):
     r"""
-    The gated linear unit. The input is evenly splited into 2 parts along a 
+    The gated linear unit. The input is evenly splited into 2 parts along a
     given axis. The first part is used as the content, and the second part is
     passed through a sigmoid function then used as the gate. The output is a
     elementwise multiplication of the content and the gate.
@@ -1611,23 +1662,22 @@ def glu(x, axis=-1, name=None):
 
     Parameters:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int, optional): The axis along which split the input tensor. It 
-            should be in range [-D, D), where D is the dimensions of ``x`` . 
-            If ``axis`` < 0, it works the same way as :math:`axis + D` . 
+        axis (int, optional): The axis along which split the input tensor. It
+            should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` < 0, it works the same way as :math:`axis + D` .
             Default is -1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-    
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
     Returns:
-        A Tensor with the same data type as x. The size of the given aixs is 
+        A Tensor with the same data type as x. The size of the given aixs is
         halved.
-    
+
     Examples:
         .. code-block:: python
-        
+
             import paddle
             from paddle.nn import functional as F
-            
+
             x = paddle.to_tensor(
                 [[-0.22014759, -1.76358426,  0.80566144,  0.04241343],
                  [-1.94900405, -1.89956081,  0.17134808, -1.11280477]]
@@ -1635,10 +1685,11 @@ def glu(x, axis=-1, name=None):
             print(F.glu(x).numpy())
             # array([[-0.15216254, -0.9004892 ],
             #        [-1.0577879 , -0.46985325]], dtype=float32)
-        
+
     """
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             "glu")
+    check_variable_and_dtype(
+        x, 'input', ['float16', 'float32', 'float64'], "glu"
+    )
     a, b = chunk(x, 2, axis=axis, name=name)
     gate = sigmoid(b, name=name)
     out = paddle.multiply(a, gate, name=name)
@@ -1668,24 +1719,23 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
         gumbel\_softmax(v_i)=\frac{e^{v_i/t}}{\sum_{j=1}^n{e^{v_j/t}}},i=1,2,3...n
 
     Parameters:
-        x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch 
-            of independent distributions and the last dimension represents 
+        x (Tensor): An N-D Tensor, the first N - 1 dimensions index into a batch
+            of independent distributions and the last dimension represents
             a vector of probabilities with datatype float32, float64.
         temperature (float, optional): non-negative scalar temperature.
             Default is 1.0.
-        hard (bool, optional): if True, the returned samples will be discretized as 
-            one-hot vectors, but will be differentiated as if it is the soft sample 
+        hard (bool, optional): if True, the returned samples will be discretized as
+            one-hot vectors, but will be differentiated as if it is the soft sample
             in autograd. Default is False.
-        axis (int, optional): The axis along will be calculated softmax value. 
+        axis (int, optional): The axis along will be calculated softmax value.
             Default is -1.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
-    
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+
     Returns:
-        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution. 
-        If ``hard = True``, the returned samples will be one-hot, otherwise they will be 
+        Sampled tensor of same shape as ``x`` from the Gumbel-Softmax distribution.
+        If ``hard = True``, the returned samples will be one-hot, otherwise they will be
         probability distributions that sum to 1 across ``axis``.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1701,24 +1751,23 @@ def gumbel_softmax(x, temperature=1.0, hard=False, axis=-1, name=None):
             # [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 1.        ],
             # [0.00000062, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.99999940],
             # [0.00000000, 0.00000000, 0.00000000, 0.00001258, 0.99998736, 0.00000000]]
-        
+
     """
     if in_dygraph_mode():
         return _C_ops.gumbel_softmax(x, temperature, hard, axis)
 
     if in_dynamic_mode():
-        return _legacy_C_ops.gumbel_softmax(x, 'temperature', temperature,
-                                            'hard', hard, 'axis', axis)
+        return _legacy_C_ops.gumbel_softmax(
+            x, 'temperature', temperature, 'hard', hard, 'axis', axis
+        )
 
     helper = LayerHelper("gumbel_softmax", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'gumbel_softmax')
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='gumbel_softmax',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={
-                         'temperature': temperature,
-                         'hard': hard,
-                         'axis': axis
-                     })
+    helper.append_op(
+        type='gumbel_softmax',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'temperature': temperature, 'hard': hard, 'axis': axis},
+    )
     return out
diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index d4d3e9759e0199..011acc3096cccc 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -20,14 +20,24 @@
 from ...tensor.creation import zeros
 from paddle.static import Variable
 from ...fluid import dygraph_utils
+
 # TODO: define the common functions to build a neural network
 from ...tensor.manipulation import squeeze
 from ...tensor.manipulation import unsqueeze
 from ...tensor import clip
 from ...tensor import sum
 from ...tensor import sqrt
-from ...fluid.data_feeder import check_variable_and_dtype, check_dtype, check_type
-from ...fluid.framework import _varbase_creator, _in_legacy_dygraph, in_dygraph_mode, _non_static_mode
+from ...fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_dtype,
+    check_type,
+)
+from ...fluid.framework import (
+    _varbase_creator,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+    _non_static_mode,
+)
 
 from ...fluid import dygraph_utils
 
@@ -112,26 +122,28 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'unfold')
 
-    assert len(x.shape) == 4, \
-            "input should be the format of [N, C, H, W]"
+    assert len(x.shape) == 4, "input should be the format of [N, C, H, W]"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert isinstance(kernel_sizes, list) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list of two integers"
+        assert isinstance(kernel_sizes, list) and (
+            len(kernel_sizes) == 2
+        ), "kernel_sizes should either be an integer or a list of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert isinstance(strides, list) and (len(strides) == 2), \
-            "strides should either be an integer or a list of two integers"
+        assert isinstance(strides, list) and (
+            len(strides) == 2
+        ), "strides should either be an integer or a list of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert isinstance(dilations, list) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list of two integers"
+        assert isinstance(dilations, list) and (
+            len(dilations) == 2
+        ), "dilations should either be an integer or a list of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -147,35 +159,41 @@ def unfold(x, kernel_sizes, strides=1, paddings=0, dilations=1, name=None):
     else:
         raise ValueError(
             "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
+            "of 2 or 4 integers"
+        )
 
     if in_dygraph_mode():
         return _C_ops.unfold(x, kernel_sizes, strides, paddings, dilations)
 
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="unfold",
-                     inputs={"X": x},
-                     outputs={"Y": out},
-                     attrs={
-                         "kernel_sizes": kernel_sizes,
-                         "strides": strides,
-                         "paddings": paddings,
-                         "dilations": dilations
-                     })
+    helper.append_op(
+        type="unfold",
+        inputs={"X": x},
+        outputs={"Y": out},
+        attrs={
+            "kernel_sizes": kernel_sizes,
+            "strides": strides,
+            "paddings": paddings,
+            "dilations": dilations,
+        },
+    )
     return out
 
 
-def interpolate(x,
-                size=None,
-                scale_factor=None,
-                mode='nearest',
-                align_corners=False,
-                align_mode=0,
-                data_format='NCHW',
-                name=None):
+def interpolate(
+    x,
+    size=None,
+    scale_factor=None,
+    mode='nearest',
+    align_corners=False,
+    align_mode=0,
+    data_format='NCHW',
+    name=None,
+):
     """
 
     This API resizes a batch of images.
+
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
     or 4-D (num_batches, channels, in_h, in_w), or a 5-D Tensor of the shape
     (num_batches, channels, in_d, in_h, in_w) or (num_batches, in_d, in_h, in_w, channels),
@@ -184,16 +202,17 @@ def interpolate(x,
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
-        'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
-        'area': Area interpolation
-
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+
+    - 'linear' : Linear interpolation
+    - 'bilinear' : Bilinear interpolation
+    - 'trilinear' : Trilinear interpolation
+    - 'nearest' : Nearest neighbor interpolation
+    - 'bicubic' : Bicubic interpolation
+    - 'area': Area interpolation
+
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -218,21 +237,21 @@ def interpolate(x,
 
     Area interpolation is to perform area interpolation
     in both the 3rd dimension(in height direction) , the 4th dimension(in width
-    direction) and the 5th dimension(in depth direction) on input tensor. Set to 
-    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or 
+    direction) and the 5th dimension(in depth direction) on input tensor. Set to
+    area will directly call `paddle.nn.functional.adaptive_avg_pool1d` or
     `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
 
     Example:
 
     .. code-block:: text
 
-        For scale_factor:
+        # For scale_factor:
             if align_corners = True && out_size > 1 :
               scale_factor = (in_size-1.0)/(out_size-1.0)
             else:
               scale_factor = float(in_size/out_size)
 
-        Linear interpolation:
+        # Linear interpolation:
             if:
                 align_corners = False , align_mode = 0
                 input : (N,C,W_in)
@@ -242,8 +261,8 @@ def interpolate(x,
                 input : (N,C,W_in)
                 output: (N,C,W_out) where:
                 W_out = W_{in} * scale_{factor}
-        
-        Nearest neighbor interpolation:
+
+        # Nearest neighbor interpolation:
 
               align_corners = False
               input : (N,C,H_in,W_in)
@@ -251,7 +270,7 @@ def interpolate(x,
               H_out = floor (H_{in} * scale_{factor})
               W_out = floor (W_{in} * scale_{factor})
 
-        Bilinear interpolation:
+        # Bilinear interpolation:
           if:
               align_corners = False , align_mode = 0
               input : (N,C,H_in,W_in)
@@ -264,7 +283,7 @@ def interpolate(x,
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-        Bicubic interpolation:
+        # Bicubic interpolation:
           if:
               align_corners = False
               input : (N,C,H_in,W_in)
@@ -277,7 +296,7 @@ def interpolate(x,
               H_out = H_{in} * scale_{factor}
               W_out = W_{in} * scale_{factor}
 
-        Trilinear interpolation:
+        # Trilinear interpolation:
           if:
               align_corners = False , align_mode = 0
               input : (N,C,D_in,H_in,W_in)
@@ -294,25 +313,25 @@ def interpolate(x,
 
     For details of linear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Linear_interpolation.
-    
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor, its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
@@ -345,23 +364,23 @@ def interpolate(x,
     Examples:
         .. code-block:: python
 
-		import paddle
-		import paddle.nn.functional as F
+                import paddle
+                import paddle.nn.functional as F
 
-		input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-		output_1 = F.interpolate(x=input_data, size=[12,12])
-		print(output_1.shape)
-		    # [2L, 3L, 12L, 12L]
+                input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+                output_1 = F.interpolate(x=input_data, size=[12,12])
+                print(output_1.shape)
+                    # [2L, 3L, 12L, 12L]
 
-		# given scale
-		output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
-		print(output_2.shape)
-		# [2L, 3L, 12L, 10L]
+                # given scale
+                output_2 = F.interpolate(x=input_data, scale_factor=[2,1])
+                print(output_2.shape)
+                # [2L, 3L, 12L, 10L]
 
-		# bilinear interp
-		output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
-		print(output_2.shape)
-		# [2L, 3L, 12L, 10L]
+                # bilinear interp
+                output_3 = F.interpolate(x=input_data, scale_factor=[2,1], mode="bilinear")
+                print(output_2.shape)
+                # [2L, 3L, 12L, 10L]
     """
     data_format = data_format.upper()
     resample = mode.upper()
@@ -378,7 +397,8 @@ def interpolate(x,
     if resample not in resample_methods:
         raise ValueError(
             "The 'resample' of image_resize can only be 'area', 'linear', 'bilinear', 'trilinear', "
-            " 'bicubic' or 'nearest' currently.")
+            " 'bicubic' or 'nearest' currently."
+        )
 
     if resample in ['LINEAR'] and len(x.shape) != 3:
         raise ValueError("'linear' only support 3-D tensor.")
@@ -405,8 +425,11 @@ def interpolate(x,
         )
 
     if resample == 'AREA':
-        if isinstance(size, list) or isinstance(size, tuple) or isinstance(
-                size, Variable):
+        if (
+            isinstance(size, list)
+            or isinstance(size, tuple)
+            or isinstance(size, Variable)
+        ):
             if len(size) == 0:
                 raise ValueError("output size can not be empty")
         if len(x.shape) == 3:
@@ -420,19 +443,25 @@ def interpolate(x,
     dtype = helper.input_dtype(input_param_name='x')
     if len(x.shape) == 3 and data_format not in ['NCW', 'NWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCW` or `NWC` supported for 3-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCW` or `NWC` supported for 3-D input."
+        )
     elif len(x.shape) == 4 and data_format not in ['NCHW', 'NHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCHW` or `NHWC` supported for 4-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCHW` or `NHWC` supported for 4-D input."
+        )
     elif len(x.shape) == 5 and data_format not in ['NCDHW', 'NDHWC']:
         raise ValueError(
-            "Got wrong value for param `data_format`: " + data_format +
-            " received but only `NCDHW` or `NDHWC` supported for 5-D input.")
+            "Got wrong value for param `data_format`: "
+            + data_format
+            + " received but only `NCDHW` or `NDHWC` supported for 5-D input."
+        )
 
     def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
     if data_format == 'NCHW' or data_format == 'NCDHW' or data_format == 'NCW':
         data_layout = 'NCHW'
@@ -450,7 +479,7 @@ def _is_list_or_turple_(data):
         "interp_method": resample_type,
         "align_corners": align_corners,
         "align_mode": align_mode,
-        "data_layout": data_layout
+        "data_layout": data_layout,
     }
 
     out_shape = size
@@ -478,9 +507,9 @@ def _is_list_or_turple_(data):
                 if isinstance(dim_size, Variable):
                     contain_var = True
                     continue
-                assert dim_size > 0, (
-                    "Each dimension size given in out_shape must be greater than 0."
-                )
+                assert (
+                    dim_size > 0
+                ), "Each dimension size given in out_shape must be greater than 0."
 
             if contain_var:
                 new_size_tensor = []
@@ -491,14 +520,13 @@ def _is_list_or_turple_(data):
                         new_size_tensor.append(dim)
                         size_list.append(-1)
                     else:
-                        assert (isinstance(dim, int))
+                        assert isinstance(dim, int)
                         temp_out = helper.create_variable_for_type_inference(
-                            'int32')
-                        fill_constant([1],
-                                      'int32',
-                                      dim,
-                                      force_cpu=True,
-                                      out=temp_out)
+                            'int32'
+                        )
+                        fill_constant(
+                            [1], 'int32', dim, force_cpu=True, out=temp_out
+                        )
                         new_size_tensor.append(temp_out)
                         size_list.append(dim)
                 inputs['SizeTensor'] = new_size_tensor
@@ -506,7 +534,8 @@ def _is_list_or_turple_(data):
             if len(x.shape) == 3:
                 if len(out_shape) != 1:
                     raise ValueError(
-                        "size length should be 2 for input 3-D tensor")
+                        "size length should be 2 for input 3-D tensor"
+                    )
                 if contain_var:
                     attrs['out_w'] = size_list[0]
                 else:
@@ -514,8 +543,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[0]
             if len(x.shape) == 4:
                 if len(out_shape) != 2:
-                    raise ValueError("size length should be 2 for "
-                                     "input 4-D tensor.")
+                    raise ValueError(
+                        "size length should be 2 for " "input 4-D tensor."
+                    )
                 if contain_var:
                     attrs['out_h'] = size_list[0]
                     attrs['out_w'] = size_list[1]
@@ -525,8 +555,9 @@ def _is_list_or_turple_(data):
                     attrs['out_w'] = out_shape[1]
             if len(x.shape) == 5:
                 if len(out_shape) != 3:
-                    raise ValueError("size length should be 3 for "
-                                     "input 5-D tensor.")
+                    raise ValueError(
+                        "size length should be 3 for " "input 5-D tensor."
+                    )
                 if contain_var:
                     attrs['out_d'] = size_list[0]
                     attrs['out_h'] = size_list[1]
@@ -552,9 +583,10 @@ def _is_list_or_turple_(data):
             attrs['scale'] = list(map(float, scale_list))
         elif isinstance(scale, list) or isinstance(scale, tuple):
             if len(scale) != len(x.shape) - 2:
-                raise ValueError("scale_shape length should be {} for "
-                                 "input {}-D tensor.".format(
-                                     len(x.shape) - 2, len(x.shape)))
+                raise ValueError(
+                    "scale_shape length should be {} for "
+                    "input {}-D tensor.".format(len(x.shape) - 2, len(x.shape))
+                )
             for value in scale:
                 if value <= 0:
                     raise ValueError("Attr(scale) should be greater than zero.")
@@ -574,81 +606,116 @@ def _is_list_or_turple_(data):
         if resample_type == "linear":
             if in_dygraph_mode():
                 out = _C_ops.linear_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.linear_interp_v2(x, *dy_attr)
         elif resample_type == "bilinear":
             if in_dygraph_mode():
                 out = _C_ops.bilinear_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.bilinear_interp_v2(x, *dy_attr)
         elif resample_type == "trilinear":
             if in_dygraph_mode():
                 out = _C_ops.trilinear_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.trilinear_interp_v2(x, *dy_attr)
         elif resample_type == "nearest":
             if in_dygraph_mode():
                 out = _C_ops.nearest_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.nearest_interp_v2(x, *dy_attr)
         elif resample_type == "bicubic":
             if in_dygraph_mode():
                 out = _C_ops.bicubic_interp(
-                    x, inputs['OutSize'] if 'OutSize' in inputs else None,
+                    x,
+                    inputs['OutSize'] if 'OutSize' in inputs else None,
                     inputs['SizeTensor'] if 'SizeTensor' in inputs else None,
                     inputs['Scale'] if 'Scale' in inputs else None,
-                    attrs['data_layout'], attrs['out_d'], attrs['out_h'],
-                    attrs['out_w'], attrs['scale'] if 'scale' in attrs else [],
-                    attrs['interp_method'], attrs['align_corners'],
-                    attrs['align_mode'])
+                    attrs['data_layout'],
+                    attrs['out_d'],
+                    attrs['out_h'],
+                    attrs['out_w'],
+                    attrs['scale'] if 'scale' in attrs else [],
+                    attrs['interp_method'],
+                    attrs['align_corners'],
+                    attrs['align_mode'],
+                )
             else:
                 out = _legacy_C_ops.bicubic_interp_v2(x, *dy_attr)
         return out
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='{}_interp_v2'.format(resample_type),
-                     inputs=inputs,
-                     outputs={"Out": out},
-                     attrs=attrs)
+    helper.append_op(
+        type='{}_interp_v2'.format(resample_type),
+        inputs=inputs,
+        outputs={"Out": out},
+        attrs=attrs,
+    )
     return out
 
 
-def upsample(x,
-             size=None,
-             scale_factor=None,
-             mode='nearest',
-             align_corners=False,
-             align_mode=0,
-             data_format='NCHW',
-             name=None):
+def upsample(
+    x,
+    size=None,
+    scale_factor=None,
+    mode='nearest',
+    align_corners=False,
+    align_mode=0,
+    data_format='NCHW',
+    name=None,
+):
     """
+
     This API resizes a batch of images.
 
     The input must be a 3-D Tensor of the shape (num_batches, channels, in_w)
@@ -659,14 +726,15 @@ def upsample(x,
     and the resizing only applies on the three dimensions(depth, height and width).
 
     Supporting resample methods:
-        'linear' : Linear interpolation
-        'bilinear' : Bilinear interpolation
-        'trilinear' : Trilinear interpolation
-        'nearest' : Nearest neighbor interpolation
-        'bicubic' : Bicubic interpolation
-    Linear interpolation is the method of using a line connecting two known quantities 
-    to determine the value of an unknown quantity between the two known quantities. 
-    
+    - 'linear' : Linear interpolation
+    - 'bilinear' : Bilinear interpolation
+    - 'trilinear' : Trilinear interpolation
+    - 'nearest' : Nearest neighbor interpolation
+    - 'bicubic' : Bicubic interpolation
+
+    Linear interpolation is the method of using a line connecting two known quantities
+    to determine the value of an unknown quantity between the two known quantities.
+
     Nearest neighbor interpolation is to perform nearest neighbor interpolation
     in both the 3rd dimension(in height direction) and the 4th dimension(in width
     direction) on input tensor.
@@ -675,7 +743,7 @@ def upsample(x,
     W-direction in this op) on a rectilinear 2D grid. The key idea is
     to perform linear interpolation first in one direction, and then
     again in the other direction.
-    
+
     Bicubic interpolation is an extension of cubic interpolation for interpolating
     data points on a two-dimensional regular grid. The interpolated surface is
     smoother than corresponding surfaces obtained by bilinear interpolation or
@@ -696,101 +764,102 @@ def upsample(x,
     `paddle.nn.functional.adaptive_avg_pool2d` or `paddle.nn.functional.adaptive_avg_pool3d`.
 
     Example:
-    .. code-block:: text
-    
-        For scale_factor:
-            if align_corners = True && out_size > 1 :
-              scale_factor = (in_size-1.0)/(out_size-1.0)
+        .. code-block:: text
+
+            For scale_factor:
+                if align_corners = True && out_size > 1 :
+                scale_factor = (in_size-1.0)/(out_size-1.0)
+                else:
+                scale_factor = float(in_size/out_size)
+            Linear interpolation:
+                if:
+                    align_corners = False , align_mode = 0
+                    input : (N,C,W_in)
+                    output: (N,C,W_out) where:
+                    W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+                else:
+                    input : (N,C,W_in)
+                    output: (N,C,W_out) where:
+                    W_out = W_{in} * scale_{factor}
+            Nearest neighbor interpolation:
+            if:
+                align_corners = False
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = floor (H_{in} * scale_{factor})
+                W_out = floor (W_{in} * scale_{factor})
             else:
-              scale_factor = float(in_size/out_size)
-        Linear interpolation:
+                align_corners = True
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = round(H_{in} * scale_{factor})
+                W_out = round(W_{in} * scale_{factor})
+
+            Bilinear interpolation:
             if:
                 align_corners = False , align_mode = 0
-                input : (N,C,W_in)
-                output: (N,C,W_out) where:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
                 W_out = (W_{in}+0.5) * scale_{factor} - 0.5
             else:
-                input : (N,C,W_in)
-                output: (N,C,W_out) where:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = H_{in} * scale_{factor}
                 W_out = W_{in} * scale_{factor}
-        Nearest neighbor interpolation:
-          if:
-              align_corners = False
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = floor (H_{in} * scale_{factor})
-              W_out = floor (W_{in} * scale_{factor})
-          else:
-              align_corners = True
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = round(H_{in} * scale_{factor})
-              W_out = round(W_{in} * scale_{factor})
-        
-        Bilinear interpolation:
-          if:
-              align_corners = False , align_mode = 0
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-        Bicubic interpolation:
-          if:
-              align_corners = False
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,H_in,W_in)
-              output: (N,C,H_out,W_out) where:
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-        Trilinear interpolation:
-          if:
-              align_corners = False , align_mode = 0
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              D_out = (D_{in}+0.5) * scale_{factor} - 0.5
-              H_out = (H_{in}+0.5) * scale_{factor} - 0.5
-              W_out = (W_{in}+0.5) * scale_{factor} - 0.5
-          else:
-              input : (N,C,D_in,H_in,W_in)
-              output: (N,C,D_out,H_out,W_out) where:
-              D_out = D_{in} * scale_{factor}
-              H_out = H_{in} * scale_{factor}
-              W_out = W_{in} * scale_{factor}
-    https://en.wikipedia.org/wiki/Linear_interpolation.
+            Bicubic interpolation:
+            if:
+                align_corners = False
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,H_in,W_in)
+                output: (N,C,H_out,W_out) where:
+                H_out = H_{in} * scale_{factor}
+                W_out = W_{in} * scale_{factor}
+            Trilinear interpolation:
+            if:
+                align_corners = False , align_mode = 0
+                input : (N,C,D_in,H_in,W_in)
+                output: (N,C,D_out,H_out,W_out) where:
+                D_out = (D_{in}+0.5) * scale_{factor} - 0.5
+                H_out = (H_{in}+0.5) * scale_{factor} - 0.5
+                W_out = (W_{in}+0.5) * scale_{factor} - 0.5
+            else:
+                input : (N,C,D_in,H_in,W_in)
+                output: (N,C,D_out,H_out,W_out) where:
+                D_out = D_{in} * scale_{factor}
+                H_out = H_{in} * scale_{factor}
+                W_out = W_{in} * scale_{factor}
+
     For details of linear interpolation, please refer to Wikipedia:
-    
+    https://en.wikipedia.org/wiki/Linear_interpolation.
+
     For details of nearest neighbor interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Nearest-neighbor_interpolation.
-    
+
     For details of bilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bilinear_interpolation.
-    
+
     For details of bicubic interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Bicubic_interpolation
-    
+
     For details of trilinear interpolation, please refer to Wikipedia:
     https://en.wikipedia.org/wiki/Trilinear_interpolation.
-    
+
     Parameters:
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None, optional): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None, optional): The multiplier for the input height or width. At
              least one of :attr:`size` or :attr:`scale_factor` must be set.
-             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if 
+             And :attr:`size` has a higher priority than :attr:`scale_factor`.Has to match input size if
              it is either a list or a tuple or a Tensor.
              Default: None.
         mode (str, optional): The resample method. It supports 'linear', 'nearest', 'bilinear',
@@ -810,27 +879,29 @@ def upsample(x,
         name(str, optional): The default value is None.
                              Normally there is no need for user to set this property.
                              For more information, please refer to :ref:`api_guide_Name`
+
     Returns:
         A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
 
-        Examples:
+    Examples:
         .. code-block:: python
-	
-		import paddle
-		import paddle.nn as nn
 
-		input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
-		upsample_out = paddle.nn.Upsample(size=[12,12])
+            import paddle
+            import paddle.nn as nn
+
+            input_data = paddle.randn(shape=(2,3,6,10)).astype(paddle.float32)
+            upsample_out = paddle.nn.Upsample(size=[12,12])
 
-		output = upsample_out(x=input_data)
-		print(output.shape)
-		# [2L, 3L, 12L, 12L]
+            output = upsample_out(x=input_data)
+            print(output.shape)
+            # [2L, 3L, 12L, 12L]
 
     """
-    return interpolate(x, size, scale_factor, mode, align_corners, align_mode,
-                       data_format)
+    return interpolate(
+        x, size, scale_factor, mode, align_corners, align_mode, data_format
+    )
 
 
 def bilinear(x1, x2, weight, bias=None, name=None):
@@ -853,17 +924,17 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     Examples:
        .. code-block:: python
 
-		import paddle
-		import paddle.nn.functional as F
+                import paddle
+                import paddle.nn.functional as F
 
-		x1 = paddle.randn((5, 5)).astype(paddle.float32)
-		x2 = paddle.randn((5, 4)).astype(paddle.float32)
-		w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
-		b = paddle.randn((1, 1000)).astype(paddle.float32)
+                x1 = paddle.randn((5, 5)).astype(paddle.float32)
+                x2 = paddle.randn((5, 4)).astype(paddle.float32)
+                w = paddle.randn((1000, 5, 4)).astype(paddle.float32)
+                b = paddle.randn((1, 1000)).astype(paddle.float32)
 
-		result = F.bilinear(x1, x2, w, b)
-		print(result.shape)
-		# [5, 1000]
+                result = F.bilinear(x1, x2, w, b)
+                print(result.shape)
+                # [5, 1000]
     """
 
     if in_dygraph_mode():
@@ -881,19 +952,16 @@ def bilinear(x1, x2, weight, bias=None, name=None):
     helper = LayerHelper("bilinear", **locals())
     out = helper.create_variable_for_type_inference(dtype=x1.dtype)
 
-    helper.append_op(type="bilinear_tensor_product",
-                     inputs=inputs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="bilinear_tensor_product", inputs=inputs, outputs={"Out": out}
+    )
 
     return out
 
 
-def dropout(x,
-            p=0.5,
-            axis=None,
-            training=True,
-            mode="upscale_in_train",
-            name=None):
+def dropout(
+    x, p=0.5, axis=None, training=True, mode="upscale_in_train", name=None
+):
     """
     Dropout is a regularization technique for reducing overfitting by preventing
     neuron co-adaption during training. The dropout operator randomly sets the
@@ -907,15 +975,16 @@ def dropout(x,
         training (bool, optional): A flag indicating whether it is in train phrase or not. Default True.
         mode(str, optional): ['upscale_in_train'(default) | 'downscale_in_infer'].
 
-                           1. upscale_in_train(default), upscale the output at training time
+            1. upscale_in_train(default), upscale the output at training time
+
+                - train: out = input * mask / ( 1.0 - dropout_prob )
+                - inference: out = input
 
-                              - train: out = input * mask / ( 1.0 - dropout_prob )
-                              - inference: out = input
+            2. downscale_in_infer, downscale the output at inference
 
-                           2. downscale_in_infer, downscale the output at inference
+                - train: out = input * mask
+                - inference: out = input * (1.0 - dropout_prob)
 
-                              - train: out = input * mask
-                              - inference: out = input * (1.0 - dropout_prob)
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -1005,38 +1074,38 @@ def dropout(x,
 
         .. code-block:: python
 
-		import paddle
-
-		x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
-		y_train = paddle.nn.functional.dropout(x, 0.5)
-		y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
-		y_0 = paddle.nn.functional.dropout(x, axis=0)
-		y_1 = paddle.nn.functional.dropout(x, axis=1)
-		y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
-		print(x)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[1., 2., 3.],
-		#         [4., 5., 6.]])
-		print(y_train)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[2. , 0. , 6. ],
-		#         [8. , 0. , 12.]])
-		print(y_test)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[1., 2., 3.],
-		#         [4., 5., 6.]])
-		print(y_0)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[0. , 0. , 0. ],
-		#         [8. , 10., 12.]])
-		print(y_1)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[2. , 0. , 6. ],
-		#         [8. , 0. , 12.]])
-		print(y_01)
-		# Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[0. , 0. , 0. ],
-		#         [8. , 0. , 12.]])
+                import paddle
+
+                x = paddle.to_tensor([[1,2,3], [4,5,6]]).astype(paddle.float32)
+                y_train = paddle.nn.functional.dropout(x, 0.5)
+                y_test = paddle.nn.functional.dropout(x, 0.5, training=False)
+                y_0 = paddle.nn.functional.dropout(x, axis=0)
+                y_1 = paddle.nn.functional.dropout(x, axis=1)
+                y_01 = paddle.nn.functional.dropout(x, axis=[0,1])
+                print(x)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[1., 2., 3.],
+                #         [4., 5., 6.]])
+                print(y_train)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[2. , 0. , 6. ],
+                #         [8. , 0. , 12.]])
+                print(y_test)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[1., 2., 3.],
+                #         [4., 5., 6.]])
+                print(y_0)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[0. , 0. , 0. ],
+                #         [8. , 10., 12.]])
+                print(y_1)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[2. , 0. , 6. ],
+                #         [8. , 0. , 12.]])
+                print(y_01)
+                # Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[0. , 0. , 0. ],
+                #         [8. , 0. , 12.]])
 
     """
     if not isinstance(p, (float, int, Variable)):
@@ -1044,7 +1113,8 @@ def dropout(x,
 
     if isinstance(p, (int, float)):
         # fast return for p == 0
-        if p == 0: return x
+        if p == 0:
+            return x
         elif p < 0 or p > 1:
             raise ValueError("p argument should between 0 and 1")
     if mode not in ('downscale_in_infer', 'upscale_in_train'):
@@ -1056,41 +1126,63 @@ def dropout(x,
 
     if axis == None:  # commonly used dropout
         seed = None
-        mode = 'downgrade_in_infer' if mode == 'downscale_in_infer' else mode  #semantic transfer
+        mode = (
+            'downgrade_in_infer' if mode == 'downscale_in_infer' else mode
+        )  # semantic transfer
 
         if _non_static_mode():
             if default_main_program().random_seed != 0:
                 seed = default_main_program().random_seed
 
             if in_dygraph_mode():
-                out, mask = _C_ops.dropout( x, None, p, not training, mode, \
-                    seed if seed is not None else 0, seed is not None)
+                out, mask = _C_ops.dropout(
+                    x,
+                    None,
+                    p,
+                    not training,
+                    mode,
+                    seed if seed is not None else 0,
+                    seed is not None,
+                )
 
                 return out
-            out, mask = _legacy_C_ops.dropout(x, 'dropout_prob', p, 'is_test',
-                                              not training, 'fix_seed', seed
-                                              is not None, 'seed',
-                                              seed if seed is not None else 0,
-                                              'dropout_implementation', mode)
+            out, mask = _legacy_C_ops.dropout(
+                x,
+                'dropout_prob',
+                p,
+                'is_test',
+                not training,
+                'fix_seed',
+                seed is not None,
+                'seed',
+                seed if seed is not None else 0,
+                'dropout_implementation',
+                mode,
+            )
             return out
 
         helper = LayerHelper('dropout', **locals())
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'dropout')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'dropout'
+        )
 
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
         mask = helper.create_variable_for_type_inference(
-            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True)
+            dtype=core.VarDesc.VarType.UINT8, stop_gradient=True
+        )
 
         def get_attrs(prog, dropout_prob, is_test, seed):
             if (seed is None or seed == 0) and prog.random_seed != 0:
                 seed = prog.random_seed
 
-            if isinstance(dropout_prob,
-                          Variable) and not dropout_prob.shape != [1]:
+            if isinstance(
+                dropout_prob, Variable
+            ) and not dropout_prob.shape != [1]:
                 raise TypeError(
-                    "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}"
-                    .format(p.shape))
+                    "Required p.shape == [1] if type(p) is Variable, but received p.shape = {}".format(
+                        p.shape
+                    )
+                )
             attrs = {
                 'dropout_prob': dropout_prob,
                 'is_test': is_test,
@@ -1102,38 +1194,45 @@ def get_attrs(prog, dropout_prob, is_test, seed):
 
         attrs = get_attrs(helper.main_program, p, not training, seed)
 
-        helper.append_op(type='dropout',
-                         inputs={'X': [x]},
-                         outputs={
-                             'Out': [out],
-                             'Mask': [mask]
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='dropout',
+            inputs={'X': [x]},
+            outputs={'Out': [out], 'Mask': [mask]},
+            attrs=attrs,
+        )
         return out
-    else:  #sometimes called dropout_nd #TODO: optimize with c++
+    else:  # sometimes called dropout_nd #TODO: optimize with c++
         if not in_dynamic_mode():
             check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'dropout')
         dtype = x.dtype
         keep_prob = 1 - p
         if training:
-            if p == 1.:
-                return paddle.scale(x, scale=0.)
+            if in_dynamic_mode() and p == 1.0:
+                return paddle.scale(x, scale=0.0)
 
-            scale_input = paddle.scale(
-                x, scale=1 / keep_prob) if mode == 'upscale_in_train' else x
+            scale_input = (
+                paddle.scale(x, scale=1 / keep_prob)
+                if mode == 'upscale_in_train'
+                else x
+            )
 
-            #get mask shape
+            # get mask shape
             input_shape = x.shape
             if not in_dynamic_mode():
                 input_shape_tensor = paddle.shape(x)
             drop_axes = [axis] if isinstance(axis, int) else list(axis)
             if min(drop_axes) < 0 or max(drop_axes) > len(input_shape) - 1:
-                raise ValueError("axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} " \
-                                 .format(len(input_shape), max(drop_axes)))
+                raise ValueError(
+                    "axis value should be greater than or equal to 0 and less than dimensions of x:{}, but get axis value:{} ".format(
+                        len(input_shape), max(drop_axes)
+                    )
+                )
             if len(drop_axes) > len(input_shape):
                 raise ValueError(
-                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}"
-                    .format(len(input_shape), len(drop_axes)))
+                    "length of axis should not be greater than dimensions of x:{}, but get length of axis: {}".format(
+                        len(input_shape), len(drop_axes)
+                    )
+                )
             mask_shape = [1] * len(input_shape)
             if not in_dynamic_mode():
                 for i in drop_axes:
@@ -1142,11 +1241,10 @@ def get_attrs(prog, dropout_prob, is_test, seed):
                 for i in drop_axes:
                     mask_shape[i] = input_shape[i]
 
-            #get mask
-            random_tensor = paddle.uniform(mask_shape,
-                                           dtype='float32',
-                                           min=0.,
-                                           max=1.0)
+            # get mask
+            random_tensor = paddle.uniform(
+                mask_shape, dtype='float32', min=0.0, max=1.0
+            )
             p = full(shape=[1], fill_value=p, dtype='float32')
             keep_mask = paddle.greater_equal(random_tensor, p)
 
@@ -1155,8 +1253,11 @@ def get_attrs(prog, dropout_prob, is_test, seed):
             ret = paddle.multiply(scale_input, keep_mask, name=name)
             return ret
         else:  # test
-            ret = paddle.scale(
-                x, scale=keep_prob) if mode == 'downscale_in_infer' else x
+            ret = (
+                paddle.scale(x, scale=keep_prob)
+                if mode == 'downscale_in_infer'
+                else x
+            )
             return ret
 
 
@@ -1197,20 +1298,26 @@ def dropout2d(x, p=0.5, training=True, data_format='NCHW', name=None):
     """
     input_shape = x.shape
     if len(input_shape) != 4:
-        raise ValueError("dimensions of x should be 4, but received {} != 4"\
-        .format(len(input_shape)))
+        raise ValueError(
+            "dimensions of x should be 4, but received {} != 4".format(
+                len(input_shape)
+            )
+        )
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    return dropout(x,
-                   p=p,
-                   axis=[0, 1] if data_format == 'NCHW' else [0, 3],
-                   training=training,
-                   mode="upscale_in_train",
-                   name=name)
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCHW' else [0, 3],
+        training=training,
+        mode="upscale_in_train",
+        name=name,
+    )
 
 
 def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
@@ -1236,33 +1343,39 @@ def dropout3d(x, p=0.5, training=True, data_format='NCDHW', name=None):
     Examples:
         .. code-block:: python
 
-		import paddle
+                import paddle
 
-		x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
-		y_train = paddle.nn.functional.dropout3d(x)  #train
-		y_test = paddle.nn.functional.dropout3d(x, training=False) #test
-		print(x[0,0,:,:,:])
-		print(y_train[0,0,:,:,:]) # may all 0
-		print(y_test[0,0,:,:,:])
+                x = paddle.randn(shape=(2, 3, 4, 5, 6)).astype(paddle.float32)
+                y_train = paddle.nn.functional.dropout3d(x)  #train
+                y_test = paddle.nn.functional.dropout3d(x, training=False) #test
+                print(x[0,0,:,:,:])
+                print(y_train[0,0,:,:,:]) # may all 0
+                print(y_test[0,0,:,:,:])
 
     """
 
     input_shape = x.shape
     if len(input_shape) != 5:
-        raise ValueError("dimensions of x should be 5, but received {} != 5" \
-        .format(len(input_shape)))
+        raise ValueError(
+            "dimensions of x should be 5, but received {} != 5".format(
+                len(input_shape)
+            )
+        )
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    return dropout(x,
-                   p=p,
-                   axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
-                   training=training,
-                   mode="upscale_in_train",
-                   name=name)
+    return dropout(
+        x,
+        p=p,
+        axis=[0, 1] if data_format == 'NCDHW' else [0, 4],
+        training=training,
+        mode="upscale_in_train",
+        name=name,
+    )
 
 
 def alpha_dropout(x, p=0.5, training=True, name=None):
@@ -1284,19 +1397,19 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
     Examples:
         .. code-block:: python
 
-		import paddle
-
-		x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
-		y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
-		y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
-		print(y_train)
-		# Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[-0.10721093, -0.77919382],
-		#         [-0.10721093,  1.66559887]]) (randomly)
-		print(y_test)
-		# Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
-		#        [[-1.,  1.],
-		#         [-1.,  1.]])
+                import paddle
+
+                x = paddle.to_tensor([[-1, 1], [-1, 1]]).astype(paddle.float32)
+                y_train = paddle.nn.functional.alpha_dropout(x, 0.5)
+                y_test = paddle.nn.functional.alpha_dropout(x, 0.5, training=False)
+                print(y_train)
+                # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[-0.10721093, -0.77919382],
+                #         [-0.10721093,  1.66559887]]) (randomly)
+                print(y_test)
+                # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+                #        [[-1.,  1.],
+                #         [-1.,  1.]])
     """
     if not isinstance(p, (float, int)):
         raise TypeError("p argument should be a float or int")
@@ -1304,37 +1417,40 @@ def alpha_dropout(x, p=0.5, training=True, name=None):
         raise ValueError("p argument should between 0 and 1")
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'alpha_dropout')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'alpha_dropout'
+        )
 
     if training:
         if p == 1:
-            return paddle.scale(x, scale=0.)
-        #get transformation params
+            return paddle.scale(x, scale=0.0)
+        # get transformation params
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
         alpha_p = -alpha * scale
-        a = ((1 - p) * (1 + p * alpha_p**2))**-0.5
+        a = ((1 - p) * (1 + p * alpha_p**2)) ** -0.5
         b = -a * alpha_p * p
 
         dtype = x.dtype
         input_shape = x.shape
 
-        #get mask
-        random_tensor = paddle.uniform(input_shape,
-                                       dtype='float32',
-                                       min=0.,
-                                       max=1.0)
+        # get mask
+        random_tensor = paddle.uniform(
+            input_shape, dtype='float32', min=0.0, max=1.0
+        )
         p = full(shape=[1], fill_value=p, dtype='float32')
         keep_mask = paddle.greater_equal(random_tensor, p)
         keep_mask = paddle.cast(keep_mask, dtype)
         drop_mask = paddle.subtract(
-            full(shape=input_shape, fill_value=1., dtype=dtype), keep_mask)
+            full(shape=input_shape, fill_value=1.0, dtype=dtype), keep_mask
+        )
 
-        #apply mask
+        # apply mask
         b = full(shape=[1], fill_value=b, dtype=dtype)
-        y = paddle.add(paddle.multiply(x, keep_mask),
-                       paddle.scale(drop_mask, scale=alpha_p))
+        y = paddle.add(
+            paddle.multiply(x, keep_mask),
+            paddle.scale(drop_mask, scale=alpha_p),
+        )
         res = paddle.add(paddle.scale(y, scale=a), b, name=name)
         return res
     else:  # test
@@ -1353,11 +1469,11 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
     Parameters:
         x (Tensor): The input tensor with data type float32/double/int32/int64_t.
         pad (Tensor|list[int]|tuple[int]): The padding size with data type int.
-            If mode is 'constant' and length of pad is twice as length of x dimension, then x will 
+            If mode is 'constant' and length of pad is twice as length of x dimension, then x will
             be padded from the first  dimension to the last dimension.
             Else: 1. If input dimension is 3, then the pad has the form (pad_left,
-            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right, 
-            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form 
+            pad_right). 2. If the input dimension is 4, then the pad has the form (pad_left, pad_right,
+            pad_top, pad_bottom). 3. If the input dimension is 5, then the pad has the form
             (pad_left, pad_right, pad_top, pad_bottom, pad_front, pad_back).
         mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. Default is 'constant'
 
@@ -1370,12 +1486,12 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         data_format (str, optional): An string from: "NCL", "NLC", NHWC", "NCHW", "NCDHW", "NDHWC". Specify the data format of
            the input data. Default is "NCHW"，
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-                    
-    Returns: 
+
+    Returns:
         Tensor, a Tensor padded according to pad and mode and data type is same as input.
 
     Example:
-    
+
         .. code-block:: text
 
             x = [[[[[1., 2., 3.],
@@ -1428,21 +1544,21 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
             import paddle
             import paddle.nn.functional as F
-            
+
             # example 1
             x_shape = (1, 1, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
             y = F.pad(x, [0, 0, 0, 0, 2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 2
             x_shape = (1, 1, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
             y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
             print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 3
             x_shape = (1, 1, 2, 3)
             x = paddle.arange(paddle.prod(paddle.to_tensor(x_shape)), dtype="float32").reshape(x_shape) + 1
@@ -1453,18 +1569,28 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             #    [6. 4. 5. 6. 4. 5.]
             #    [3. 1. 2. 3. 1. 2.]]]]
     """
-    assert mode in ['reflect', 'replicate', 'constant', 'circular'], \
-            "mode should be one of constant, reflect, replicate, circular, but got {}.".format(mode)
+    assert mode in [
+        'reflect',
+        'replicate',
+        'constant',
+        'circular',
+    ], "mode should be one of constant, reflect, replicate, circular, but got {}.".format(
+        mode
+    )
 
     data_format = data_format.upper()
-    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], \
-        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
+    assert data_format in ["NCL", "NCHW", "NCDHW", "NLC", "NHWC", "NDHWC"], (
+        "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], "
         "but got {}".format(data_format)
+    )
 
     x_dim = len(x.shape)
 
-    if mode == "constant" and isinstance(
-            pad, (list, tuple)) and len(pad) == x_dim * 2:
+    if (
+        mode == "constant"
+        and isinstance(pad, (list, tuple))
+        and len(pad) == x_dim * 2
+    ):
         paddings = pad
         pad_value = value
 
@@ -1472,10 +1598,20 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
             out = _C_ops.pad(x, paddings, float(pad_value))
             return out
 
-        check_variable_and_dtype(x, 'x', [
-            'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-            'complex128'
-        ], "pad")
+        check_variable_and_dtype(
+            x,
+            'x',
+            [
+                'float16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'complex64',
+                'complex128',
+            ],
+            "pad",
+        )
 
         check_type(pad_value, 'pad_value', (float, int, Variable), 'pad')
         if isinstance(pad_value, int):
@@ -1484,17 +1620,18 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         helper = LayerHelper('pad', **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
-        helper.append_op(type='pad',
-                         inputs={'X': x},
-                         outputs={'Out': out},
-                         attrs={
-                             'paddings': paddings,
-                             'pad_value': pad_value
-                         })
+        helper.append_op(
+            type='pad',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'paddings': paddings, 'pad_value': pad_value},
+        )
         return out
 
     assert x_dim in [
-        3, 4, 5
+        3,
+        4,
+        5,
     ], "input tesor dimension must be in [3, 4, 5] but got {}".format(x_dim)
 
     supported_format_map = {
@@ -1502,9 +1639,11 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         4: ["NCHW", "NHWC"],
         5: ["NCDHW", "NDHWC"],
     }
-    assert data_format in supported_format_map[x_dim], \
-    "input tensor dimension is {}, it's data format should be in {} but got {}".format(
-        x_dim, supported_format_map[x_dim], data_format)
+    assert (
+        data_format in supported_format_map[x_dim]
+    ), "input tensor dimension is {}, it's data format should be in {} but got {}".format(
+        x_dim, supported_format_map[x_dim], data_format
+    )
 
     unsqueezed_dim = []
 
@@ -1512,21 +1651,21 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         if data_format in ["NCL", "NCHW", "NCDHW"]:
             data_format = "NCDHW"
             if x_dim == 3:
-                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                pad = concat([zeros((4,), dtype="int32"), pad], axis=0)
                 unsqueezed_dim = [3, 4]
                 x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
-                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                pad = concat([pad, zeros((2,), dtype="int32")], axis=0)
                 unsqueezed_dim = [2]
                 x = unsqueeze(x, axis=unsqueezed_dim)
         elif data_format in ["NLC", "NHWC", "NDHWC"]:
             data_format = "NDHWC"
             if x_dim == 3:
-                pad = concat([zeros((4, ), dtype="int32"), pad], axis=0)
+                pad = concat([zeros((4,), dtype="int32"), pad], axis=0)
                 unsqueezed_dim = [2, 3]
                 x = unsqueeze(x, axis=unsqueezed_dim)
             elif x_dim == 4:
-                pad = concat([pad, zeros((2, ), dtype="int32")], axis=0)
+                pad = concat([pad, zeros((2,), dtype="int32")], axis=0)
                 unsqueezed_dim = [1]
                 x = unsqueeze(x, axis=unsqueezed_dim)
     else:
@@ -1560,9 +1699,19 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
         if _in_legacy_dygraph():
             if isinstance(pad, Variable):
                 pad = pad.numpy().tolist()
-            out = _legacy_C_ops.pad3d(x, "paddings", pad, "mode", mode, "value",
-                                      value, "data_format", data_format, "name",
-                                      name)
+            out = _legacy_C_ops.pad3d(
+                x,
+                "paddings",
+                pad,
+                "mode",
+                mode,
+                "value",
+                value,
+                "data_format",
+                data_format,
+                "name",
+                name,
+            )
         else:
             attrs = {'mode': mode, 'value': value, 'data_format': data_format}
             inputs = {'X': [x]}
@@ -1576,10 +1725,9 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
             dtype = helper.input_dtype(input_param_name='input')
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='pad3d',
-                             inputs=inputs,
-                             outputs={"Out": out},
-                             attrs=attrs)
+            helper.append_op(
+                type='pad3d', inputs=inputs, outputs={"Out": out}, attrs=attrs
+            )
 
     if len(unsqueezed_dim) != 0:
         out = squeeze(out, axis=unsqueezed_dim)
@@ -1601,7 +1749,7 @@ def zeropad2d(x, padding, data_format="NCHW", name=None):
         name(str, optional): The default value is None. Normally there is no need for user
             to set this property.
 
-    Returns: 
+    Returns:
         Tensor, padded with 0 according to pad and data type is same as input.
 
     Examples:
@@ -1620,12 +1768,14 @@ def zeropad2d(x, padding, data_format="NCHW", name=None):
             #    [0. 0. 0. 0. 0. 0.]]]]
     """
 
-    return pad(x,
-               pad=padding,
-               mode='constant',
-               value=0,
-               data_format=data_format,
-               name=name)
+    return pad(
+        x,
+        pad=padding,
+        mode='constant',
+        value=0,
+        data_format=data_format,
+        name=name,
+    )
 
 
 def cosine_similarity(x1, x2, axis=1, eps=1e-8):
@@ -1637,8 +1787,8 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
         x2 (Tensor): Second input. float32/double.
         axis (int, optional): Dimension of vectors to compute cosine similarity. Default is 1.
         eps(float, optional): Small value to avoid division by zero. Default is 1e-8.
-                    
-    Returns: 
+
+    Returns:
         Tensor, a Tensor representing cosine similarity between x1 and x2 along axis.
 
     Examples:
@@ -1670,7 +1820,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
             result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
             print(result)
             # [0.97689527,  0.99996042, -0.55138415]
-            
+
     """
     w12 = sum(paddle.multiply(x1, x2), axis=axis)
     w1 = sum(paddle.multiply(x1, x1), axis=axis)
@@ -1696,7 +1846,7 @@ def linear(x, weight, bias=None, name=None):
     input should be a multi-dimensional tensor of shape
     :math:`[batch\_size, *, in\_features]` , where :math:`*` means any number of
     additional dimensions. The linear operator multiplies input tensor with
-    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` , 
+    weight and produces an output tensor of shape :math:`[batch\_size, *, out\_features]` ,
     If :math:`bias` is not None, the bias should be a 1-D tensor of shape
     :math:`[out\_features]` and will be added to the output.
 
@@ -1714,9 +1864,9 @@ def linear(x, weight, bias=None, name=None):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
-          
+
           x = paddle.randn((3, 2), dtype="float32")
           # x: [[-0.32342386 -1.200079  ]
           #     [ 0.7979031  -0.90978354]
@@ -1732,12 +1882,13 @@ def linear(x, weight, bias=None, name=None):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
     if in_dygraph_mode():
-        #TODO(jiabin): using addmm for fast forward route
+        # TODO(jiabin): using addmm for fast forward route
         return _C_ops.linear(x, weight, bias)
     else:
         if _in_legacy_dygraph():
-            pre_bias = _legacy_C_ops.matmul_v2(x, weight, 'trans_x', False,
-                                               'trans_y', False)
+            pre_bias = _legacy_C_ops.matmul_v2(
+                x, weight, 'trans_x', False, 'trans_y', False
+            )
 
             if bias is None:
                 return pre_bias
@@ -1747,27 +1898,30 @@ def linear(x, weight, bias=None, name=None):
             helper = LayerHelper('linear', **locals())
             dtype = x.dtype
 
-            check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                     'linear')
-            check_dtype(dtype, 'dtype', ['float16', 'float32', 'float64'],
-                        'linear')
+            check_variable_and_dtype(
+                x, 'x', ['float16', 'float32', 'float64'], 'linear'
+            )
+            check_dtype(
+                dtype, 'dtype', ['float16', 'float32', 'float64'], 'linear'
+            )
 
             inputs = {'X': [x], 'Y': [weight]}
             attrs = {'trans_x': False, 'trans_y': False}
             tmp = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='matmul_v2',
-                             inputs=inputs,
-                             outputs={'Out': tmp},
-                             attrs=attrs)
+            helper.append_op(
+                type='matmul_v2',
+                inputs=inputs,
+                outputs={'Out': tmp},
+                attrs=attrs,
+            )
             if bias is not None:
                 res = helper.create_variable_for_type_inference(dtype)
-                helper.append_op(type='elementwise_add',
-                                 inputs={
-                                     'X': [tmp],
-                                     'Y': [bias]
-                                 },
-                                 outputs={'Out': [res]},
-                                 attrs={'axis': len(x.shape) - 1})
+                helper.append_op(
+                    type='elementwise_add',
+                    inputs={'X': [tmp], 'Y': [bias]},
+                    outputs={'Out': [res]},
+                    attrs={'axis': len(x.shape) - 1},
+                )
             else:
                 res = tmp
             return res
@@ -1776,12 +1930,12 @@ def linear(x, weight, bias=None, name=None):
 def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
     r"""
     Label smoothing is a mechanism to regularize the classifier layer and is called
-    label-smoothing regularization (LSR).
+    label-smoothing regularization (LSR).Label smoothing is proposed to encourage
+    the model to be less confident, since optimizing the log-likelihood of the
+    correct label directly may cause overfitting and reduce the ability of the
+    model to adapt.
 
-    Label smoothing is proposed to encourage the model to be less confident,
-    since optimizing the log-likelihood of the correct label directly may
-    cause overfitting and reduce the ability of the model to adapt. Label
-    smoothing replaces the ground-truth label :math:`y` with the weighted sum
+    Label smoothing replaces the ground-truth label :math:`y` with the weighted sum
     of itself and some fixed distribution :math:`\mu`. For class :math:`k`,
     i.e.
 
@@ -1818,61 +1972,62 @@ def label_smooth(label, prior_dist=None, epsilon=0.1, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
-            x_data = np.array([[[0, 1, 0],
-                                [ 1,  0, 1]]]).astype("float32")
-            print(x_data.shape)
             paddle.disable_static()
-            x = paddle.to_tensor(x_data, stop_gradient=False)
+
+            x = paddle.to_tensor([[[0, 1, 0],
+                                [ 1,  0, 1]]], dtype="float32", stop_gradient=False)
+
             output = paddle.nn.functional.label_smooth(x)
             print(output)
-            
-            #[[[0.03333334 0.93333334 0.03333334]
-            #  [0.93333334 0.03333334 0.93333334]]]
+            # Tensor(shape=[1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[[0.03333334, 0.93333334, 0.03333334],
+            #          [0.93333334, 0.03333334, 0.93333334]]])
     """
-    if epsilon > 1. or epsilon < 0.:
+    if epsilon > 1.0 or epsilon < 0.0:
         raise ValueError("The value of epsilon must be between 0 and 1.")
 
     if in_dygraph_mode():
         return _C_ops.label_smooth(label, prior_dist, float(epsilon))
 
     elif paddle.in_dynamic_mode():
-        return _legacy_C_ops.label_smooth(label, prior_dist, 'epsilon',
-                                          float(epsilon))
+        return _legacy_C_ops.label_smooth(
+            label, prior_dist, 'epsilon', float(epsilon)
+        )
 
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'label_smooth')
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'label_smooth'
+    )
 
     helper = LayerHelper("label_smooth", **locals())
     label.stop_gradient = True
     smooth_label = helper.create_variable_for_type_inference(label.dtype)
-    helper.append_op(type="label_smooth",
-                     inputs={
-                         "X": label,
-                         "PriorDist": prior_dist
-                     } if prior_dist else {"X": label},
-                     outputs={"Out": smooth_label},
-                     attrs={"epsilon": float(epsilon)})
+    helper.append_op(
+        type="label_smooth",
+        inputs={"X": label, "PriorDist": prior_dist}
+        if prior_dist
+        else {"X": label},
+        outputs={"Out": smooth_label},
+        attrs={"epsilon": float(epsilon)},
+    )
     return smooth_label
 
 
 def class_center_sample(label, num_classes, num_samples, group=None):
     """
     Class center sample method is proposed from the paper PartialFC that only sample a subset of the class centers.
-    The process of sampling subset class centers is straightforward: 
+    The process of sampling subset class centers is straightforward:
 
     1. First select the positive class centers;
     2. Then randomly sample negative class centers.
 
-    Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly 
+    Specifically, given a label tensor, shape [batch_size], select all the positive class centers and randomly
     sample negative class centers, then remap the input label tensor using the sampled class centers.
 
     For more information, Partial FC: Training 10 Million Identities on a Single Machine
     arxiv: https://arxiv.org/abs/2010.05222
-    
+
     .. hint::
-        If the number of the positive class centers is greater than the input num_samples, it keeps all the positive 
+        If the number of the positive class centers is greater than the input num_samples, it keeps all the positive
         class centers and the shape of sampled_class_center will be [num_positive_class_centers].
 
         The API supports CPU, single GPU and multi GPU.
@@ -1886,7 +2041,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         num_classes (int): A positive integer to specify the number of classes at local rank.
             Note that num_classes of each GPU can be different.
         num_samples (int): A positive integer to specify the number of class center to sample.
-        group (Group, optional): The group instance return by paddle.distributed.new_group 
+        group (Group, optional): The group instance return by paddle.distributed.new_group
             or ``None`` for global default group or ``False`` for data parallel (do not communication cross ranks).
             Default is ``None``.
 
@@ -1952,7 +2107,7 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         #       [6 , 11, 10, 7 , 4 , 8 , 12, 12, 11, 12, 13, 1 , 3 , 9 , 7 , 9 , 4 , 6 , 0 , 2 ])
         #Tensor(shape=[6], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
         #       [0, 2, 4, 8, 9, 3])
-        
+
         # rank 1 output:
         #Tensor(shape=[20], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
         #       [10, 17, 15, 11, 9 , 12, 18, 18, 17, 18, 19, 2 , 8 , 13, 11, 13, 9 , 10, 0 , 4 ])
@@ -1964,7 +2119,10 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
     if not (group == False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
             'Expected group is False, None or instance of paddle.distributed.collective.Group \
-             (got group: {})'.format(group))
+             (got group: {})'.format(
+                group
+            )
+        )
         return
 
     if hasattr(group, 'is_member') and not group.is_member():
@@ -1977,76 +2135,112 @@ class centers and the shape of sampled_class_center will be [num_positive_class_
         if core.is_compiled_with_dist():
             parallel_env = paddle.distributed.ParallelEnv()
             global_rank = parallel_env.rank
-            rank = global_rank if group is None else group.get_group_rank(
-                global_rank)
+            rank = (
+                global_rank
+                if group is None
+                else group.get_group_rank(global_rank)
+            )
             nranks = parallel_env.world_size if group is None else group.nranks
 
     if num_samples > num_classes:
         raise ValueError(
-            'Expected num_samples less than or equal to {}, got num_samples {}'.
-            format(num_classes, num_samples))
+            'Expected num_samples less than or equal to {}, got num_samples {}'.format(
+                num_classes, num_samples
+            )
+        )
 
     label_size = 1
     for dim in list(label.shape):
         label_size *= dim
     if label_size != -1 and label_size < 1:
-        raise ValueError('Expected label_size > 0 \
-             (got label_size: {})'.format(label_size))
+        raise ValueError(
+            'Expected label_size > 0 \
+             (got label_size: {})'.format(
+                label_size
+            )
+        )
 
     label_dims = len(list(label.shape))
     if label_dims != 1:
-        raise ValueError('Expected label_dims == 1 \
-             (got label_dims: {})'.format(label_dims))
+        raise ValueError(
+            'Expected label_dims == 1 \
+             (got label_dims: {})'.format(
+                label_dims
+            )
+        )
 
     seed = None
     if (seed is None or seed == 0) and default_main_program().random_seed != 0:
         seed = default_main_program().random_seed
 
     if in_dygraph_mode():
-        return _C_ops.class_center_sample(label, num_classes, num_samples,
-                                          ring_id, rank, nranks, seed
-                                          is not None,
-                                          seed if seed is not None else 0)
+        return _C_ops.class_center_sample(
+            label,
+            num_classes,
+            num_samples,
+            ring_id,
+            rank,
+            nranks,
+            seed is not None,
+            seed if seed is not None else 0,
+        )
     elif paddle.in_dynamic_mode():
-        remapped_label, sampled_class_center = _legacy_C_ops.class_center_sample(
-            label, 'num_classes', num_classes, 'num_samples', num_samples,
-            'ring_id', ring_id, 'nranks', nranks, 'rank', rank, 'fix_seed', seed
-            is not None, 'seed', seed if seed is not None else 0)
+        (
+            remapped_label,
+            sampled_class_center,
+        ) = _legacy_C_ops.class_center_sample(
+            label,
+            'num_classes',
+            num_classes,
+            'num_samples',
+            num_samples,
+            'ring_id',
+            ring_id,
+            'nranks',
+            nranks,
+            'rank',
+            rank,
+            'fix_seed',
+            seed is not None,
+            'seed',
+            seed if seed is not None else 0,
+        )
         return remapped_label, sampled_class_center
 
-    check_variable_and_dtype(label, 'label', ['int64', 'int32'],
-                             'class_center_sample')
+    check_variable_and_dtype(
+        label, 'label', ['int64', 'int32'], 'class_center_sample'
+    )
     op_type = 'class_center_sample'
     helper = LayerHelper(op_type, **locals())
     remapped_label = helper.create_variable_for_type_inference(
-        dtype=label.dtype)
+        dtype=label.dtype
+    )
     sampled_class_center = helper.create_variable_for_type_inference(
-        dtype=label.dtype)
-    helper.append_op(type=op_type,
-                     inputs={'Label': label},
-                     outputs={
-                         'RemappedLabel': remapped_label,
-                         'SampledLocalClassCenter': sampled_class_center
-                     },
-                     attrs={
-                         'num_classes': num_classes,
-                         'num_samples': num_samples,
-                         'ring_id': ring_id,
-                         'nranks': nranks,
-                         'rank': rank,
-                         'fix_seed': seed is not None,
-                         'seed': seed if seed is not None else 0
-                     })
+        dtype=label.dtype
+    )
+    helper.append_op(
+        type=op_type,
+        inputs={'Label': label},
+        outputs={
+            'RemappedLabel': remapped_label,
+            'SampledLocalClassCenter': sampled_class_center,
+        },
+        attrs={
+            'num_classes': num_classes,
+            'num_samples': num_samples,
+            'ring_id': ring_id,
+            'nranks': nranks,
+            'rank': rank,
+            'fix_seed': seed is not None,
+            'seed': seed if seed is not None else 0,
+        },
+    )
     return remapped_label, sampled_class_center
 
 
-def fold(x,
-         output_sizes,
-         kernel_sizes,
-         strides=1,
-         paddings=0,
-         dilations=1,
-         name=None):
+def fold(
+    x, output_sizes, kernel_sizes, strides=1, paddings=0, dilations=1, name=None
+):
     r"""
     
     Combines an array of sliding local blocks into a large containing
@@ -2109,35 +2303,38 @@ def fold(x,
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'fold')
 
-    assert len(x.shape) == 3, \
-            "input should be the format of [N, C, L]"
+    assert len(x.shape) == 3, "input should be the format of [N, C, L]"
 
     def _is_list_or_turple_(data):
-        return (isinstance(data, list) or isinstance(data, tuple))
+        return isinstance(data, list) or isinstance(data, tuple)
 
     if isinstance(output_sizes, int):
         output_sizes = [output_sizes, output_sizes]
     else:
-        assert _is_list_or_turple_(output_sizes) and (len(output_sizes) == 2), \
-            "output_sizes should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(output_sizes) and (
+            len(output_sizes) == 2
+        ), "output_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(kernel_sizes, int):
         kernel_sizes = [kernel_sizes, kernel_sizes]
     else:
-        assert _is_list_or_turple_(kernel_sizes) and (len(kernel_sizes) == 2), \
-            "kernel_sizes should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(kernel_sizes) and (
+            len(kernel_sizes) == 2
+        ), "kernel_sizes should either be an integer or a list/tuple of two integers"
 
     if isinstance(strides, int):
         strides = [strides, strides]
     else:
-        assert _is_list_or_turple_(strides) and (len(strides) == 2), \
-            "strides should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(strides) and (
+            len(strides) == 2
+        ), "strides should either be an integer or a list/tuple of two integers"
 
     if isinstance(dilations, int):
         dilations = [dilations, dilations]
     else:
-        assert _is_list_or_turple_(dilations) and (len(dilations) == 2), \
-            "dilations should either be an integer or a list/tuple of two integers"
+        assert _is_list_or_turple_(dilations) and (
+            len(dilations) == 2
+        ), "dilations should either be an integer or a list/tuple of two integers"
 
     if isinstance(paddings, int):
         paddings = [paddings] * 4
@@ -2153,26 +2350,39 @@ def _is_list_or_turple_(data):
     else:
         raise ValueError(
             "Unexpected type of paddings, it should be either an integer or a list"
-            "of 2 or 4 integers")
+            "of 2 or 4 integers"
+        )
 
     if in_dygraph_mode():
-        out = _C_ops.fold(x, output_sizes, kernel_sizes, strides, paddings,
-                          dilations)
+        out = _C_ops.fold(
+            x, output_sizes, kernel_sizes, strides, paddings, dilations
+        )
     elif in_dynamic_mode():
-        out = _legacy_C_ops.fold(x, "output_sizes", output_sizes,
-                                 "kernel_sizes", kernel_sizes, "strides",
-                                 strides, "paddings", paddings, "dilations",
-                                 dilations)
+        out = _legacy_C_ops.fold(
+            x,
+            "output_sizes",
+            output_sizes,
+            "kernel_sizes",
+            kernel_sizes,
+            "strides",
+            strides,
+            "paddings",
+            paddings,
+            "dilations",
+            dilations,
+        )
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-        helper.append_op(type="fold",
-                         inputs={"X": x},
-                         outputs={"Y": out},
-                         attrs={
-                             "output_sizes": output_sizes,
-                             "kernel_sizes": kernel_sizes,
-                             "strides": strides,
-                             "paddings": paddings,
-                             "dilations": dilations
-                         })
+        helper.append_op(
+            type="fold",
+            inputs={"X": x},
+            outputs={"Y": out},
+            attrs={
+                "output_sizes": output_sizes,
+                "kernel_sizes": kernel_sizes,
+                "strides": strides,
+                "paddings": paddings,
+                "dilations": dilations,
+            },
+        )
     return out
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 06784f5d13c52a..d62ddc6a1ed588 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -17,7 +17,12 @@
 from ...device import get_cudnn_version
 from ...static import Variable
 from ...fluid import dygraph_utils
-from ...fluid.layers.utils import convert_to_list, _is_symmetric_padding, _contain_var, _convert_to_tensor_list
+from ...fluid.layers.utils import (
+    convert_to_list,
+    _is_symmetric_padding,
+    _contain_var,
+    _convert_to_tensor_list,
+)
 from ...fluid.data_feeder import check_variable_and_dtype, check_dtype
 from ...framework import ParamAttr
 from ...fluid.layer_helper import LayerHelper
@@ -62,8 +67,10 @@ def _update_padding_nd(padding, channel_last, num_dims):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
-                format(padding))
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".format(
+                    padding
+                )
+            )
         if padding == "VALID":
             padding_algorithm = "VALID"
             padding = [0] * num_dims
@@ -78,10 +85,12 @@ def _update_padding_nd(padding, channel_last, num_dims):
             if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
                     "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding))
+                    "is not supported.".format(padding)
+                )
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(
-                padding, channel_last)
+                padding, channel_last
+            )
             if _is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
@@ -102,44 +111,60 @@ def _update_padding_nd(padding, channel_last, num_dims):
         padding = convert_to_list(padding, num_dims, 'padding')
     if not all([p >= 0 for p in padding]):
         raise ValueError(
-            "Invalid padding, all value should be larger than or equal to 0, but received: {}"
-            .format(padding))
+            "Invalid padding, all value should be larger than or equal to 0, but received: {}".format(
+                padding
+            )
+        )
     return padding, padding_algorithm
 
 
-def _conv_nd(x,
-             weight,
-             bias=None,
-             stride=1,
-             padding=0,
-             padding_algorithm=None,
-             dilation=1,
-             groups=1,
-             data_format="NCHW",
-             channel_dim=1,
-             op_type="conv2d",
-             use_cudnn=True,
-             use_mkldnn=False,
-             name=None):
+def _conv_nd(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    padding_algorithm=None,
+    dilation=1,
+    groups=1,
+    data_format="NCHW",
+    channel_dim=1,
+    op_type="conv2d",
+    use_cudnn=True,
+    use_mkldnn=False,
+    name=None,
+):
 
     # Due to the poor performance of NHWC, we transpose the input to NCHW.
     if in_dygraph_mode() and op_type == "conv2d":
-        pre_bias = _C_ops.conv2d(x, weight, stride, padding, padding_algorithm,
-                                 groups, dilation, data_format, False, -1,
-                                 False)
+        pre_bias = _C_ops.conv2d(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+            False,
+            -1,
+            False,
+        )
         if bias is not None:
-            channel_dim = channel_dim + len(
-                x.shape) if channel_dim < 0 else channel_dim
-            if pre_bias.layout == "NHWC":
-                channel_dim = 3  # last dim
+            channel_dim = (
+                channel_dim + len(x.shape) if channel_dim < 0 else channel_dim
+            )
             if isinstance(x, tuple):
                 x = x[0]
             if isinstance(bias, tuple):
                 bias = bias[0]
             if len(bias.shape) < len(x.shape):
                 tmp_bias = _C_ops.reshape(
-                    bias, bias.shape +
-                    [1 for i in range(len(x.shape) - channel_dim - 1)])
+                    bias,
+                    [1 for i in range(channel_dim)]
+                    + bias.shape
+                    + [1 for i in range(len(x.shape) - channel_dim - 1)],
+                )
                 return _C_ops.add(pre_bias, tmp_bias)
             else:
                 return _C_ops.add(pre_bias, bias)
@@ -147,40 +172,82 @@ def _conv_nd(x,
             return pre_bias
 
     if in_dygraph_mode() and op_type == "depthwise_conv2d":
-        pre_bias = _C_ops.depthwise_conv2d(x, weight, stride, padding,
-                                           padding_algorithm, groups, dilation,
-                                           data_format, False, -1, False, False,
-                                           use_cudnn)
+        pre_bias = _C_ops.depthwise_conv2d(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+            False,
+            -1,
+            False,
+            False,
+            use_cudnn,
+        )
         if bias is not None:
-            channel_dim = channel_dim + len(
-                x.shape) if channel_dim < 0 else channel_dim
+            channel_dim = (
+                channel_dim + len(x.shape) if channel_dim < 0 else channel_dim
+            )
             tmp_bias = _C_ops.reshape(
                 bias,
-                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
+                [1 for i in range(channel_dim)]
+                + bias.shape
+                + [1 for i in range(len(x.shape) - channel_dim - 1)],
+            )
             return _C_ops.add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
     if in_dygraph_mode() and op_type == "conv3d":
-        pre_bias = _C_ops.conv3d(x, weight, stride, padding, padding_algorithm,
-                                 groups, dilation, data_format, False, -1,
-                                 False)
+        pre_bias = _C_ops.conv3d(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+            False,
+            -1,
+            False,
+        )
         if bias is not None:
-            channel_dim = channel_dim + len(
-                x.shape) if channel_dim < 0 else channel_dim
+            channel_dim = (
+                channel_dim + len(x.shape) if channel_dim < 0 else channel_dim
+            )
             tmp_bias = _C_ops.reshape(
                 bias,
-                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)])
+                bias.shape + [1 for i in range(len(x.shape) - channel_dim - 1)],
+            )
             return _C_ops.add(pre_bias, tmp_bias)
         else:
             return pre_bias
 
     if in_dynamic_mode():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn',
-                 use_mkldnn, 'fuse_relu_before_depthwise_conv', False,
-                 "padding_algorithm", padding_algorithm, "data_format",
-                 data_format)
+        attrs = (
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'use_mkldnn',
+            use_mkldnn,
+            'fuse_relu_before_depthwise_conv',
+            False,
+            "padding_algorithm",
+            padding_algorithm,
+            "data_format",
+            data_format,
+        )
         pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -197,44 +264,42 @@ def _conv_nd(x,
             'use_mkldnn': use_mkldnn,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
-            "data_format": data_format
+            "data_format": data_format,
         }
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 op_type)
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], op_type
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         pre_bias = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [pre_bias]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_add',
-                             inputs={
-                                 'X': [pre_bias],
-                                 'Y': [bias]
-                             },
-                             outputs={'Out': [out]},
-                             attrs={
-                                 'axis': channel_dim,
-                                 'use_mkldnn': use_mkldnn
-                             })
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias], 'Y': [bias]},
+                outputs={'Out': [out]},
+                attrs={'axis': channel_dim, 'use_mkldnn': use_mkldnn},
+            )
         else:
             out = pre_bias
     return out
 
 
-def conv1d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format='NCL',
-           name=None):
+def conv1d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format='NCL',
+    name=None,
+):
     r"""
     The convolution1D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -281,10 +346,10 @@ def conv1d(x,
             L_{out} = \frac{(L_{in} + 2 * padding - (dilation * (L_f - 1) + 1))}{stride} + 1
 
     Args:
-        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type 
+        x (Tensor): The input is 3-D Tensor with shape [N, C, L], the data type
             of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel with shape [M, C/g, K], where M is
-            the number of output channels, g is the number of groups, K is the kernel's size. 
+            the number of output channels, g is the number of groups, K is the kernel's size.
         bias (Tensor, optional): The bias with shape [M,]. Default: None.
         stride (int|list|tuple, optional): The stride size. If stride is a list/tuple, it must
             contain one integers, (stride_size). Default: 1.
@@ -302,55 +367,39 @@ def conv1d(x,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: 1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCL"`, `"NLC"`.
             The default is `"NCL"`. When it is `"NCL"`, the data is stored in the order of:
             `[batch_size, input_channels, feature_length]`.
-        name(str, optional): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A tensor representing the conv1d, whose data type is the 
+        A tensor representing the conv1d, whose data type is the
         same with input.
 
-    Raises:
-        ValueError: If the channel dimension of the input is less than or equal to zero.
-        ValueError: If `data_format` is not "NCL" or "NLC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ShapeError: If the input is not 3-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
-        ShapeError: If the number of input channels is not equal to filter's channels * groups.
-        ShapeError: If the number of output channels is not be divided by groups.
-
     Examples:
         .. code-block:: python
 
           import paddle
           import paddle.nn.functional as F
-          import numpy as np
-          x = np.array([[[4, 8, 1, 9],
-            [7, 2, 0, 9],
-            [6, 9, 2, 6]]]).astype(np.float32)
-          w=np.array(
-          [[[9, 3, 4],
-            [0, 0, 7],
-            [2, 5, 6]],
-           [[0, 3, 4],
-            [2, 9, 7],
-            [5, 6, 8]]]).astype(np.float32)
-          
-          x_var = paddle.to_tensor(x)
-          w_var = paddle.to_tensor(w)
-          y_var = F.conv1d(x_var, w_var)
-          y_np = y_var.numpy()
-          print(y_np)
-          
-          # [[[133. 238.]
-          #   [160. 211.]]]
+
+          x = paddle.to_tensor([[[4, 8, 1, 9],
+                                 [7, 2, 0, 9],
+                                 [6, 9, 2, 6]]], dtype="float32")
+          w = paddle.to_tensor([[[9, 3, 4],
+                                 [0, 0, 7],
+                                 [2, 5, 6]],
+                                [[0, 3, 4],
+                                 [2, 9, 7],
+                                 [5, 6, 8]]], dtype="float32")
+
+          y = F.conv1d(x, w)
+          print(y)
+          # Tensor(shape=[1, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+          #        [[[133., 238.],
+          #          [160., 211.]]])
     """
     cudnn_version = get_cudnn_version()
     if cudnn_version is not None:
@@ -359,36 +408,45 @@ def conv1d(x,
         use_cudnn = False
 
     if data_format not in ["NCL", "NLC"]:
-        raise ValueError("Attr(data_format) should be 'NCL' or 'NLC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCL' or 'NLC'. "
+            "Received Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NLC")
+    channel_last = data_format == "NLC"
     channel_dim = -1 if channel_last else 1
     conv2d_data_format = "NHWC" if channel_last else "NCHW"
     if len(x.shape) != 3:
         raise ValueError(
-            "Input x should be 3D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 3D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv1d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups))
+            ", the groups is {}".format(num_filters, weight.shape, groups)
+        )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
@@ -399,8 +457,10 @@ def conv1d(x,
         padding = [0] + padding
     else:
         raise ValueError(
-            "The size of padding's dimension should be 1 or 2. But got padding={}"
-            .format(padding))
+            "The size of padding's dimension should be 1 or 2. But got padding={}".format(
+                padding
+            )
+        )
     stride = [1] + convert_to_list(stride, 1, 'stride')
     dilation = [1] + convert_to_list(dilation, 1, 'dilation')
     weight = unsqueeze(weight, axis=[-2])
@@ -408,14 +468,18 @@ def conv1d(x,
     l_type = "conv2d"
 
     # When "groups==num_channels and num_filters% num_channels == 0" using depthwise_conv2d has better performance
-    if (is_compiled_with_cuda() and num_channels == groups and num_channels != 1
-            and num_filters % num_channels == 0):
+    if (
+        is_compiled_with_cuda()
+        and num_channels == groups
+        and num_channels != 1
+        and num_filters % num_channels == 0
+    ):
         l_type = 'depthwise_conv2d'
         use_cudnn = False
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
     if is_compiled_with_npu():
-        if (num_channels == groups and num_channels == num_filters):
+        if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
@@ -424,17 +488,44 @@ def conv1d(x,
     x = unsqueeze(x, axis=[squeeze_aixs])
 
     if in_dygraph_mode():
-        out = getattr(_C_ops,
-                      l_type)(x, weight, stride, padding, padding_algorithm,
-                              groups, dilation, conv2d_data_format, False, -1,
-                              False, False, use_cudnn)
+        out = getattr(_C_ops, l_type)(
+            x,
+            weight,
+            stride,
+            padding,
+            padding_algorithm,
+            groups,
+            dilation,
+            conv2d_data_format,
+            False,
+            -1,
+            False,
+            False,
+            use_cudnn,
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     elif _in_legacy_dygraph():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'groups', groups, 'use_cudnn', use_cudnn, 'use_mkldnn', False,
-                 'fuse_relu_before_depthwise_conv', False, "padding_algorithm",
-                 padding_algorithm, "data_format", conv2d_data_format)
+        attrs = (
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'use_mkldnn',
+            False,
+            'fuse_relu_before_depthwise_conv',
+            False,
+            "padding_algorithm",
+            padding_algorithm,
+            "data_format",
+            conv2d_data_format,
+        )
         out = getattr(_legacy_C_ops, l_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
@@ -449,33 +540,35 @@ def conv1d(x,
             'use_mkldnn': False,
             'fuse_relu_before_depthwise_conv': False,
             "padding_algorithm": padding_algorithm,
-            "data_format": conv2d_data_format
+            "data_format": conv2d_data_format,
         }
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'conv2d')
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'conv2d'
+        )
         helper = LayerHelper(l_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [out]}
-        helper.append_op(type=l_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=l_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     out = squeeze(out, axis=[squeeze_aixs])
     return out
 
 
-def conv2d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format="NCHW",
-           name=None):
+def conv2d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format="NCHW",
+    name=None,
+):
     r"""
 
     The convolution2D layer calculates the output based on the input, filter
@@ -567,18 +660,6 @@ def conv2d(x,
     Returns:
         A Tensor representing the conv2d result, whose data type is the same with input. 
 
-    Raises:
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If the channel dimension of the input is less than or equal to zero.
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ShapeError: If the input is not 4-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels * groups.
-        ShapeError: If the number of output channels is not be divided by groups.
-
     Examples:
         .. code-block:: python
 
@@ -596,40 +677,52 @@ def conv2d(x,
     """
     # entry checks
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. "
+            "Received Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NHWC")
+    channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 4D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv2d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "the number of filters must be divisible by groups,"
             "received: the number of filters is {}, the shape of weight is {}"
-            ", the groups is {}".format(num_filters, weight.shape, groups))
+            ", the groups is {}".format(num_filters, weight.shape, groups)
+        )
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
@@ -637,8 +730,11 @@ def conv2d(x,
     dilation = convert_to_list(dilation, 2, 'dilation')
 
     l_type = "conv2d"
-    if (num_channels == groups and num_channels != 1
-            and num_filters % num_channels == 0):
+    if (
+        num_channels == groups
+        and num_channels != 1
+        and num_filters % num_channels == 0
+    ):
         l_type = 'depthwise_conv2d'
         if is_compiled_with_rocm():
             use_cudnn = True
@@ -646,9 +742,19 @@ def conv2d(x,
             use_cudnn = False
     else:
         if in_dygraph_mode():
-            pre_bias = _C_ops.conv2d(x, weight, stride, padding,
-                                     padding_algorithm, groups, dilation,
-                                     data_format, False, -1, False)
+            pre_bias = _C_ops.conv2d(
+                x,
+                weight,
+                stride,
+                padding,
+                padding_algorithm,
+                groups,
+                dilation,
+                data_format,
+                False,
+                -1,
+                False,
+            )
             if bias is not None:
                 out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
                 return out
@@ -659,31 +765,50 @@ def conv2d(x,
 
     # NPU only supports depthwise_conv2d when  "input_channel = output_channel = groups"
     if is_compiled_with_npu():
-        if (num_channels == groups and num_channels == num_filters):
+        if num_channels == groups and num_channels == num_filters:
             l_type = 'depthwise_conv2d'
         else:
             l_type = 'conv2d'
 
-    if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
-        ["FLAGS_conv2d_disable_cudnn"]):
+    if (
+        is_compiled_with_cuda()
+        and get_flags("FLAGS_conv2d_disable_cudnn")[
+            "FLAGS_conv2d_disable_cudnn"
+        ]
+    ):
         use_cudnn = False
 
-    return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
-                    dilation, groups, data_format, channel_dim, l_type,
-                    use_cudnn, use_mkldnn, name)
-
-
-def conv1d_transpose(x,
-                     weight,
-                     bias=None,
-                     stride=1,
-                     padding=0,
-                     output_padding=0,
-                     groups=1,
-                     dilation=1,
-                     output_size=None,
-                     data_format="NCL",
-                     name=None):
+    return _conv_nd(
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        padding_algorithm,
+        dilation,
+        groups,
+        data_format,
+        channel_dim,
+        l_type,
+        use_cudnn,
+        use_mkldnn,
+        name,
+    )
+
+
+def conv1d_transpose(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+    output_size=None,
+    data_format="NCL",
+    name=None,
+):
     r"""
     The 1-D convolution transpose layer calculates the output based on the input,
     filter, and dilation, stride, padding. Input(Input) and output(Output)
@@ -780,40 +905,23 @@ def conv1d_transpose(x,
         when data_format is `"NCL"` and (num_batches, length, channels) when data_format is
         `"NLC"`.
 
-    Raises:
-        ValueError: If `data_format` is a string, but not "NCL" or "NLC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and filter_size are None at the same time.
-        ValueError: If `output_padding` is greater than `stride`.
-        ShapeError: If the input is not 3-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 1.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-
     Examples:
         .. code-block:: python
 
-
-
           import paddle
           import paddle.nn.functional as F
-          import numpy as np
-          
+
           # shape: (1, 2, 4)
-          x=np.array([[[4, 0, 9, 7],
-                       [8, 0, 9, 2,]]]).astype(np.float32)
+          x = paddle.to_tensor([[[4, 0, 9, 7],
+                                [8, 0, 9, 2,]]], dtype="float32")
           # shape: (2, 1, 2)
-          w=np.array([[[7, 0]],
-                      [[4, 2]]]).astype(np.float32)
-          x_var = paddle.to_tensor(x)
-          w_var = paddle.to_tensor(w)
-          y_var = F.conv1d_transpose(x_var, w_var)
-          print(y_var)
-          
-          # [[[60. 16. 99. 75.  4.]]]
+          w = paddle.to_tensor([[[7, 0]],
+                                [[4, 2]]], dtype="float32")
+
+          y = F.conv1d_transpose(x, w)
+          print(y)
+          # Tensor(shape=[1, 1, 5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+          #        [[[60., 16., 99., 75., 4. ]]])
     """
     cudnn_version = get_cudnn_version()
     if cudnn_version is not None:
@@ -825,28 +933,36 @@ def conv1d_transpose(x,
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
             "received {}, but only 'NCL' or 'NLC' are supported.".format(
-                data_format))
-    channel_last = (data_format == "NLC")
+                data_format
+            )
+        )
+    channel_last = data_format == "NLC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 3:
         raise ValueError(
-            "Input x should be 3D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 3D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
 
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv1d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv1d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 1)
@@ -857,8 +973,10 @@ def conv1d_transpose(x,
         padding = padding + [0]
     else:
         raise ValueError(
-            "The size of padding's dimension should 1 or 2. But got padding={}".
-            format(padding))
+            "The size of padding's dimension should 1 or 2. But got padding={}".format(
+                padding
+            )
+        )
 
     stride = convert_to_list(stride, 1, 'stride') + [1]
     dilation = convert_to_list(dilation, 1, 'dilation') + [1]
@@ -867,30 +985,40 @@ def conv1d_transpose(x,
         output_size = []
     else:
         if output_padding != 0:
-            raise ValueError('output_padding option is mutually exclusive with '
-                             'output_size')
+            raise ValueError(
+                'output_padding option is mutually exclusive with '
+                'output_size'
+            )
         if isinstance(output_size, (list, tuple, int)):
             output_size = convert_to_list(output_size, 1, 'output_size') + [1]
         else:
             raise ValueError(
-                "output_size should be int, or list, tuple of ints")
+                "output_size should be int, or list, tuple of ints"
+            )
 
     if output_padding == 0:
         output_padding = []
     else:
-        output_padding = convert_to_list(output_padding, 1,
-                                         'output_padding') + [0]
+        output_padding = convert_to_list(
+            output_padding, 1, 'output_padding'
+        ) + [0]
 
     if len(output_padding) > 0 and output_padding[0] > stride[0]:
         raise ValueError(
             "The size of output_padding should not be greater than stride."
             "But got output_padding={} and stride={}".format(
-                output_padding[0], stride[0]))
+                output_padding[0], stride[0]
+            )
+        )
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_channels != 1 and num_filters == 1
-            and not use_cudnn):
+    if (
+        num_channels == groups
+        and num_channels != 1
+        and num_filters == 1
+        and not use_cudnn
+    ):
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
@@ -901,17 +1029,41 @@ def conv1d_transpose(x,
     weight = unsqueeze(weight, axis=[-1])
 
     if in_dygraph_mode():
-        out = getattr(_C_ops,
-                      op_type)(x, weight, stride, padding, output_padding,
-                               output_size, padding_algorithm, groups, dilation,
-                               conv2d_data_format)
+        out = getattr(_C_ops, op_type)(
+            x,
+            weight,
+            stride,
+            padding,
+            output_padding,
+            output_size,
+            padding_algorithm,
+            groups,
+            dilation,
+            conv2d_data_format,
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
     elif _in_legacy_dygraph():
-        attrs = ('output_padding', output_padding, 'output_size', output_size,
-                 'strides', stride, 'paddings', padding, 'padding_algorithm',
-                 padding_algorithm, 'dilations', dilation, 'groups', groups,
-                 'use_cudnn', use_cudnn, 'data_format', conv2d_data_format)
+        attrs = (
+            'output_padding',
+            output_padding,
+            'output_size',
+            output_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'padding_algorithm',
+            padding_algorithm,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'data_format',
+            conv2d_data_format,
+        )
         out = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
@@ -926,18 +1078,18 @@ def conv1d_transpose(x,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'data_format': conv2d_data_format
+            'data_format': conv2d_data_format,
         }
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'conv2d_transpose')
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'conv2d_transpose'
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Output": [out]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = nn.elementwise_add(out, bias, axis=channel_dim)
 
@@ -945,17 +1097,19 @@ def conv1d_transpose(x,
     return out
 
 
-def conv2d_transpose(x,
-                     weight,
-                     bias=None,
-                     stride=1,
-                     padding=0,
-                     output_padding=0,
-                     dilation=1,
-                     groups=1,
-                     output_size=None,
-                     data_format='NCHW',
-                     name=None):
+def conv2d_transpose(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    dilation=1,
+    groups=1,
+    output_size=None,
+    data_format='NCHW',
+    name=None,
+):
     r"""
 
     The convolution2D transpose layer calculates the output based on the input,
@@ -1064,18 +1218,6 @@ def conv2d_transpose(x,
         out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing 
         transposed convolution result.
 
-    Raises:
-        ValueError: If `data_format` is not "NCHW" or "NHWC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and kernel_size are None at the same time.
-        ShapeError: If the input is not 4-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-
     Examples:
         .. code-block:: python
 
@@ -1096,32 +1238,43 @@ def conv2d_transpose(x,
         raise ValueError(
             "Attr(data_format) of conv2d_transpose got wrong value: "
             "received {}, but only 'NCHW' or 'NHWC' are supported.".format(
-                data_format))
-    channel_last = (data_format == "NHWC")
+                data_format
+            )
+        )
+    channel_last = data_format == "NHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 4D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
-        raise ValueError("The channel dimension of the input({}) "
-                         "should be defined. Received: {}.".format(
-                             x.shape, num_channels))
+        raise ValueError(
+            "The channel dimension of the input({}) "
+            "should be defined. Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv2d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv2d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "the channel of input must be divisible by groups,"
             "received: the channel of input is {}, the shape of input is {}"
-            ", the groups is {}".format(num_channels, x.shape, groups))
+            ", the groups is {}".format(num_channels, x.shape, groups)
+        )
 
     cudnn_version = get_cudnn_version()
 
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     # update attrs
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 2)
@@ -1132,8 +1285,10 @@ def conv2d_transpose(x,
         output_size = []
     else:
         if output_padding != 0:
-            raise ValueError('output_padding option is mutually exclusive with '
-                             'output_size')
+            raise ValueError(
+                'output_padding option is mutually exclusive with '
+                'output_size'
+            )
         if isinstance(output_size, (list, tuple)):
             if _contain_var(output_size):
                 output_size = _convert_to_tensor_list(output_size)
@@ -1142,15 +1297,21 @@ def conv2d_transpose(x,
         elif isinstance(output_size, int):
             output_size = convert_to_list(output_size, 2, 'output_size')
         elif isinstance(output_size, Variable):
-            check_dtype(output_size.dtype, 'output_size', ['int32', 'int64'],
-                        'conv2d_transpose')
-            if len(output_size.shape) == 1 and (output_size.shape[0] == 1
-                                                or output_size.shape[0] == 2):
+            check_dtype(
+                output_size.dtype,
+                'output_size',
+                ['int32', 'int64'],
+                'conv2d_transpose',
+            )
+            if len(output_size.shape) == 1 and (
+                output_size.shape[0] == 1 or output_size.shape[0] == 2
+            ):
                 if output_size.shape[0] == 1:
                     output_size = [output_size, output_size]
             else:
                 raise ValueError(
-                    "output_size must contain one or two integers.")
+                    "output_size must contain one or two integers."
+                )
         else:
             raise ValueError(
                 "output_size should be int or Tensor or list, tuple of ints or Tensor"
@@ -1163,24 +1324,54 @@ def conv2d_transpose(x,
 
     op_type = 'conv2d_transpose'
     num_filters = weight.shape[1]
-    if (num_channels == groups and num_channels != 1 and num_filters == 1):
+    if num_channels == groups and num_channels != 1 and num_filters == 1:
         op_type = 'depthwise_conv2d_transpose'
         use_cudnn = False
 
     if in_dygraph_mode():
-        op = _C_ops.conv2d_transpose if op_type == 'conv2d_transpose' else _C_ops.depthwise_conv2d_transpose
-        pre_bias = op(x, weight, stride, padding, output_padding, output_size,
-                      padding_algorithm, groups, dilation, data_format)
+        op = (
+            _C_ops.conv2d_transpose
+            if op_type == 'conv2d_transpose'
+            else _C_ops.depthwise_conv2d_transpose
+        )
+        pre_bias = op(
+            x,
+            weight,
+            stride,
+            padding,
+            output_padding,
+            output_size,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format,
+        )
         if bias is not None:
             return nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
             return pre_bias
 
     if _in_legacy_dygraph():
-        attrs = ('output_padding', output_padding, 'output_size', output_size,
-                 'strides', stride, 'paddings', padding, 'padding_algorithm',
-                 padding_algorithm, 'dilations', dilation, 'groups', groups,
-                 'use_cudnn', use_cudnn, 'data_format', data_format)
+        attrs = (
+            'output_padding',
+            output_padding,
+            'output_size',
+            output_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'padding_algorithm',
+            padding_algorithm,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            'data_format',
+            data_format,
+        )
         pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1197,17 +1388,17 @@ def conv2d_transpose(x,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            'data_format': data_format
+            'data_format': data_format,
         }
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'conv2d_transpose')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'conv2d_transpose'
+        )
         helper = LayerHelper(op_type, **locals())
         pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
 
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1217,15 +1408,17 @@ def conv2d_transpose(x,
     return out
 
 
-def conv3d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format="NCDHW",
-           name=None):
+def conv3d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format="NCDHW",
+    name=None,
+):
     r"""
 
     The convolution3D layer calculates the output based on the input, filter
@@ -1333,60 +1526,88 @@ def conv3d(x,
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format))
+            "Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 5D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[0]
     if num_channels < 0:
         raise ValueError(
             "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv3d should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
             "Received: number of channels({}), groups({}).".format(
-                num_channels, groups))
+                num_channels, groups
+            )
+        )
     if num_filters % groups != 0:
         raise ValueError(
             "The number of filters must be divisible by Attr(groups). "
             "Received: number of filters({}), groups({}).".format(
-                num_filters, groups))
+                num_filters, groups
+            )
+        )
 
     cudnn_version = get_cudnn_version()
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = convert_to_list(stride, 3, 'stride')
     dilation = convert_to_list(dilation, 3, 'dilation')
     op_type = "conv3d"
 
-    return _conv_nd(x, weight, bias, stride, padding, padding_algorithm,
-                    dilation, groups, data_format, channel_dim, op_type,
-                    use_cudnn, False, name)
-
-
-def conv3d_transpose(x,
-                     weight,
-                     bias=None,
-                     stride=1,
-                     padding=0,
-                     output_padding=0,
-                     groups=1,
-                     dilation=1,
-                     output_size=None,
-                     data_format='NCDHW',
-                     name=None):
+    return _conv_nd(
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        padding_algorithm,
+        dilation,
+        groups,
+        data_format,
+        channel_dim,
+        op_type,
+        use_cudnn,
+        False,
+        name,
+    )
+
+
+def conv3d_transpose(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    output_padding=0,
+    groups=1,
+    dilation=1,
+    output_size=None,
+    data_format='NCDHW',
+    name=None,
+):
     r"""
     The convolution3d transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input(Input) and output(Output)
@@ -1501,18 +1722,6 @@ def conv3d_transpose(x,
         variable storing the transposed convolution result, and if act is not None, the tensor 
         variable storing transposed convolution and non-linearity activation result.
 
-    Raises:
-        ValueError: If `data_format` is not "NCDHW" or "NDHWC".
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is a list/tuple, but the element corresponding to the input's batch size is not 0 
-            or the element corresponding to the input's channel is not 0.
-        ValueError: If `output_size` and kernel_size are None at the same time.
-        ShapeError: If the input is not 5-D Tensor.
-        ShapeError: If the input's dimension size and filter's dimension size not equal.
-        ShapeError: If the dimension size of input minus the size of `stride` is not 2.
-        ShapeError: If the number of input channels is not equal to filter's channels.
-        ShapeError: If the size of `output_size` is not equal to that of `stride`.
-
     Examples:
        .. code-block:: python
           
@@ -1532,29 +1741,37 @@ def conv3d_transpose(x,
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): {}.".format(data_format))
+            "Attr(data_format): {}.".format(data_format)
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 5D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     num_filters = weight.shape[1]
     if num_channels < 0:
         raise ValueError(
             "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels)
+        )
     if groups <= 0:
         raise ValueError(
-            "The groups of conv3d_transpose should be greater than 0. Received groups: {}"
-            .format(groups))
+            "The groups of conv3d_transpose should be greater than 0. Received groups: {}".format(
+                groups
+            )
+        )
     if num_channels % groups != 0:
         raise ValueError(
             "The number of input channels must be divisible by Attr(groups). "
             "Received: number of channels({}), groups({}).".format(
-                num_channels, groups))
+                num_channels, groups
+            )
+        )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, 3)
     stride = convert_to_list(stride, 3, 'stride')
@@ -1563,13 +1780,16 @@ def conv3d_transpose(x,
         output_size = []
     else:
         if output_padding != 0:
-            raise ValueError('output_padding option is mutually exclusive with '
-                             'output_size')
+            raise ValueError(
+                'output_padding option is mutually exclusive with '
+                'output_size'
+            )
         if isinstance(output_size, (list, tuple, int)):
             output_size = convert_to_list(output_size, 3, 'output_size')
         else:
             raise ValueError(
-                "output_size should be int, or list, tuple of ints")
+                "output_size should be int, or list, tuple of ints"
+            )
 
     if output_padding == 0:
         output_padding = []
@@ -1578,28 +1798,55 @@ def conv3d_transpose(x,
 
     cudnn_version = get_cudnn_version()
 
-    #TODO(LielinJiang): whether to use cudnn according to the version of cudnn
-    use_cudnn = True if (is_compiled_with_cuda()
-                         and cudnn_version is not None) else False
+    # TODO(LielinJiang): whether to use cudnn according to the version of cudnn
+    use_cudnn = (
+        True
+        if (is_compiled_with_cuda() and cudnn_version is not None)
+        else False
+    )
 
     op_type = 'conv3d_transpose'
     data_format_ = "NHWC" if channel_last else "NCHW"
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.conv3d_transpose(x, weight, stride, padding,
-                                           output_padding, output_size,
-                                           padding_algorithm, groups, dilation,
-                                           data_format_)
+        pre_bias = _C_ops.conv3d_transpose(
+            x,
+            weight,
+            stride,
+            padding,
+            output_padding,
+            output_size,
+            padding_algorithm,
+            groups,
+            dilation,
+            data_format_,
+        )
         if bias is not None:
             return nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
             return pre_bias
 
     if _in_legacy_dygraph():
-        attrs = ('output_padding', output_padding, 'output_size', output_size,
-                 'paddings', padding, "padding_algorithm", padding_algorithm,
-                 'strides', stride, 'dilations', dilation, 'groups', groups,
-                 'use_cudnn', use_cudnn, "data_format", data_format_)
+        attrs = (
+            'output_padding',
+            output_padding,
+            'output_size',
+            output_size,
+            'paddings',
+            padding,
+            "padding_algorithm",
+            padding_algorithm,
+            'strides',
+            stride,
+            'dilations',
+            dilation,
+            'groups',
+            groups,
+            'use_cudnn',
+            use_cudnn,
+            "data_format",
+            data_format_,
+        )
         pre_bias = getattr(_legacy_C_ops, op_type)(x, weight, *attrs)
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
@@ -1616,19 +1863,19 @@ def conv3d_transpose(x,
             'dilations': dilation,
             'groups': groups,
             'use_cudnn': use_cudnn,
-            "data_format": data_format_
+            "data_format": data_format_,
         }
         helper = LayerHelper(op_type, **locals())
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'conv3d')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'conv3d'
+        )
 
         pre_bias = helper.create_variable_for_type_inference(x.dtype)
         outputs = {"Output": [pre_bias]}
 
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=channel_dim)
         else:
diff --git a/python/paddle/nn/functional/distance.py b/python/paddle/nn/functional/distance.py
index 1c29d509741018..0c3a1a8b0d72a4 100644
--- a/python/paddle/nn/functional/distance.py
+++ b/python/paddle/nn/functional/distance.py
@@ -23,6 +23,7 @@
 
 def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
     r"""
+
     It computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
@@ -48,10 +49,11 @@ def pairwise_distance(x, y, p=2., epsilon=1e-6, keepdim=False, name=None):
 
     Returns:
         Tensor, the dtype is same as input tensor.
+
         - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
-            depending on whether the input has data shaped as :math:`[N, D]`.
+          depending on whether the input has data shaped as :math:`[N, D]`.
         - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
-            depending on whether the input has data shaped as :math:`[N, D]`.
+          depending on whether the input has data shaped as :math:`[N, D]`.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/nn/functional/extension.py b/python/paddle/nn/functional/extension.py
index 7ae35666c86127..ce92e4aba200e8 100644
--- a/python/paddle/nn/functional/extension.py
+++ b/python/paddle/nn/functional/extension.py
@@ -23,7 +23,11 @@
 from ...tensor.layer_function_generator import templatedoc
 from paddle import in_dynamic_mode
 from paddle import _C_ops, _legacy_C_ops
-from ...fluid.framework import _non_static_mode, _in_legacy_dygraph, in_dygraph_mode
+from ...fluid.framework import (
+    _non_static_mode,
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+)
 from ...fluid.data_feeder import check_variable_and_dtype, check_type
 from ...framework import core
 from ...common_ops_import import convert_np_dtype_to_dtype_
@@ -33,8 +37,8 @@
 
 def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     """
-    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2) 
-    are filled by ``input``. By default, a 2D plane formed by the last two dimensions 
+    This OP creates a tensor whose diagonals of certain 2D planes (specified by dim1 and dim2)
+    are filled by ``input``. By default, a 2D plane formed by the last two dimensions
     of the returned tensor will be selected.
 
     The argument ``offset`` determines which diagonal is generated:
@@ -48,16 +52,16 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
         offset(int, optional): Which diagonal to consider. Default: 0 (main diagonal).
         dim1(int, optional): The first dimension with respect to which to take diagonal. Default: -2.
         dim2(int, optional): The second dimension with respect to which to take diagonal. Default: -1.
-    
+
     Returns:
         Tensor, the output data type is the same as input data type.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle.nn.functional as F
             import numpy as np
-            
+
             diag_embed = np.random.randn(2, 3).astype('float32')
             # [[ 0.7545889 , -0.25074545,  0.5929117 ],
             #  [-0.6097662 , -0.01753256,  0.619769  ]]
@@ -104,50 +108,55 @@ def diag_embed(input, offset=0, dim1=-2, dim2=-1):
     if in_dygraph_mode():
         return _C_ops.diag_embed(input, offset, dim1, dim2)
     elif in_dynamic_mode():
-        return _legacy_C_ops.diag_embed(input, "offset", offset, "dim1", dim1,
-                                        "dim2", dim2)
+        return _legacy_C_ops.diag_embed(
+            input, "offset", offset, "dim1", dim1, "dim2", dim2
+        )
 
     inputs = {'Input': [input]}
     attrs = {'offset': offset, 'dim1': dim1, 'dim2': dim2}
 
     def __check_input(input, offset, dim1, dim2):
-        check_dtype(input.dtype, 'Input',
-                    ['int32', 'int64', 'float16', 'float32', 'float64'],
-                    'diag_embed')
+        check_dtype(
+            input.dtype,
+            'Input',
+            ['int32', 'int64', 'float16', 'float32', 'float64'],
+            'diag_embed',
+        )
 
         input_shape = list(input.shape)
-        assert len(input_shape) >= 1,                     \
-                "Input must be at least 1-dimensional, "   \
-                "But received Input's dimensional: %s.\n" %  \
-                len(input_shape)
+        assert len(input_shape) >= 1, (
+            "Input must be at least 1-dimensional, "
+            "But received Input's dimensional: %s.\n" % len(input_shape)
+        )
 
-        assert np.abs(dim1) <= len(input_shape),    \
-            "Dim1 is out of range (expected to be in range of [%d, %d], but got %d).\n"  \
+        assert np.abs(dim1) <= len(input_shape), (
+            "Dim1 is out of range (expected to be in range of [%d, %d], but got %d).\n"
             % (-(len(input_shape) + 1), len(input_shape), dim1)
+        )
 
-        assert np.abs(dim2) <= len(input_shape),      \
-            "Dim2 is out of range (expected to be in range of [%d, %d], but got %d).\n"  \
+        assert np.abs(dim2) <= len(input_shape), (
+            "Dim2 is out of range (expected to be in range of [%d, %d], but got %d).\n"
             % (-(len(input_shape) + 1), len(input_shape), dim2)
+        )
 
         dim1_ = dim1 if dim1 >= 0 else len(input_shape) + dim1 + 1
         dim2_ = dim2 if dim2 >= 0 else len(input_shape) + dim2 + 1
-        assert dim1_ != dim2_,       \
-               "dim1 and dim2 cannot be the same dimension." \
-                "But received dim1 = %d, dim2 = %d\n"%(dim1, dim2)
+        assert dim1_ != dim2_, (
+            "dim1 and dim2 cannot be the same dimension."
+            "But received dim1 = %d, dim2 = %d\n" % (dim1, dim2)
+        )
 
     __check_input(input, offset, dim1, dim2)
     helper = LayerHelper("diag_embed", **locals())
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(type='diag_embed',
-                     inputs={'Input': [input]},
-                     attrs={
-                         'offset': offset,
-                         'dim1': dim1,
-                         'dim2': dim2
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='diag_embed',
+        inputs={'Input': [input]},
+        attrs={'offset': offset, 'dim1': dim1, 'dim2': dim2},
+        outputs={'Out': [out]},
+    )
     out.stop_gradient = True
     return out
 
@@ -235,10 +244,9 @@ def sequence_mask(x, maxlen=None, dtype='int64', name=None):
         else:
             attrs['maxlen'] = maxlen
 
-    helper.append_op(type='sequence_mask',
-                     inputs=inputs,
-                     outputs={'Y': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='sequence_mask', inputs=inputs, outputs={'Y': out}, attrs=attrs
+    )
 
     out.stop_gradient = True
     return out
@@ -305,6 +313,13 @@ def gather_tree(ids, parents):
             # [[[2, 2], [1, 6]], [[3, 3], [6, 1]], [[0, 1], [9, 0]]]
 
     """
+    if ids.ndim != 3:
+        raise ValueError(
+            "The input ids must be a 3D tensor with shape [length, batch_size, beam_size]"
+        )
+    if ids.ndim != parents.ndim:
+        raise ValueError("The ids's shape must be the same as parents' shape. ")
+
     if in_dygraph_mode():
         return _C_ops.gather_tree(ids, parents)
     else:
@@ -312,18 +327,19 @@ def gather_tree(ids, parents):
             return _legacy_C_ops.gather_tree(ids, parents)
         else:
             helper = LayerHelper('gather_tree', **locals())
-            check_variable_and_dtype(ids, 'ids', ['int32', 'int64'],
-                                     'gather_tree')
-            check_variable_and_dtype(parents, 'parents', ['int32', 'int64'],
-                                     'gather_tree')
+            check_variable_and_dtype(
+                ids, 'ids', ['int32', 'int64'], 'gather_tree'
+            )
+            check_variable_and_dtype(
+                parents, 'parents', ['int32', 'int64'], 'gather_tree'
+            )
             out = helper.create_variable_for_type_inference(dtype=ids.dtype)
 
-            helper.append_op(type="gather_tree",
-                             inputs={
-                                 "Ids": ids,
-                                 "Parents": parents
-                             },
-                             outputs={"Out": out})
+            helper.append_op(
+                type="gather_tree",
+                inputs={"Ids": ids, "Parents": parents},
+                outputs={"Out": out},
+            )
 
             return out
 
@@ -350,9 +366,6 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
         out(Tensor): The temporal shifting result is a tensor with the
         same shape and same data type as the input.
 
-    Raises:
-        TypeError: seg_num must be int type.
-
     Examples:
         .. code-block:: python
 
@@ -363,14 +376,22 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
             out = F.temporal_shift(x=input, seg_num=2, shift_ratio=0.2)
     """
     if data_format not in ["NCHW", "NHWC"]:
-        raise ValueError("Attr(data_format) should be 'NCHW' or 'NHWC'. "
-                         "Received Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW' or 'NHWC'. "
+            "Received Attr(data_format): {}.".format(data_format)
+        )
     if in_dygraph_mode():
         return _C_ops.temporal_shift(x, seg_num, shift_ratio, data_format)
     if _non_static_mode():
-        return _legacy_C_ops.temporal_shift(x, 'seg_num', seg_num,
-                                            'shift_ratio', shift_ratio,
-                                            'data_format', data_format)
+        return _legacy_C_ops.temporal_shift(
+            x,
+            'seg_num',
+            seg_num,
+            'shift_ratio',
+            shift_ratio,
+            'data_format',
+            data_format,
+        )
 
     helper = LayerHelper("temporal_shift", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'temporal_shift')
@@ -382,12 +403,14 @@ def temporal_shift(x, seg_num, shift_ratio=0.25, name=None, data_format="NCHW"):
     if not isinstance(seg_num, int):
         raise TypeError("seg_num must be int type.")
 
-    helper.append_op(type="temporal_shift",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={
-                         "seg_num": seg_num,
-                         "shift_ratio": shift_ratio,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="temporal_shift",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "seg_num": seg_num,
+            "shift_ratio": shift_ratio,
+            "data_format": data_format,
+        },
+    )
     return out
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index c0742bdbf407f9..4bb19343c13a64 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -29,7 +29,12 @@
 from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
 from paddle.framework import core, _non_static_mode
-from ...fluid.framework import _in_legacy_dygraph, in_dygraph_mode, _non_static_mode, _current_expected_place
+from ...fluid.framework import (
+    _in_legacy_dygraph,
+    in_dygraph_mode,
+    _non_static_mode,
+    _current_expected_place,
+)
 
 __all__ = []
 
@@ -78,25 +83,32 @@ def dice_loss(input, label, epsilon=0.00001, name=None):
     """
     assert input.dtype in (paddle.float32, paddle.float64)
     assert label.dtype in (paddle.int32, paddle.int64)
-    assert len(input.shape) >= 2, \
-        "The rank of input should be greater than or equal to 2."
-    assert len(input.shape) == len(
-        label.shape), ("The rank of input and label should be equal, "
-                       "but received input: %d, label: %d." %
-                       (len(input.shape), len(label.shape)))
-    assert label.shape[-1] == 1, ("The last dimension of label should be 1, "
-                                  "but received %d." % label.shape[-1])
-    assert input.shape[:-1] == label.shape[:-1], (
-        "All dimensions should be equal except the last one.")
-    assert input.numel() > 0 and label.numel() > 0, \
-        "Any dimension of input and label cannot be equal to 0."
+    assert (
+        len(input.shape) >= 2
+    ), "The rank of input should be greater than or equal to 2."
+    assert len(input.shape) == len(label.shape), (
+        "The rank of input and label should be equal, "
+        "but received input: %d, label: %d."
+        % (len(input.shape), len(label.shape))
+    )
+    assert label.shape[-1] == 1, (
+        "The last dimension of label should be 1, "
+        "but received %d." % label.shape[-1]
+    )
+    assert (
+        input.shape[:-1] == label.shape[:-1]
+    ), "All dimensions should be equal except the last one."
+    assert (
+        input.numel() > 0 and label.numel() > 0
+    ), "Any dimension of input and label cannot be equal to 0."
 
     label = paddle.squeeze(label, [-1])
     label = paddle.nn.functional.one_hot(label, input.shape[-1])
     reduce_dim = list(range(1, len(input.shape)))
     inse = paddle.sum(input * label, axis=reduce_dim)
     dice_denominator = paddle.sum(input, axis=reduce_dim) + paddle.sum(
-        label, axis=reduce_dim)
+        label, axis=reduce_dim
+    )
     dice_score = 1 - inse * 2 / (dice_denominator + epsilon)
     return paddle.mean(dice_score)
 
@@ -147,23 +159,24 @@ def log_loss(input, label, epsilon=1e-4, name=None):
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
 
-    helper.append_op(type='log_loss',
-                     inputs={
-                         'Predicted': [input],
-                         'Labels': [label]
-                     },
-                     outputs={'Loss': [loss]},
-                     attrs={'epsilon': epsilon})
+    helper.append_op(
+        type='log_loss',
+        inputs={'Predicted': [input], 'Labels': [label]},
+        outputs={'Loss': [loss]},
+        attrs={'epsilon': epsilon},
+    )
     return loss
 
 
-def fluid_softmax_with_cross_entropy(logits,
-                                     label,
-                                     soft_label=False,
-                                     ignore_index=-100,
-                                     numeric_stable_mode=True,
-                                     return_softmax=False,
-                                     axis=-1):
+def fluid_softmax_with_cross_entropy(
+    logits,
+    label,
+    soft_label=False,
+    ignore_index=-100,
+    numeric_stable_mode=True,
+    return_softmax=False,
+    axis=-1,
+):
     r"""
 
     This operator implements the cross entropy loss function with softmax. This function 
@@ -239,33 +252,53 @@ def fluid_softmax_with_cross_entropy(logits,
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            data = np.random.rand(128).astype("float32")
-            label = np.random.rand(1).astype("int64")
-            data = paddle.to_tensor(data)
-            label = paddle.to_tensor(label)
-            linear = paddle.nn.Linear(128, 100)
-            x = linear(data)
-            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
+
+            logits = paddle.to_tensor([0.4, 0.6, 0.9])
+            label = paddle.randint(high=2, shape=[1], dtype="int64")
+
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=logits, label=label)
             print(out)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.15328646])
     """
     if _non_static_mode():
         if core.is_compiled_with_npu():
             softmax, backprop, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                logits, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                'axis', axis)
+                logits,
+                label,
+                'soft_label',
+                soft_label,
+                'ignore_index',
+                ignore_index,
+                'numeric_stable_mode',
+                numeric_stable_mode,
+                'axis',
+                axis,
+            )
         else:
             if in_dygraph_mode():
                 softmax, loss = _C_ops.cross_entropy_with_softmax(
-                    logits, label, soft_label, True, numeric_stable_mode,
-                    ignore_index, axis)
+                    logits,
+                    label,
+                    soft_label,
+                    True,
+                    numeric_stable_mode,
+                    ignore_index,
+                    axis,
+                )
             if _in_legacy_dygraph():
                 softmax, loss = _legacy_C_ops.softmax_with_cross_entropy(
-                    logits, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', numeric_stable_mode,
-                    'axis', axis)
+                    logits,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    numeric_stable_mode,
+                    'axis',
+                    axis,
+                )
         if not return_softmax:
             return loss
         else:
@@ -275,7 +308,7 @@ def fluid_softmax_with_cross_entropy(logits,
         'soft_label': soft_label,
         'ignore_index': ignore_index,
         'numeric_stable_mode': numeric_stable_mode,
-        'axis': axis
+        'axis': axis,
     }
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
@@ -285,13 +318,12 @@ def fluid_softmax_with_cross_entropy(logits,
     if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=logits.dtype)
         outputs['Backprop'] = backprop
-    helper.append_op(type='softmax_with_cross_entropy',
-                     inputs={
-                         'Logits': logits,
-                         'Label': label
-                     },
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': logits, 'Label': label},
+        outputs=outputs,
+        attrs=attrs,
+    )
 
     if return_softmax:
         return loss, softmax
@@ -300,71 +332,74 @@ def fluid_softmax_with_cross_entropy(logits,
 
 
 def npair_loss(anchor, positive, labels, l2_reg=0.002):
-    """ 
-  
+    """
+
     Npair loss requires paired data. Npair loss has two parts: the first part is L2
     regularizer on the embedding vector; the second part is cross entropy loss which
     takes the similarity matrix of anchor and positive as logits.
-  
+
     For more information, please refer to:
     `Improved Deep Metric Learning with Multi class N pair Loss Objective <http://www.nec-labs.com/uploads/images/Department-Images/MediaAnalytics/papers/nips16_npairmetriclearning.pdf>`_
-  
+
     Args:
-      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims], 
+      anchor(Tensor): embedding vector for the anchor image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
-      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims], 
+      positive(Tensor): embedding vector for the positive image. shape=[batch_size, embedding_dims],
                         the data type is float32 or float64.
       labels(Tensor): 1-D tensor. shape=[batch_size], the data type is float32 or float64 or int64.
       l2_reg(float32): L2 regularization term on embedding vector, default: 0.002.
 
-  
+
     Returns:
       A Tensor representing the npair loss, the data type is the same as anchor, the shape is [1].
-  
+
     Examples:
 
       .. code-block:: python
-  
+
           import paddle
-          
+
           DATATYPE = "float32"
-  
+
           anchor = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           positive = paddle.rand(shape=(18, 6), dtype=DATATYPE)
           labels = paddle.rand(shape=(18,), dtype=DATATYPE)
-          
+
           npair_loss = paddle.nn.functional.npair_loss(anchor, positive, labels, l2_reg = 0.002)
           print(npair_loss)
-  
+
     """
-    check_variable_and_dtype(anchor, 'anchor', ['float32', 'float64'],
-                             'npair_loss')
-    check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                             'positive')
-    check_variable_and_dtype(labels, 'labels', ['float32', 'float64', 'int64'],
-                             'labels')
+    check_variable_and_dtype(
+        anchor, 'anchor', ['float32', 'float64'], 'npair_loss'
+    )
+    check_variable_and_dtype(
+        positive, 'positive', ['float32', 'float64'], 'positive'
+    )
+    check_variable_and_dtype(
+        labels, 'labels', ['float32', 'float64', 'int64'], 'labels'
+    )
     Beta = 0.25
     batch_size = labels.shape[0]
 
     labels = paddle.reshape(labels, shape=[batch_size, 1])
     labels = paddle.tile(labels, repeat_times=[1, batch_size])
 
-    labels = paddle.equal(labels, paddle.transpose(labels,
-                                                   perm=[1,
-                                                         0])).astype('float32')
+    labels = paddle.equal(labels, paddle.transpose(labels, perm=[1, 0])).astype(
+        'float32'
+    )
     labels = labels / paddle.sum(labels, axis=1, keepdim=True)
 
-    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) \
-             + paddle.mean(paddle.sum(paddle.square(positive), 1))
+    l2loss = paddle.mean(paddle.sum(paddle.square(anchor), 1)) + paddle.mean(
+        paddle.sum(paddle.square(positive), 1)
+    )
     l2loss = l2loss * Beta * l2_reg
 
-    similarity_matrix = paddle.matmul(anchor,
-                                      positive,
-                                      transpose_x=False,
-                                      transpose_y=True)
-    softmax_ce = fluid_softmax_with_cross_entropy(logits=similarity_matrix,
-                                                  label=labels,
-                                                  soft_label=True)
+    similarity_matrix = paddle.matmul(
+        anchor, positive, transpose_x=False, transpose_y=True
+    )
+    softmax_ce = fluid_softmax_with_cross_entropy(
+        logits=similarity_matrix, label=labels, soft_label=True
+    )
     cross_entropy = paddle.sum(labels * softmax_ce, 0)
     celoss = paddle.mean(cross_entropy)
 
@@ -412,32 +447,35 @@ def square_error_cost(input, label):
         square_out = _legacy_C_ops.square(minus_out)
         return square_out
 
-    check_variable_and_dtype(input, "input", ['float32', 'float64'],
-                             'square_error_cost')
-    check_variable_and_dtype(label, "label", ['float32', 'float64'],
-                             'square_error_cost')
+    check_variable_and_dtype(
+        input, "input", ['float32', 'float64'], 'square_error_cost'
+    )
+    check_variable_and_dtype(
+        label, "label", ['float32', 'float64'], 'square_error_cost'
+    )
     helper = LayerHelper('square_error_cost', **locals())
     minus_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='elementwise_sub',
-                     inputs={
-                         'X': [input],
-                         'Y': [label]
-                     },
-                     outputs={'Out': [minus_out]})
+    helper.append_op(
+        type='elementwise_sub',
+        inputs={'X': [input], 'Y': [label]},
+        outputs={'Out': [minus_out]},
+    )
 
     square_out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='square',
-                     inputs={'X': [minus_out]},
-                     outputs={'Out': [square_out]})
+    helper.append_op(
+        type='square', inputs={'X': [minus_out]}, outputs={'Out': [square_out]}
+    )
     return square_out
 
 
-def edit_distance(input,
-                  label,
-                  normalized=True,
-                  ignored_tokens=None,
-                  input_length=None,
-                  label_length=None):
+def edit_distance(
+    input,
+    label,
+    normalized=True,
+    ignored_tokens=None,
+    input_length=None,
+    label_length=None,
+):
     """
     This op computes the edit distances, also called Levenshtein distance, between a batch of
     hypothesis strings and their references. It measures how dissimilar two strings are by counting
@@ -472,7 +510,7 @@ def edit_distance(input,
         NOTE: This Api is different from fluid.metrics.EditDistance
 
     Returns:
-	Tuple:
+        Tuple:
 
         distance(Tensor): edit distance result, its data type is float32, and its shape is (batch_size, 1).
         sequence_num(Tensor): sequence number, its data type is float32, and its shape is (1,).
@@ -514,21 +552,26 @@ def edit_distance(input,
         erased_input = helper.create_variable_for_type_inference(dtype="int64")
         erased_label = helper.create_variable_for_type_inference(dtype="int64")
 
-        helper.append_op(type="sequence_erase",
-                         inputs={"X": [input]},
-                         outputs={"Out": [erased_input]},
-                         attrs={"tokens": ignored_tokens})
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [input]},
+            outputs={"Out": [erased_input]},
+            attrs={"tokens": ignored_tokens},
+        )
         input = erased_input
 
-        helper.append_op(type="sequence_erase",
-                         inputs={"X": [label]},
-                         outputs={"Out": [erased_label]},
-                         attrs={"tokens": ignored_tokens})
+        helper.append_op(
+            type="sequence_erase",
+            inputs={"X": [label]},
+            outputs={"Out": [erased_label]},
+            attrs={"tokens": ignored_tokens},
+        )
         label = erased_label
 
     if in_dygraph_mode():
-        return _C_ops.edit_distance(input, label, input_length, label_length,
-                                    normalized)
+        return _C_ops.edit_distance(
+            input, label, input_length, label_length, normalized
+        )
 
     this_inputs = {"Hyps": [input], "Refs": [label]}
     if input_length is not None and label_length is not None:
@@ -538,22 +581,19 @@ def edit_distance(input,
     # edit distance op
     edit_distance_out = helper.create_variable_for_type_inference(dtype="int64")
     sequence_num = helper.create_variable_for_type_inference(dtype="int64")
-    helper.append_op(type="edit_distance",
-                     inputs=this_inputs,
-                     outputs={
-                         "Out": [edit_distance_out],
-                         "SequenceNum": [sequence_num]
-                     },
-                     attrs={"normalized": normalized})
+    helper.append_op(
+        type="edit_distance",
+        inputs=this_inputs,
+        outputs={"Out": [edit_distance_out], "SequenceNum": [sequence_num]},
+        attrs={"normalized": normalized},
+    )
 
     return edit_distance_out, sequence_num
 
 
-def binary_cross_entropy(input,
-                         label,
-                         weight=None,
-                         reduction='mean',
-                         name=None):
+def binary_cross_entropy(
+    input, label, weight=None, reduction='mean', name=None
+):
     """
     This op measures the binary_cross_entropy loss between input predictions ``input``
     and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -621,8 +661,9 @@ def binary_cross_entropy(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in binary_cross_entropy should be 'sum', "
-            "'mean' or 'none', but received %s, which is not allowed." %
-            reduction)
+            "'mean' or 'none', but received %s, which is not allowed."
+            % reduction
+        )
 
     if in_dygraph_mode():
         out = _C_ops.bce_loss(input, label)
@@ -642,27 +683,32 @@ def binary_cross_entropy(input,
             if weight is not None:
                 out = _legacy_C_ops.elementwise_mul(out, weight, 'axis', -1)
             if reduction == 'sum':
-                return _legacy_C_ops.reduce_sum(out, 'dim', [0], 'keep_dim',
-                                                False, "reduce_all", True)
+                return _legacy_C_ops.reduce_sum(
+                    out, 'dim', [0], 'keep_dim', False, "reduce_all", True
+                )
             elif reduction == 'mean':
                 return _legacy_C_ops.mean(out)
             else:
                 return out
         else:
-            check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                     'binary_cross_entropy')
-            check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                     'binary_cross_entropy')
+            check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'binary_cross_entropy'
+            )
+            check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'binary_cross_entropy'
+            )
 
             sub_name = name if weight is None and reduction == 'none' else None
             helper = LayerHelper("binary_cross_entropy", name=sub_name)
             out = helper.create_variable_for_type_inference(dtype=input.dtype)
-            helper.append_op(type='bce_loss',
-                             inputs={
-                                 'X': [input],
-                                 'Label': [label],
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='bce_loss',
+                inputs={
+                    'X': [input],
+                    'Label': [label],
+                },
+                outputs={'Out': [out]},
+            )
 
             if weight is not None:
                 if isinstance(weight, paddle.static.Variable):
@@ -670,7 +716,8 @@ def binary_cross_entropy(input,
                     out = paddle.multiply(out, weight, name=weight_name)
                 else:
                     raise ValueError(
-                        "The weight is not a Tensor, please convert to Tensor.")
+                        "The weight is not a Tensor, please convert to Tensor."
+                    )
 
             if reduction == 'sum':
                 return paddle.sum(out, name=name)
@@ -680,12 +727,9 @@ def binary_cross_entropy(input,
                 return out
 
 
-def binary_cross_entropy_with_logits(logit,
-                                     label,
-                                     weight=None,
-                                     reduction='mean',
-                                     pos_weight=None,
-                                     name=None):
+def binary_cross_entropy_with_logits(
+    logit, label, weight=None, reduction='mean', pos_weight=None, name=None
+):
     r"""
     This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
@@ -767,16 +811,23 @@ def binary_cross_entropy_with_logits(logit,
         raise ValueError(
             "The value of 'reduction' in binary_cross_entropy_with_logits "
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
-            % reduction)
+            % reduction
+        )
 
     if in_dygraph_mode():
-        one = _C_ops.full([1], float(1.0), core.VarDesc.VarType.FP32,
-                          _current_expected_place())
-        out = _C_ops.sigmoid_cross_entropy_with_logits(logit, label, False,
-                                                       -100)
+        one = _C_ops.full(
+            [1],
+            float(1.0),
+            core.VarDesc.VarType.FP32,
+            _current_expected_place(),
+        )
+        out = _C_ops.sigmoid_cross_entropy_with_logits(
+            logit, label, False, -100
+        )
         if pos_weight is not None:
             log_weight = _C_ops.add(
-                _C_ops.multiply(label, _C_ops.subtract(pos_weight, one)), one)
+                _C_ops.multiply(label, _C_ops.subtract(pos_weight, one)), one
+            )
             out = _C_ops.multiply(out, log_weight)
         if weight is not None:
             out = _C_ops.multiply(out, weight)
@@ -789,14 +840,27 @@ def binary_cross_entropy_with_logits(logit,
             return out
     elif _in_legacy_dygraph():
         one = _varbase_creator(dtype=logit.dtype)
-        _legacy_C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu',
-                                    False, 'dtype', one.dtype, 'str_value',
-                                    '1.0', 'shape', [1])
+        _legacy_C_ops.fill_constant(
+            one,
+            'value',
+            float(1.0),
+            'force_cpu',
+            False,
+            'dtype',
+            one.dtype,
+            'str_value',
+            '1.0',
+            'shape',
+            [1],
+        )
         out = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
         if pos_weight is not None:
             log_weight = _legacy_C_ops.elementwise_add(
                 _legacy_C_ops.elementwise_mul(
-                    label, _legacy_C_ops.elementwise_sub(pos_weight, one)), one)
+                    label, _legacy_C_ops.elementwise_sub(pos_weight, one)
+                ),
+                one,
+            )
             out = _legacy_C_ops.elementwise_mul(out, log_weight)
         if weight is not None:
             out = _legacy_C_ops.elementwise_mul(out, weight)
@@ -808,30 +872,49 @@ def binary_cross_entropy_with_logits(logit,
         else:
             return out
 
-    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
-                             'binary_cross_entropy_with_logits')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'binary_cross_entropy_with_logits')
+    check_variable_and_dtype(
+        logit,
+        'logit',
+        ['float32', 'float64'],
+        'binary_cross_entropy_with_logits',
+    )
+    check_variable_and_dtype(
+        label,
+        'label',
+        ['float32', 'float64'],
+        'binary_cross_entropy_with_logits',
+    )
     sigmoid_name = None
     if reduction == 'none' and pos_weight is None and weight is None:
         sigmoid_name = name
 
     out = paddle.fluid.layers.sigmoid_cross_entropy_with_logits(
-        logit, label, name=sigmoid_name)
+        logit, label, name=sigmoid_name
+    )
 
     one = paddle.full(shape=[1], fill_value=1.0, dtype=logit.dtype)
     if pos_weight is not None:
-        check_variable_and_dtype(pos_weight, 'pos_weight',
-                                 ['float32', 'float64'],
-                                 'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(
+            pos_weight,
+            'pos_weight',
+            ['float32', 'float64'],
+            'binary_cross_entropy_with_logits',
+        )
         log_weight = paddle.add(
-            paddle.multiply(label, paddle.subtract(pos_weight, one)), one)
-        pos_weight_name = name if reduction == 'none' and weight is None else None
+            paddle.multiply(label, paddle.subtract(pos_weight, one)), one
+        )
+        pos_weight_name = (
+            name if reduction == 'none' and weight is None else None
+        )
         out = paddle.multiply(out, log_weight, name=pos_weight_name)
 
     if weight is not None:
-        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                                 'binary_cross_entropy_with_logits')
+        check_variable_and_dtype(
+            weight,
+            'weight',
+            ['float32', 'float64'],
+            'binary_cross_entropy_with_logits',
+        )
         weight_name = name if reduction == 'none' else None
         out = paddle.multiply(out, weight, name=weight_name)
 
@@ -842,27 +925,32 @@ def binary_cross_entropy_with_logits(logit,
     return out
 
 
-def hsigmoid_loss(input,
-                  label,
-                  num_classes,
-                  weight,
-                  bias=None,
-                  path_table=None,
-                  path_code=None,
-                  is_sparse=False,
-                  name=None):
+def hsigmoid_loss(
+    input,
+    label,
+    num_classes,
+    weight,
+    bias=None,
+    path_table=None,
+    path_code=None,
+    is_sparse=False,
+    name=None,
+):
     """
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
+
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
     For each class(word), there's a unique path from root to itself, hsigmoid calculate the cost for each non-leaf node on
     the path, and sum them to get a total cost.
-    Comparing to softmax, the OP can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
+
+    Comparing to softmax, hsigmoid can reduce the computational complexity from :math:`O(N)` to :math:`O(logN)`, where :math:`N`
     represents the number of classes or the size of word dict.
 
-    The OP supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
-    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_. For the custom
-    tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
+    The API supports default tree and custom tree. For the default tree, you can refer to `Hierarchical Probabilistic Neural
+    Network Language Model <http://www.iro.umontreal.ca/~lisa/pointeurs/hierarchical-nnlm-aistats05.pdf>`_.
+
+    For the custom tree, you need to set :attr:`is_custom` to True, and do the following steps (take the language model as an example):
 
     1. Using a custom word dict to build a binary tree, each leaf node should be an word in the word dict.
     2. Creating a dict map word_id -> path that from the word to the root node, we call it path_table.
@@ -929,36 +1017,63 @@ def hsigmoid_loss(input,
             #  [1.92374969]]
     """
     if in_dygraph_mode():
-        out, _, _ = _C_ops.hierarchical_sigmoid(input, weight, label,
-                                                path_table, path_code, bias,
-                                                num_classes, is_sparse, 0, [],
-                                                [], [], is_sparse)
+        out, _, _ = _C_ops.hierarchical_sigmoid(
+            input,
+            weight,
+            label,
+            path_table,
+            path_code,
+            bias,
+            num_classes,
+            is_sparse,
+            0,
+            [],
+            [],
+            [],
+            is_sparse,
+        )
         return out
     elif _in_legacy_dygraph():
         out, _, _ = _legacy_C_ops.hierarchical_sigmoid(
-            input, weight, label, path_table, path_code, bias, 'num_classes',
-            num_classes, 'is_sparse', is_sparse, 'remote_prefetch', is_sparse)
+            input,
+            weight,
+            label,
+            path_table,
+            path_code,
+            bias,
+            'num_classes',
+            num_classes,
+            'is_sparse',
+            is_sparse,
+            'remote_prefetch',
+            is_sparse,
+        )
         return out
 
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'hsigmoid_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'hsigmoid_loss'
+    )
     check_variable_and_dtype(label, 'label', ['int64'], 'hsigmoid_loss')
-    check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                             'hsigmoid_loss')
+    check_variable_and_dtype(
+        weight, 'weight', ['float32', 'float64'], 'hsigmoid_loss'
+    )
     if bias is not None:
-        check_variable_and_dtype(bias, 'bias', ['float32', 'float64'],
-                                 'hsigmoid_loss')
+        check_variable_and_dtype(
+            bias, 'bias', ['float32', 'float64'], 'hsigmoid_loss'
+        )
     if path_table is not None:
-        check_variable_and_dtype(path_table, 'path_table', ['int64'],
-                                 'hsigmoid_loss')
+        check_variable_and_dtype(
+            path_table, 'path_table', ['int64'], 'hsigmoid_loss'
+        )
     if path_code is not None:
-        check_variable_and_dtype(path_code, 'path_code', ['int64'],
-                                 'hsigmoid_loss')
+        check_variable_and_dtype(
+            path_code, 'path_code', ['int64'], 'hsigmoid_loss'
+        )
 
     attrs = {
         "num_classes": num_classes,
         "is_sparse": is_sparse,
-        "remote_prefetch": is_sparse
+        "remote_prefetch": is_sparse,
     }
 
     inputs = {
@@ -967,7 +1082,7 @@ def hsigmoid_loss(input,
         "Bias": bias,
         "PathTable": path_table,
         "PathCode": path_code,
-        "Label": label
+        "Label": label,
     }
 
     helper = LayerHelper('hsigmoid_loss', **locals())
@@ -975,10 +1090,9 @@ def hsigmoid_loss(input,
     pre_out = helper.create_variable_for_type_inference(input.dtype)
     outputs = {"Out": out, "PreOut": pre_out, "W_Out": weight}
 
-    helper.append_op(type="hierarchical_sigmoid",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="hierarchical_sigmoid", inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return out
 
 
@@ -991,17 +1105,17 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
 
     .. math::
 
-         loss(x,y) = \frac{1}{n}\sum_{i}z_i
+        loss(x,y) = \frac{1}{n}\sum_{i}z_i
 
 
-    where z_i is given by:
+    where :math:`z_i` is given by:
 
     .. math::
 
         \mathop{z_i} = \left\{\begin{array}{rcl}
-        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\
-        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
-        \end{array} \right.
+                0.5(x_i - y_i)^2 & & {if |x_i - y_i| < \delta} \\
+                \delta * |x_i - y_i| - 0.5 * \delta^2 & & {otherwise}
+            \end{array} \right.
 
     Parameters:
         input (Tensor): Input tensor, the data type is float32 or float64. Shape is
@@ -1015,12 +1129,11 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
             If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        delta (float, optional): Specifies the hyperparameter delta to be used.
+        delta (float, optional): Specifies the hyperparameter :math:`\delta` to be used.
             The value determines how large the errors need to be to use L1. Errors
             smaller than delta are minimized with L2. Parameter is ignored for
             negative/zero values. Default = 1.0
-        name (str, optional): Name for the operation (optional, default is
-            None). For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Returns:
         Tensor, The tensor variable storing the smooth_l1_loss of input and label.
@@ -1029,43 +1142,42 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            input_data = np.random.rand(3,3).astype("float32")
-            label_data = np.random.rand(3,3).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.rand([3, 3]).astype('float32')
+            label = paddle.rand([3, 3]).astype('float32')
             output = paddle.nn.functional.smooth_l1_loss(input, label)
             print(output)
+            # [0.068004]
     """
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'smooth_l1_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'smooth_l1_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'smooth_l1_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'smooth_l1_loss'
+    )
 
     if in_dygraph_mode():
         out, residual = _C_ops.huber_loss(input, label, delta)
     else:
         helper = LayerHelper('huber_loss', **locals())
         residual = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
-        helper.append_op(type='huber_loss',
-                         inputs={
-                             'X': input,
-                             'Y': label
-                         },
-                         outputs={
-                             'Out': out,
-                             'Residual': residual
-                         },
-                         attrs={'delta': delta})
+            dtype=helper.input_dtype()
+        )
+        helper.append_op(
+            type='huber_loss',
+            inputs={'X': input, 'Y': label},
+            outputs={'Out': out, 'Residual': residual},
+            attrs={'delta': delta},
+        )
 
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in smooth_l1_loss should be 'sum', 'mean' or"
-            " 'none', but received %s, which is not allowed." % reduction)
+            " 'none', but received %s, which is not allowed." % reduction
+        )
     if reduction == 'none':
         return out
     elif reduction == 'mean':
@@ -1074,12 +1186,9 @@ def smooth_l1_loss(input, label, reduction='mean', delta=1.0, name=None):
         return paddle.sum(out)
 
 
-def margin_ranking_loss(input,
-                        other,
-                        label,
-                        margin=0.0,
-                        reduction='mean',
-                        name=None):
+def margin_ranking_loss(
+    input, other, label, margin=0.0, reduction='mean', name=None
+):
     r"""
 
     Calcluate the margin rank loss between the input, other and label, use the math function as follows.
@@ -1107,7 +1216,7 @@ def margin_ranking_loss(input,
         reduction (str, optional): Indicate the reduction to apply to the loss, the candicates are ``'none'``, ``'mean'``, ``'sum'``.If :attr:`reduction` is ``'none'``, the unreduced loss is returned; If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned. If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned. Default is ``'mean'``.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
-    Returns: 
+    Returns:
         Tensor, if :attr:`reduction` is ``'mean'`` or ``'sum'``, the out shape is :math:`[1]`, otherwise the shape is the same as `input` .The same dtype as input tensor.
 
     Examples:
@@ -1125,7 +1234,8 @@ def margin_ranking_loss(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
-            "received %s, which is not allowed." % reduction)
+            "received %s, which is not allowed." % reduction
+        )
     if in_dygraph_mode():
         out = _C_ops.subtract(other, input)
         out = _C_ops.multiply(out, label)
@@ -1152,12 +1262,15 @@ def margin_ranking_loss(input,
         return out
 
     helper = LayerHelper("margin_ranking_loss", **locals())
-    check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                             'margin_rank_loss')
-    check_variable_and_dtype(other, 'other', ['float32', 'float64'],
-                             'margin_rank_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'margin_rank_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64'], 'margin_rank_loss'
+    )
+    check_variable_and_dtype(
+        other, 'other', ['float32', 'float64'], 'margin_rank_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'margin_rank_loss'
+    )
 
     out = paddle.subtract(other, input)
     out = paddle.multiply(out, label)
@@ -1170,30 +1283,35 @@ def margin_ranking_loss(input,
     result_out = helper.create_variable_for_type_inference(input.dtype)
 
     if reduction == 'none':
-        helper.append_op(type="relu",
-                         inputs={"X": out},
-                         outputs={"Out": result_out})
+        helper.append_op(
+            type="relu", inputs={"X": out}, outputs={"Out": result_out}
+        )
         return result_out
     elif reduction == 'sum':
         out = paddle.nn.functional.relu(out)
         attrs = {"dim": [0], "keep_dim": False, "reduce_all": True}
-        helper.append_op(type="reduce_sum",
-                         inputs={"X": out},
-                         outputs={"Out": result_out},
-                         attrs=attrs)
+        helper.append_op(
+            type="reduce_sum",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs=attrs,
+        )
         return result_out
     elif reduction == 'mean':
         out = paddle.nn.functional.relu(out)
-        helper.append_op(type="mean",
-                         inputs={"X": out},
-                         outputs={"Out": result_out},
-                         attrs={})
+        helper.append_op(
+            type="mean",
+            inputs={"X": out},
+            outputs={"Out": result_out},
+            attrs={},
+        )
         return result_out
 
 
 def l1_loss(input, label, reduction='mean', name=None):
     r"""
-    This operator computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
+
+    Computes the L1 Loss of Tensor ``input`` and ``label`` as follows.
 
     If `reduction` set to ``'none'``, the loss is:
 
@@ -1224,8 +1342,8 @@ def l1_loss(input, label, reduction='mean', name=None):
 
     Returns:
         Tensor, the L1 Loss of Tensor ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
-            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+        If `reduction` is ``'none'``, the shape of output loss is :math:`[N, *]`, the same as ``input`` .
+        If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
 
     Examples:
         .. code-block:: python
@@ -1247,11 +1365,13 @@ def l1_loss(input, label, reduction='mean', name=None):
             l1_loss = paddle.nn.functional.l1_loss(input, label, reduction='sum')
             print(l1_loss.numpy())
             # [1.4]
+
     """
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
-            "received %s, which is not allowed." % reduction)
+            "received %s, which is not allowed." % reduction
+        )
 
     if in_dygraph_mode():
         unreduced = _C_ops.abs(_C_ops.subtract(input, label))
@@ -1263,25 +1383,24 @@ def l1_loss(input, label, reduction='mean', name=None):
         else:
             return unreduced
     elif _in_legacy_dygraph():
-        unreduced = _elementwise_op_in_dygraph(input,
-                                               label,
-                                               axis=-1,
-                                               act='abs',
-                                               op_name='elementwise_sub')
+        unreduced = _elementwise_op_in_dygraph(
+            input, label, axis=-1, act='abs', op_name='elementwise_sub'
+        )
         if reduction == 'mean':
             return _legacy_C_ops.mean(unreduced)
         elif reduction == 'sum':
-            return _legacy_C_ops.reduce_sum(unreduced, 'dim', [0], 'keep_dim',
-                                            False, 'reduce_all', True)
+            return _legacy_C_ops.reduce_sum(
+                unreduced, 'dim', [0], 'keep_dim', False, 'reduce_all', True
+            )
         else:
             return unreduced
 
-    check_variable_and_dtype(input, 'input',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'l1_loss')
-    check_variable_and_dtype(label, 'label',
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'l1_loss')
+    check_variable_and_dtype(
+        input, 'input', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64', 'int32', 'int64'], 'l1_loss'
+    )
 
     if reduction == 'sum':
         unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
@@ -1290,18 +1409,14 @@ def l1_loss(input, label, reduction='mean', name=None):
         unreduced = paddle.fluid.layers.elementwise_sub(input, label, act='abs')
         return paddle.mean(unreduced, name=name)
     else:
-        return paddle.fluid.layers.elementwise_sub(input,
-                                                   label,
-                                                   act='abs',
-                                                   name=name)
-
-
-def nll_loss(input,
-             label,
-             weight=None,
-             ignore_index=-100,
-             reduction='mean',
-             name=None):
+        return paddle.fluid.layers.elementwise_sub(
+            input, label, act='abs', name=name
+        )
+
+
+def nll_loss(
+    input, label, weight=None, ignore_index=-100, reduction='mean', name=None
+):
     """
     This api returns negative log likelihood.
     See more detail in :ref:`api_nn_loss_NLLLoss` .
@@ -1350,13 +1465,15 @@ def nll_loss(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
-            "'none', but received %s, which is not allowed." % reduction)
+            "'none', but received %s, which is not allowed." % reduction
+        )
 
     input_shape = list(input.shape)
     input_dims = len(input_shape)
     if input_dims < 2:
         raise ValueError(
-            'Expected 2 or more dimensions (got {})'.format(input_dims))
+            'Expected 2 or more dimensions (got {})'.format(input_dims)
+        )
     n = input_shape[0]
     c = input_shape[1]
     if in_dygraph_mode():
@@ -1364,21 +1481,29 @@ def nll_loss(input,
             input = _C_ops.reshape(input, [n, c, 1, -1])
             label = _C_ops.reshape(label, [n, 1, -1])
             out_shape = [n] + input_shape[2:]
-        out, total_weight = _C_ops.nll_loss(input, label, weight, ignore_index,
-                                            reduction)
+        out, total_weight = _C_ops.nll_loss(
+            input, label, weight, ignore_index, reduction
+        )
         if input_dims != 2 and input_dims != 4 and reduction == 'none':
             out = _C_ops.reshape(out, out_shape)
         return out
     elif _in_legacy_dygraph():
         if input_dims != 2 and input_dims != 4:
-            input, _ = _legacy_C_ops.reshape2(input, None, 'shape',
-                                              [n, c, 1, -1])
+            input, _ = _legacy_C_ops.reshape2(
+                input, None, 'shape', [n, c, 1, -1]
+            )
             label, _ = _legacy_C_ops.reshape2(label, None, 'shape', [n, 1, -1])
             out_shape = [n] + input_shape[2:]
 
-        out, total_weight = _legacy_C_ops.nll_loss(input, label, weight,
-                                                   'ignore_index', ignore_index,
-                                                   'reduction', reduction)
+        out, total_weight = _legacy_C_ops.nll_loss(
+            input,
+            label,
+            weight,
+            'ignore_index',
+            ignore_index,
+            'reduction',
+            reduction,
+        )
         if input_dims != 2 and input_dims != 4 and reduction == 'none':
             out, _ = _legacy_C_ops.reshape2(out, None, 'shape', out_shape)
         return out
@@ -1402,10 +1527,9 @@ def nll_loss(input,
     total_weight = helper.create_variable_for_type_inference(dtype=input.dtype)
     outputs = {'Out': out, 'Total_weight': total_weight}
 
-    helper.append_op(type='nll_loss',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='nll_loss', inputs=inputs, outputs=outputs, attrs=attrs
+    )
     if input_dims != 2 and input_dims != 4 and reduction == 'none':
         out = reshape(out, shape=out_shape)
 
@@ -1414,7 +1538,7 @@ def nll_loss(input,
 
 def kl_div(input, label, reduction='mean', name=None):
     r"""
-    This operator calculates the Kullback-Leibler divergence loss
+    Calculate the Kullback-Leibler divergence loss
     between Input(X) and Input(Target). Notes that Input(X) is the
     log-probability and Input(Target) is the probability.
 
@@ -1459,42 +1583,39 @@ def kl_div(input, label, reduction='mean', name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
             import paddle.nn.functional as F
 
             shape = (5, 20)
-            input = np.random.uniform(-10, 10, shape).astype('float32')
-            target = np.random.uniform(-10, 10, shape).astype('float32')
+            x = paddle.uniform(shape, min=-10, max=10).astype('float32')
+            target = paddle.uniform(shape, min=-10, max=10).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='batchmean')
+            pred_loss = F.kl_div(x, target, reduction='batchmean')
             # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='mean')
+            pred_loss = F.kl_div(x, target, reduction='mean')
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='sum')
+            pred_loss = F.kl_div(x, target, reduction='sum')
             # shape=[1]
 
             # 'none' reduction, loss shape is same with input shape
-            pred_loss = F.kl_div(paddle.to_tensor(input),
-                                 paddle.to_tensor(target), reduction='none')
+            pred_loss = F.kl_div(x, target, reduction='none')
             # shape=[5, 20]
 
     """
     # ugly type promotion
-    if fluid.data_feeder.convert_dtype(
-            input.dtype) == 'float32' and fluid.data_feeder.convert_dtype(
-                label.dtype) == 'float64':
+    if (
+        fluid.data_feeder.convert_dtype(input.dtype) == 'float32'
+        and fluid.data_feeder.convert_dtype(label.dtype) == 'float64'
+    ):
         input = paddle.cast(input, 'float64')
-    elif fluid.data_feeder.convert_dtype(
-            input.dtype) == 'float64' and fluid.data_feeder.convert_dtype(
-                label.dtype) == 'float32':
+    elif (
+        fluid.data_feeder.convert_dtype(input.dtype) == 'float64'
+        and fluid.data_feeder.convert_dtype(label.dtype) == 'float32'
+    ):
         label = paddle.cast(label, 'float64')
 
     if in_dygraph_mode():
@@ -1527,13 +1648,12 @@ def kl_div(input, label, reduction='mean', name=None):
     fluid.data_feeder.check_type(reduction, 'reduction', str, 'kl_div')
 
     loss = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type='kldiv_loss',
-                     inputs={
-                         'X': input,
-                         'Target': label
-                     },
-                     outputs={'Loss': loss},
-                     attrs={'reduction': 'none'})
+    helper.append_op(
+        type='kldiv_loss',
+        inputs={'X': input, 'Target': label},
+        outputs={'Loss': loss},
+        attrs={'reduction': 'none'},
+    )
 
     if reduction == 'mean':
         loss = paddle.mean(loss)
@@ -1596,31 +1716,38 @@ def mse_loss(input, label, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'mse_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'mse_loss')
-        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                 'mse_loss')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'mse_loss'
+        )
+        check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'mse_loss'
+        )
 
     if reduction == 'none':
         return paddle.square(paddle.subtract(input, label), name=name)
     elif reduction == 'mean':
-        return paddle.mean(paddle.square(paddle.subtract(input, label)),
-                           name=name)
+        return paddle.mean(
+            paddle.square(paddle.subtract(input, label)), name=name
+        )
     else:
-        return paddle.sum(paddle.square(paddle.subtract(input, label)),
-                          name=name)
+        return paddle.sum(
+            paddle.square(paddle.subtract(input, label)), name=name
+        )
 
 
-def ctc_loss(log_probs,
-             labels,
-             input_lengths,
-             label_lengths,
-             blank=0,
-             reduction='mean',
-             norm_by_times=False):
+def ctc_loss(
+    log_probs,
+    labels,
+    input_lengths,
+    label_lengths,
+    blank=0,
+    reduction='mean',
+    norm_by_times=False,
+):
     """
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
@@ -1646,7 +1773,6 @@ def ctc_loss(log_probs,
 
             # declarative mode
             import paddle.nn.functional as F
-            import numpy as np
             import paddle
 
             # length of the longest logit sequence
@@ -1658,8 +1784,7 @@ def ctc_loss(log_probs,
             # class num
             class_num = 3
 
-            np.random.seed(1)
-            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+            log_probs = paddle.to_tensor([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
                                     [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
 
                                     [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
@@ -1672,35 +1797,36 @@ def ctc_loss(log_probs,
                                     [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
 
                                     [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
-                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
-            labels = np.array([[1, 2, 2],
-                            [1, 2, 2]]).astype("int32")
-            input_lengths = np.array([5, 5]).astype("int64")
-            label_lengths = np.array([3, 3]).astype("int64")
-
-            log_probs = paddle.to_tensor(log_probs)
-            labels = paddle.to_tensor(labels)
-            input_lengths = paddle.to_tensor(input_lengths)
-            label_lengths = paddle.to_tensor(label_lengths)
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]],
+                                    dtype="float32")
+            labels = paddle.to_tensor([[1, 2, 2],
+                                    [1, 2, 2]], dtype="int32")
+            input_lengths = paddle.to_tensor([5, 5], dtype="int64")
+            label_lengths = paddle.to_tensor([3, 3], dtype="int64")
 
             loss = F.ctc_loss(log_probs, labels,
                 input_lengths,
                 label_lengths,
                 blank=0,
                 reduction='none')
-            print(loss)  #[3.9179852 2.9076521]
+            print(loss)
+            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [3.91798496, 2.90765190])
 
             loss = F.ctc_loss(log_probs, labels,
                 input_lengths,
                 label_lengths,
                 blank=0,
                 reduction='mean')
-            print(loss)  #[1.1376063]
+            print(loss)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.13760614])
 
     """
 
-    loss_out = fluid.layers.warpctc(log_probs, labels, blank, norm_by_times,
-                                    input_lengths, label_lengths)
+    loss_out = fluid.layers.warpctc(
+        log_probs, labels, blank, norm_by_times, input_lengths, label_lengths
+    )
 
     loss_out = paddle.squeeze(loss_out, [-1])
     assert reduction in ['mean', 'sum', 'none']
@@ -1711,15 +1837,17 @@ def ctc_loss(log_probs,
     return loss_out
 
 
-def margin_cross_entropy(logits,
-                         label,
-                         margin1=1.0,
-                         margin2=0.5,
-                         margin3=0.0,
-                         scale=64.0,
-                         group=None,
-                         return_softmax=False,
-                         reduction='mean'):
+def margin_cross_entropy(
+    logits,
+    label,
+    margin1=1.0,
+    margin2=0.5,
+    margin3=0.0,
+    scale=64.0,
+    group=None,
+    return_softmax=False,
+    reduction='mean',
+):
     r"""
     .. math::
 
@@ -1731,9 +1859,7 @@ def margin_cross_entropy(logits,
 
     .. hint::
         The API supports single GPU and multi GPU, and don't supports CPU.
-
         For data parallel mode, set ``group=False``.
-
         For model parallel mode, set ``group=None`` or the group instance return by paddle.distributed.new_group.
         And logits.shape[-1] can be different at each rank.
 
@@ -1745,7 +1871,7 @@ def margin_cross_entropy(logits,
         margin2 (float, optional): m2 of margin loss, default value is `0.5`.
         margin3 (float, optional): m3 of margin loss, default value is `0.0`.
         scale (float, optional): s of margin loss, default value is `64.0`.
-        group (Group, optional): The group instance return by paddle.distributed.new_group 
+        group (Group, optional): The group instance return by paddle.distributed.new_group
             or ``None`` for global default group or ``False`` for data parallel (do not communication cross ranks).
             Default is ``None``.
         return_softmax (bool, optional): Whether return softmax probability. Default value is `False`.
@@ -1756,12 +1882,12 @@ def margin_cross_entropy(logits,
                     Default value is `'mean'`.
 
     Returns:
-        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
-            `return_softmax` is False, otherwise the tuple \
-            (loss, softmax), softmax is shard_softmax when \
-            using model parallel, otherwise softmax is in \
-            the same shape with input logits. If ``reduction == None``, \
-            the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``.
+        Tensor|tuple[Tensor, Tensor], return the cross entropy loss if
+            `return_softmax` is False, otherwise the tuple (loss, softmax),
+            softmax is shard_softmax when using model parallel, otherwise
+            softmax is in the same shape with input logits. If
+            ``reduction == None``, the shape of loss is ``[N, 1]``, otherwise
+            the shape is ``[1]``.
 
     Examples:
 
@@ -1801,7 +1927,7 @@ def margin_cross_entropy(logits,
         print(label)
         print(loss)
         print(softmax)
-        
+
         #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
         #       [[ 0.85204151, -0.55557678,  0.04994566,  0.71986042],
         #        [-0.20198586, -0.35270476, -0.55182702,  0.09749021]])
@@ -1862,7 +1988,7 @@ def margin_cross_entropy(logits,
         print(loss)
         print(softmax)
 
-        # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py 
+        # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py
         ## for rank0 input
         #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
         #       [[ 0.32888934,  0.02408748, -0.02763289,  0.18173063],
@@ -1909,7 +2035,10 @@ def margin_cross_entropy(logits,
     if not (group == False or group is None or hasattr(group, 'is_member')):
         raise ValueError(
             'Expected group is False, None or instance of paddle.distributed.collective.Group \
-             (got group: {})'.format(group))
+             (got group: {})'.format(
+                group
+            )
+        )
         return
 
     if hasattr(group, 'is_member') and not group.is_member():
@@ -1923,8 +2052,11 @@ def margin_cross_entropy(logits,
         if core.is_compiled_with_dist():
             parallel_env = paddle.distributed.ParallelEnv()
             global_rank = parallel_env.rank
-            rank = global_rank if group is None else group.get_group_rank(
-                global_rank)
+            rank = (
+                global_rank
+                if group is None
+                else group.get_group_rank(global_rank)
+            )
             nranks = parallel_env.world_size if group is None else group.nranks
 
     input_dims = len(list(logits.shape))
@@ -1932,15 +2064,26 @@ def margin_cross_entropy(logits,
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
             'Expected input_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+             (got nput_dims{}, label_dims{})'.format(
+                input_dims, label_dims
+            )
+        )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=-1)
 
     if in_dygraph_mode():
-        softmax, loss = _C_ops.margin_cross_entropy(logits, label,
-                                                    return_softmax, ring_id,
-                                                    rank, nranks, margin1,
-                                                    margin2, margin3, scale)
+        softmax, loss = _C_ops.margin_cross_entropy(
+            logits,
+            label,
+            return_softmax,
+            ring_id,
+            rank,
+            nranks,
+            margin1,
+            margin2,
+            margin3,
+            scale,
+        )
         if reduction == 'mean':
             loss = paddle.mean(loss)
         elif reduction == 'sum':
@@ -1951,9 +2094,25 @@ def margin_cross_entropy(logits,
             return loss, softmax
     elif _in_legacy_dygraph():
         softmax, loss = _legacy_C_ops.margin_cross_entropy(
-            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
-            'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
-            scale, 'return_softmax', return_softmax)
+            logits,
+            label,
+            'ring_id',
+            ring_id,
+            'rank',
+            rank,
+            'nranks',
+            nranks,
+            'margin1',
+            margin1,
+            'margin2',
+            margin2,
+            'margin3',
+            margin3,
+            'scale',
+            scale,
+            'return_softmax',
+            return_softmax,
+        )
         if reduction == 'mean':
             loss = paddle.mean(loss)
         elif reduction == 'sum':
@@ -1968,31 +2127,31 @@ def margin_cross_entropy(logits,
     softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
     loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
 
-    check_variable_and_dtype(logits, 'logits',
-                             ['float16', 'float32', 'float64'],
-                             'margin_cross_entropy')
-    check_variable_and_dtype(label, 'label', ['int32', 'int64'],
-                             'margin_cross_entropy')
-
-    helper.append_op(type=op_type,
-                     inputs={
-                         'Logits': logits,
-                         'Label': label
-                     },
-                     outputs={
-                         'Softmax': softmax,
-                         'Loss': loss
-                     },
-                     attrs={
-                         'return_softmax': return_softmax,
-                         'ring_id': ring_id,
-                         'rank': rank,
-                         'nranks': nranks,
-                         'margin1': margin1,
-                         'margin2': margin2,
-                         'margin3': margin3,
-                         'scale': scale,
-                     })
+    check_variable_and_dtype(
+        logits,
+        'logits',
+        ['float16', 'float32', 'float64'],
+        'margin_cross_entropy',
+    )
+    check_variable_and_dtype(
+        label, 'label', ['int32', 'int64'], 'margin_cross_entropy'
+    )
+
+    helper.append_op(
+        type=op_type,
+        inputs={'Logits': logits, 'Label': label},
+        outputs={'Softmax': softmax, 'Loss': loss},
+        attrs={
+            'return_softmax': return_softmax,
+            'ring_id': ring_id,
+            'rank': rank,
+            'nranks': nranks,
+            'margin1': margin1,
+            'margin2': margin2,
+            'margin3': margin3,
+            'scale': scale,
+        },
+    )
 
     if reduction == 'mean':
         loss = paddle.mean(loss)
@@ -2009,16 +2168,20 @@ def margin_cross_entropy(logits,
     since="2.0.0",
     update_to="paddle.nn.functional.cross_entropy",
     level=1,
-    reason=
-    ('Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
-     'and "paddle.nn.functional.cross_entropy" is different.'))
-def softmax_with_cross_entropy(logits,
-                               label,
-                               soft_label=False,
-                               ignore_index=-100,
-                               numeric_stable_mode=True,
-                               return_softmax=False,
-                               axis=-1):
+    reason=(
+        'Please notice that behavior of "paddle.nn.functional.softmax_with_cross_entropy" '
+        'and "paddle.nn.functional.cross_entropy" is different.'
+    ),
+)
+def softmax_with_cross_entropy(
+    logits,
+    label,
+    soft_label=False,
+    ignore_index=-100,
+    numeric_stable_mode=True,
+    return_softmax=False,
+    axis=-1,
+):
     r"""
     This operator implements the cross entropy loss function with softmax. This function 
     combines the calculation of the softmax operation and the cross entropy loss function 
@@ -2093,44 +2256,51 @@ def softmax_with_cross_entropy(logits,
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            data = np.random.rand(128).astype("float32")
-            label = np.random.rand(1).astype("int64")
-            data = paddle.to_tensor(data)
-            label = paddle.to_tensor(label)
-            linear = paddle.nn.Linear(128, 100)
-            x = linear(data)
-            out = paddle.nn.functional.softmax_with_cross_entropy(logits=x, label=label)
+
+            logits = paddle.to_tensor([0.4, 0.6, 0.9], dtype="float32")
+            label = paddle.to_tensor([1], dtype="int64")
+
+            out = paddle.nn.functional.softmax_with_cross_entropy(logits=logits, label=label)
             print(out)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.15328646])
     """
-    return fluid_softmax_with_cross_entropy(logits, label, soft_label,
-                                            ignore_index, numeric_stable_mode,
-                                            return_softmax, axis)
-
-
-def cross_entropy(input,
-                  label,
-                  weight=None,
-                  ignore_index=-100,
-                  reduction='mean',
-                  soft_label=False,
-                  axis=-1,
-                  use_softmax=True,
-                  name=None):
+    return fluid_softmax_with_cross_entropy(
+        logits,
+        label,
+        soft_label,
+        ignore_index,
+        numeric_stable_mode,
+        return_softmax,
+        axis,
+    )
+
+
+def cross_entropy(
+    input,
+    label,
+    weight=None,
+    ignore_index=-100,
+    reduction='mean',
+    soft_label=False,
+    axis=-1,
+    use_softmax=True,
+    name=None,
+):
     r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
-    to provide a more numerically stable computing. 
+
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
+    to provide a more numerically stable computing.
 
     This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
     parameters for details.
 
     This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
     mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
     The calculation of this operator includes the following two steps.
@@ -2185,7 +2355,7 @@ def cross_entropy(input,
             1.1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]
 
 
             1.2. Soft labels (soft_label = True)
@@ -2195,21 +2365,21 @@ def cross_entropy(input,
 
         2. reduction
 
-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``
 
                 Return the previous result directly
 
-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``
 
                 Return the sum of the previous results
 
             .. math::
                \\loss=\sum_{j}loss_j
 
-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.
 
-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``
 
                    Return the average value of the previous results
 
@@ -2223,7 +2393,7 @@ def cross_entropy(input,
             1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
 
             2. Soft labels (soft_label = True)
 
@@ -2232,70 +2402,41 @@ def cross_entropy(input,
 
 
     Parameters:
+        input (Tensor): the data type is float32, float64. Shape is :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes, ``k >= 1`` .
 
-        - **input** (Tensor)
-
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
-
-            Note: 
-
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
-                output of softmax operator, which will produce incorrect results.
-
+            Note:
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the output of softmax operator, which will produce incorrect results.
                 2. when use_softmax=False, it expects the output of softmax operator.
 
-        - **label** (Tensor)
-
+        label (Tensor):
             1. If soft_label=False, the shape is
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
-        - **weight** (Tensor, optional)
-
-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+        weight (Tensor, optional): a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
-
-        - **ignore_index** (int64, optional)
-
-            Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
-
-        - **reduction** (str, optional)
-
-            Indicate how to average the loss by batch_size,
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-
-        - **soft_label** (bool, optional)
-
-            Indicate whether label is soft. 
-            Default is ``False``.
-
-        - **axis** (int, optional)
-
-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the 
-            number of dimensions of input :attr:`input`. 
+        soft_label (bool, optional): Indicate whether label is soft. Default is ``False``.
+        axis (int, optional):The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the
+            number of dimensions of input :attr:`input`.
             Default is ``-1`` .
-
-        - **use_softmax** (bool, optional)
-
-            Indicate whether compute softmax before cross_entropy.
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
             Default is ``True``.
-
-        - **name** (str, optional)
-
-            The name of the operator. Default is ``None`` .
+        name (str, optional): The name of the operator. Default is ``None`` .
             For more information, please refer to :ref:`api_guide_Name` .
 
     Returns:
@@ -2307,13 +2448,11 @@ def cross_entropy(input,
 
         If :attr:`reduction` is ``'none'``:
 
-        1. If soft_label = False, the dimension of return value is the same with ``label`` . 
-
-        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+        1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
+        2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
     Examples:
-
         .. code-block:: python
 
             # hard labels
@@ -2322,10 +2461,10 @@ def cross_entropy(input,
             N=100
             C=200
             reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
             label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
@@ -2349,9 +2488,9 @@ def cross_entropy(input,
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                   axis=axis,
                                                                   weight=weight,
                                                                   reduction=reduction)
@@ -2363,12 +2502,14 @@ def cross_entropy(input,
         raise ValueError(
             "The value of 'reduction' in softmax_cross_entropy"
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
-            % reduction)
+            % reduction
+        )
     if ignore_index > 0 and soft_label == True:
         raise ValueError(
             "When soft_label == True, the value of 'ignore_index' in softmax_cross_entropy"
-            "should be '-100', but received %s, which is not allowed." %
-            ignore_index)
+            "should be '-100', but received %s, which is not allowed."
+            % ignore_index
+        )
 
     input_dims = len(list(input.shape))
     if input_dims == 0:
@@ -2378,37 +2519,53 @@ def cross_entropy(input,
     if input_dims - 1 != label_dims and input_dims != label_dims:
         raise ValueError(
             'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
-             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+             (got nput_dims{}, label_dims{})'.format(
+                input_dims, label_dims
+            )
+        )
     if input_dims - 1 == label_dims:
         label = paddle.unsqueeze(label, axis=axis)
 
     if in_dygraph_mode():
         if soft_label == False:
-            valid_label = paddle.cast(label != ignore_index,
-                                      dtype=label.dtype) * label
-            label_min = paddle.min(valid_label)
-            label_max = paddle.max(valid_label)
-            if label_min < 0:
-                raise ValueError("Target {} is out of lower bound.".format(
-                    label_min.item()))
-            if label_max >= input.shape[axis]:
-                raise ValueError("Target {} is out of upper bound.".format(
-                    label_max.item()))
+            valid_label = (
+                paddle.cast(label != ignore_index, dtype=label.dtype) * label
+            )
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, valid_label, 'soft_label', soft_label,
-                    'ignore_index', ignore_index, 'numeric_stable_mode', True,
-                    'axis', axis, 'use_softmax', use_softmax)
+                    input,
+                    valid_label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
             else:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-                    'use_softmax', use_softmax)
+                    input,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
         else:
-            _, out = _C_ops.cross_entropy_with_softmax(input, label, soft_label,
-                                                       use_softmax, True,
-                                                       ignore_index, axis)
+            _, out = _C_ops.cross_entropy_with_softmax(
+                input, label, soft_label, use_softmax, True, ignore_index, axis
+            )
 
         if weight is not None:
 
@@ -2418,11 +2575,12 @@ def cross_entropy(input,
                 # weight's shape is C, where C is class num.
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-                weight_gather = paddle.matmul(x=paddle.cast(
-                    label, weight.dtype),
-                                              y=weight,
-                                              transpose_x=False,
-                                              transpose_y=True)
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True,
+                )
                 out_shape = list(out.shape)
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
@@ -2433,29 +2591,44 @@ def cross_entropy(input,
                     raise ValueError(
                         "input's class_dimension({}) must equal to "
                         "weight's class_dimension({}) "
-                        "when weight is provided" \
-                            .format(input.shape[axis], weight.shape[-1]))
-
-                ignore_weight_mask = paddle.cast((label != ignore_index),
-                                                 out.dtype)
-                if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                        axis] == 1:
+                        "when weight is provided".format(
+                            input.shape[axis], weight.shape[-1]
+                        )
+                    )
+
+                ignore_weight_mask = paddle.cast(
+                    (label != ignore_index), out.dtype
+                )
+                if (
+                    ignore_weight_mask.ndim > 1
+                    and ignore_weight_mask.shape[axis] == 1
+                ):
                     # TODO: Temporarily use squeeze instead of squeeze_
-                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask,
-                                                        axis)
+                    ignore_weight_mask = paddle.squeeze(
+                        ignore_weight_mask, axis
+                    )
                 if axis != -1 and axis != valid_label.ndim - 1:
-                    temp_perm = list(range(axis % valid_label.ndim)) \
-                                + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
-                                + [axis % valid_label.ndim]
+                    temp_perm = (
+                        list(range(axis % valid_label.ndim))
+                        + list(
+                            range(
+                                (axis % valid_label.ndim + 1), valid_label.ndim
+                            )
+                        )
+                        + [axis % valid_label.ndim]
+                    )
                     weight_gather = _C_ops.gather_nd(
-                        weight, valid_label.transpose(temp_perm))
+                        weight, valid_label.transpose(temp_perm)
+                    )
                 else:
                     weight_gather = _C_ops.gather_nd(weight, valid_label)
-                weight_gather = _C_ops.multiply(weight_gather,
-                                                ignore_weight_mask)
+                weight_gather = _C_ops.multiply(
+                    weight_gather, ignore_weight_mask
+                )
                 input_shape = list(label.shape)
-                weight_gather_reshape = reshape(weight_gather,
-                                                shape=input_shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape
+                )
                 out = paddle.cast(out, weight_gather_reshape.dtype)
                 out = _C_ops.multiply(out, weight_gather_reshape)
 
@@ -2476,22 +2649,24 @@ def cross_entropy(input,
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
                 # mask[i]=1, otherwise
-                mask = (label != ignore_index)
+                mask = label != ignore_index
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = _C_ops.sum(mask, [], None, False)
                     ret = out_sum / (count + (count == 0.0))
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
-                    weight_ignored = _C_ops.multiply(mask,
-                                                     weight_gather_reshape)
+                    weight_ignored = _C_ops.multiply(
+                        mask, weight_gather_reshape
+                    )
                     weight_sum = _C_ops.sum(weight_ignored, [], None, False)
                     ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
                 out_sum = _C_ops.sum(out, [], None, False)
-                total_weight = _C_ops.sum(weight_gather_reshape, [], None,
-                                          False)
+                total_weight = _C_ops.sum(
+                    weight_gather_reshape, [], None, False
+                )
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
                 return _C_ops.mean_all(out)
@@ -2503,32 +2678,65 @@ def cross_entropy(input,
 
     elif _in_legacy_dygraph():
         if soft_label == False:
-            valid_label = paddle.cast(label != ignore_index,
-                                      dtype=label.dtype) * label
+            valid_label = (
+                paddle.cast(label != ignore_index, dtype=label.dtype) * label
+            )
             label_min = paddle.min(valid_label)
             label_max = paddle.max(valid_label)
             if label_min < 0:
-                raise ValueError("Target {} is out of lower bound.".format(
-                    label_min.item()))
+                raise ValueError(
+                    "Target {} is out of lower bound.".format(label_min.item())
+                )
             if label_max >= input.shape[axis]:
-                raise ValueError("Target {} is out of upper bound.".format(
-                    label_max.item()))
+                raise ValueError(
+                    "Target {} is out of upper bound.".format(label_max.item())
+                )
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             if soft_label == False:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, valid_label, 'soft_label', soft_label,
-                    'ignore_index', ignore_index, 'numeric_stable_mode', True,
-                    'axis', axis, 'use_softmax', use_softmax)
+                    input,
+                    valid_label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
             else:
                 _, _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                    input, label, 'soft_label', soft_label, 'ignore_index',
-                    ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-                    'use_softmax', use_softmax)
+                    input,
+                    label,
+                    'soft_label',
+                    soft_label,
+                    'ignore_index',
+                    ignore_index,
+                    'numeric_stable_mode',
+                    True,
+                    'axis',
+                    axis,
+                    'use_softmax',
+                    use_softmax,
+                )
         else:
             _, out = _legacy_C_ops.softmax_with_cross_entropy(
-                input, label, 'soft_label', soft_label, 'ignore_index',
-                ignore_index, 'numeric_stable_mode', True, 'axis', axis,
-                'use_softmax', use_softmax)
+                input,
+                label,
+                'soft_label',
+                soft_label,
+                'ignore_index',
+                ignore_index,
+                'numeric_stable_mode',
+                True,
+                'axis',
+                axis,
+                'use_softmax',
+                use_softmax,
+            )
 
         if weight is not None:
 
@@ -2538,11 +2746,12 @@ def cross_entropy(input,
                 # weight's shape is C, where C is class num.
                 # for 1d case: label's shape is [N,C], weight_gather's shape is N.
                 # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-                weight_gather = paddle.matmul(x=paddle.cast(
-                    label, weight.dtype),
-                                              y=weight,
-                                              transpose_x=False,
-                                              transpose_y=True)
+                weight_gather = paddle.matmul(
+                    x=paddle.cast(label, weight.dtype),
+                    y=weight,
+                    transpose_x=False,
+                    transpose_y=True,
+                )
                 out_shape = list(out.shape)
                 weight_gather_reshape = reshape(weight_gather, shape=out_shape)
                 out = paddle.cast(out, weight_gather_reshape.dtype)
@@ -2554,29 +2763,44 @@ def cross_entropy(input,
                     raise ValueError(
                         "input's class_dimension({}) must equal to "
                         "weight's class_dimension({}) "
-                        "when weight is provided" \
-                            .format(input.shape[axis], weight.shape[-1]))
-
-                ignore_weight_mask = paddle.cast((label != ignore_index),
-                                                 out.dtype)
-                if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                        axis] == 1:
+                        "when weight is provided".format(
+                            input.shape[axis], weight.shape[-1]
+                        )
+                    )
+
+                ignore_weight_mask = paddle.cast(
+                    (label != ignore_index), out.dtype
+                )
+                if (
+                    ignore_weight_mask.ndim > 1
+                    and ignore_weight_mask.shape[axis] == 1
+                ):
                     # TODO: Temporarily use squeeze instead of squeeze_
-                    ignore_weight_mask = paddle.squeeze(ignore_weight_mask,
-                                                        axis)
+                    ignore_weight_mask = paddle.squeeze(
+                        ignore_weight_mask, axis
+                    )
                 if axis != -1 and axis != valid_label.ndim - 1:
-                    temp_perm = list(range(axis % valid_label.ndim)) \
-                                + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
-                                + [axis % valid_label.ndim]
+                    temp_perm = (
+                        list(range(axis % valid_label.ndim))
+                        + list(
+                            range(
+                                (axis % valid_label.ndim + 1), valid_label.ndim
+                            )
+                        )
+                        + [axis % valid_label.ndim]
+                    )
                     weight_gather = _legacy_C_ops.gather_nd(
-                        weight, valid_label.transpose(temp_perm))
+                        weight, valid_label.transpose(temp_perm)
+                    )
                 else:
                     weight_gather = _legacy_C_ops.gather_nd(weight, valid_label)
                 weight_gather = _legacy_C_ops.elementwise_mul(
-                    weight_gather, ignore_weight_mask)
+                    weight_gather, ignore_weight_mask
+                )
                 input_shape = list(label.shape)
-                weight_gather_reshape = reshape(weight_gather,
-                                                shape=input_shape)
+                weight_gather_reshape = reshape(
+                    weight_gather, shape=input_shape
+                )
                 out = paddle.cast(out, weight_gather_reshape.dtype)
                 out = _legacy_C_ops.elementwise_mul(out, weight_gather_reshape)
 
@@ -2597,7 +2821,7 @@ def cross_entropy(input,
                 # for each label[i],set 1 or 0, according to ignore_index
                 # mask[i]=0, if label[i]==ignore_index
                 # mask[i]=1, otherwise
-                mask = (label != ignore_index)
+                mask = label != ignore_index
                 if weight is None:
                     mask = paddle.cast(mask, dtype=out_sum.dtype)
                     count = _legacy_C_ops.reduce_sum(mask, 'reduce_all', True)
@@ -2605,15 +2829,18 @@ def cross_entropy(input,
                 else:
                     mask = paddle.cast(mask, weight_gather_reshape.dtype)
                     weight_ignored = _legacy_C_ops.elementwise_mul(
-                        mask, weight_gather_reshape)
+                        mask, weight_gather_reshape
+                    )
                     weight_sum = _legacy_C_ops.reduce_sum(
-                        weight_ignored, 'reduce_all', True)
+                        weight_ignored, 'reduce_all', True
+                    )
                     ret = out_sum / (weight_sum + (weight_sum == 0.0))
                 return ret
             elif weight is not None:
                 out_sum = _legacy_C_ops.reduce_sum(out, 'reduce_all', True)
-                total_weight = _legacy_C_ops.reduce_sum(weight_gather_reshape,
-                                                        'reduce_all', True)
+                total_weight = _legacy_C_ops.reduce_sum(
+                    weight_gather_reshape, 'reduce_all', True
+                )
                 return out_sum / (total_weight + (total_weight == 0.0))
             else:
                 return _legacy_C_ops.mean(out)
@@ -2622,18 +2849,24 @@ def cross_entropy(input,
                 out = paddle.squeeze(out, axis=axis)
             return out
 
-    check_variable_and_dtype(input, 'input', ['float16', 'float32', 'float64'],
-                             'softmax_cross_entropy')
     check_variable_and_dtype(
-        label, 'label',
+        input,
+        'input',
+        ['float16', 'float32', 'float64'],
+        'softmax_cross_entropy',
+    )
+    check_variable_and_dtype(
+        label,
+        'label',
         ['uint8', 'int8', 'int16', 'int32', 'int64', 'float32', 'float64'],
-        'softmax_cross_entropy')
+        'softmax_cross_entropy',
+    )
     attrs = {
         'soft_label': soft_label,
         'ignore_index': ignore_index,
         'numeric_stable_mode': True,
         'axis': axis,
-        'use_softmax': use_softmax
+        'use_softmax': use_softmax,
     }
     helper = LayerHelper('softmax_with_cross_entropy', **locals())
     softmax = helper.create_variable_for_type_inference(dtype=input.dtype)
@@ -2643,17 +2876,17 @@ def cross_entropy(input,
     if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
         backprop = helper.create_variable_for_type_inference(dtype=input.dtype)
         outputs['Backprop'] = backprop
-    helper.append_op(type='softmax_with_cross_entropy',
-                     inputs={
-                         'Logits': input,
-                         'Label': label
-                     },
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='softmax_with_cross_entropy',
+        inputs={'Logits': input, 'Label': label},
+        outputs=outputs,
+        attrs=attrs,
+    )
 
     if weight is not None:
-        check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                                 'softmax_cross_entropy')
+        check_variable_and_dtype(
+            weight, 'weight', ['float32', 'float64'], 'softmax_cross_entropy'
+        )
         weight_name = name if reduction == 'none' else None
         if soft_label == True:
             # chajchaj:
@@ -2661,34 +2894,48 @@ def cross_entropy(input,
             # weight's shape is C, where C is class num.
             # for 1d case: label's shape is [N,C], weight_gather's shape is N.
             # for 2d case: label's shape is [N,H,W,C], weight_gather's shape is [N,H,W].
-            weight_gather = paddle.matmul(x=paddle.cast(label, weight.dtype),
-                                          y=weight,
-                                          transpose_x=False,
-                                          transpose_y=True)
+            weight_gather = paddle.matmul(
+                x=paddle.cast(label, weight.dtype),
+                y=weight,
+                transpose_x=False,
+                transpose_y=True,
+            )
 
             out_shape = list(out.shape)
             weight_gather_reshape = reshape(weight_gather, shape=out_shape)
             out = paddle.cast(out, weight_gather_reshape.dtype)
         else:
             if input.shape[axis] != weight.shape[-1]:
-                raise ValueError("input's class_dimension({}) must equal to "
-                                 "weight's class_dimension({}) "
-                                 "when weight is provided" \
-                                 .format(input.shape[axis], weight.shape[-1]))
+                raise ValueError(
+                    "input's class_dimension({}) must equal to "
+                    "weight's class_dimension({}) "
+                    "when weight is provided".format(
+                        input.shape[axis], weight.shape[-1]
+                    )
+                )
 
             valid_label = paddle.multiply(
-                paddle.cast(label != ignore_index, dtype=label.dtype), label)
-            ignore_weight_mask = paddle.cast((label != ignore_index),
-                                             input.dtype)
-            if ignore_weight_mask.ndim > 1 and ignore_weight_mask.shape[
-                    axis] == 1:
+                paddle.cast(label != ignore_index, dtype=label.dtype), label
+            )
+            ignore_weight_mask = paddle.cast(
+                (label != ignore_index), input.dtype
+            )
+            if (
+                ignore_weight_mask.ndim > 1
+                and ignore_weight_mask.shape[axis] == 1
+            ):
                 ignore_weight_mask = paddle.squeeze(ignore_weight_mask, axis)
             if axis != -1 and axis != valid_label.ndim - 1:
-                temp_perm = list(range(axis % valid_label.ndim)) \
-                            + list(range((axis % valid_label.ndim + 1), valid_label.ndim)) \
-                            + [axis % valid_label.ndim]
+                temp_perm = (
+                    list(range(axis % valid_label.ndim))
+                    + list(
+                        range((axis % valid_label.ndim + 1), valid_label.ndim)
+                    )
+                    + [axis % valid_label.ndim]
+                )
                 weight_gather = paddle.gather_nd(
-                    weight, paddle.transpose(valid_label, temp_perm))
+                    weight, paddle.transpose(valid_label, temp_perm)
+                )
             else:
                 weight_gather = paddle.gather_nd(weight, valid_label)
             weight_gather = paddle.multiply(weight_gather, ignore_weight_mask)
@@ -2705,8 +2952,8 @@ def cross_entropy(input,
             # for each label[i],set 1 or 0, according to ignore_index
             # mask[i]=0, if label[i]==ignore_index
             # mask[i]=1, otherwise
-            mask = (label != ignore_index)
-            if (weight is None):
+            mask = label != ignore_index
+            if weight is None:
                 mask = paddle.cast(mask, dtype=out_sum.dtype)
                 count = paddle.sum(mask, name=name)
                 ret = out_sum / (count + (count == 0.0))
@@ -2730,13 +2977,15 @@ def cross_entropy(input,
         return out
 
 
-def sigmoid_focal_loss(logit,
-                       label,
-                       normalizer=None,
-                       alpha=0.25,
-                       gamma=2.0,
-                       reduction='sum',
-                       name=None):
+def sigmoid_focal_loss(
+    logit,
+    label,
+    normalizer=None,
+    alpha=0.25,
+    gamma=2.0,
+    reduction='sum',
+    name=None,
+):
     r"""
     `Focal Loss <https://arxiv.org/abs/1708.02002>`_ is proposed to address the
     foreground-background class imbalance for classification tasks. It down-weights
@@ -2744,12 +2993,12 @@ def sigmoid_focal_loss(logit,
     it is used in one-stage object detection where the foreground-background class
     imbalance is extremely high.
 
-    This operator measures focal loss function as follows: 
+    This operator measures focal loss function as follows:
 
     .. math::
            Out = -Labels * alpha * {(1 - \sigma(Logit))}^{gamma}\log(\sigma(Logit)) - (1 - Labels) * (1 - alpha) * {\sigma(Logit)}^{gamma}\log(1 - \sigma(Logit))
 
-    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`. 
+    We know that :math:`\sigma(Logit) = \frac{1}{1 + \exp(-Logit)}`.
 
     Then, if :attr:`normalizer` is not None, this operator divides the
     normalizer tensor on the loss `Out`:
@@ -2776,7 +3025,7 @@ def sigmoid_focal_loss(logit,
             For object detection task, it is the number of positive samples.
             If set to None, the focal loss will not be normalized. Default is None.
         alpha(int|float, optional): Hyper-parameter to balance the positive and negative example,
-            it should be between 0 and 1.  Default value is set to 0.25. 
+            it should be between 0 and 1.  Default value is set to 0.25.
         gamma(int|float, optional): Hyper-parameter to modulate the easy and hard examples.
             Default value is set to 2.0.
         reduction (str, optional): Indicate how to average the loss by batch_size,
@@ -2810,37 +3059,49 @@ def sigmoid_focal_loss(logit,
         raise ValueError(
             "The value of 'reduction' in sigmoid_focal_loss "
             "should be 'sum', 'mean' or 'none', but received %s, which is not allowed."
-            % reduction)
+            % reduction
+        )
 
     if normalizer is not None:
-        check_variable_and_dtype(normalizer, 'normalizer',
-                                 ['float32', 'float64'], 'sigmoid_focal_loss')
+        check_variable_and_dtype(
+            normalizer,
+            'normalizer',
+            ['float32', 'float64'],
+            'sigmoid_focal_loss',
+        )
         normalizer_shape = list(normalizer.shape)
         normalizer_dims = len(normalizer_shape)
         if normalizer_dims > 1:
             raise ValueError(
-                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}."
-                .format(normalizer_dims))
+                "Expected one dimension of normalizer in sigmoid_focal_loss but got {}.".format(
+                    normalizer_dims
+                )
+            )
 
     if in_dygraph_mode():
         place = _current_expected_place()
         one = _C_ops.full(logit.shape, float(1.0), logit.dtype, place)
 
-        loss = _C_ops.sigmoid_cross_entropy_with_logits(logit, label, False,
-                                                        -100)
+        loss = _C_ops.sigmoid_cross_entropy_with_logits(
+            logit, label, False, -100
+        )
 
         pred = _C_ops.sigmoid(logit)
 
         p_t = _C_ops.add(
             _C_ops.multiply(pred, label),
-            _C_ops.multiply(_C_ops.subtract(one, pred),
-                            _C_ops.subtract(one, label)))
+            _C_ops.multiply(
+                _C_ops.subtract(one, pred), _C_ops.subtract(one, label)
+            ),
+        )
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
         alpha_t = _C_ops.add(
             _C_ops.multiply(alpha, label),
-            _C_ops.multiply(_C_ops.subtract(one, alpha),
-                            _C_ops.subtract(one, label)))
+            _C_ops.multiply(
+                _C_ops.subtract(one, alpha), _C_ops.subtract(one, label)
+            ),
+        )
         loss = _C_ops.multiply(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
@@ -2859,9 +3120,19 @@ def sigmoid_focal_loss(logit,
 
     elif _in_legacy_dygraph():
         one = _varbase_creator(dtype=logit.dtype)
-        _legacy_C_ops.fill_constant(one, 'value', float(1.0), 'force_cpu',
-                                    False, 'dtype', one.dtype, 'str_value',
-                                    '1.0', 'shape', logit.shape)
+        _legacy_C_ops.fill_constant(
+            one,
+            'value',
+            float(1.0),
+            'force_cpu',
+            False,
+            'dtype',
+            one.dtype,
+            'str_value',
+            '1.0',
+            'shape',
+            logit.shape,
+        )
         loss = _legacy_C_ops.sigmoid_cross_entropy_with_logits(logit, label)
 
         pred = _legacy_C_ops.sigmoid(logit)
@@ -2870,19 +3141,24 @@ def sigmoid_focal_loss(logit,
             _legacy_C_ops.elementwise_mul(pred, label),
             _legacy_C_ops.elementwise_mul(
                 _legacy_C_ops.elementwise_sub(one, pred),
-                _legacy_C_ops.elementwise_sub(one, label)))
+                _legacy_C_ops.elementwise_sub(one, label),
+            ),
+        )
 
         alpha = fluid.dygraph.base.to_variable([alpha], dtype=loss.dtype)
         alpha_t = _legacy_C_ops.elementwise_add(
             _legacy_C_ops.elementwise_mul(alpha, label),
             _legacy_C_ops.elementwise_mul(
                 _legacy_C_ops.elementwise_sub(one, alpha),
-                _legacy_C_ops.elementwise_sub(one, label)))
+                _legacy_C_ops.elementwise_sub(one, label),
+            ),
+        )
         loss = _legacy_C_ops.elementwise_mul(alpha_t, loss)
 
         gamma = fluid.dygraph.base.to_variable([gamma], dtype=loss.dtype)
         gamma_t = _legacy_C_ops.elementwise_pow(
-            _legacy_C_ops.elementwise_sub(one, p_t), gamma)
+            _legacy_C_ops.elementwise_sub(one, p_t), gamma
+        )
         loss = _legacy_C_ops.elementwise_mul(gamma_t, loss)
 
         if normalizer is not None:
@@ -2895,16 +3171,19 @@ def sigmoid_focal_loss(logit,
 
         return loss
 
-    check_variable_and_dtype(logit, 'logit', ['float32', 'float64'],
-                             'sigmoid_focal_loss')
-    check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                             'sigmoid_focal_loss')
+    check_variable_and_dtype(
+        logit, 'logit', ['float32', 'float64'], 'sigmoid_focal_loss'
+    )
+    check_variable_and_dtype(
+        label, 'label', ['float32', 'float64'], 'sigmoid_focal_loss'
+    )
 
     bce_name = None
     if reduction == 'none' and normalizer is None:
         bce_name = name
     loss = paddle.nn.functional.binary_cross_entropy_with_logits(
-        logit, label, reduction='none', name=bce_name)
+        logit, label, reduction='none', name=bce_name
+    )
 
     pred = paddle.nn.functional.sigmoid(logit)
     p_t = pred * label + (1 - pred) * (1 - label)
@@ -2927,74 +3206,102 @@ def sigmoid_focal_loss(logit,
     return loss
 
 
-def multi_label_soft_margin_loss(input,
-                                 label,
-                                 weight=None,
-                                 reduction="mean",
-                                 name=None):
+def multi_label_soft_margin_loss(
+    input, label, weight=None, reduction="mean", name=None
+):
     r"""
+    Calculate a multi-class multi-classification
+    hinge loss (margin-based loss) between input :math:`x` (a 2D mini-batch `Tensor`)
+    and output :math:`y` (which is a 2D `Tensor` of target class indices).
+    For each sample in the mini-batch:
 
-        Parameters:
-            input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
-            label (Tensor): Label tensor, the data type is float32 or float64. The shape of label is the same as the shape of input.
-            weight (Tensor,optional): a manual rescaling weight given to each class.
-                    If given, has to be a Tensor of size C and the data type is float32, float64.
-                    Default is ``'None'`` .
-            reduction (str, optional): Indicate how to average the loss by batch_size,
-                    the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
-                    If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
-                    If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
-                    If :attr:`reduction` is ``'sum'``, the summed loss is returned.
-                    Default: ``'mean'``
-            name (str, optional): Name for the operation (optional, default is None).
-                    For more information, please refer to :ref:`api_guide_Name`.
-
-	Shape:
-            input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means number of classes, available dtype is float32, float64. The sum operationoperates over all the elements.
-            label: N-D Tensor, same shape as the input.
-            weight:N-D Tensor, the shape is [N,1]
-            output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
-
-	Returns:
-            Tensor, The tensor variable storing the multi_label_soft_margin_loss of input and label.
-
-	Examples:
-            .. code-block:: python
+    .. math::
+        \text{loss}(x, y) = \sum_{ij}\frac{\max(0, 1 - (x[y[j]] - x[i]))}{\text{x.size}(0)}
 
-                import paddle
-                import paddle.nn.functional as F
-                input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
-                # label elements in {1., -1.}
-                label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
-                loss = F.multi_label_soft_margin_loss(input, label, reduction='none')
-                print(loss)
-                # Tensor([3.49625897, 0.71111226, 0.43989015])
-                loss = F.multi_label_soft_margin_loss(input, label, reduction='mean')
-                print(loss)
-                # Tensor([1.54908717])
+    where :math:`x \in \left\{0, \; \cdots , \; \text{x.size}(0) - 1\right\}`, \
+    :math:`y \in \left\{0, \; \cdots , \; \text{y.size}(0) - 1\right\}`, \
+    :math:`0 \leq y[j] \leq \text{x.size}(0)-1`, \
+    and :math:`i \neq y[j]` for all :math:`i` and :math:`j`.
+    :math:`y` and :math:`x` must have the same size.
+
+    Parameters:
+        input (Tensor): Input tensor, the data type is float32 or float64. Shape is (N, C), where C is number of classes, and if shape is more than 2D, this is (N, C, D1, D2,..., Dk), k >= 1.
+        label (Tensor): Label tensor, the data type is float32 or float64. The shape of label is the same as the shape of input.
+        weight (Tensor,optional): a manual rescaling weight given to each class.
+                If given, has to be a Tensor of size C and the data type is float32, float64.
+                Default is ``'None'`` .
+        reduction (str, optional): Indicate how to average the loss by batch_size,
+                the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                If :attr:`reduction` is ``'none'``, the unreduced loss is returned;
+                If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
+                If :attr:`reduction` is ``'sum'``, the summed loss is returned.
+                Default: ``'mean'``
+        name (str, optional): Name for the operation (optional, default is None).
+                For more information, please refer to :ref:`api_guide_Name`.
+
+    Shape:
+        input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means number of classes, available dtype is float32, float64. The sum operationoperates over all the elements.
+        label: N-D Tensor, same shape as the input.
+        weight:N-D Tensor, the shape is [N,1]
+        output: scalar. If :attr:`reduction` is ``'none'``, then same shape as the input.
+
+    Returns:
+        Tensor, The tensor variable storing the multi_label_soft_margin_loss of input and label.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.nn.functional as F
+            input = paddle.to_tensor([[1, -2, 3], [0, -1, 2], [1, 0, 1]], dtype=paddle.float32)
+            # label elements in {1., -1.}
+            label = paddle.to_tensor([[-1, 1, -1], [1, 1, 1], [1, -1, 1]], dtype=paddle.float32)
+            loss = F.multi_label_soft_margin_loss(input, label, reduction='none')
+            print(loss)
+            # Tensor([3.49625897, 0.71111226, 0.43989015])
+            loss = F.multi_label_soft_margin_loss(input, label, reduction='mean')
+            print(loss)
+            # Tensor([1.54908717])
     """
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'multi_label_soft_margin_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
 
     if not (input.shape == label.shape):
-        raise ValueError("The input and label should have same dimension,"
-                         "but received {}!={}".format(input.shape, label.shape))
+        raise ValueError(
+            "The input and label should have same dimension,"
+            "but received {}!={}".format(input.shape, label.shape)
+        )
 
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'multilabel_soft_margin_loss')
-        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                 'multilabel_soft_margin_loss')
+        check_variable_and_dtype(
+            input,
+            'input',
+            ['float32', 'float64'],
+            'multilabel_soft_margin_loss',
+        )
+        check_variable_and_dtype(
+            label,
+            'label',
+            ['float32', 'float64'],
+            'multilabel_soft_margin_loss',
+        )
 
-    loss = -(label * paddle.nn.functional.log_sigmoid(input) +
-             (1 - label) * paddle.nn.functional.log_sigmoid(-input))
+    loss = -(
+        label * paddle.nn.functional.log_sigmoid(input)
+        + (1 - label) * paddle.nn.functional.log_sigmoid(-input)
+    )
 
     if weight is not None:
         if not _non_static_mode():
-            check_variable_and_dtype(weight, 'weight', ['float32', 'float64'],
-                                     'multilabel_soft_margin_loss')
+            check_variable_and_dtype(
+                weight,
+                'weight',
+                ['float32', 'float64'],
+                'multilabel_soft_margin_loss',
+            )
         loss = loss * weight
 
     loss = loss.mean(axis=-1)  # only return N loss values
@@ -3009,7 +3316,7 @@ def multi_label_soft_margin_loss(input,
 
 def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
     r"""
-    This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    Calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
     This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
     and is typically used for learning nonlinear embeddings or semi-supervised learning.
 
@@ -3084,17 +3391,21 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'hinge_embedding_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
 
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'hinge_embedding_loss')
-        check_variable_and_dtype(label, 'label', ['float32', 'float64'],
-                                 'hinge_embedding_loss')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'hinge_embedding_loss'
+        )
+        check_variable_and_dtype(
+            label, 'label', ['float32', 'float64'], 'hinge_embedding_loss'
+        )
 
     zero_ = paddle.zeros([1], dtype=input.dtype)
-    loss = paddle.where(label == 1., input, zero_) + \
-           paddle.where(label == -1., paddle.nn.functional.relu(margin - input), zero_)
+    loss = paddle.where(label == 1.0, input, zero_) + paddle.where(
+        label == -1.0, paddle.nn.functional.relu(margin - input), zero_
+    )
 
     if reduction == 'mean':
         return paddle.mean(loss, name=name)
@@ -3104,12 +3415,9 @@ def hinge_embedding_loss(input, label, margin=1.0, reduction='mean', name=None):
         return loss
 
 
-def cosine_embedding_loss(input1,
-                          input2,
-                          label,
-                          margin=0,
-                          reduction='mean',
-                          name=None):
+def cosine_embedding_loss(
+    input1, input2, label, margin=0, reduction='mean', name=None
+):
     r"""
     This operator computes the cosine embedding loss of Tensor ``input1``, ``input2`` and ``label`` as follows.
 
@@ -3170,12 +3478,14 @@ def cosine_embedding_loss(input1,
     """
     if len(label.shape) != 1:
         raise ValueError(
-            "1D target tensor expected, multi-target not supported")
+            "1D target tensor expected, multi-target not supported"
+        )
 
     if input1.shape != input2.shape:
         raise ValueError(
             "the shape of input tensor 1 should be equal to input tensor 2, but found inputs with "
-            "different sizes")
+            "different sizes"
+        )
 
     if len(input1.shape) > 2:
         raise ValueError(
@@ -3184,9 +3494,13 @@ def cosine_embedding_loss(input1,
 
     if input1.dtype not in [paddle.float32, paddle.float64]:
         raise ValueError(
-            "The data type of input Variable must be 'float32' or 'float64'")
+            "The data type of input Variable must be 'float32' or 'float64'"
+        )
     if label.dtype not in [
-            paddle.int32, paddle.int64, paddle.float32, paddle.float64
+        paddle.int32,
+        paddle.int64,
+        paddle.float32,
+        paddle.float64,
     ]:
         raise ValueError(
             "The data type of label Variable must be 'int32', 'int64', 'float32', 'float64'"
@@ -3212,14 +3526,16 @@ def cosine_embedding_loss(input1,
         return paddle.sum(out, name=name)
 
 
-def triplet_margin_with_distance_loss(input,
-                                      positive,
-                                      negative,
-                                      distance_function=None,
-                                      margin=1.0,
-                                      swap=False,
-                                      reduction='mean',
-                                      name=None):
+def triplet_margin_with_distance_loss(
+    input,
+    positive,
+    negative,
+    distance_function=None,
+    margin=1.0,
+    swap=False,
+    reduction='mean',
+    name=None,
+):
     r"""
     Measures the triplet loss given an input
     tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
@@ -3239,7 +3555,7 @@ def triplet_margin_with_distance_loss(input,
     .. math::
         d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_p
 
-    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference 
+    or user can defined their own distance functions. `margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
     distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
 
@@ -3255,10 +3571,10 @@ def triplet_margin_with_distance_loss(input,
             The shape of label is the same as the shape of input.
 
         distance_function (callable, optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
-	
-	    margin (float, optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
+
+            margin (float, optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
             between the positive and negative distances required for the loss to be 0.
-	
+
         swap (bool, optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
@@ -3270,7 +3586,7 @@ def triplet_margin_with_distance_loss(input,
             Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-	    
+
     Returns:
         Output: Tensor. The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
 
@@ -3294,28 +3610,47 @@ def triplet_margin_with_distance_loss(input,
 
     """
     if reduction not in ['sum', 'mean', 'none']:
-        raise ValueError("'reduction' in 'triplet_margin_with_distance_loss' "
-                         "should be 'sum', 'mean' or 'none', "
-                         "but received {}.".format(reduction))
+        raise ValueError(
+            "'reduction' in 'triplet_margin_with_distance_loss' "
+            "should be 'sum', 'mean' or 'none', "
+            "but received {}.".format(reduction)
+        )
     if margin < 0:
         raise ValueError(
             "The margin between positive samples and negative samples should be greater than 0."
         )
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'triplet_margin_with_distance_loss')
-        check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                                 'triplet_margin_with_distance_loss')
-        check_variable_and_dtype(negative, 'negative', ['float32', 'float64'],
-                                 'triplet_margin_with_distance_loss')
+        check_variable_and_dtype(
+            input,
+            'input',
+            ['float32', 'float64'],
+            'triplet_margin_with_distance_loss',
+        )
+        check_variable_and_dtype(
+            positive,
+            'positive',
+            ['float32', 'float64'],
+            'triplet_margin_with_distance_loss',
+        )
+        check_variable_and_dtype(
+            negative,
+            'negative',
+            ['float32', 'float64'],
+            'triplet_margin_with_distance_loss',
+        )
 
     if not (input.shape == positive.shape == negative.shape):
-        raise ValueError("input's shape must equal to "
-                         "positive's shape and  "
-                         "negative's shape")
+        raise ValueError(
+            "input's shape must equal to "
+            "positive's shape and  "
+            "negative's shape"
+        )
 
-    distance_function = distance_function if distance_function is not None \
+    distance_function = (
+        distance_function
+        if distance_function is not None
         else paddle.nn.PairwiseDistance(2)
+    )
 
     positive_dist = distance_function(input, positive)
     negative_dist = distance_function(input, negative)
@@ -3327,7 +3662,8 @@ def triplet_margin_with_distance_loss(input,
     if not paddle.all(positive_dist > 0) or not paddle.all(negative_dist > 0):
         raise ValueError(
             "The positive distance or negative distance should be greater than 0, "
-            "The distance functions should be checked.")
+            "The distance functions should be checked."
+        )
 
     loss = paddle.clip(positive_dist - negative_dist + margin, min=0.0)
 
@@ -3339,15 +3675,17 @@ def triplet_margin_with_distance_loss(input,
         return loss
 
 
-def triplet_margin_loss(input,
-                        positive,
-                        negative,
-                        margin=1.0,
-                        p=2,
-                        epsilon=1e-6,
-                        swap=False,
-                        reduction='mean',
-                        name=None):
+def triplet_margin_loss(
+    input,
+    positive,
+    negative,
+    margin=1.0,
+    p=2,
+    epsilon=1e-6,
+    swap=False,
+    reduction='mean',
+    name=None,
+):
     r"""
         Measures the triplet loss given an input
         tensors :math:`x1`, :math:`x2`, :math:`x3` and a margin with a value greater than :math:`0`.
@@ -3424,23 +3762,29 @@ def triplet_margin_loss(input,
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "'reduction' in 'triplet_margin_loss' should be 'sum', 'mean' or 'none', "
-            "but received {}.".format(reduction))
+            "but received {}.".format(reduction)
+        )
     if margin < 0:
         raise ValueError(
             "The margin between positive samples and negative samples should be greater than 0."
         )
     if not _non_static_mode():
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'triplet_margin_loss')
-        check_variable_and_dtype(positive, 'positive', ['float32', 'float64'],
-                                 'triplet_margin_loss')
-        check_variable_and_dtype(negative, 'negative', ['float32', 'float64'],
-                                 'triplet_margin_loss')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'triplet_margin_loss'
+        )
+        check_variable_and_dtype(
+            positive, 'positive', ['float32', 'float64'], 'triplet_margin_loss'
+        )
+        check_variable_and_dtype(
+            negative, 'negative', ['float32', 'float64'], 'triplet_margin_loss'
+        )
 
     if not (input.shape == positive.shape == negative.shape):
-        raise ValueError("input's shape must equal to "
-                         "positive's shape and  "
-                         "negative's shape")
+        raise ValueError(
+            "input's shape must equal to "
+            "positive's shape and  "
+            "negative's shape"
+        )
 
     distance_function = paddle.nn.PairwiseDistance(p, epsilon=epsilon)
     positive_dist = distance_function(input, positive)
@@ -3462,6 +3806,7 @@ def triplet_margin_loss(input,
 
 def soft_margin_loss(input, label, reduction='mean', name=None):
     """
+
     The API measures the soft margin loss between input predictions ``input``
     and target labels ``label`` . It can be described as:
 
@@ -3470,9 +3815,9 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
 
     Parameters:
 
-        input (Tensor): The input predications tensor with shape: [N, *],
+        input (Tensor): The input predications tensor with shape: ``[N, *]``,
             N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf.
-             Available dtype is float32, float64.
+            Available dtype is float32, float64.
 
         label (Tensor): The target labels tensor with the same shape as
             ``input``. The target labels which values should be numbers -1 or 1.
@@ -3490,43 +3835,54 @@ def soft_margin_loss(input, label, reduction='mean', name=None):
 
     Returns:
 
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is [1].
+        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is same as ``input`` , else the shape of output is [1].
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             input = paddle.to_tensor([[0.5, 0.6, 0.7],[0.3, 0.5, 0.2]], 'float32')
             label = paddle.to_tensor([[1.0, -1.0, 1.0],[-1.0, 1.0, 1.0]], 'float32')
             output = paddle.nn.functional.soft_margin_loss(input, label)
+            print(output)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.64022040])
+
+            input = paddle.uniform(shape=(5, 5), dtype="float32", min=0.1, max=0.8)
+            label = paddle.randint(0, 2, shape=(5, 5), dtype="int64")
+            label[label==0]=-1
 
-            input_np = np.random.uniform(0.1, 0.8, size=(5, 5)).astype(np.float64)
-            label_np = np.random.randint(0, 2, size=(5, 5)).astype(np.int64)
-            label_np[label_np==0]=-1
-            input = paddle.to_tensor(input_np)
-            label = paddle.to_tensor(label_np)
             output = paddle.nn.functional.soft_margin_loss(input, label, reduction='none')
+            print(output)
+            # Tensor(shape=[5, 5], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1.09917796, 0.52613139, 0.56263304, 0.82736146, 0.38776723],
+            #         [1.07179427, 1.11924267, 0.49877715, 1.10026348, 0.46184641],
+            #         [0.84367639, 0.74795729, 0.44629076, 0.55123353, 0.77659678],
+            #         [0.39465919, 0.76651484, 0.54485321, 0.76609844, 0.77166790],
+            #         [0.51283568, 0.84757161, 0.78913331, 1.05268764, 0.45318675]])
+
     """
     if reduction not in ['sum', 'mean', 'none']:
         raise ValueError(
             "The value of 'reduction' in soft_margin_loss should be 'sum', "
-            "'mean' or 'none', but received %s, which is not allowed." %
-            reduction)
+            "'mean' or 'none', but received %s, which is not allowed."
+            % reduction
+        )
 
     if not _non_static_mode():
-        fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                   ['float32', 'float64'],
-                                                   'soft_margin_loss')
         fluid.data_feeder.check_variable_and_dtype(
-            label, 'label', ['int32', 'int64', 'float32', 'float64'],
-            'soft_margin_loss')
+            input, 'input', ['float32', 'float64'], 'soft_margin_loss'
+        )
+        fluid.data_feeder.check_variable_and_dtype(
+            label,
+            'label',
+            ['int32', 'int64', 'float32', 'float64'],
+            'soft_margin_loss',
+        )
 
     if not (input.shape == label.shape):
-        raise ValueError("input's shape must equal to "
-                         "label's shape")
+        raise ValueError("input's shape must equal to " "label's shape")
 
     label = fluid.layers.cast(label, input.dtype)
     out = paddle.log(1 + paddle.exp(-label * input))
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 03ba72fdda344e..ac5829ea0dbdea 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -24,14 +24,19 @@
 import numbers
 from paddle import _C_ops, _legacy_C_ops
 from paddle import in_dynamic_mode
-from paddle.fluid.framework import core, _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from paddle.fluid.framework import (
+    core,
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 
 __all__ = []
 
 
 def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     r"""
-    This op normalizes ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
+    Normalize ``x`` along dimension ``axis`` using :math:`L_p` norm. This layer computes
 
     .. math::
 
@@ -45,7 +50,7 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     Parameters:
         x (Tensor): The input tensor could be N-D tensor, and the input data type could be float32 or float64.
-        p (float|int, optional): The exponent value in the norm formulation. Default: 2
+        p (float|int, optional): The exponent value in the norm formulation. Default: 2.
         axis (int, optional): The axis on which to apply normalization. If `axis < 0`, the dimension to normalization is `x.ndim + axis`. -1 is the last dimension.
         epsilon (float, optional): Small float added to denominator to avoid dividing by zero. Default is 1e-12.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -57,27 +62,28 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
             import paddle.nn.functional as F
 
             paddle.disable_static()
-            x = np.arange(6, dtype=np.float32).reshape(2,3)
-            x = paddle.to_tensor(x)
+            x = paddle.arange(6, dtype="float32").reshape([2,3])
             y = F.normalize(x)
-            print(y.numpy())
-            # [[0.         0.4472136  0.8944272 ]
-            # [0.42426404 0.5656854  0.7071067 ]]
+            print(y)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.        , 0.44721359, 0.89442718],
+            #         [0.42426404, 0.56568539, 0.70710671]])
 
             y = F.normalize(x, p=1.5)
-            print(y.numpy())
-            # [[0.         0.40862012 0.81724024]
-            # [0.35684016 0.4757869  0.5947336 ]]
+            print(y)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.        , 0.40862012, 0.81724024],
+            #         [0.35684016, 0.47578689, 0.59473360]])
 
             y = F.normalize(x, axis=0)
-            print(y.numpy())
-            # [[0.         0.24253564 0.37139067]
-            # [1.         0.97014254 0.9284767 ]]
+            print(y)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.        , 0.24253564, 0.37139067],
+            #         [1.        , 0.97014254, 0.92847669]])
     """
     if in_dygraph_mode():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
@@ -86,18 +92,30 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
 
     if _in_legacy_dygraph():
         eps = fluid.dygraph.base.to_variable([epsilon], dtype=x.dtype)
-        out = _legacy_C_ops.p_norm(x, 'axis', axis, 'porder', float(p),
-                                   'keepdim', True, 'epsilon', epsilon)
+        out = _legacy_C_ops.p_norm(
+            x,
+            'axis',
+            axis,
+            'porder',
+            float(p),
+            'keepdim',
+            True,
+            'epsilon',
+            epsilon,
+        )
         return x / _legacy_C_ops.elementwise_max(out, eps)
 
     check_type(p, 'p', (float, int), 'normalize')
     check_type(axis, 'axis', (int), 'normalize')
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'normalize')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'normalize'
+    )
     if len(x.shape) == 1 and axis != 0 and axis != -1:
         raise ValueError(
-            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}"
-            .format(axis))
+            "Axis must be 0 or -1 when x is a 1-D tensor, but received axis = {}".format(
+                axis
+            )
+        )
 
     attrs = {
         'axis': axis,
@@ -107,26 +125,27 @@ def normalize(x, p=2, axis=1, epsilon=1e-12, name=None):
     }
     helper = LayerHelper('p_norm', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='p_norm',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='p_norm', inputs={'X': x}, outputs={'Out': out}, attrs=attrs
+    )
     eps = out.block.create_var(dtype=out.dtype)
     eps = paddle.full(shape=[1], fill_value=epsilon, dtype=out.dtype)
     return paddle.divide(x, paddle.maximum(out, eps), name=name)
 
 
-def batch_norm(x,
-               running_mean,
-               running_var,
-               weight,
-               bias,
-               training=False,
-               momentum=0.9,
-               epsilon=1e-05,
-               data_format="NCHW",
-               use_global_stats=None,
-               name=None):
+def batch_norm(
+    x,
+    running_mean,
+    running_var,
+    weight,
+    bias,
+    training=False,
+    momentum=0.9,
+    epsilon=1e-05,
+    data_format="NCHW",
+    use_global_stats=None,
+    name=None,
+):
     """
     Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
@@ -140,8 +159,8 @@ def batch_norm(x,
         bias(Tensor): The bias tensor of batch_norm can not be None.
         epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Defalut False.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Defalut "NCHW".
+        training(bool, optional): True means train mode which compute by batch data and track global mean and var during train period. False means inference mode which compute by global mean and var which calculated by train period. Default False.
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW", "NCDHW", "NLC", "NHWC" or "NDHWC". Default "NCHW".
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
@@ -151,22 +170,31 @@ def batch_norm(x,
     Examples:
         .. code-block:: python
 
-          import paddle
-          import numpy as np
-
-          x = np.random.seed(123)
-          x = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-          running_mean = np.random.random(size=1).astype('float32')
-          running_variance = np.random.random(size=1).astype('float32')
-          weight_data = np.random.random(size=1).astype('float32')
-          bias_data = np.random.random(size=1).astype('float32')
-          x = paddle.to_tensor(x)
-          rm = paddle.to_tensor(running_mean)
-          rv = paddle.to_tensor(running_variance)
-          w = paddle.to_tensor(weight_data)
-          b = paddle.to_tensor(bias_data)
-          batch_norm_out = paddle.nn.functional.batch_norm(x, rm, rv, w, b)
-          print(batch_norm_out)
+            import paddle
+
+            x = paddle.arange(12, dtype="float32").reshape([2, 1, 2, 3])
+            print(x)
+            # Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[0. , 1. , 2. ],
+            #           [3. , 4. , 5. ]]],
+
+            #         [[[6. , 7. , 8. ],
+            #           [9. , 10., 11.]]]])
+
+            running_mean = paddle.to_tensor([0], dtype="float32")
+            running_variance = paddle.to_tensor([1], dtype="float32")
+            weight = paddle.to_tensor([2], dtype="float32")
+            bias = paddle.to_tensor([1], dtype="float32")
+
+            batch_norm_out = paddle.nn.functional.batch_norm(x, running_mean,
+                                                        running_variance, weight, bias)
+            print(batch_norm_out)
+            # Tensor(shape=[2, 1, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[1.         , 2.99998999 , 4.99997997 ],
+            #           [6.99996948 , 8.99995995 , 10.99994946]]],
+
+            #         [[[12.99993896, 14.99992943, 16.99991989],
+            #           [18.99990845, 20.99989891, 22.99988937]]]])
     """
     assert len(x.shape) >= 2, "input dim must be larger than 1"
 
@@ -178,7 +206,8 @@ def batch_norm(x,
     if data_format not in true_data_format:
         raise ValueError(
             "data_format must be one of 'NC', 'NCL', 'NCHW', 'NCDHW', "
-            "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format))
+            "'NLC', 'NHWC', 'NDHWC' but receive {}".format(data_format)
+        )
 
     data_format = 'NCHW' if data_format[1] == 'C' else 'NHWC'
 
@@ -190,29 +219,64 @@ def batch_norm(x,
 
     if in_dygraph_mode():
         batch_norm_out, _, _, _, _, _ = _C_ops.batch_norm(
-            x, weight, bias, running_mean, running_var, momentum, epsilon,
-            data_format, not training, use_global_stats, trainable_statistics,
-            False)
-
-        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
-                                                           act=None)
+            x,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            momentum,
+            epsilon,
+            data_format,
+            not training,
+            use_global_stats,
+            trainable_statistics,
+            False,
+        )
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None
+        )
 
     elif _in_legacy_dygraph():
         # for dygraph need tuple
-        attrs = ("momentum", momentum, "epsilon", epsilon, "is_test",
-                 not training, "data_layout", data_format, "use_mkldnn", False,
-                 "fuse_with_relu", False, "use_global_stats", use_global_stats,
-                 "trainable_statistics", trainable_statistics)
+        attrs = (
+            "momentum",
+            momentum,
+            "epsilon",
+            epsilon,
+            "is_test",
+            not training,
+            "data_layout",
+            data_format,
+            "use_mkldnn",
+            False,
+            "fuse_with_relu",
+            False,
+            "use_global_stats",
+            use_global_stats,
+            "trainable_statistics",
+            trainable_statistics,
+        )
 
         batch_norm_out, _, _, _, _, _ = _legacy_C_ops.batch_norm(
-            x, weight, bias, running_mean, running_var, None, mean_out,
-            variance_out, *attrs)
-
-        return dygraph_utils._append_activation_in_dygraph(batch_norm_out,
-                                                           act=None)
-
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             'BatchNorm')
+            x,
+            weight,
+            bias,
+            running_mean,
+            running_var,
+            None,
+            mean_out,
+            variance_out,
+            *attrs
+        )
+
+        return dygraph_utils._append_activation_in_dygraph(
+            batch_norm_out, act=None
+        )
+
+    check_variable_and_dtype(
+        x, 'input', ['float16', 'float32', 'float64'], 'BatchNorm'
+    )
 
     # for static need dict
     attrs = {
@@ -231,16 +295,18 @@ def batch_norm(x,
         "Scale": [weight],
         "Bias": [bias],
         "Mean": [running_mean],
-        "Variance": [running_var]
+        "Variance": [running_var],
     }
 
     helper = LayerHelper('batch_norm', **locals())
 
     param_dtype = x.dtype if x.dtype != 'float16' else 'float32'
-    saved_mean = helper.create_variable_for_type_inference(dtype=param_dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=param_dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=param_dtype, stop_gradient=True)
+        dtype=param_dtype, stop_gradient=True
+    )
     batch_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
@@ -248,29 +314,26 @@ def batch_norm(x,
         "MeanOut": [running_mean],
         "VarianceOut": [running_var],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance]
+        "SavedVariance": [saved_variance],
     }
 
     if training or trainable_statistics:
         # reserve_space is only used for training.
         reserve_space = helper.create_variable_for_type_inference(
-            dtype=x.dtype, stop_gradient=True)
+            dtype=x.dtype, stop_gradient=True
+        )
         outputs["ReserveSpace"] = [reserve_space]
 
-    helper.append_op(type="batch_norm",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return helper.append_activation(batch_norm_out)
 
 
-def layer_norm(x,
-               normalized_shape,
-               weight=None,
-               bias=None,
-               epsilon=1e-05,
-               name=None):
+def layer_norm(
+    x, normalized_shape, weight=None, bias=None, epsilon=1e-05, name=None
+):
     """
     see more detail in paddle.nn.LayerNorm
 
@@ -294,11 +357,8 @@ def layer_norm(x,
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 2, 3))
           layer_norm_out = paddle.nn.functional.layer_norm(x, x.shape[1:])
           print(layer_norm_out)
     """
@@ -310,32 +370,49 @@ def layer_norm(x,
         normalized_shape = list(normalized_shape)
     elif not isinstance(normalized_shape, list):
         raise ValueError(
-            "`normalized_shape` should be int, list of ints or tuple of ints.")
+            "`normalized_shape` should be int, list of ints or tuple of ints."
+        )
 
     normalized_ndim = len(normalized_shape)
     begin_norm_axis = input_ndim - normalized_ndim
-    if input_ndim < normalized_ndim or input_shape[
-            begin_norm_axis:] != normalized_shape:
+    if (
+        input_ndim < normalized_ndim
+        or input_shape[begin_norm_axis:] != normalized_shape
+    ):
         str_normalized_shape = str(normalized_shape)
-        raise ValueError('Given normalized_shape is ' + str_normalized_shape +
-                         ', expected input with shape [*, ' +
-                         str_normalized_shape[1:] + ', but got input shape ' +
-                         str(input_shape))
+        raise ValueError(
+            'Given normalized_shape is '
+            + str_normalized_shape
+            + ', expected input with shape [*, '
+            + str_normalized_shape[1:]
+            + ', but got input shape '
+            + str(input_shape)
+        )
 
     if in_dygraph_mode():
-        pre_act, _, _, = _C_ops.layer_norm(x, weight, bias, epsilon,
-                                           begin_norm_axis, False)
+        (
+            pre_act,
+            _,
+            _,
+        ) = _C_ops.layer_norm(x, weight, bias, epsilon, begin_norm_axis, False)
 
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
 
     if _in_legacy_dygraph():
-        pre_act, _, _ = _legacy_C_ops.layer_norm(x, weight, bias, 'epsilon',
-                                                 epsilon, 'begin_norm_axis',
-                                                 begin_norm_axis)
+        pre_act, _, _ = _legacy_C_ops.layer_norm(
+            x,
+            weight,
+            bias,
+            'epsilon',
+            epsilon,
+            'begin_norm_axis',
+            begin_norm_axis,
+        )
         return dygraph_utils._append_activation_in_dygraph(pre_act, act=None)
 
-    check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                             'LayerNorm')
+    check_variable_and_dtype(
+        x, 'input', ['float16', 'float32', 'float64'], 'LayerNorm'
+    )
 
     inputs = dict()
     inputs['X'] = [x]
@@ -349,49 +426,52 @@ def layer_norm(x,
     helper = LayerHelper('layer_norm', **locals())
 
     dtype = x.dtype
-    mean_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                         stop_gradient=True)
-    variance_out = helper.create_variable_for_type_inference(dtype=dtype,
-                                                             stop_gradient=True)
+    mean_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
+    variance_out = helper.create_variable_for_type_inference(
+        dtype=dtype, stop_gradient=True
+    )
     layer_norm_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type="layer_norm",
-                     inputs=inputs,
-                     outputs={
-                         "Y": layer_norm_out,
-                         "Mean": mean_out,
-                         "Variance": variance_out,
-                     },
-                     attrs={
-                         "epsilon": epsilon,
-                         "begin_norm_axis": begin_norm_axis
-                     })
+    helper.append_op(
+        type="layer_norm",
+        inputs=inputs,
+        outputs={
+            "Y": layer_norm_out,
+            "Mean": mean_out,
+            "Variance": variance_out,
+        },
+        attrs={"epsilon": epsilon, "begin_norm_axis": begin_norm_axis},
+    )
 
     return helper.append_activation(layer_norm_out)
 
 
-def instance_norm(x,
-                  running_mean=None,
-                  running_var=None,
-                  weight=None,
-                  bias=None,
-                  use_input_stats=True,
-                  momentum=0.9,
-                  eps=1e-05,
-                  data_format="NCHW",
-                  name=None):
+def instance_norm(
+    x,
+    running_mean=None,
+    running_var=None,
+    weight=None,
+    bias=None,
+    use_input_stats=True,
+    momentum=0.9,
+    eps=1e-05,
+    data_format="NCHW",
+    name=None,
+):
     """
     See more detail in nn.layer.InstanceNorm2D.
 
     Parameters:
         x(Tensor): Input Tensor. It's data type should be float32, float64.
-        running_mean(Tensor): running mean. Default None.
-        running_var(Tensor): running variance. Default None.
+        running_mean(Tensor, optional): running mean. Default None.
+        running_var(Tensor, optional): running variance. Default None.
         weight(Tensor, optional): The weight tensor of instance_norm. Default: None.
         bias(Tensor, optional): The bias tensor of instance_norm. Default: None.
         eps(float, optional): A value added to the denominator for numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        use_input_stats(bool): Default True.
+        use_input_stats(bool, optional): Default True.
         data_format(str, optional): Specify the input data format, may be "NC", "NCL", "NCHW" or "NCDHW". Defalut "NCHW".
         name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
@@ -403,11 +483,8 @@ def instance_norm(x,
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data)
+          x = paddle.rand((2, 2, 2, 3))
           instance_norm_out = paddle.nn.functional.instance_norm(x)
 
           print(instance_norm_out)
@@ -417,9 +494,17 @@ def instance_norm(x,
         out = _C_ops.instance_norm(x, weight, bias, eps)
         return out
     if _in_legacy_dygraph():
-        out, _, _ = _legacy_C_ops.instance_norm(x, weight, bias, "epsilon", eps,
-                                                "momentum", momentum,
-                                                "data_format", data_format)
+        out, _, _ = _legacy_C_ops.instance_norm(
+            x,
+            weight,
+            bias,
+            "epsilon",
+            eps,
+            "momentum",
+            momentum,
+            "data_format",
+            data_format,
+        )
         return out
 
     check_variable_and_dtype(x, 'input', ['float32', 'float64'], "InstanceNorm")
@@ -432,104 +517,106 @@ def instance_norm(x,
         inputs = {"X": [x]}
 
     helper = LayerHelper('instance_norm', **locals())
-    saved_mean = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                           stop_gradient=True)
+    saved_mean = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
     saved_variance = helper.create_variable_for_type_inference(
-        dtype=x.dtype, stop_gradient=True)
+        dtype=x.dtype, stop_gradient=True
+    )
     instance_norm_out = helper.create_variable_for_type_inference(x.dtype)
 
     outputs = {
         "Y": [instance_norm_out],
         "SavedMean": [saved_mean],
-        "SavedVariance": [saved_variance]
+        "SavedVariance": [saved_variance],
     }
 
-    helper.append_op(type="instance_norm",
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type="instance_norm", inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return instance_norm_out
 
 
-def local_response_norm(x,
-                        size,
-                        alpha=1e-4,
-                        beta=0.75,
-                        k=1.,
-                        data_format="NCHW",
-                        name=None):
+def local_response_norm(
+    x, size, alpha=1e-4, beta=0.75, k=1.0, data_format="NCHW", name=None
+):
     r"""
-        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
-        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
-
-        The formula is as follows:
-
-        .. math::
+    Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
+    For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
 
-            Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}
+    The formula is as follows:
 
-        In the above equation:
-
-        - :math:`size` : The number of channels to sum over.
-        - :math:`k` : The offset (avoid being divided by 0).
-        - :math:`\\alpha` : The scaling parameter.
-        - :math:`\\beta` : The exponent parameter.
+    .. math::
 
+        Output(i, x, y) = Input(i, x, y) / \left(k + \alpha \sum\limits^{\min(C-1, i + size/2)}_{j = \max(0, i - size/2)}(Input(j, x, y))^2\right)^{\beta}
+
+    In the above equation:
+
+    - :math:`size` : The number of channels to sum over.
+    - :math:`k` : The offset (avoid being divided by 0).
+    - :math:`\\alpha` : The scaling parameter.
+    - :math:`\\beta` : The exponent parameter.
+
+
+    Args:
+        x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32.
+        size (int): The number of channels to sum over.
+        alpha (float, optional): The scaling parameter, positive. Default:1e-4
+        beta (float, optional): The exponent, positive. Default:0.75
+        k (float, optional): An offset, positive. Default: 1.0
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:
+            If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
+            the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
+            If x is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
+            If x is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name (str, optional): Name for the operation (optional, default is None). For more information,
+            please refer to :ref:`api_guide_Name`.
 
-        Args:
-            x (Tensor): The input 3-D/4-D/5-D tensor. The data type is float32.
-            size (int): The number of channels to sum over.
-            alpha (float, optional): The scaling parameter, positive. Default:1e-4
-            beta (float, optional): The exponent, positive. Default:0.75
-            k (float, optional): An offset, positive. Default: 1.0
-            data_format (str, optional): Specify the data format of the input, and the data format of the output
-                will be consistent with that of the input. An optional string from:
-                If x is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
-                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
-                If x is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
-                If x is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
-            name (str, optional): Name for the operation (optional, default is None). For more information,
-                please refer to :ref:`api_guide_Name`.
+    Returns:
+        A tensor storing the transformation result with the same shape and data type as input.
 
-        Returns:
-            A tensor storing the transformation result with the same shape and data type as input.
 
+    Examples:
 
-        Examples:
+    .. code-block:: python
 
-        .. code-block:: python
+        import paddle
 
-            import paddle
-
-            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
-            y = paddle.nn.functional.local_response_norm(x, size=5)
-            print(y.shape)  # [3, 3, 112, 112]
-        """
+        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+        y = paddle.nn.functional.local_response_norm(x, size=5)
+        print(y.shape)  # [3, 3, 112, 112]
+    """
     if not in_dynamic_mode():
         check_variable_and_dtype(x, 'x', ['float32'], 'local_response_norm')
     if data_format not in ['NCL', 'NLC', 'NCHW', 'NHWC', 'NCDHW', 'NDHWC']:
         raise ValueError(
-            "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], " \
-            "but got {}".format(data_format))
+            "data_format should be in one of [NCL, NCHW, NCDHW, NLC, NHWC, NDHWC], "
+            "but got {}".format(data_format)
+        )
 
     sizes = x.shape
     dim = len(sizes)
     if dim < 3:
         raise ValueError(
-            'Expected 3D or higher dimensionality input, but got {} dimensions'.
-            format(dim))
+            'Expected 3D or higher dimensionality input, but got {} dimensions'.format(
+                dim
+            )
+        )
 
     for i, sz in enumerate(sizes):
         if not sz > 0 and i > 0:
-            raise ValueError("Expected every dim's size to be larger than 0, "
-                             "but the size of the {}-th dim is {}".format(
-                                 i, sz))
+            raise ValueError(
+                "Expected every dim's size to be larger than 0, "
+                "but the size of the {}-th dim is {}".format(i, sz)
+            )
 
     channel_last = True if data_format[-1] == "C" else False
 
     from functools import reduce
+
     sum_sizes = reduce(lambda x, y: x * y, sizes[1:])
 
     div = paddle.unsqueeze(paddle.multiply(x, x), axis=1)
@@ -537,8 +624,11 @@ def local_response_norm(x,
         pad4d_shape = [0, 0, size // 2, (size - 1) // 2]
         pool2d_shape = (size, 1)
         reshape_shape = [
-            sizes[0], 1, sizes[1], sizes[2],
-            int(sum_sizes / (sizes[1] * sizes[2]))
+            sizes[0],
+            1,
+            sizes[1],
+            sizes[2],
+            int(sum_sizes / (sizes[1] * sizes[2])),
         ]
         pad5d_shape = [0, 0, 0, 0, size // 2, (size - 1) // 2]
         pool3d_shape = (size, 1, 1)
@@ -546,26 +636,29 @@ def local_response_norm(x,
         pad4d_shape = [size // 2, (size - 1) // 2, 0, 0]
         pool2d_shape = (1, size)
         reshape_shape = [
-            sizes[0], 1, sizes[1],
-            int(sum_sizes / (sizes[1] * sizes[-1])), sizes[-1]
+            sizes[0],
+            1,
+            sizes[1],
+            int(sum_sizes / (sizes[1] * sizes[-1])),
+            sizes[-1],
         ]
         pad5d_shape = [size // 2, (size - 1) // 2, 0, 0, 0, 0]
         pool3d_shape = (1, 1, size)
 
     if dim == 3:
         div = paddle.nn.functional.pad(div, pad=pad4d_shape)
-        div = paddle.nn.functional.avg_pool2d(div,
-                                              kernel_size=pool2d_shape,
-                                              stride=1)
+        div = paddle.nn.functional.avg_pool2d(
+            div, kernel_size=pool2d_shape, stride=1
+        )
         div = paddle.squeeze(div, axis=1)
     else:
         div = paddle.reshape(div, shape=reshape_shape)
-        div = paddle.nn.functional.pad(div,
-                                       pad=pad5d_shape,
-                                       data_format='NCDHW')
-        div = paddle.nn.functional.avg_pool3d(div,
-                                              kernel_size=pool3d_shape,
-                                              stride=1)
+        div = paddle.nn.functional.pad(
+            div, pad=pad5d_shape, data_format='NCDHW'
+        )
+        div = paddle.nn.functional.avg_pool3d(
+            div, kernel_size=pool3d_shape, stride=1
+        )
         div = paddle.reshape(paddle.squeeze(div, axis=1), sizes)
 
     div = paddle.scale(div, scale=alpha, bias=k)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 637b192207eed1..388ab4c6944cc0 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -33,7 +33,9 @@ def _check_input(x, dimension):
     if len(x.shape) != dimension:
         raise ValueError(
             "Excepted Input X is {}-D tensor, but received {}-D {}".format(
-                dimension, len(x.shape), type(x)))
+                dimension, len(x.shape), type(x)
+            )
+        )
 
 
 def _check_instance(x, x_name, types=(int, float)):
@@ -41,16 +43,19 @@ def _check_instance(x, x_name, types=(int, float)):
     if not isinstance(x, types):
         raise ValueError(
             "Excepted {} type for {} but received type: {}. ".format(
-                types, x_name, type(x)))
+                types, x_name, type(x)
+            )
+        )
 
 
 def _check_value_limitation(x, x_name, min_limit=1e-3):
-
     def _check_value(x, x_name, min_limit=1e-3):
         if isinstance(x, int) and min_limit is not None and x < min_limit:
             raise ValueError(
-                "Excepted the input {} to be greater than {} but received x: {}. "
-                .format(x_name, min_limit, x))
+                "Excepted the input {} to be greater than {} but received x: {}. ".format(
+                    x_name, min_limit, x
+                )
+            )
 
     for ele in x:
         _check_value(ele, x_name)
@@ -74,21 +79,24 @@ def _channel_last(data_format, num_dims):
         if data_format not in ['NCL', 'NLC']:
             raise ValueError(
                 "Attr(data_format) should be 'NCL' or 'NLC'. Received "
-                "Attr(data_format): %s" % str(data_format))
+                "Attr(data_format): %s" % str(data_format)
+            )
         else:
             return True if data_format == "NLC" else False
     if num_dims == 2:
         if data_format not in ['NCHW', 'NHWC']:
             raise ValueError(
                 "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-                "Attr(data_format): %s" % str(data_format))
+                "Attr(data_format): %s" % str(data_format)
+            )
         else:
             return True if data_format == "NHWC" else False
     if num_dims == 3:
         if data_format not in ['NCDHW', 'NDHWC']:
             raise ValueError(
                 "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-                "Attr(data_format): %s" % str(data_format))
+                "Attr(data_format): %s" % str(data_format)
+            )
         else:
             return True if data_format == "NDHWC" else False
 
@@ -98,13 +106,16 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
         padding = padding.upper()
         if padding not in ["SAME", "VALID"]:
             raise ValueError(
-                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".
-                format(padding))
+                "Unknown padding: '{}'. It can only be 'SAME' or 'VALID'.".format(
+                    padding
+                )
+            )
         if padding == "VALID":
             if ceil_mode != False:
                 raise ValueError(
                     "When Attr(padding) is \"VALID\", Attr(ceil_mode) must be False. "
-                    "Received ceil_mode: True.")
+                    "Received ceil_mode: True."
+                )
 
             padding_algorithm = "VALID"
             padding = [0] * num_dims
@@ -119,10 +130,12 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
             if not _zero_padding_in_batch_and_channel(padding, channel_last):
                 raise ValueError(
                     "Non-zero padding({}) in the batch or channel dimensions "
-                    "is not supported.".format(padding))
+                    "is not supported.".format(padding)
+                )
             padding_algorithm = "EXPLICIT"
             padding = _exclude_padding_in_batch_and_channel(
-                padding, channel_last)
+                padding, channel_last
+            )
             if utils._is_symmetric_padding(padding, num_dims):
                 padding = padding[0::2]
         # for padding like [pad_before, pad_after, pad_before, pad_after, ...]
@@ -145,25 +158,29 @@ def _update_padding_nd(padding, num_dims, channel_last=False, ceil_mode=False):
 
 
 def _expand_low_nd_padding(padding):
-    #1d to 2d fake input
+    # 1d to 2d fake input
     if len(padding) == 2:
         padding = [0] * 2 + padding
     elif len(padding) == 1:
         padding = [0] + padding
     else:
         raise ValueError(
-            "The size of padding's dimmention should be 1 or 2. But got padding={}"
-            .format(padding))
+            "The size of padding's dimmention should be 1 or 2. But got padding={}".format(
+                padding
+            )
+        )
     return padding
 
 
-def avg_pool1d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               exclusive=True,
-               ceil_mode=False,
-               name=None):
+def avg_pool1d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    exclusive=True,
+    ceil_mode=False,
+    name=None,
+):
     """
     This API implements average pooling 1d operation,
     See more details in :ref:`api_nn_pooling_AvgPool1d` .
@@ -195,7 +212,7 @@ def avg_pool1d(x,
 
     Examples:
         .. code-block:: python
-          
+
             import paddle
             import paddle.nn as nn
 
@@ -222,28 +239,56 @@ def avg_pool1d(x,
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     channel_last = _channel_last("NCL", 1)
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    1,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        output = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                               exclusive, data_format, 'avg', False, False,
-                               padding_algorithm, True)
+        output = _C_ops.pool2d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            exclusive,
+            data_format,
+            'avg',
+            False,
+            False,
+            padding_algorithm,
+            True,
+        )
         return squeeze(output, [2])
 
     if _in_legacy_dygraph():
-        output = _legacy_C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
-                                      kernel_size, 'global_pooling', False,
-                                      'strides', stride, 'paddings', padding,
-                                      'padding_algorithm', padding_algorithm,
-                                      'use_cudnn', True, 'ceil_mode', ceil_mode,
-                                      'use_mkldnn', False, 'exclusive',
-                                      exclusive, 'data_format', data_format)
+        output = _legacy_C_ops.pool2d(
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            kernel_size,
+            'global_pooling',
+            False,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'padding_algorithm',
+            padding_algorithm,
+            'use_cudnn',
+            True,
+            'ceil_mode',
+            ceil_mode,
+            'use_mkldnn',
+            False,
+            'exclusive',
+            exclusive,
+            'data_format',
+            data_format,
+        )
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -251,35 +296,39 @@ def avg_pool1d(x,
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": 'avg',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": 'avg',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     return squeeze(pool_out, [2])
 
 
-def avg_pool2d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               exclusive=True,
-               divisor_override=None,
-               data_format="NCHW",
-               name=None):
+def avg_pool2d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    exclusive=True,
+    divisor_override=None,
+    data_format="NCHW",
+    name=None,
+):
     """
     This API implements average pooling 2d operation.
     See more details in :ref:`api_nn_pooling_AvgPool2d` .
@@ -314,16 +363,16 @@ def avg_pool2d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-    
+
     Examples:
         .. code-block:: python
-          
+
             import paddle
             import paddle.nn.functional as F
-            
+
             # avg pool2d
             x = paddle.uniform([1, 3, 32, 32], paddle.float32)
             out = F.avg_pool2d(x,
@@ -341,23 +390,52 @@ def avg_pool2d(x,
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     channel_last = _channel_last(data_format, 2)
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    2,
-                                                    channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 2, channel_last, ceil_mode=ceil_mode
+    )
 
     if _non_static_mode():
         if in_dygraph_mode():
-            output = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                                   exclusive, data_format, 'avg', False, False,
-                                   padding_algorithm, True)
+            output = _C_ops.pool2d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                exclusive,
+                data_format,
+                'avg',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
         else:
             output = _legacy_C_ops.pool2d(
-                x, 'pooling_type', 'avg', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive',
-                exclusive, 'data_format', data_format)
+                x,
+                'pooling_type',
+                'avg',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                exclusive,
+                'data_format',
+                data_format,
+            )
         if divisor_override is None:
             return output
         else:
@@ -370,22 +448,24 @@ def avg_pool2d(x,
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs={"Out": pool_out},
-                     attrs={
-                         "pooling_type": "avg",
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": exclusive,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs={"Out": pool_out},
+        attrs={
+            "pooling_type": "avg",
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": exclusive,
+            "data_format": data_format,
+        },
+    )
 
     if divisor_override is None:
         return pool_out
@@ -394,15 +474,17 @@ def avg_pool2d(x,
         return pool_out * (kernel_size[0] * kernel_size[1]) / divisor_override
 
 
-def avg_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               exclusive=True,
-               divisor_override=None,
-               data_format="NCDHW",
-               name=None):
+def avg_pool3d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    exclusive=True,
+    divisor_override=None,
+    data_format="NCDHW",
+    name=None,
+):
     """
     This API implements average pooling 3d operation.
     See more details in :ref:`api_nn_pooling_AvgPool3d` .
@@ -435,13 +517,13 @@ def avg_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
 
           x = paddle.uniform([1, 3, 32, 32, 32], paddle.float32)
@@ -460,25 +542,54 @@ def avg_pool3d(x,
         stride = utils.convert_to_list(stride, 3, 'pool_stride')
 
     channel_last = _channel_last(data_format, 3)
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    3,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     _check_value_limitation(kernel_size, "kernel_size", min_limit=1e-3)
     _check_value_limitation(stride, "stride", min_limit=1e-3)
 
     if in_dygraph_mode():
-        pool_out = _C_ops.pool3d(x, kernel_size, stride, padding, ceil_mode,
-                                 exclusive, data_format, 'avg', False, False,
-                                 padding_algorithm, True)
+        pool_out = _C_ops.pool3d(
+            x,
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            exclusive,
+            data_format,
+            'avg',
+            False,
+            False,
+            padding_algorithm,
+            True,
+        )
     elif _in_legacy_dygraph():
         pool_out = _legacy_C_ops.pool3d(
-            x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
-            'paddings', padding, 'global_pooling', False, 'padding_algorithm',
-            padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', exclusive, 'data_format',
-            data_format)
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'global_pooling',
+            False,
+            'padding_algorithm',
+            padding_algorithm,
+            'use_cudnn',
+            True,
+            'ceil_mode',
+            ceil_mode,
+            'use_mkldnn',
+            False,
+            'exclusive',
+            exclusive,
+            'data_format',
+            data_format,
+        )
     else:
         op_type = "pool3d"
         helper = LayerHelper(op_type, **locals())
@@ -487,38 +598,45 @@ def avg_pool3d(x,
         pool_out = helper.create_variable_for_type_inference(dtype)
         outputs = {"Out": pool_out}
 
-        helper.append_op(type=op_type,
-                         inputs={"X": x},
-                         outputs=outputs,
-                         attrs={
-                             "pooling_type": 'avg',
-                             "ksize": kernel_size,
-                             "global_pooling": False,
-                             "strides": stride,
-                             "paddings": padding,
-                             "padding_algorithm": padding_algorithm,
-                             "use_cudnn": True,
-                             "ceil_mode": ceil_mode,
-                             "use_mkldnn": False,
-                             "exclusive": exclusive,
-                             "data_format": data_format,
-                         })
+        helper.append_op(
+            type=op_type,
+            inputs={"X": x},
+            outputs=outputs,
+            attrs={
+                "pooling_type": 'avg',
+                "ksize": kernel_size,
+                "global_pooling": False,
+                "strides": stride,
+                "paddings": padding,
+                "padding_algorithm": padding_algorithm,
+                "use_cudnn": True,
+                "ceil_mode": ceil_mode,
+                "use_mkldnn": False,
+                "exclusive": exclusive,
+                "data_format": data_format,
+            },
+        )
 
     if divisor_override is None:
         return pool_out
     else:
         _check_instance(divisor_override, "divisor_override")
-        return pool_out * (kernel_size[0] * kernel_size[1] *
-                           kernel_size[2]) / divisor_override
+        return (
+            pool_out
+            * (kernel_size[0] * kernel_size[1] * kernel_size[2])
+            / divisor_override
+        )
 
 
-def max_pool1d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               return_mask=False,
-               ceil_mode=False,
-               name=None):
+def max_pool1d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    return_mask=False,
+    ceil_mode=False,
+    name=None,
+):
     """
     This API implements max pooling 1d opereation.
     See more details in :ref:`api_nn_pooling_MaxPool1d` .
@@ -547,12 +665,6 @@ def max_pool1d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the input is not a 3-D tensor.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
 
@@ -577,44 +689,96 @@ def max_pool1d(x,
     else:
         stride = [1] + utils.convert_to_list(stride, 1, 'pool_stride')
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    1,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 1, ceil_mode=ceil_mode
+    )
 
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
         if return_mask:
-            pool_out = _C_ops.max_pool2d_with_index(x, kernel_size, stride,
-                                                    padding, False, False)
-            return (squeeze(pool_out[0], [2]),
-                    squeeze(pool_out[1], [2])) if return_mask else squeeze(
-                        pool_out[0], [2])
+            pool_out = _C_ops.max_pool2d_with_index(
+                x, kernel_size, stride, padding, False, False
+            )
+            return (
+                (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+                if return_mask
+                else squeeze(pool_out[0], [2])
+            )
         else:
-            pool_out = _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                                     True, data_format, 'max', False, False,
-                                     padding_algorithm, True)
+            pool_out = _C_ops.pool2d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                True,
+                data_format,
+                'max',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
             return squeeze(pool_out, [2])
 
     if _in_legacy_dygraph():
         if return_mask:
             pool_out = _legacy_C_ops.max_pool2d_with_index(
-                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
-                stride, 'paddings', padding, 'padding_algorithm',
-                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-                'use_mkldnn', False, 'exclusive', True, 'data_format',
-                data_format)
-            return (squeeze(pool_out[0], [2]),
-                    squeeze(pool_out[1], [2])) if return_mask else squeeze(
-                        pool_out[0], [2])
+                x,
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'padding_algorithm',
+                padding_algorithm,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
+            return (
+                (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+                if return_mask
+                else squeeze(pool_out[0], [2])
+            )
         else:
             pool_out = _legacy_C_ops.pool2d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return squeeze(pool_out, [2])
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
@@ -624,37 +788,48 @@ def max_pool1d(x,
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": True,
-                         "data_format": data_format,
-                     })
-
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        },
+    )
+
+    return (
+        (squeeze(pool_out, [2]), squeeze(mask, [2]))
+        if return_mask
+        else squeeze(pool_out, [2])
+    )
 
 
 def _unpool_output_size(x, kernel_size, stride, padding, output_size):
+    assert output_size is None or isinstance(output_size, (list, tuple)), (
+        "Required output_size is None|list|tuple, but received %s" % output_size
+    )
     input_size = x.shape
     default_size = []
     for d in range(len(kernel_size)):
-        default_size.append((input_size[-len(kernel_size) + d] - 1) *
-                            stride[d] + kernel_size[d] - 2 * padding[d])
+        default_size.append(
+            (input_size[-len(kernel_size) + d] - 1) * stride[d]
+            + kernel_size[d]
+            - 2 * padding[d]
+        )
 
     has_static_var = False
     if output_size is None:
-        ret = default_size
+        return default_size
     elif utils._contain_var(output_size):
         if not _non_static_mode():
             has_static_var = True
@@ -663,46 +838,49 @@ def _unpool_output_size(x, kernel_size, stride, padding, output_size):
             for i, var in enumerate(output_size):
                 if isinstance(var, Variable):
                     output_size[i] = var.numpy()[0]
-        ret = output_size
-    else:
-        if len(output_size) == len(kernel_size) + 2:
-            output_size = output_size[2:]
-        if len(output_size) != len(kernel_size):
-            raise ValueError(
-                "output_size should be a sequence containing "
-                "{} or {} elements, but it has a length of '{}'".format(
-                    len(kernel_size),
-                    len(kernel_size) + 2, len(output_size)))
-        if not has_static_var:
-            for d in range(len(kernel_size)):
-                min_size = default_size[d] - stride[d]
-                max_size = default_size[d] + stride[d]
-                if not (min_size < output_size[d] < max_size):
-                    raise ValueError(
-                        'invalid output_size "{}" (dim {} must be between {} and {})'
-                        .format(output_size, d, min_size, max_size))
-
-        ret = output_size
-    return ret
-
-
-def max_unpool1d(x,
-                 indices,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCL",
-                 output_size=None,
-                 name=None):
+
+    if len(output_size) == len(kernel_size) + 2:
+        output_size = output_size[2:]
+    if len(output_size) != len(kernel_size):
+        raise ValueError(
+            "output_size should be a sequence containing "
+            "{} or {} elements, but it has a length of '{}'".format(
+                len(kernel_size), len(kernel_size) + 2, len(output_size)
+            )
+        )
+    if not has_static_var:
+        for d in range(len(kernel_size)):
+            min_size = default_size[d] - stride[d]
+            max_size = default_size[d] + stride[d]
+            if not (min_size < output_size[d] < max_size):
+                raise ValueError(
+                    'invalid output_size "{}" (dim {} must be between {} and {})'.format(
+                        output_size, d, min_size, max_size
+                    )
+                )
+
+    return output_size
+
+
+def max_unpool1d(
+    x,
+    indices,
+    kernel_size,
+    stride=None,
+    padding=0,
+    data_format="NCL",
+    output_size=None,
+    name=None,
+):
     r"""
     This API implements max unpooling 1d opereation.
-    `max_unpool1d` accepts the output of `max_pool1d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool1d` accepts the output of `max_pool1d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, L_{in})`
     - Output: :math:`(N, C, L_{out})`, where
-    
+
     .. math::
         L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
 
@@ -711,11 +889,11 @@ def max_unpool1d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 3-D tensor with
-                          shape [N, C, L]. The format of input tensor is `"NCL"`, 
+                          shape [N, C, L]. The format of input tensor is `"NCL"`,
                           where `N` is batch size, `C` is the number of channels, `L` is
                           the length of the feature. The data type is float32 or float64.
         indices (Tensor): The indices given out by maxpooling1d which is a 3-D tensor with
-                          shape [N, C, L]. The format of input tensor is `"NCL"` , 
+                          shape [N, C, L]. The format of input tensor is `"NCL"` ,
                           where `N` is batch size, `C` is the number of channels, `L` is
                           the length of the featuree. The data type is float32 or float64.
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
@@ -723,7 +901,7 @@ def max_unpool1d(x,
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -734,11 +912,11 @@ def max_unpool1d(x,
                              None by default.
 
     Returns:
-        Tensor: The output tensor of unpooling result. 
+        Tensor: The output tensor of unpooling result.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
 
@@ -751,8 +929,10 @@ def max_unpool1d(x,
     """
     """NCL to NCHW"""
     if data_format not in ["NCL"]:
-        raise ValueError("Attr(data_format) should be 'NCL'. Received "
-                         "Attr(data_format): %s." % str(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCL'. Received "
+            "Attr(data_format): %s." % str(data_format)
+        )
     data_format = "NCHW"
     x = unsqueeze(x, [2])
     indices = unsqueeze(indices, [2])
@@ -765,18 +945,32 @@ def max_unpool1d(x,
     # use 2d to implenment 1d should expand padding in advance.
     padding = _expand_low_nd_padding(padding)
 
-    output_size = _unpool_output_size(x, kernel_size, stride, padding,
-                                      output_size)
+    output_size = _unpool_output_size(
+        x, kernel_size, stride, padding, output_size
+    )
 
     if in_dygraph_mode():
-        output = _C_ops.unpool(x, indices, kernel_size, stride, padding,
-                               output_size, data_format)
+        output = _C_ops.unpool(
+            x, indices, kernel_size, stride, padding, output_size, data_format
+        )
         return squeeze(output, [2])
     elif in_dynamic_mode():
-        output = _legacy_C_ops.unpool(x, indices, 'unpooling_type', 'max',
-                                      'ksize', kernel_size, 'strides', stride,
-                                      'paddings', padding, "output_size",
-                                      output_size, "data_format", data_format)
+        output = _legacy_C_ops.unpool(
+            x,
+            indices,
+            'unpooling_type',
+            'max',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            "output_size",
+            output_size,
+            "data_format",
+            data_format,
+        )
         return squeeze(output, [2])
 
     op_type = "unpool"
@@ -784,30 +978,31 @@ def max_unpool1d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         "X": x,
-                         "Indices": indices
-                     },
-                     outputs={"Out": unpool_out},
-                     attrs={
-                         "unpooling_type": "max",
-                         "ksize": kernel_size,
-                         "strides": stride,
-                         "paddings": padding,
-                         "output_size": output_size
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x, "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size,
+        },
+    )
     return squeeze(unpool_out, [2])
 
 
-def max_unpool2d(x,
-                 indices,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCHW",
-                 output_size=None,
-                 name=None):
+def max_unpool2d(
+    x,
+    indices,
+    kernel_size,
+    stride=None,
+    padding=0,
+    data_format="NCHW",
+    output_size=None,
+    name=None,
+):
     r"""
     This API implements max unpooling 2d opereation.
     See more details in :ref:`api_nn_pooling_MaxUnPool2D` .
@@ -815,12 +1010,12 @@ def max_unpool2d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"`, 
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"`,
                           where `N` is batch size, `C` is the number of channels,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
         indices (Tensor): The indices given out by maxpooling2d which is a 4-D tensor with
-                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` , 
+                          shape [N, C, H, W]. The format of input tensor is `"NCHW"` ,
                           where `N` is batch size, `C` is the number of channels,
                           `H` is the height of the feature, and `W` is the width of the
                           feature. The data type if float32 or float64.
@@ -829,7 +1024,7 @@ def max_unpool2d(x,
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, padding).
         name(str, optional): For detailed information, please refer
@@ -849,16 +1044,16 @@ def max_unpool2d(x,
           or as given by :attr:`output_size` in the call operator
 
         Returns:
-            Tensor: The output tensor of unpooling result. 
+            Tensor: The output tensor of unpooling result.
 
         Raises:
             ValueError: If the input is not a 4-D tensor.
             ValueError: If indeces shape is not equal input shape.
-            
+
 
         Examples:
             .. code-block:: python
-          
+
             import paddle
             import paddle.nn.functional as F
 
@@ -868,9 +1063,9 @@ def max_unpool2d(x,
             unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0)
             # unpool_out shape: [1, 1, 6, 6]
 
-            # specify a different output size than input size 
+            # specify a different output size than input size
             unpool_out = F.max_unpool2d(pool_out, indices, kernel_size=2, padding=0, output_size=[7,7])
-            # unpool_out shape: [1, 1, 7, 7] 
+            # unpool_out shape: [1, 1, 7, 7]
 
     """
     kernel_size = utils.convert_to_list(kernel_size, 2, 'pool_size')
@@ -881,21 +1076,37 @@ def max_unpool2d(x,
     padding = utils.convert_to_list(padding, 2, 'padding')
 
     if data_format not in ["NCHW"]:
-        raise ValueError("Attr(data_format) should be 'NCHW'. Received "
-                         "Attr(data_format): %s." % str(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCHW'. Received "
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    output_size = _unpool_output_size(x, kernel_size, stride, padding,
-                                      output_size)
+    output_size = _unpool_output_size(
+        x, kernel_size, stride, padding, output_size
+    )
 
     if in_dygraph_mode():
-        output = _C_ops.unpool(x, indices, kernel_size, stride, padding,
-                               output_size, data_format)
+        output = _C_ops.unpool(
+            x, indices, kernel_size, stride, padding, output_size, data_format
+        )
         return output
     elif in_dynamic_mode():
-        output = _legacy_C_ops.unpool(x, indices, 'unpooling_type', 'max',
-                                      'ksize', kernel_size, 'strides', stride,
-                                      'paddings', padding, "output_size",
-                                      output_size, "data_format", data_format)
+        output = _legacy_C_ops.unpool(
+            x,
+            indices,
+            'unpooling_type',
+            'max',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            "output_size",
+            output_size,
+            "data_format",
+            data_format,
+        )
         return output
 
     op_type = "unpool"
@@ -903,39 +1114,40 @@ def max_unpool2d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         "X": x,
-                         "Indices": indices
-                     },
-                     outputs={"Out": unpool_out},
-                     attrs={
-                         "unpooling_type": "max",
-                         "ksize": kernel_size,
-                         "strides": stride,
-                         "paddings": padding,
-                         "output_size": output_size
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x, "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size,
+        },
+    )
     return unpool_out
 
 
-def max_unpool3d(x,
-                 indices,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCDHW",
-                 output_size=None,
-                 name=None):
+def max_unpool3d(
+    x,
+    indices,
+    kernel_size,
+    stride=None,
+    padding=0,
+    data_format="NCDHW",
+    output_size=None,
+    name=None,
+):
     r"""
     This API implements max unpooling 3d opereation.
-    `max_unpool3d` accepts the output of `max_pool3d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool3d` accepts the output of `max_pool3d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
     - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
-    
+
     .. math::
         D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
 
@@ -950,21 +1162,21 @@ def max_unpool3d(x,
 
     Args:
         x (Tensor): The input tensor of unpooling operator which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`, 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"`,
                           where `N` is batch size, `C` is the number of channels, `D` is
-                          the depth of the feature, `H` is the height of the feature, 
+                          the depth of the feature, `H` is the height of the feature,
                           and `W` is the width of the feature. The data type is float32 or float64.
         indices (Tensor): The indices given out by maxpooling3d which is a 5-D tensor with
-                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` , 
+                          shape [N, C, D, H, W]. The format of input tensor is `"NCDHW"` ,
                           where `N` is batch size, `C` is the number of channels, `D` is
-                          the depth of the feature, `H` is the height of the feature, 
+                          the depth of the feature, `H` is the height of the feature,
                           and `W` is the width of the feature. The data type is float32 or float64.
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -975,11 +1187,11 @@ def max_unpool3d(x,
                              None by default.
 
     Returns:
-        Tensor: The output tensor of unpooling result. 
+        Tensor: The output tensor of unpooling result.
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
 
@@ -998,21 +1210,37 @@ def max_unpool3d(x,
     padding = utils.convert_to_list(padding, 3, 'padding')
 
     if data_format not in ["NCDHW"]:
-        raise ValueError("Attr(data_format) should be 'NCDHW'. Received "
-                         "Attr(data_format): %s." % str(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NCDHW'. Received "
+            "Attr(data_format): %s." % str(data_format)
+        )
 
-    output_size = _unpool_output_size(x, kernel_size, stride, padding,
-                                      output_size)
+    output_size = _unpool_output_size(
+        x, kernel_size, stride, padding, output_size
+    )
 
     if in_dygraph_mode():
-        output = _C_ops.unpool3d(x, indices, kernel_size, stride, padding,
-                                 output_size, data_format)
+        output = _C_ops.unpool3d(
+            x, indices, kernel_size, stride, padding, output_size, data_format
+        )
         return output
     elif in_dynamic_mode():
-        output = _legacy_C_ops.unpool3d(x, indices, 'unpooling_type', 'max',
-                                        'ksize', kernel_size, 'strides', stride,
-                                        'paddings', padding, "output_size",
-                                        output_size, "data_format", data_format)
+        output = _legacy_C_ops.unpool3d(
+            x,
+            indices,
+            'unpooling_type',
+            'max',
+            'ksize',
+            kernel_size,
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            "output_size",
+            output_size,
+            "data_format",
+            data_format,
+        )
         return output
 
     op_type = "unpool3d"
@@ -1020,30 +1248,31 @@ def max_unpool3d(x,
     dtype = helper.input_dtype(input_param_name="x")
     unpool_out = helper.create_variable_for_type_inference(dtype)
 
-    helper.append_op(type=op_type,
-                     inputs={
-                         "X": x,
-                         "Indices": indices
-                     },
-                     outputs={"Out": unpool_out},
-                     attrs={
-                         "unpooling_type": "max",
-                         "ksize": kernel_size,
-                         "strides": stride,
-                         "paddings": padding,
-                         "output_size": output_size
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x, "Indices": indices},
+        outputs={"Out": unpool_out},
+        attrs={
+            "unpooling_type": "max",
+            "ksize": kernel_size,
+            "strides": stride,
+            "paddings": padding,
+            "output_size": output_size,
+        },
+    )
     return unpool_out
 
 
-def max_pool2d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               return_mask=False,
-               ceil_mode=False,
-               data_format="NCHW",
-               name=None):
+def max_pool2d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    return_mask=False,
+    ceil_mode=False,
+    data_format="NCHW",
+    name=None,
+):
     """
     This API implements max pooling 2d operation.
     See more details in :ref:`api_nn_pooling_MaxPool2d` .
@@ -1078,11 +1307,6 @@ def max_pool2d(x,
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
 
@@ -1107,14 +1331,14 @@ def max_pool2d(x,
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     channel_last = True if data_format == "NHWC" else False
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    num_dims=2,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     if data_format == "NHWC" and return_mask:
         raise ValueError(
@@ -1123,69 +1347,122 @@ def max_pool2d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            output = _C_ops.max_pool2d_with_index(x, kernel_size, stride,
-                                                  padding, False, False)
+            output = _C_ops.max_pool2d_with_index(
+                x, kernel_size, stride, padding, False, False
+            )
             return output if return_mask else output[0]
         else:
-            return _C_ops.pool2d(x, kernel_size, stride, padding, ceil_mode,
-                                 True, data_format, 'max', False, False,
-                                 padding_algorithm, True)
+            return _C_ops.pool2d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                True,
+                data_format,
+                'max',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
 
     if _in_legacy_dygraph():
         if return_mask:
             output = _legacy_C_ops.max_pool2d_with_index(
-                x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
-                stride, 'paddings', padding, 'padding_algorithm',
-                padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-                'use_mkldnn', False, 'exclusive', True, 'data_format',
-                data_format)
+                x,
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'padding_algorithm',
+                padding_algorithm,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output if return_mask else output[0]
         else:
             output = _legacy_C_ops.pool2d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output
 
     op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
     helper = LayerHelper(op_type, **locals())
-    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                             'max_pool2d')
+    check_variable_and_dtype(
+        x, 'x', ['float16', 'float32', 'float64'], 'max_pool2d'
+    )
     dtype = helper.input_dtype(input_param_name='x')
     pool_out = helper.create_variable_for_type_inference(dtype)
     mask = helper.create_variable_for_type_inference("int32")
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": True,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": True,
+            "data_format": data_format,
+        },
+    )
 
     return (pool_out, mask) if return_mask else pool_out
 
 
-def max_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               return_mask=False,
-               ceil_mode=False,
-               data_format="NCDHW",
-               name=None):
+def max_pool3d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    return_mask=False,
+    ceil_mode=False,
+    data_format="NCDHW",
+    name=None,
+):
     """
     This API implements max pooling 2d operation.
     See more details in :ref:`api_nn_pooling_MaxPool3d` .
@@ -1215,15 +1492,10 @@ def max_pool3d(x,
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
     Examples:
         .. code-block:: python
 
@@ -1255,10 +1527,9 @@ def max_pool3d(x,
 
     channel_last = _channel_last(data_format, 3)
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    3,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
     if data_format == "NDHWC" and return_mask:
         raise ValueError(
@@ -1267,30 +1538,80 @@ def max_pool3d(x,
 
     if in_dygraph_mode():
         if return_mask:
-            output = _C_ops.max_pool3d_with_index(x, kernel_size, stride,
-                                                  padding, False, False)
+            output = _C_ops.max_pool3d_with_index(
+                x, kernel_size, stride, padding, False, False
+            )
             return output if return_mask else output[0]
         else:
-            return _C_ops.pool3d(x, kernel_size, stride, padding, ceil_mode,
-                                 True, data_format, 'max', False, False,
-                                 padding_algorithm, True)
+            return _C_ops.pool3d(
+                x,
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                True,
+                data_format,
+                'max',
+                False,
+                False,
+                padding_algorithm,
+                True,
+            )
 
     if _in_legacy_dygraph():
         if return_mask:
             output = _legacy_C_ops.max_pool3d_with_index(
-                x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
-                stride, 'paddings', padding, 'global_pooling', False,
-                'padding_algorithm', padding_algorithm, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output if return_mask else output[0]
         else:
             output = _legacy_C_ops.pool3d(
-                x, 'pooling_type', 'max', 'ksize', kernel_size,
-                'global_pooling', False, 'padding_algorithm', padding_algorithm,
-                'strides', stride, 'paddings', padding, 'use_cudnn', True,
-                'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
-                'data_format', data_format)
+                x,
+                'pooling_type',
+                'max',
+                'ksize',
+                kernel_size,
+                'global_pooling',
+                False,
+                'padding_algorithm',
+                padding_algorithm,
+                'strides',
+                stride,
+                'paddings',
+                padding,
+                'use_cudnn',
+                True,
+                'ceil_mode',
+                ceil_mode,
+                'use_mkldnn',
+                False,
+                'exclusive',
+                True,
+                'data_format',
+                data_format,
+            )
             return output
 
     op_type = "max_pool3d_with_index" if return_mask else "pool3d"
@@ -1301,30 +1622,32 @@ def max_pool3d(x,
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=op_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": kernel_size,
-                         "global_pooling": False,
-                         "strides": stride,
-                         "paddings": padding,
-                         "padding_algorithm": padding_algorithm,
-                         "use_cudnn": True,
-                         "ceil_mode": ceil_mode,
-                         "use_mkldnn": False,
-                         "exclusive": False,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=op_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": kernel_size,
+            "global_pooling": False,
+            "strides": stride,
+            "paddings": padding,
+            "padding_algorithm": padding_algorithm,
+            "use_cudnn": True,
+            "ceil_mode": ceil_mode,
+            "use_mkldnn": False,
+            "exclusive": False,
+            "data_format": data_format,
+        },
+    )
 
     return (pool_out, mask) if return_mask else pool_out
 
 
 def adaptive_avg_pool1d(x, output_size, name=None):
     """
-    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`. 
-    
+    Adaptive average pooling 1d operation on :attr:`x` according to :attr:`output_size`.
+
     Notes:
         See more details in :ref:`api_nn_pooling_AdaptiveAvgPool1d` .
 
@@ -1332,10 +1655,10 @@ def adaptive_avg_pool1d(x, output_size, name=None):
         x (Tensor): The input Tensor of pooling, which is a 3-D tensor with shape :math:`[N, C, L]`, where :math:`N` is batch size, :math:`C` is the number of channels and :math:`L` is the length of the feature. The data type is float32 or float64.
         output_size (int): The target output size. Its data type must be int.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: The result of 1D adaptive average pooling. Its data type is same as input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -1360,21 +1683,34 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     """
     pool_type = 'avg'
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'adaptive_pool2d')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'adaptive_pool2d'
+        )
         check_type(output_size, 'pool_size', (int), 'adaptive_pool1d')
     _check_input(x, 3)
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
     x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = _C_ops.pool2d(x, pool_size, [1, 1], [0, 0], False, True,
-                                 "NCHW", pool_type, False, True, "EXPLICIT",
-                                 False)
+        pool_out = _C_ops.pool2d(
+            x,
+            pool_size,
+            [1, 1],
+            [0, 0],
+            False,
+            True,
+            "NCHW",
+            pool_type,
+            False,
+            True,
+            "EXPLICIT",
+            False,
+        )
         return squeeze(pool_out, [2])
     if _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.pool2d(x, 'pooling_type', pool_type, 'ksize',
-                                        pool_size, 'adaptive', True)
+        pool_out = _legacy_C_ops.pool2d(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True
+        )
         return squeeze(pool_out, [2])
 
     l_type = "pool2d"
@@ -1384,29 +1720,33 @@ def adaptive_avg_pool1d(x, output_size, name=None):
     pool_out = helper.create_variable_for_type_inference(dtype)
 
     outputs = {"Out": pool_out}
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
     return squeeze(pool_out, [2])
 
 
 def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
-    """
+    r"""
+
     Applies 2D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
-    
+
     For avg adaptive pool2d:
+
     ..  math::
-        hstart &= floor(i * H_{in} / H_{out})
-        hend &= ceil((i + 1) * H_{in} / H_{out})
-        wstart &= floor(j * W_{in} / W_{out})
-        wend &= ceil((j + 1) * W_{in} / W_{out})
+        hstart &= floor(i * H_{in} / H_{out}) \\
+        hend &= ceil((i + 1) * H_{in} / H_{out}) \\
+        wstart &= floor(j * W_{in} / W_{out}) \\
+        wend &= ceil((j + 1) * W_{in} / W_{out}) \\
         Output(i ,j) &= \frac{\sum Input[hstart:hend, wstart:wend]}{(hend - hstart) * (wend - wstart)}
 
     Args:
@@ -1415,14 +1755,15 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
             it must contain two element, (H, W). H and W can be either a int, or None which means
             the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+        data_format (str, optional): The data format of the input and output data. An optional string
             from: "NCHW", "NHWC". The default is "NCHW". When it is "NCHW", the data is stored in
             the order of: [batch_size, input_channels, input_height, input_width].
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
+
     Returns:
-        Tensor: The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool2d result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
@@ -1443,25 +1784,26 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
             #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
             #
             import paddle
-            import numpy as np
 
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
+            x = paddle.rand([2, 3, 32, 32])
             # x.shape is [2, 3, 32, 32]
             out = paddle.nn.functional.adaptive_avg_pool2d(
                             x = x,
                             output_size=[3, 3])
             # out.shape is [2, 3, 3, 3]
+
     """
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'],
-                                 'adaptive_avg_pool2d')
+        check_variable_and_dtype(
+            x, 'x', ['float16', 'float32', 'float64'], 'adaptive_avg_pool2d'
+        )
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool2d')
 
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     if data_format == "NCHW":
         in_h, in_w = x.shape[2:4]
@@ -1487,14 +1829,35 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
         output_size = utils._convert_to_tensor_list(output_size)
 
     if in_dygraph_mode():
-        return _C_ops.pool2d(x, output_size, [1, 1], [0, 0], False, True,
-                             data_format, 'avg', False, True, "EXPLICIT", False)
+        return _C_ops.pool2d(
+            x,
+            output_size,
+            [1, 1],
+            [0, 0],
+            False,
+            True,
+            data_format,
+            'avg',
+            False,
+            True,
+            "EXPLICIT",
+            False,
+        )
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.pool2d(x, 'pooling_type', 'avg', 'ksize',
-                                    output_size, 'global_pooling', False,
-                                    'adaptive', True, 'data_format',
-                                    data_format)
+        return _legacy_C_ops.pool2d(
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            output_size,
+            'global_pooling',
+            False,
+            'adaptive',
+            True,
+            'data_format',
+            data_format,
+        )
 
     l_type = 'pool2d'
 
@@ -1504,49 +1867,53 @@ def adaptive_avg_pool2d(x, output_size, data_format='NCHW', name=None):
 
     outputs = {"Out": pool_out}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": "avg",
-                         "ksize": output_size,
-                         "adaptive": True,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
 
 def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
-    """
+    r"""
+
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size.
-    
+
     For avg adaptive pool3d:
+
     ..  math::
-        dstart &= floor(i * D_{in} / D_{out})
-        dend &= ceil((i + 1) * D_{in} / D_{out})
-        hstart &= floor(j * H_{in} / H_{out})
-        hend &= ceil((j + 1) * H_{in} / H_{out})
-        wstart &= floor(k * W_{in} / W_{out})
-        wend &= ceil((k + 1) * W_{in} / W_{out})
+        dstart &= floor(i * D_{in} / D_{out}) \\
+        dend &= ceil((i + 1) * D_{in} / D_{out}) \\
+        hstart &= floor(j * H_{in} / H_{out}) \\
+        hend &= ceil((j + 1) * H_{in} / H_{out}) \\
+        wstart &= floor(k * W_{in} / W_{out}) \\
+        wend &= ceil((k + 1) * W_{in} / W_{out}) \\
         Output(i ,j, k) &= \frac{\sum Input[dstart:dend, hstart:hend, wstart:wend]}
             {(dend - dstart) * (hend - hstart) * (wend - wstart)}
 
     Args:
         x (Tensor): The input tensor of adaptive avg pool3d operator, which is a 5-D tensor.
-                          The data type can be float32, float64.
-        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
-            it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means
-            the size will be the same as that of the input.
-        data_format (str): The data format of the input and output data. An optional string
+            The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or
+            list, it must contain three elements, (D, H, W). D, H and W can be either a int,
+            or None which means the size will be the same as that of the input.
+        data_format (str, optional): The data format of the input and output data. An optional string
             from: "NCDHW", "NDHWC". The default is "NCDHW". When it is "NCDHW", the data is stored in
             the order of: [batch_size, input_channels, input_depth, input_height, input_width].
-        name(str, optional): For detailed information, please refer
-                             to :ref:`api_guide_Name`. Usually name is no need to set and
-                             None by default.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`.
+            Usually name is no need to set and None by default.
+
     Returns:
-        Tensor: The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
+        Tensor, The output tensor of avg adaptive pool3d result. The data type is same as input tensor.
 
     Examples:
         .. code-block:: python
@@ -1576,16 +1943,19 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
                             x = input_data,
                             output_size=[3, 3, 3])
             # out.shape is [2, 3, 3, 3, 3]
+
     """
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_avg_pool3d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_avg_pool3d'
+        )
         check_type(data_format, 'data_format', str, 'adaptive_avg_pool3d')
 
     if data_format not in ["NCDHW", "NDHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCDHW' or 'NDHWC'. Received "
-            "Attr(data_format): %s." % str(data_format))
+            "Attr(data_format): %s." % str(data_format)
+        )
 
     if data_format == "NCDHW":
         in_l, in_h, in_w = x.shape[2:5]
@@ -1604,13 +1974,34 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
             output_size[2] = in_w
 
     if in_dygraph_mode():
-        return _C_ops.pool3d(x, output_size, [1, 1, 1], [0, 0, 0], False, True,
-                             data_format, 'avg', False, True, "EXPLICIT", False)
+        return _C_ops.pool3d(
+            x,
+            output_size,
+            [1, 1, 1],
+            [0, 0, 0],
+            False,
+            True,
+            data_format,
+            'avg',
+            False,
+            True,
+            "EXPLICIT",
+            False,
+        )
     elif _in_legacy_dygraph():
-        return _legacy_C_ops.pool3d(x, 'pooling_type', 'avg', 'ksize',
-                                    output_size, 'global_pooling', False,
-                                    'adaptive', True, 'data_format',
-                                    data_format)
+        return _legacy_C_ops.pool3d(
+            x,
+            'pooling_type',
+            'avg',
+            'ksize',
+            output_size,
+            'global_pooling',
+            False,
+            'adaptive',
+            True,
+            'data_format',
+            data_format,
+        )
 
     l_type = 'pool3d'
 
@@ -1619,15 +2010,17 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     pool_out = helper.create_variable_for_type_inference(dtype)
     outputs = {"Out": pool_out}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": "avg",
-                         "ksize": output_size,
-                         "adaptive": True,
-                         "data_format": data_format,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": "avg",
+            "ksize": output_size,
+            "adaptive": True,
+            "data_format": data_format,
+        },
+    )
 
     return pool_out
 
@@ -1651,8 +2044,7 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     Returns:
             Tensor: The output tensor of adaptive pooling result. The data type is same
                       as input tensor.
-    Raises:
-            ValueError: 'output_size' should be an integer.
+
     Examples:
         .. code-block:: python
 
@@ -1679,8 +2071,9 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     """
     pool_type = 'max'
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_max_pool1d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_max_pool1d'
+        )
         check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool1d')
     _check_input(x, 3)
@@ -1689,17 +2082,23 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
 
     x = unsqueeze(x, [2])
     if in_dygraph_mode():
-        pool_out = _C_ops.max_pool2d_with_index(x, pool_size, [1, 1], [0, 0],
-                                                False, True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
+        pool_out = _C_ops.max_pool2d_with_index(
+            x, pool_size, [1, 1], [0, 0], False, True
+        )
+        return (
+            (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+            if return_mask
+            else squeeze(pool_out[0], [2])
+        )
     if _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.max_pool2d_with_index(x, 'pooling_type',
-                                                       pool_type, 'ksize',
-                                                       pool_size, 'adaptive',
-                                                       True)
-        return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
+        pool_out = _legacy_C_ops.max_pool2d_with_index(
+            x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True
+        )
+        return (
+            (squeeze(pool_out[0], [2]), squeeze(pool_out[1], [2]))
+            if return_mask
+            else squeeze(pool_out[0], [2])
+        )
 
     l_type = 'max_pool2d_with_index'
 
@@ -1710,64 +2109,70 @@ def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": pool_type,
-                         "ksize": pool_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": pool_type,
+            "ksize": pool_size,
+            "adaptive": True,
+        },
+    )
 
-    return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
+    return (
+        (squeeze(pool_out, [2]), squeeze(mask, [2]))
+        if return_mask
+        else squeeze(pool_out, [2])
+    )
 
 
 def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     """
-        This operation applies a 2D adaptive max pooling on input tensor.
-        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
+    This operation applies a 2D adaptive max pooling on input tensor.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
 
-        Args:
-            x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
-            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+    Args:
+        x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
-        Returns:
-            Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
+    Returns:
+        Tensor: The output tensor of adaptive max pool2d result. The data type is same as input tensor.
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-              # max adaptive pool2d
-              # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
-              # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
-              # of input data into m*n grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(m):
-              #         for j in range(n):
-              #             hstart = floor(i * H / m)
-              #             hend = ceil((i + 1) * H / m)
-              #             wstart = floor(i * W / n)
-              #             wend = ceil((i + 1) * W / n)
-              #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
-              #
-              import paddle
+          # max adaptive pool2d
+          # suppose input data in the shape of [N, C, H, W], `output_size` is [m, n]
+          # output shape is [N, C, m, n], adaptive pool divide H and W dimensions
+          # of input data into m*n grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(m):
+          #         for j in range(n):
+          #             hstart = floor(i * H / m)
+          #             hend = ceil((i + 1) * H / m)
+          #             wstart = floor(i * W / n)
+          #             wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
+          #
+          import paddle
 
-              input_data = paddle.randn(shape=(2, 3, 32, 32))
-              out = paddle.nn.functional.adaptive_max_pool2d(
-                            x = input_data,
-                            output_size=[3, 3])
-              # out.shape is [2, 3, 3, 3]
+          input_data = paddle.randn(shape=(2, 3, 32, 32))
+          out = paddle.nn.functional.adaptive_max_pool2d(
+                        x = input_data,
+                        output_size=[3, 3])
+          # out.shape is [2, 3, 3, 3]
     """
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_max_pool2d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_max_pool2d'
+        )
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool2d')
-        #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
+        # check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
     _check_input(x, 4)
 
     in_h, in_w = x.shape[2:4]
@@ -1780,13 +2185,14 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
         if output_size[1] == None:
             output_size[1] = in_w
     if in_dygraph_mode():
-        pool_out = _C_ops.max_pool2d_with_index(x, output_size, [1, 1], [0, 0],
-                                                False, True)
+        pool_out = _C_ops.max_pool2d_with_index(
+            x, output_size, [1, 1], [0, 0], False, True
+        )
         return pool_out if return_mask else pool_out[0]
     if _in_legacy_dygraph():
-        pool_out = _legacy_C_ops.max_pool2d_with_index(x, 'pooling_type', 'max',
-                                                       'ksize', output_size,
-                                                       'adaptive', True)
+        pool_out = _legacy_C_ops.max_pool2d_with_index(
+            x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True
+        )
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool2d_with_index'
@@ -1798,67 +2204,70 @@ def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": output_size,
-                         "adaptive": True,
-                     })
-    #return (pool_out, mask) if return_mask else pool_out
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        },
+    )
+    # return (pool_out, mask) if return_mask else pool_out
     return pool_out
 
 
 def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     """
-        This operation applies a 3D adaptive max pooling on input tensor.
-        See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
+    This operation applies a 3D adaptive max pooling on input tensor.
+    See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
 
-        Args:
-            x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
-            output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
-            name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
+    Args:
+        x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
+        output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
-        Returns:
-            Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
+    Returns:
+        Tensor: The output tensor of adaptive max pool3d result. The data type is same as input tensor.
 
-        Examples:
-            .. code-block:: python
+    Examples:
+        .. code-block:: python
 
-              # adaptive max pool3d
-              # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
-              # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
-              # of input data into m*n grids averagely and performs poolings in each
-              # grid to get output.
-              # adaptive max pool performs calculations as follow:
-              #
-              #     for i in range(l):
-              #         for j in range(m):
-              #             for k in range(n):
-              #                 dstart = floor(i * D / l)
-              #                 dend = ceil((i + 1) * D / l)
-              #                 hstart = floor(i * H / m)
-              #                 hend = ceil((i + 1) * H / m)
-              #                 wstart = floor(i * W / n)
-              #                 wend = ceil((i + 1) * W / n)
-              #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
-              #
-              import paddle
+          # adaptive max pool3d
+          # suppose input data in the shape of [N, C, D, H, W], `output_size` is [l, m, n]
+          # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimensions
+          # of input data into m*n grids averagely and performs poolings in each
+          # grid to get output.
+          # adaptive max pool performs calculations as follow:
+          #
+          #     for i in range(l):
+          #         for j in range(m):
+          #             for k in range(n):
+          #                 dstart = floor(i * D / l)
+          #                 dend = ceil((i + 1) * D / l)
+          #                 hstart = floor(i * H / m)
+          #                 hend = ceil((i + 1) * H / m)
+          #                 wstart = floor(i * W / n)
+          #                 wend = ceil((i + 1) * W / n)
+          #             output[:, :, i, j, k] = max(input[:, :, dstart: dend, hstart: hend, wstart: wend])
+          #
+          import paddle
 
-              input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
-              out = paddle.nn.functional.adaptive_max_pool3d(
-                            x = input_data,
-                            output_size=[3, 3, 3])
-              # out.shape is [2, 3, 3, 3, 3]
+          input_data = paddle.randn(shape=(2, 3, 8, 32, 32))
+          out = paddle.nn.functional.adaptive_max_pool3d(
+                        x = input_data,
+                        output_size=[3, 3, 3])
+          # out.shape is [2, 3, 3, 3, 3]
     """
 
     if not in_dynamic_mode():
-        check_variable_and_dtype(x, 'x', ['float32', 'float64'],
-                                 'adaptive_max_pool3d')
+        check_variable_and_dtype(
+            x, 'x', ['float32', 'float64'], 'adaptive_max_pool3d'
+        )
         check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool3d')
-        #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
+        # check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
     _check_input(x, 5)
 
     in_l, in_h, in_w = x.shape[2:5]
@@ -1876,12 +2285,13 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     if in_dynamic_mode():
         if in_dygraph_mode():
             # By default, strides is [1,1,1] and paddings is [0, 0, 0]
-            pool_out = _C_ops.max_pool3d_with_index(x, output_size, [1, 1, 1],
-                                                    [0, 0, 0], False, True)
+            pool_out = _C_ops.max_pool3d_with_index(
+                x, output_size, [1, 1, 1], [0, 0, 0], False, True
+            )
         elif _in_legacy_dygraph():
             pool_out = _legacy_C_ops.max_pool3d_with_index(
-                x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive',
-                True)
+                x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True
+            )
         return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool3d_with_index'
@@ -1893,13 +2303,15 @@ def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     mask = helper.create_variable_for_type_inference('int32')
     outputs = {"Out": pool_out, "Mask": mask}
 
-    helper.append_op(type=l_type,
-                     inputs={"X": x},
-                     outputs=outputs,
-                     attrs={
-                         "pooling_type": 'max',
-                         "ksize": output_size,
-                         "adaptive": True,
-                     })
+    helper.append_op(
+        type=l_type,
+        inputs={"X": x},
+        outputs=outputs,
+        attrs={
+            "pooling_type": 'max',
+            "ksize": output_size,
+            "adaptive": True,
+        },
+    )
 
     return (pool_out, mask) if return_mask else pool_out
diff --git a/python/paddle/nn/functional/sparse_attention.py b/python/paddle/nn/functional/sparse_attention.py
index 77327bae5204f5..b4f6c8464ead51 100644
--- a/python/paddle/nn/functional/sparse_attention.py
+++ b/python/paddle/nn/functional/sparse_attention.py
@@ -20,62 +20,64 @@
 from paddle import in_dynamic_mode
 
 
-def sparse_attention(query,
-                     key,
-                     value,
-                     sparse_csr_offset,
-                     sparse_csr_columns,
-                     key_padding_mask=None,
-                     attn_mask=None,
-                     name=None):
+def sparse_attention(
+    query,
+    key,
+    value,
+    sparse_csr_offset,
+    sparse_csr_columns,
+    key_padding_mask=None,
+    attn_mask=None,
+    name=None,
+):
     r"""
     This operator sparsify the Attention matrix in Transformer module
-    to achieve the effect of reducing memory consumption and computation. 
-    The sparse layout is expressed in CSR format and contains two parameters, 
-    ``offset`` and ``columns``. The equation is: 
+    to achieve the effect of reducing memory consumption and computation.
+    The sparse layout is expressed in CSR format and contains two parameters,
+    ``offset`` and ``columns``. The equation is:
 
     .. math::
 
         result=softmax(\frac{ Q * K^T }{\sqrt{d}}) * V
 
-    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module. 
-    The dimensions of the three parameters are the same. 
+    where : ``Q``, ``K``, and ``V`` represent the three input parameters of the attention module.
+    The dimensions of the three parameters are the same.
     ``d`` represents the size of the last dimension of the three parameters.
 
-    Warning:    
+    Warning:
         This API is only used in ``CUDA 11.3`` and above versions.
 
     Args:
-        query(Tensor): The query tensor in the Attention module. 
-                        4-D tensor with shape: 
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        query(Tensor): The query tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        key(Tensor): The key tensor in the Attention module. 
-                        4-D tensor with shape: 
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        key(Tensor): The key tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        value(Tensor): The value tensor in the Attention module. 
-                        4-D tensor with shape:  
-                        [batch_size, num_heads, seq_len, head_dim]. 
+        value(Tensor): The value tensor in the Attention module.
+                        4-D tensor with shape:
+                        [batch_size, num_heads, seq_len, head_dim].
                         The dtype can be float32 and float64.
-        sparse_csr_offset(Tensor): The sparsity feature in the Attention module 
-                        is expressed in the CSR format, and the offset represents 
+        sparse_csr_offset(Tensor): The sparsity feature in the Attention module
+                        is expressed in the CSR format, and the offset represents
                         the number of non-zero elements in each row of the matrix.
-                        3-D tensor with shape:   
-                        [batch_size, num_heads, seq_len + 1]. 
+                        3-D tensor with shape:
+                        [batch_size, num_heads, seq_len + 1].
                         The dtype should be int32.
-        sparse_csr_columns(Tensor): The sparsity feature in the Attention module 
-                        is expressed in the CSR format, and the columns represent 
+        sparse_csr_columns(Tensor): The sparsity feature in the Attention module
+                        is expressed in the CSR format, and the columns represent
                         the column index values of non-zero elements in the matrix.
-                        3-D tensor with shape:  
-                        [batch_size, num_heads, sparse_nnz]. 
+                        3-D tensor with shape:
+                        [batch_size, num_heads, sparse_nnz].
                         The dtype should be int32.
-        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module. 
-                        2-D tensor with shape: [batch_size, seq_len]. 
+        key_padding_mask(Tensor, optional):The key padding mask tensor in the Attention module.
+                        2-D tensor with shape: [batch_size, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
-        attn_mask(Tensor, optional):The attention mask tensor in the Attention module. 
-                        2-D tensor with shape: [seq_len, seq_len]. 
+        attn_mask(Tensor, optional):The attention mask tensor in the Attention module.
+                        2-D tensor with shape: [seq_len, seq_len].
                         The dtype can be float32 and float64.
                         A value of 0 means that the position is masked.
         name(str, optional): The default value is None. Normally there is no need for user
@@ -84,7 +86,7 @@ def sparse_attention(query,
 
     Returns:
         4-D tensor with shape:
-        [batch_size, num_heads, seq_len, head_dim]. 
+        [batch_size, num_heads, seq_len, head_dim].
         The dtype can be float32 or float64.
 
     Examples:
@@ -92,61 +94,63 @@ def sparse_attention(query,
 
             # required: skiptest
             import paddle
-            import numpy as np
-
-            query_data = np.array([[[[0, 1,], [2, 3],
-                    [ 0, 1], [2, 3]]]]).astype("float32")
-            key_data = np.array([[[[0, 1,], [2, 3],
-                            [ 0, 1], [2, 3]]]]).astype("float32")
-            value_data = np.array([[[[0, 1,], [2, 3],
-                            [ 0, 1], [2, 3]]]]).astype("float32")
-            sparse_csr_offset_data = np.array([[[0, 2,
-                            4, 6, 8]]]).astype("int32")
-            sparse_csr_columns_data = np.array([[[0, 1,
-                            0, 1, 2, 3, 2, 3]]]).astype("int32")
-            key_padding_mask_data = np.array([[1,1,1,0]]).astype("float32")
-            attention_mask_data = np.array([[1,0,1,1],[1,1,1,1],[1,1,1,1],[1,1,1,1]]).astype("float32")
-            print(query_data.shape)
-            # (1, 1, 4, 2)
-            print(sparse_csr_offset_data.shape)
-            # (1, 1, 5)
-            print(sparse_csr_columns_data.shape)
-            # (1, 1, 8)
+
             paddle.disable_static()
-            query = paddle.to_tensor(query_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            key = paddle.to_tensor(key_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            value = paddle.to_tensor(value_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            offset = paddle.to_tensor(sparse_csr_offset_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            columns = paddle.to_tensor(sparse_csr_columns_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            key_padding_mask = paddle.to_tensor(key_padding_mask_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            attention_mask = paddle.to_tensor(attention_mask_data, stop_gradient=False, 
-                            place=paddle.CUDAPlace(0))
-            output_mask = paddle.nn.functional.sparse_attention(query, key, 
-                            value, offset, columns, 
-                            key_padding_mask=key_padding_mask, attn_mask=attention_mask)
+
+            # `query`, `key` and `value` all have shape [1, 1, 4, 2]
+            query = paddle.to_tensor([[[[0, 1, ], [2, 3],
+                                        [0, 1], [2, 3]]]], dtype="float32")
+            key = paddle.to_tensor([[[[0, 1], [2, 3],
+                                    [0, 1], [2, 3]]]], dtype="float32")
+            value = paddle.to_tensor([[[[0, 1], [2, 3],
+                                        [0, 1], [2, 3]]]], dtype="float32")
+
+
+            offset = paddle.to_tensor([[[0, 2, 4, 6, 8]]], dtype="int32")
+            columns = paddle.to_tensor([[[0, 1, 0, 1, 2, 3, 2, 3]]], dtype="int32")
+
+            print(offset.shape)  # (1, 1, 5)
+            print(columns.shape)  # (1, 1, 8)
+
+            key_padding_mask = paddle.to_tensor([[1, 1, 1, 0]], dtype="float32")
+            attention_mask = paddle.to_tensor([[1, 0, 1, 1],
+                                            [1, 1, 1, 1],
+                                            [1, 1, 1, 1],
+                                            [1, 1, 1, 1]], dtype="float32")
+            output_mask = paddle.nn.functional.sparse_attention(query, key,
+                                                                value, offset, columns,
+                                                                key_padding_mask=key_padding_mask,
+                                                                attn_mask=attention_mask)
             print(output_mask)
-            # [[[[0.        , 1.        ],
-            #    [1.99830270, 2.99830270],
-            #    [0.        , 1.        ],
-            #    [0.        , 1.        ]]]]
-            output = paddle.nn.functional.sparse_attention(query, key, 
-                            value, offset, columns)
-            print(output) 
-            # [[[[1.60885942, 2.60885954],
-            #       [1.99830270, 2.99830270],
-            #       [1.60885942, 2.60885954],
-            #       [1.99830270, 2.99830270]]]]
+            # Tensor(shape=[1, 1, 4, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[[[0.        , 1.        ],
+            #           [1.99830270, 2.99830270],
+            #           [0.        , 1.        ],
+            #           [0.        , 1.        ]]]])
+
+            output = paddle.nn.functional.sparse_attention(query, key,
+                                                        value, offset, columns)
+            print(output)
+            # Tensor(shape=[1, 1, 4, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[[[1.60885942, 2.60885954],
+            #           [1.99830270, 2.99830270],
+            #           [1.60885942, 2.60885954],
+            #           [1.99830270, 2.99830270]]]])
     """
     if in_dynamic_mode():
-        result_attention, result_sdd, result_softmax = _legacy_C_ops.sparse_attention(
-            query, key, value, sparse_csr_offset, sparse_csr_columns,
-            key_padding_mask, attn_mask)
+        (
+            result_attention,
+            result_sdd,
+            result_softmax,
+        ) = _legacy_C_ops.sparse_attention(
+            query,
+            key,
+            value,
+            sparse_csr_offset,
+            sparse_csr_columns,
+            key_padding_mask,
+            attn_mask,
+        )
         return result_attention
 
     helper = LayerHelper('sparse_attention', **locals())
@@ -166,7 +170,7 @@ def sparse_attention(query,
     outputs = {
         'Out': out,
         'SparseDotSdd': result_sdd,
-        'Softmax': result_softmax
+        'Softmax': result_softmax,
     }
     helper.append_op(type='sparse_attention', inputs=inputs, outputs=outputs)
     return out
diff --git a/python/paddle/nn/functional/vision.py b/python/paddle/nn/functional/vision.py
index 7b014204416498..c6c425a3a295fd 100644
--- a/python/paddle/nn/functional/vision.py
+++ b/python/paddle/nn/functional/vision.py
@@ -37,7 +37,7 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
     Args:
         theta (Tensor) - A tensor with shape [N, 2, 3] or [N, 3, 4]. It contains a batch of affine transform parameters.
                            The data type can be float32 or float64.
-        out_shape (Tensor | list | tuple): Type can be a 1-D Tensor, list, or tuple. It is used to represent the shape of the output in an affine transformation, in the format ``[N, C, H, W]`` or ``[N, C, D, H, W]``. 
+        out_shape (Tensor | list | tuple): Type can be a 1-D Tensor, list, or tuple. It is used to represent the shape of the output in an affine transformation, in the format ``[N, C, H, W]`` or ``[N, C, D, H, W]``.
                                            When the format is ``[N, C, H, W]``, it represents the batch size, number of channels, height and width. When the format is ``[N, C, D, H, W]``, it represents the batch size, number of channels, depth, height and width.
                                            The data type must be int32.
         align_corners(bool, optional): if True, aligns the centers of the 4 (4D) or 8 (5D) corner pixels of the input and output tensors, and preserves the value of the corner pixels. Default: True
@@ -60,7 +60,7 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
                     [1, 2, 3, 3],
                     align_corners=False)
             print(y_t)
-            
+
             #[[[[ 1.0333333   0.76666665]
             #   [ 0.76666665  1.0999999 ]
             #   [ 0.5         1.4333333 ]]
@@ -84,62 +84,82 @@ def affine_grid(theta, out_shape, align_corners=True, name=None):
     if theta.shape[1] == 3:
         use_cudnn = False
     if is_compiled_with_rocm():
-        use_cudnn = False  # ROCM platform do not have MIOPEN kernel for affine_grid
+        use_cudnn = (
+            False  # ROCM platform do not have MIOPEN kernel for affine_grid
+        )
 
     if in_dygraph_mode():
-        _out_shape = out_shape.numpy().tolist() if isinstance(
-            out_shape, Variable) else out_shape
+        _out_shape = (
+            out_shape.numpy().tolist()
+            if isinstance(out_shape, Variable)
+            else out_shape
+        )
         return _C_ops.affine_grid(theta, _out_shape, use_cudnn, align_corners)
     elif in_dynamic_mode():
-        _out_shape = out_shape.numpy().tolist() if isinstance(
-            out_shape, Variable) else out_shape
-        return _legacy_C_ops.affine_grid(theta, "output_shape", _out_shape,
-                                         "align_corners", align_corners,
-                                         "use_cudnn", use_cudnn)
+        _out_shape = (
+            out_shape.numpy().tolist()
+            if isinstance(out_shape, Variable)
+            else out_shape
+        )
+        return _legacy_C_ops.affine_grid(
+            theta,
+            "output_shape",
+            _out_shape,
+            "align_corners",
+            align_corners,
+            "use_cudnn",
+            use_cudnn,
+        )
 
     helper = LayerHelper('affine_grid')
-    check_variable_and_dtype(theta, 'theta', ['float32', 'float64'],
-                             'affine_grid')
+    check_variable_and_dtype(
+        theta, 'theta', ['float32', 'float64'], 'affine_grid'
+    )
     out = helper.create_variable_for_type_inference(theta.dtype)
     ipts = {'Theta': theta}
     attrs = {"align_corners": align_corners, "use_cudnn": use_cudnn}
     if isinstance(out_shape, Variable):
         ipts['OutputShape'] = out_shape
-        check_variable_and_dtype(out_shape, 'out_shape', ['int32'],
-                                 'affine_grid')
+        check_variable_and_dtype(
+            out_shape, 'out_shape', ['int32'], 'affine_grid'
+        )
     else:
         attrs['output_shape'] = out_shape
 
-    helper.append_op(type='affine_grid',
-                     inputs=ipts,
-                     outputs={'Output': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='affine_grid',
+        inputs=ipts,
+        outputs={'Output': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
-def grid_sample(x,
-                grid,
-                mode='bilinear',
-                padding_mode='zeros',
-                align_corners=True,
-                name=None):
+def grid_sample(
+    x,
+    grid,
+    mode='bilinear',
+    padding_mode='zeros',
+    align_corners=True,
+    name=None,
+):
     """
-    This operation samples input X by using bilinear interpolation or
+    Sample input X by using bilinear interpolation or
     nearest interpolation based on flow field grid, which is usually
-    generated by :code:`affine_grid` . When the input X is 4-D Tensor, 
-    the grid of shape [N, H, W, 2] is the concatenation of (x, y) 
-    coordinates with shape [N, H, W] each, where x is indexing the 4th 
-    dimension (in width dimension) of input data x and y is indexing 
-    the 3rd dimension (in height dimension), finally results is the 
+    generated by :code:`affine_grid` . When the input X is 4-D Tensor,
+    the grid of shape [N, H, W, 2] is the concatenation of (x, y)
+    coordinates with shape [N, H, W] each, where x is indexing the 4th
+    dimension (in width dimension) of input data x and y is indexing
+    the 3rd dimension (in height dimension), finally results is the
     bilinear interpolation or nearest value of 4 nearest corner
-    points. The output tensor shape will be [N, C, H, W]. When the input X 
-    is 5-D Tensor, the grid of shape [N, D, H, W, 3] is the concatenation 
-    of (x, y, z) coordinates with shape [N, D, H, W] each, where x is 
-    indexing the 5th dimension (in width dimension) of input data x, y is 
-    indexing the 4th dimension (in height dimension) and z is indexing the 
-    3rd dimension (in depth dimension) finally results is the bilinear 
-    interpolation or nearest value of 8 nearest cornerpoints. The output 
-    tensor shape will be [N, C, D, H, W]. 
+    points. The output tensor shape will be [N, C, H, W]. When the input X
+    is 5-D Tensor, the grid of shape [N, D, H, W, 3] is the concatenation
+    of (x, y, z) coordinates with shape [N, D, H, W] each, where x is
+    indexing the 5th dimension (in width dimension) of input data x, y is
+    indexing the 4th dimension (in height dimension) and z is indexing the
+    3rd dimension (in depth dimension) finally results is the bilinear
+    interpolation or nearest value of 8 nearest cornerpoints. The output
+    tensor shape will be [N, C, D, H, W].
 
 
 
@@ -153,7 +173,7 @@ def grid_sample(x,
         grid_y = 0.5 * (grid[:, :, :, 1] + 1) * (H - 1)
 
     Step 2:
-    
+
     Indices input data X with grid (x, y) in each [H, W] area, and bilinear
     interpolate point value by 4 nearest points or nearest interpolate point value
     by nearest point.
@@ -189,12 +209,12 @@ def grid_sample(x,
 
     Args:
         x(Tensor): The input tensor, which is a 4-d tensor with shape
-                     [N, C, H, W] or a 5-d tensor with shape [N, C, D, H, W], 
-                     N is the batch size, C is the channel number, 
+                     [N, C, H, W] or a 5-d tensor with shape [N, C, D, H, W],
+                     N is the batch size, C is the channel number,
                      D, H and W is the feature depth, height and width.
                      The data type is float32 or float64.
-        grid(Tensor): Input grid tensor, which is a 4-d tensor with shape [N, grid_H, 
-                        grid_W, 2] or a 5-d tensor with shape [N, grid_D, grid_H, 
+        grid(Tensor): Input grid tensor, which is a 4-d tensor with shape [N, grid_H,
+                        grid_W, 2] or a 5-d tensor with shape [N, grid_D, grid_H,
                         grid_W, 3]. The data type is float32 or float64.
         mode(str, optional): The interpolation method which can be 'bilinear' or 'nearest'.
                          Default: 'bilinear'.
@@ -209,17 +229,18 @@ def grid_sample(x,
                              None by default.
 
     Returns:
-        Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid, 
+
+        Tensor, The shape of output is [N, C, grid_H, grid_W] or [N, C, grid_D, grid_H, grid_W] in which `grid_D` is the depth of grid,
                 `grid_H` is the height of grid and `grid_W` is the width of grid. The data type is same as input tensor.
 
     Examples:
 
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
-            
-            # x shape=[1, 1, 3, 3]           
+
+            # x shape=[1, 1, 3, 3]
             x = paddle.to_tensor([[[[-0.6,  0.8, -0.5],
                                     [-0.5,  0.2,  1.2],
                                     [ 1.4,  0.3, -0.2]]]],dtype='float64')
@@ -243,7 +264,7 @@ def grid_sample(x,
                 padding_mode='border',
                 align_corners=True)
             print(y_t)
-            
+
             # output shape = [1, 1, 3, 4]
             # [[[[ 0.34   0.016  0.086 -0.448]
             #    [ 0.55  -0.076  0.35   0.59 ]
@@ -254,22 +275,33 @@ def grid_sample(x,
     _padding_modes = ['zeros', 'reflection', 'border']
     if mode not in _modes:
         raise ValueError(
-            "The mode of grid sample function should be in {}, but got: {}".
-            format(_modes, mode))
+            "The mode of grid sample function should be in {}, but got: {}".format(
+                _modes, mode
+            )
+        )
     if padding_mode not in _padding_modes:
         raise ValueError(
-            "The padding mode of grid sample function should be in {}, but got: {}"
-            .format(_padding_modes, padding_mode))
+            "The padding mode of grid sample function should be in {}, but got: {}".format(
+                _padding_modes, padding_mode
+            )
+        )
 
     if not isinstance(align_corners, bool):
-        raise ValueError("The align corners should be bool, but got: {}".format(
-            align_corners))
+        raise ValueError(
+            "The align corners should be bool, but got: {}".format(
+                align_corners
+            )
+        )
 
     cudnn_version = get_cudnn_version()
     use_cudnn = False
-    if not is_compiled_with_rocm() and (
-            cudnn_version is not None
-    ) and align_corners and mode == 'bilinear' and padding_mode == 'zeros':
+    if (
+        not is_compiled_with_rocm()
+        and (cudnn_version is not None)
+        and align_corners
+        and mode == 'bilinear'
+        and padding_mode == 'zeros'
+    ):
         use_cudnn = True
         # CUDNN always computes gradients for all inputs
         x.stop_gradient = False
@@ -281,26 +313,37 @@ def grid_sample(x,
     if in_dygraph_mode():
         return _C_ops.grid_sample(x, grid, mode, padding_mode, align_corners)
     elif in_dynamic_mode():
-        attrs = ('mode', mode, 'padding_mode', padding_mode, 'align_corners',
-                 align_corners, 'use_cudnn', use_cudnn)
+        attrs = (
+            'mode',
+            mode,
+            'padding_mode',
+            padding_mode,
+            'align_corners',
+            align_corners,
+            'use_cudnn',
+            use_cudnn,
+        )
         out = getattr(_legacy_C_ops, 'grid_sampler')(x, grid, *attrs)
     else:
         helper = LayerHelper("grid_sample", **locals())
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'grid_sample')
-        check_variable_and_dtype(grid, 'grid', ['float32', 'float64'],
-                                 'grid_sample')
+        check_variable_and_dtype(
+            grid, 'grid', ['float32', 'float64'], 'grid_sample'
+        )
         ipts = {'X': x, 'Grid': grid}
         attrs = {
             'mode': mode,
             'padding_mode': padding_mode,
             'align_corners': align_corners,
-            'use_cudnn': use_cudnn
+            'use_cudnn': use_cudnn,
         }
         out = helper.create_variable_for_type_inference(x.dtype)
-        helper.append_op(type='grid_sampler',
-                         inputs=ipts,
-                         attrs=attrs,
-                         outputs={'Output': out})
+        helper.append_op(
+            type='grid_sampler',
+            inputs=ipts,
+            attrs=attrs,
+            outputs={'Output': out},
+        )
     return out
 
 
@@ -337,24 +380,25 @@ def pixel_shuffle(x, upscale_factor, data_format="NCHW", name=None):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
-            "But recevie Attr(data_format): {} ".format(data_format))
+            "But recevie Attr(data_format): {} ".format(data_format)
+        )
     if in_dygraph_mode():
         return _C_ops.pixel_shuffle(x, upscale_factor, data_format)
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.pixel_shuffle(x, "upscale_factor", upscale_factor,
-                                           "data_format", data_format)
+        return _legacy_C_ops.pixel_shuffle(
+            x, "upscale_factor", upscale_factor, "data_format", data_format
+        )
 
     helper = LayerHelper("pixel_shuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_shuffle')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="pixel_shuffle",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={
-                         "upscale_factor": upscale_factor,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="pixel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"upscale_factor": upscale_factor, "data_format": data_format},
+    )
     return out
 
 
@@ -384,8 +428,10 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
     """
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 4D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
 
     if not isinstance(downscale_factor, int):
         raise TypeError("Downscale factor must be int type")
@@ -396,23 +442,26 @@ def pixel_unshuffle(x, downscale_factor, data_format="NCHW", name=None):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
-            "But recevie Attr(data_format): {} ".format(data_format))
+            "But recevie Attr(data_format): {} ".format(data_format)
+        )
 
     if _non_static_mode():
-        return _legacy_C_ops.pixel_unshuffle(x, "downscale_factor",
-                                             downscale_factor, "data_format",
-                                             data_format)
+        return _legacy_C_ops.pixel_unshuffle(
+            x, "downscale_factor", downscale_factor, "data_format", data_format
+        )
 
     helper = LayerHelper("pixel_unshuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'pixel_unshuffle')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="pixel_unshuffle",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={
-                         "downscale_factor": downscale_factor,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="pixel_unshuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={
+            "downscale_factor": downscale_factor,
+            "data_format": data_format,
+        },
+    )
     return out
 
 
@@ -453,8 +502,10 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
     """
     if len(x.shape) != 4:
         raise ValueError(
-            "Input x should be 4D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 4D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
 
     if not isinstance(groups, int):
         raise TypeError("groups must be int type")
@@ -465,20 +516,21 @@ def channel_shuffle(x, groups, data_format="NCHW", name=None):
     if data_format not in ["NCHW", "NHWC"]:
         raise ValueError(
             "Attr(data_format) should be 'NCHW' or 'NHWC'."
-            "But recevie Attr(data_format): {} ".format(data_format))
+            "But recevie Attr(data_format): {} ".format(data_format)
+        )
 
     if _non_static_mode():
-        return _legacy_C_ops.channel_shuffle(x, "groups", groups, "data_format",
-                                             data_format)
+        return _legacy_C_ops.channel_shuffle(
+            x, "groups", groups, "data_format", data_format
+        )
 
     helper = LayerHelper("channel_shuffle", **locals())
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'channel_shuffle')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="channel_shuffle",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={
-                         "groups": groups,
-                         "data_format": data_format
-                     })
+    helper.append_op(
+        type="channel_shuffle",
+        inputs={"X": x},
+        outputs={"Out": out},
+        attrs={"groups": groups, "data_format": data_format},
+    )
     return out
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index dc80743de51c5e..6736a9a6128627 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -28,7 +28,7 @@ class CELU(Layer):
     CELU Activation.
 
     .. math::
-    
+
         CELU(x) = max(0, x) + min(0, \alpha * (e^{x/\alpha}-1))
 
     Parameters:
@@ -44,7 +44,7 @@ class CELU(Layer):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.to_tensor([[-1. ,6.], [1., 15.6]])
             m = paddle.nn.CELU(0.2)
             out = m(x)
@@ -140,7 +140,7 @@ class GELU(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.to_tensor([[-1, 0.5],[1, 1.5]])
@@ -215,9 +215,8 @@ def extra_repr(self):
 
 class Hardswish(Layer):
     r"""
-    Hardswish activation
-
-    Hardswish is proposed in MobileNetV3, and performs better in computational stability
+    Hardswish activation. Create a callable object of `Hardswish`. Hardswish
+    is proposed in MobileNetV3, and performs better in computational stability
     and efficiency compared to swish function. For more details please refer
     to: https://arxiv.org/pdf/1905.02244.pdf
 
@@ -284,13 +283,13 @@ class Tanh(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
             m = paddle.nn.Tanh()
             out = m(x)
             print(out)
-            # [-0.37994896 -0.19737532  0.09966799  0.29131261]
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.37994894, -0.19737533,  0.09966800,  0.29131261])
     """
 
     def __init__(self, name=None):
@@ -307,7 +306,7 @@ def extra_repr(self):
 
 class Hardtanh(Layer):
     r"""
-    Hardtanh Activation
+    Hardtanh Activation. Create a callable object of `Hardtanh`.
 
     .. math::
 
@@ -404,12 +403,14 @@ class PReLU(Layer):
             #    [ 6.  ,  7.  ,  8.  ,  9.  ]]]]
     """
 
-    def __init__(self,
-                 num_parameters=1,
-                 init=0.25,
-                 weight_attr=None,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        num_parameters=1,
+        init=0.25,
+        weight_attr=None,
+        data_format="NCHW",
+        name=None,
+    ):
         super(PReLU, self).__init__()
         self._num_parameters = num_parameters
         self._init = init
@@ -417,12 +418,13 @@ def __init__(self,
         self._name = name
         self._data_format = data_format
 
-        self._weight = self.create_parameter(attr=self._weight_attr,
-                                             shape=[self._num_parameters],
-                                             dtype=get_default_dtype(),
-                                             is_bias=False,
-                                             default_initializer=Constant(
-                                                 self._init))
+        self._weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=[self._num_parameters],
+            dtype=get_default_dtype(),
+            is_bias=False,
+            default_initializer=Constant(self._init),
+        )
 
     def forward(self, x):
         return F.prelu(x, self._weight, data_format=self._data_format)
@@ -430,8 +432,12 @@ def forward(self, x):
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'num_parameters={}, data_format={}, init={}, dtype={}{}'.format(
-            self._num_parameters, self._data_format, self._init, self._dtype,
-            name_str)
+            self._num_parameters,
+            self._data_format,
+            self._init,
+            self._dtype,
+            name_str,
+        )
 
 
 class RReLU(Layer):
@@ -505,22 +511,22 @@ class RReLU(Layer):
             #   [ 6.          7.          8.          9.        ]]]]
     """
 
-    def __init__(self, lower=1. / 8., upper=1. / 3., name=None):
+    def __init__(self, lower=1.0 / 8.0, upper=1.0 / 3.0, name=None):
         super(RReLU, self).__init__()
         self._lower = lower
         self._upper = upper
         self._name = name
 
     def forward(self, x):
-        return F.rrelu(x,
-                       lower=self._lower,
-                       upper=self._upper,
-                       training=self.training)
+        return F.rrelu(
+            x, lower=self._lower, upper=self._upper, training=self.training
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'lower={}, upper={}, training={}, dtype={}{}'.format(
-            self._lower, self._upper, self.training, self._dtype, name_str)
+            self._lower, self._upper, self.training, self._dtype, name_str
+        )
 
 
 class ReLU(Layer):
@@ -639,10 +645,12 @@ class SELU(Layer):
             # [[0, 1.050701],[2.101402, 3.152103]]
     """
 
-    def __init__(self,
-                 scale=1.0507009873554804934193349852946,
-                 alpha=1.6732632423543772848170429916717,
-                 name=None):
+    def __init__(
+        self,
+        scale=1.0507009873554804934193349852946,
+        alpha=1.6732632423543772848170429916717,
+        name=None,
+    ):
         super(SELU, self).__init__()
         self._scale = scale
         self._alpha = alpha
@@ -653,13 +661,15 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'scale={:.16f}, alpha={:.16f}{}'.format(self._scale, self._alpha,
-                                                       name_str)
+        return 'scale={:.16f}, alpha={:.16f}{}'.format(
+            self._scale, self._alpha, name_str
+        )
 
 
 class LeakyReLU(Layer):
     r"""
-    Leaky ReLU Activation.
+    Leaky ReLU Activation. Create a callable object of `LeakyReLU` to calculate
+    the `LeakyReLU` of input `x`.
 
     .. math::
 
@@ -686,10 +696,9 @@ class LeakyReLU(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             m = paddle.nn.LeakyReLU()
-            x = paddle.to_tensor(np.array([-2, 0, 1], 'float32'))
+            x = paddle.to_tensor([-2.0, 0, 1])
             out = m(x)  # [-0.02, 0., 1.]
     """
 
@@ -707,15 +716,15 @@ def extra_repr(self):
 
 
 class Sigmoid(Layer):
-    """
+    r"""
     this interface is used to construct a callable object of the ``Sigmoid`` class. This layer calcluate the `sigmoid` of input x.
 
     .. math::
 
-        Sigmoid(x) = \\frac{1}{1 + e^{-x}}
+        sigmoid(x) = \frac{1}{1 + e^{-x}}
 
     Parameters:
-        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Shape:
         x: N-D tensor, available dtype is float16, float32, float64.
@@ -727,11 +736,11 @@ class Sigmoid(Layer):
 
         .. code-block:: python
 
-          import paddle
+            import paddle
 
-          m = paddle.nn.Sigmoid()
-          x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
-          out = m(x) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
+            m = paddle.nn.Sigmoid()
+            x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
+            out = m(x) # [0.7310586, 0.880797, 0.95257413, 0.98201376]
     """
 
     def __init__(self, name=None):
@@ -748,8 +757,8 @@ def extra_repr(self):
 
 class Hardsigmoid(Layer):
     r"""
-    This interface is used to construct a callable object of the ``Hardsigmoid`` class.
-    This layer calcluate the `hardsigmoid` of input x.
+    ``Hardsigmoid`` Activiation Layers, Construct a callable object of
+    the ``Hardsigmoid`` class. This layer calcluate the `hardsigmoid` of input x.
 
     A 3-part piecewise linear approximation of sigmoid(https://arxiv.org/abs/1603.00391),
     which is much faster than sigmoid.
@@ -765,7 +774,6 @@ class Hardsigmoid(Layer):
                 \end{array}
             \right.
 
-
     Parameters:
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
@@ -803,15 +811,15 @@ class Softplus(Layer):
     Softplus Activation
 
     .. math::
-
-        Softplus(x) = \frac{1}{beta} * \log(1 + e^{beta * x}) \\
-        \text{For numerical stability, the implementation reverts to the linear function when: beta * x > threshold.}
+        softplus(x)=\begin{cases}
+                \frac{1}{\beta} * \log(1 + e^{\beta * x}),&x\leqslant\frac{\varepsilon}{\beta};\\
+                x,&x>\frac{\varepsilon}{\beta}.
+            \end{cases}
 
     Parameters:
-        beta (float, optional): The value of beta for Softplus. Default is 1
-        threshold (float, optional): The value of threshold for Softplus. Default is 20
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        beta (float, optional): The value of :math:`\beta` for Softplus. Default is 1
+        threshold (float, optional): The value of :math:`\varepsilon` for Softplus. Default is 20
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Shape:
         - input: Tensor with any shape.
@@ -821,9 +829,8 @@ class Softplus(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3], dtype='float32')
             m = paddle.nn.Softplus()
             out = m(x) # [0.513015, 0.598139, 0.744397, 0.854355]
     """
@@ -839,8 +846,9 @@ def forward(self, x):
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'beta={}, threshold={}{}'.format(self._beta, self._threshold,
-                                                name_str)
+        return 'beta={}, threshold={}{}'.format(
+            self._beta, self._threshold, name_str
+        )
 
 
 class Softshrink(Layer):
@@ -872,11 +880,13 @@ class Softshrink(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.9, -0.2, 0.1, 0.8]))
+            x = paddle.to_tensor([-0.9, -0.2, 0.1, 0.8])
             m = paddle.nn.Softshrink()
-            out = m(x) # [-0.4, 0, 0, 0.3]
+            out = m(x)
+            print(out)
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.39999998,  0.        ,  0.        ,  0.30000001])
     """
 
     def __init__(self, threshold=0.5, name=None):
@@ -912,11 +922,13 @@ class Softsign(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
             m = paddle.nn.Softsign()
-            out = m(x) # [-0.285714, -0.166667, 0.0909091, 0.230769]
+            out = m(x)
+            print(out)
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.28571430, -0.16666666,  0.09090909,  0.23076925])
     """
 
     def __init__(self, name=None):
@@ -951,11 +963,13 @@ class Swish(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-2., 0., 1.]))
+            x = paddle.to_tensor([-2., 0., 1.])
             m = paddle.nn.Swish()
-            out = m(x) # [-0.238406, 0., 0.731059]
+            out = m(x)
+            print(out)
+            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.23840584,  0.        ,  0.73105854])
     """
 
     def __init__(self, name=None):
@@ -1035,11 +1049,13 @@ class Tanhshrink(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
+            x = paddle.to_tensor([-0.4, -0.2, 0.1, 0.3])
             m = paddle.nn.Tanhshrink()
-            out = m(x) # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
+            out = m(x)
+            print(out)
+            # Tensor(shape=[4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [-0.02005106, -0.00262468,  0.00033200,  0.00868741])
     """
 
     def __init__(self, name=None):
@@ -1082,11 +1098,13 @@ class ThresholdedReLU(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([2., 0., 1.]))
+            x = paddle.to_tensor([2., 0., 1.])
             m = paddle.nn.ThresholdedReLU()
-            out = m(x) # [2., 0., 0.]
+            out = m(x)
+            print(out)
+            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [2., 0., 0.])
     """
 
     def __init__(self, threshold=1.0, name=None):
@@ -1103,16 +1121,17 @@ def extra_repr(self):
 
 
 class Silu(Layer):
-    """
-    Silu Activation.
+    r"""
+    Silu Activation
+
     .. math::
 
-        Silu(x) = \frac{x}{1 + e^{-x}}
+        silu(x) = \frac{x}{1 + \mathrm{e}^{-x}}
+
+    Where :math:`x` is the input Tensor.
 
     Parameters:
-        x (Tensor): The input Tensor with data type float32, or float64.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Shape:
         - input: Tensor with any shape.
@@ -1273,15 +1292,13 @@ class Softmax(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = np.array([[[2.0, 3.0, 4.0, 5.0],
+            x = paddle.to_tensor([[[2.0, 3.0, 4.0, 5.0],
                         [3.0, 4.0, 5.0, 6.0],
                         [7.0, 8.0, 8.0, 9.0]],
                         [[1.0, 2.0, 3.0, 4.0],
                         [5.0, 6.0, 7.0, 8.0],
-                        [6.0, 7.0, 8.0, 9.0]]], 'float32')
-            x = paddle.to_tensor(x)
+                        [6.0, 7.0, 8.0, 9.0]]], dtype='float32')
             m = paddle.nn.Softmax()
             out = m(x)
             # [[[0.0320586 , 0.08714432, 0.23688282, 0.64391426],
@@ -1366,7 +1383,7 @@ def extra_repr(self):
 
 class Maxout(Layer):
     r"""
-    Maxout Activation.
+    Maxout Activation. Create a callable object of `Maxout`.
 
     Assumed the input shape is (N, Ci, H, W).
     The output shape is (N, Co, H, W).
@@ -1433,15 +1450,16 @@ def extra_repr(self):
 
 class Softmax2D(Layer):
     r"""
+
     Softmax2D Activation.
     Given a Tensor with shape (B, C, H, W) or (C, H, W), it will apply Softmax to each location (C, h_i, w_j).
     The sum of result in each location (C, H_i, W_j) will be one.
 
     Shape:
         - Input: :math:`(B, C, H, W)` or :math:`(C, H, W)`
-        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)`(same as input)
+        - Output: :math:`(B, C, H, W)` or :math:`(C, H, W)` (same as input)
 
-    Return:
+    Returns:
         A Tensor of the same shape and dtype as input with value in range [0, 1].
 
     Examples:
@@ -1466,6 +1484,7 @@ class Softmax2D(Layer):
             #   [[0.42368975 0.51082766 0.47752273 0.5258871 ]
             #    [0.66754097 0.47182566 0.5187628  0.5402329 ]
             #    [0.49014282 0.46369177 0.50340754 0.5289428 ]]]]
+
     """
 
     def __init__(self, name=None):
@@ -1474,8 +1493,11 @@ def __init__(self, name=None):
         self._name = name
 
     def forward(self, x):
-        assert x.ndim == 3 or x.ndim == 4, "Softmax2D requires a 3D or 4D tensor as input. Received: {}D.".format(
-            x.ndim)
+        assert (
+            x.ndim == 3 or x.ndim == 4
+        ), "Softmax2D requires a 3D or 4D tensor as input. Received: {}D.".format(
+            x.ndim
+        )
         return F.softmax(x, axis=-3, dtype=self._dtype, name=self._name)
 
     def extra_repr(self):
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index ee5641f5d1257e..45c08bf2b4d10e 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -145,37 +145,43 @@ class Linear(Layer):
           #     [2.1077576  2.1077576  2.1077576  2.1077576 ]]
     """
 
-    def __init__(self,
-                 in_features,
-                 out_features,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        in_features,
+        out_features,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super(Linear, self).__init__()
         self._dtype = self._helper.get_default_dtype()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
-        self.weight = self.create_parameter(shape=[in_features, out_features],
-                                            attr=self._weight_attr,
-                                            dtype=self._dtype,
-                                            is_bias=False)
-        self.bias = self.create_parameter(shape=[out_features],
-                                          attr=self._bias_attr,
-                                          dtype=self._dtype,
-                                          is_bias=True)
+        self.weight = self.create_parameter(
+            shape=[in_features, out_features],
+            attr=self._weight_attr,
+            dtype=self._dtype,
+            is_bias=False,
+        )
+        self.bias = self.create_parameter(
+            shape=[out_features],
+            attr=self._bias_attr,
+            dtype=self._dtype,
+            is_bias=True,
+        )
         self.name = name
 
     def forward(self, input):
-        out = F.linear(x=input,
-                       weight=self.weight,
-                       bias=self.bias,
-                       name=self.name)
+        out = F.linear(
+            x=input, weight=self.weight, bias=self.bias, name=self.name
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
         return 'in_features={}, out_features={}, dtype={}{}'.format(
-            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str)
+            self.weight.shape[0], self.weight.shape[1], self._dtype, name_str
+        )
 
 
 class Upsample(Layer):
@@ -325,8 +331,8 @@ class Upsample(Layer):
         x (Tensor): 3-D, 4-D or 5-D Tensor, its data type is float32, float64, or uint8,
                           its data format is specified by :attr:`data_format`.
         size (list|tuple|Tensor|None): Output shape of image resize
-             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w) 
-             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor. 
+             layer, the shape is (out_w, ) when input is a 3-D Tensor, the shape is (out_h, out_w)
+             when input is a 4-D Tensor and is (out_d, out_h, out_w) when input is a 5-D Tensor.
              Default: None. If a list/tuple, each element can be an integer or a Tensor of shape: [1].
              If a Tensor , its dimensions size should be a 1.
         scale_factor (float|Tensor|list|tuple|None): The multiplier for the input height or width. At
@@ -354,48 +360,31 @@ class Upsample(Layer):
         A 3-D Tensor of the shape (num_batches, channels, out_w) or (num_batches, out_w, channels),
         A 4-D Tensor of the shape (num_batches, channels, out_h, out_w) or (num_batches, out_h, out_w, channels),
         or 5-D Tensor of the shape (num_batches, channels, out_d, out_h, out_w) or (num_batches, out_d, out_h, out_w, channels).
-    Raises:
-        TypeError: size should be a list or tuple or Tensor.
-        ValueError: The 'mode' of image_resize can only be 'linear', 'bilinear',
-                    'trilinear', 'bicubic', or 'nearest' currently.
-        ValueError: 'linear' only support 3-D tensor.
-        ValueError: 'bilinear' and 'bicubic'  only support 4-D tensor.
-        ValueError: 'trilinear' only support 5-D tensor.
-        ValueError: 'nearest' only support 4-D or 5-D tensor.
-        ValueError: One of size and scale_factor must not be None.
-        ValueError: size length should be 1 for input 3-D tensor.
-        ValueError: size length should be 2 for input 4-D tensor.
-        ValueError: size length should be 3 for input 5-D tensor.
-        ValueError: scale_factor should be greater than zero.
-        TypeError: align_corners should be a bool value
-        ValueError: align_mode can only be '0' or '1'
-        ValueError: data_format can only be 'NCW', 'NWC', 'NCHW', 'NHWC', 'NCDHW' or 'NDHWC'.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            import paddle.nn as nn
-            import numpy as np
 
-            input_data = np.random.rand(2,3,6,10).astype("float32")
-            upsample_out  = paddle.nn.Upsample(size=[12,12])
+            input = paddle.rand([2,3,6,10], dtype="float32")
+            upsample_out = paddle.nn.Upsample(size=[12,12])
 
-            input = paddle.to_tensor(input_data)
             output = upsample_out(x=input)
             print(output.shape)
-            # [2L, 3L, 12L, 12L]
+            # [2, 3, 12, 12]
 
     """
 
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 mode='nearest',
-                 align_corners=False,
-                 align_mode=0,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self,
+        size=None,
+        scale_factor=None,
+        mode='nearest',
+        align_corners=False,
+        align_mode=0,
+        data_format='NCHW',
+        name=None,
+    ):
         super(Upsample, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -406,14 +395,16 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(x,
-                            size=self.size,
-                            scale_factor=self.scale_factor,
-                            mode=self.mode,
-                            align_corners=self.align_corners,
-                            align_mode=self.align_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode=self.mode,
+            align_corners=self.align_corners,
+            align_mode=self.align_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
         return out
 
@@ -424,8 +415,13 @@ def extra_repr(self):
             main_str = 'size={}'.format(self.size)
         name_str = ', name={}'.format(self.name) if self.name else ''
         return '{}, mode={}, align_corners={}, align_mode={}, data_format={}{}'.format(
-            main_str, self.mode, self.align_corners, self.align_mode,
-            self.data_format, name_str)
+            main_str,
+            self.mode,
+            self.align_corners,
+            self.align_mode,
+            self.data_format,
+            name_str,
+        )
 
 
 class UpsamplingNearest2D(Layer):
@@ -479,11 +475,9 @@ class UpsamplingNearest2D(Layer):
             # [2L, 3L, 12L, 12L]
     """
 
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self, size=None, scale_factor=None, data_format='NCHW', name=None
+    ):
         super(UpsamplingNearest2D, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -491,14 +485,16 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(x,
-                            size=self.size,
-                            scale_factor=self.scale_factor,
-                            mode='nearest',
-                            align_corners=False,
-                            align_mode=0,
-                            data_format=self.data_format,
-                            name=self.name)
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='nearest',
+            align_corners=False,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
         return out
 
@@ -508,8 +504,9 @@ def extra_repr(self):
         else:
             main_str = 'size={}'.format(self.size)
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return '{}, data_format={}{}'.format(main_str, self.data_format,
-                                             name_str)
+        return '{}, data_format={}{}'.format(
+            main_str, self.data_format, name_str
+        )
 
 
 class UpsamplingBilinear2D(Layer):
@@ -564,11 +561,9 @@ class UpsamplingBilinear2D(Layer):
             # [2L, 3L, 12L, 12L]
     """
 
-    def __init__(self,
-                 size=None,
-                 scale_factor=None,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self, size=None, scale_factor=None, data_format='NCHW', name=None
+    ):
         super(UpsamplingBilinear2D, self).__init__()
         self.size = size
         self.scale_factor = scale_factor
@@ -576,14 +571,16 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.interpolate(x,
-                            size=self.size,
-                            scale_factor=self.scale_factor,
-                            mode='bilinear',
-                            align_corners=True,
-                            align_mode=0,
-                            data_format=self.data_format,
-                            name=self.name)
+        out = F.interpolate(
+            x,
+            size=self.size,
+            scale_factor=self.scale_factor,
+            mode='bilinear',
+            align_corners=True,
+            align_mode=0,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
         return out
 
@@ -593,8 +590,9 @@ def extra_repr(self):
         else:
             main_str = 'size={}'.format(self.size)
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return '{}, data_format={}{}'.format(main_str, self.data_format,
-                                             name_str)
+        return '{}, data_format={}{}'.format(
+            main_str, self.data_format, name_str
+        )
 
 
 class Bilinear(Layer):
@@ -640,24 +638,24 @@ class Bilinear(Layer):
        .. code-block:: python
 
         import paddle
-        import numpy
 
-        layer1 = numpy.random.random((5, 5)).astype('float32')
-        layer2 = numpy.random.random((5, 4)).astype('float32')
+        layer1 = paddle.rand((5, 5)).astype('float32')
+        layer2 = paddle.rand((5, 4)).astype('float32')
         bilinear = paddle.nn.Bilinear(
             in1_features=5, in2_features=4, out_features=1000)
-        result = bilinear(paddle.to_tensor(layer1),
-                        paddle.to_tensor(layer2))     # result shape [5, 1000]
+        result = bilinear(layer1,layer2)    # result shape [5, 1000]
 
     """
 
-    def __init__(self,
-                 in1_features,
-                 in2_features,
-                 out_features,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        in1_features,
+        in2_features,
+        out_features,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super(Bilinear, self).__init__()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -668,17 +666,23 @@ def __init__(self,
         self._dtype = self._helper.get_default_dtype()
 
         weight_shape = [
-            self._out_features, self._in1_features, self._in2_features
+            self._out_features,
+            self._in1_features,
+            self._in2_features,
         ]
-        self.weight = self.create_parameter(attr=self._weight_attr,
-                                            shape=weight_shape,
-                                            dtype=self._dtype,
-                                            is_bias=False)
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=weight_shape,
+            dtype=self._dtype,
+            is_bias=False,
+        )
         bias_shape = [1, self._out_features]
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=bias_shape,
-                                          dtype=self._dtype,
-                                          is_bias=True)
+        self.bias = self.create_parameter(
+            attr=self._bias_attr,
+            shape=bias_shape,
+            dtype=self._dtype,
+            is_bias=True,
+        )
 
     def forward(self, x1, x2):
         return F.bilinear(x1, x2, self.weight, self.bias, self._name)
@@ -686,8 +690,12 @@ def forward(self, x1, x2):
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'in1_features={}, in2_features={}, out_features={}, dtype={}{}'.format(
-            self._in1_features, self._in2_features, self._out_features,
-            self._dtype, name_str)
+            self._in1_features,
+            self._in2_features,
+            self._out_features,
+            self._dtype,
+            name_str,
+        )
 
 
 class Dropout(Layer):
@@ -727,18 +735,23 @@ class Dropout(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = np.array([[1,2,3], [4,5,6]]).astype('float32')
-            x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[1,2,3], [4,5,6]], dtype="float32")
             m = paddle.nn.Dropout(p=0.5)
+
             y_train = m(x)
+            print(y_train)
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[2., 0., 6.],
+            #         [0., 0., 0.]])
+
             m.eval()  # switch the model to test phase
             y_test = m(x)
-            print(x)
-            print(y_train)
             print(y_test)
-   """
+            # Tensor(shape=[2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1., 2., 3.],
+            #         [4., 5., 6.]])
+    """
 
     def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
         super(Dropout, self).__init__()
@@ -749,18 +762,21 @@ def __init__(self, p=0.5, axis=None, mode="upscale_in_train", name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout(input,
-                        p=self.p,
-                        axis=self.axis,
-                        training=self.training,
-                        mode=self.mode,
-                        name=self.name)
+        out = F.dropout(
+            input,
+            p=self.p,
+            axis=self.axis,
+            training=self.training,
+            mode=self.mode,
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'p={}, axis={}, mode={}{}'.format(self.p, self.axis, self.mode,
-                                                 name_str)
+        return 'p={}, axis={}, mode={}{}'.format(
+            self.p, self.axis, self.mode, name_str
+        )
 
 
 class Dropout2D(Layer):
@@ -789,18 +805,36 @@ class Dropout2D(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
-            x = paddle.to_tensor(x)
+            x = paddle.rand([2, 2, 1, 3], dtype="float32")
+            print(x)
+            # Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[0.10052059, 0.93890846, 0.45351565]],
+            #          [[0.47507706, 0.45021373, 0.11331241]]],
+
+            #         [[[0.53358698, 0.97375143, 0.34997326]],
+            #          [[0.24758087, 0.52628899, 0.17970420]]]])
+
             m = paddle.nn.Dropout2D(p=0.5)
             y_train = m(x)
+            print(y_train)
+            # Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[0.        , 0.        , 0.        ]],
+            #          [[0.95015413, 0.90042746, 0.22662482]]],
+
+            #         [[[1.06717396, 1.94750285, 0.69994652]],
+            #          [[0.        , 0.        , 0.        ]]]])
+
             m.eval()  # switch the model to test phase
             y_test = m(x)
-            print(x)
-            print(y_train)
             print(y_test)
-   """
+            # Tensor(shape=[2, 2, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[0.10052059, 0.93890846, 0.45351565]],
+            #          [[0.47507706, 0.45021373, 0.11331241]]],
+
+            #         [[[0.53358698, 0.97375143, 0.34997326]],
+            #          [[0.24758087, 0.52628899, 0.17970420]]]])
+    """
 
     def __init__(self, p=0.5, data_format='NCHW', name=None):
         super(Dropout2D, self).__init__()
@@ -810,17 +844,20 @@ def __init__(self, p=0.5, data_format='NCHW', name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout2d(input,
-                          p=self.p,
-                          training=self.training,
-                          data_format=self.data_format,
-                          name=self.name)
+        out = F.dropout2d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'p={}, data_format={}{}'.format(self.p, self.data_format,
-                                               name_str)
+        return 'p={}, data_format={}{}'.format(
+            self.p, self.data_format, name_str
+        )
 
 
 class Dropout3D(Layer):
@@ -849,18 +886,48 @@ class Dropout3D(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
-            x = paddle.to_tensor(x)
+            x = paddle.arange(24, dtype="float32").reshape((1, 2, 2, 2, 3))
+            print(x)
+            # Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[[0. , 1. , 2. ],
+            #            [3. , 4. , 5. ]],
+            #           [[6. , 7. , 8. ],
+            #            [9. , 10., 11.]]],
+
+            #          [[[12., 13., 14.],
+            #            [15., 16., 17.]],
+            #           [[18., 19., 20.],
+            #            [21., 22., 23.]]]]])
+
             m = paddle.nn.Dropout3D(p=0.5)
             y_train = m(x)
+            print(y_train)
+            # Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[[0. , 2. , 4. ],
+            #            [6. , 8. , 10.]],
+            #           [[12., 14., 16.],
+            #            [18., 20., 22.]]],
+
+            #          [[[0. , 0. , 0. ],
+            #            [0. , 0. , 0. ]],
+            #           [[0. , 0. , 0. ],
+            #            [0. , 0. , 0. ]]]]])
+
             m.eval()  # switch the model to test phase
             y_test = m(x)
-            print(x)
-            print(y_train)
             print(y_test)
-   """
+            # Tensor(shape=[1, 2, 2, 2, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[[[0. , 1. , 2. ],
+            #            [3. , 4. , 5. ]],
+            #           [[6. , 7. , 8. ],
+            #            [9. , 10., 11.]]],
+
+            #          [[[12., 13., 14.],
+            #            [15., 16., 17.]],
+            #           [[18., 19., 20.],
+            #            [21., 22., 23.]]]]])
+    """
 
     def __init__(self, p=0.5, data_format='NCDHW', name=None):
         super(Dropout3D, self).__init__()
@@ -870,17 +937,20 @@ def __init__(self, p=0.5, data_format='NCDHW', name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.dropout3d(input,
-                          p=self.p,
-                          training=self.training,
-                          data_format=self.data_format,
-                          name=self.name)
+        out = F.dropout3d(
+            input,
+            p=self.p,
+            training=self.training,
+            data_format=self.data_format,
+            name=self.name,
+        )
         return out
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'p={}, data_format={}{}'.format(self.p, self.data_format,
-                                               name_str)
+        return 'p={}, data_format={}{}'.format(
+            self.p, self.data_format, name_str
+        )
 
 
 class AlphaDropout(Layer):
@@ -907,19 +977,22 @@ class AlphaDropout(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = np.array([[-1, 1], [-1, 1]]).astype('float32')
-            x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[-1, 1], [-1, 1]], dtype="float32")
             m = paddle.nn.AlphaDropout(p=0.5)
             y_train = m(x)
+            print(y_train)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[-0.77919382,  1.66559887],
+            #         [-0.77919382, -0.77919382]])
+
             m.eval()  # switch the model to test phase
             y_test = m(x)
-            print(x)
-            print(y_train)
-            # [[-0.10721093, 1.6655989 ], [-0.7791938, -0.7791938]] (randomly)
             print(y_test)
-   """
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[-1.,  1.],
+            #         [-1.,  1.]])
+    """
 
     def __init__(self, p=0.5, name=None):
         super(AlphaDropout, self).__init__()
@@ -927,10 +1000,9 @@ def __init__(self, p=0.5, name=None):
         self.name = name
 
     def forward(self, input):
-        out = F.alpha_dropout(input,
-                              p=self.p,
-                              training=self.training,
-                              name=self.name)
+        out = F.alpha_dropout(
+            input, p=self.p, training=self.training, name=self.name
+        )
         return out
 
     def extra_repr(self):
@@ -980,12 +1052,9 @@ class Pad1D(Layer):
             #   [0. 4. 5. 6. 0. 0.]]]
     """
 
-    def __init__(self,
-                 padding,
-                 mode='constant',
-                 value=0.0,
-                 data_format="NCL",
-                 name=None):
+    def __init__(
+        self, padding, mode='constant', value=0.0, data_format="NCL", name=None
+    ):
         super(Pad1D, self).__init__()
         self._pad = _npairs(padding, 1)
         self._mode = mode
@@ -994,17 +1063,20 @@ def __init__(self,
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str)
+            self._pad, self._mode, self._value, self._data_format, name_str
+        )
 
 
 class Pad2D(Layer):
@@ -1016,8 +1088,8 @@ class Pad2D(Layer):
 
     Parameters:
         padding (Tensor|list[int]|int): The padding size with data type int. If is int, use the
-            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded. 
-            The pad has the form (pad_left, pad_right, pad_top, pad_bottom). 
+            same padding in all dimensions. Else [len(padding)/2] dimensions of input will be padded.
+            The pad has the form (pad_left, pad_right, pad_top, pad_bottom).
         mode (str, optional): Four modes: 'constant' (default), 'reflect', 'replicate', 'circular'. Default is 'constant'.
 
            - 'constant' mode, uses a constant value to pad the input tensor.
@@ -1053,12 +1125,9 @@ class Pad2D(Layer):
             #    [0. 0. 0. 0.]]]]
     """
 
-    def __init__(self,
-                 padding,
-                 mode='constant',
-                 value=0.0,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self, padding, mode='constant', value=0.0, data_format="NCHW", name=None
+    ):
         super(Pad2D, self).__init__()
         self._pad = _npairs(padding, 2)
         self._mode = mode
@@ -1067,17 +1136,20 @@ def __init__(self,
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str)
+            self._pad, self._mode, self._value, self._data_format, name_str
+        )
 
 
 class ZeroPad2D(Layer):
@@ -1128,23 +1200,25 @@ def __init__(self, padding, data_format="NCHW", name=None):
         super(ZeroPad2D, self).__init__()
         self._pad = _npairs(padding, 2)
         self._mode = 'constant'
-        self._value = 0.
+        self._value = 0.0
         self._data_format = data_format
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
-        return 'padding={}, data_format={}{}'.format(self._pad,
-                                                     self._data_format,
-                                                     name_str)
+        return 'padding={}, data_format={}{}'.format(
+            self._pad, self._data_format, name_str
+        )
 
 
 class Pad3D(Layer):
@@ -1193,12 +1267,14 @@ class Pad3D(Layer):
             #     [0. 0. 0. 0.]]]]]
     """
 
-    def __init__(self,
-                 padding,
-                 mode='constant',
-                 value=0.0,
-                 data_format="NCDHW",
-                 name=None):
+    def __init__(
+        self,
+        padding,
+        mode='constant',
+        value=0.0,
+        data_format="NCDHW",
+        name=None,
+    ):
         super(Pad3D, self).__init__()
         self._pad = _npairs(padding, 3)
         self._mode = mode
@@ -1207,17 +1283,20 @@ def __init__(self,
         self._name = name
 
     def forward(self, x):
-        return F.pad(x,
-                     pad=self._pad,
-                     mode=self._mode,
-                     value=self._value,
-                     data_format=self._data_format,
-                     name=self._name)
+        return F.pad(
+            x,
+            pad=self._pad,
+            mode=self._mode,
+            value=self._value,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self._name) if self._name else ''
         return 'padding={}, mode={}, value={}, data_format={}{}'.format(
-            self._pad, self._mode, self._value, self._data_format, name_str)
+            self._pad, self._mode, self._value, self._data_format, name_str
+        )
 
 
 class CosineSimilarity(Layer):
@@ -1251,18 +1330,17 @@ class CosineSimilarity(Layer):
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
-            np.random.seed(0)
-            x1 = np.random.rand(2,3)
-            x2 = np.random.rand(2,3)
-            x1 = paddle.to_tensor(x1)
-            x2 = paddle.to_tensor(x2)
+            x1 = paddle.to_tensor([[1., 2., 3.],
+                                [2., 3., 4.]], dtype="float32")
+            x2 = paddle.to_tensor([[8., 3., 3.],
+                                [2., 3., 4.]], dtype="float32")
 
             cos_sim_func = nn.CosineSimilarity(axis=0)
             result = cos_sim_func(x1, x2)
             print(result)
-            # [0.99806249 0.9817672  0.94987036]
+            # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.65079135, 0.98058069, 1.        ])
     """
 
     def __init__(self, axis=1, eps=1e-8):
@@ -1279,7 +1357,7 @@ def extra_repr(self):
 
 class Embedding(Layer):
     r"""
-    
+
     Embedding Layer, used to construct a callable object of the ``Embedding`` class.
     For specific usage, refer to code examples. It implements the function of the Embedding Layer.
     This layer is used to lookup embeddings vector of ids provided by :attr:`x` .
@@ -1349,42 +1427,47 @@ class Embedding(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            x_data = np.arange(3, 6).reshape((3, 1)).astype(np.int64)
-            y_data = np.arange(6, 12).reshape((3, 2)).astype(np.float32)
 
-            x = paddle.to_tensor(x_data, stop_gradient=False)
-            y = paddle.to_tensor(y_data, stop_gradient=False)
+            x = paddle.to_tensor([[0], [1], [3]], dtype="int64", stop_gradient=False)
+            embedding = paddle.nn.Embedding(4, 3, sparse=True)
 
-            embedding = paddle.nn.Embedding(10, 3, sparse=True)
-
-            w0=np.full(shape=(10, 3), fill_value=2).astype(np.float32)
+            w0 = paddle.to_tensor([[0., 0., 0.],
+                                [1., 1., 1.],
+                                [2., 2., 2.],
+                                [3., 3., 3.]], dtype="float32")
             embedding.weight.set_value(w0)
+            print(embedding.weight)
+            # Tensor(shape=[4, 3], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[0., 0., 0.],
+            #         [1., 1., 1.],
+            #         [2., 2., 2.],
+            #         [3., 3., 3.]])
 
             adam = paddle.optimizer.Adam(parameters=[embedding.weight], learning_rate=0.01)
             adam.clear_grad()
 
-            # weight.shape = [10, 3]
 
-            # x.data = [[3],[4],[5]]
-            # x.shape = [3, 1]
+            out = embedding(x)
+            print(out)
+            # Tensor(shape=[3, 1, 3], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[[0., 0., 0.]],
+            #         [[1., 1., 1.]],
+            #         [[3., 3., 3.]]])
 
-            # out.data = [[2,2,2], [2,2,2], [2,2,2]]
-            # out.shape = [3, 1, 3]
-            out=embedding(x)
             out.backward()
             adam.step()
 
     """
 
-    def __init__(self,
-                 num_embeddings,
-                 embedding_dim,
-                 padding_idx=None,
-                 sparse=False,
-                 weight_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        num_embeddings,
+        embedding_dim,
+        padding_idx=None,
+        sparse=False,
+        weight_attr=None,
+        name=None,
+    ):
         super(Embedding, self).__init__()
         self._num_embeddings = num_embeddings
         self._embedding_dim = embedding_dim
@@ -1398,12 +1481,20 @@ def __init__(self,
         if self._embedding_dim <= 0:
             raise ValueError("embedding_dim must be gather than 0")
 
-        padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else (
-            num_embeddings + padding_idx)
+        padding_idx = (
+            -1
+            if padding_idx is None
+            else padding_idx
+            if padding_idx >= 0
+            else (num_embeddings + padding_idx)
+        )
 
         if padding_idx >= num_embeddings or padding_idx < -num_embeddings:
-            raise ValueError("padding_idx must be within [-{}, {})".format(
-                num_embeddings, num_embeddings))
+            raise ValueError(
+                "padding_idx must be within [-{}, {})".format(
+                    num_embeddings, num_embeddings
+                )
+            )
 
         self._dtype = self._helper.get_default_dtype()
         self._size = [self._num_embeddings, self._embedding_dim]
@@ -1411,21 +1502,25 @@ def __init__(self,
         self._weight_attr = weight_attr
         self._remote_prefetch = False
         self._name = name
-        self.weight = self.create_parameter(attr=self._weight_attr,
-                                            shape=self._size,
-                                            dtype=self._dtype,
-                                            is_bias=False)
+        self.weight = self.create_parameter(
+            attr=self._weight_attr,
+            shape=self._size,
+            dtype=self._dtype,
+            is_bias=False,
+        )
 
         if in_dynamic_mode() and padding_idx != -1:
             with paddle.no_grad():
                 self.weight[padding_idx] = 0.0
 
     def forward(self, x):
-        return F.embedding(x,
-                           weight=self.weight,
-                           padding_idx=self._padding_idx,
-                           sparse=self._sparse,
-                           name=self._name)
+        return F.embedding(
+            x,
+            weight=self.weight,
+            padding_idx=self._padding_idx,
+            sparse=self._sparse,
+            name=self._name,
+        )
 
     def extra_repr(self):
         main_str = '{_num_embeddings}, {_embedding_dim}'
@@ -1449,7 +1544,7 @@ class Unfold(Layer):
 
     See ``paddle.nn.functional.unfold`` for more details.
 
-    
+
     Parameters:
         kernel_sizes(int|list):   The size of convolution kernel, should be [k_h, k_w]
                                   or an integer k treated as [k, k].
@@ -1483,12 +1578,9 @@ class Unfold(Layer):
             print(result)
     """
 
-    def __init__(self,
-                 kernel_sizes,
-                 dilations=1,
-                 paddings=0,
-                 strides=1,
-                 name=None):
+    def __init__(
+        self, kernel_sizes, dilations=1, paddings=0, strides=1, name=None
+    ):
         super(Unfold, self).__init__()
 
         self.kernel_sizes = kernel_sizes
@@ -1498,17 +1590,24 @@ def __init__(self,
         self.name = name
 
     def forward(self, input):
-        return F.unfold(input,
-                        kernel_sizes=self.kernel_sizes,
-                        strides=self.strides,
-                        paddings=self.paddings,
-                        dilations=self.dilations,
-                        name=self.name)
+        return F.unfold(
+            input,
+            kernel_sizes=self.kernel_sizes,
+            strides=self.strides,
+            paddings=self.paddings,
+            dilations=self.dilations,
+            name=self.name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
-                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.format(
+            self.kernel_sizes,
+            self.dilations,
+            self.paddings,
+            self.strides,
+            name_str,
+        )
 
 
 class Fold(Layer):
@@ -1568,13 +1667,15 @@ class Fold(Layer):
             # y.shape = [2,3,4,5]
    """
 
-    def __init__(self,
-                 output_sizes,
-                 kernel_sizes,
-                 dilations=1,
-                 paddings=0,
-                 strides=1,
-                 name=None):
+    def __init__(
+        self,
+        output_sizes,
+        kernel_sizes,
+        dilations=1,
+        paddings=0,
+        strides=1,
+        name=None,
+    ):
         super(Fold, self).__init__()
 
         self.output_sizes = output_sizes
@@ -1585,15 +1686,22 @@ def __init__(self,
         self.name = name
 
     def forward(self, input):
-        return F.fold(input,
-                      output_sizes=self.output_sizes,
-                      kernel_sizes=self.kernel_sizes,
-                      strides=self.strides,
-                      paddings=self.paddings,
-                      dilations=self.dilations,
-                      name=self.name)
+        return F.fold(
+            input,
+            output_sizes=self.output_sizes,
+            kernel_sizes=self.kernel_sizes,
+            strides=self.strides,
+            paddings=self.paddings,
+            dilations=self.dilations,
+            name=self.name,
+        )
 
     def extra_repr(self):
         name_str = ', name={}'.format(self.name) if self.name else ''
-        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.\
-                format(self.kernel_sizes, self.dilations, self.paddings, self.strides, name_str)
+        return 'kernel_size={}, dilation={}, padding={}, stride={}{}'.format(
+            self.kernel_sizes,
+            self.dilations,
+            self.paddings,
+            self.strides,
+            name_str,
+        )
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 4ef987eccf2a4a..08056508f9170b 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -31,7 +31,7 @@
 
 def _get_default_param_initializer(num_channels, filter_size):
     filter_elem_num = num_channels * np.prod(filter_size)
-    std = (2.0 / filter_elem_num)**0.5
+    std = (2.0 / filter_elem_num) ** 0.5
     return Normal(0.0, std)
 
 
@@ -44,24 +44,27 @@ def _reverse_repeat_list(t, n):
 
 
 class _ConvNd(Layer):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 transposed,
-                 dims,
-                 stride=1,
-                 padding=0,
-                 padding_mode='zeros',
-                 output_padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        transposed,
+        dims,
+        stride=1,
+        padding=0,
+        padding_mode='zeros',
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+    ):
         super(_ConvNd, self).__init__()
-        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        assert (
+            weight_attr is not False
+        ), "weight_attr should not be False in Conv."
         self._param_attr = weight_attr
         self._bias_attr = bias_attr
         self._groups = groups
@@ -72,11 +75,16 @@ def __init__(self,
         valid_padding_modes = {'zeros', 'reflect', 'replicate', 'circular'}
         if padding_mode not in valid_padding_modes:
             raise ValueError(
-                "padding_mode must be one of {}, but got padding_mode='{}'".
-                format(valid_padding_modes, padding_mode))
+                "padding_mode must be one of {}, but got padding_mode='{}'".format(
+                    valid_padding_modes, padding_mode
+                )
+            )
 
-        if padding_mode in {'reflect', 'replicate', 'circular'
-                            } and not isinstance(padding, int):
+        if padding_mode in {
+            'reflect',
+            'replicate',
+            'circular',
+        } and not isinstance(padding, int):
             raise TypeError(
                 "when padding_mode in ['reflect', 'replicate', 'circular'], type of padding must be int"
             )
@@ -84,12 +92,16 @@ def __init__(self,
         valid_format = {'NHWC', 'NCHW', 'NDHWC', 'NCDHW', 'NLC', 'NCL'}
         if data_format not in valid_format:
             raise ValueError(
-                "data_format must be one of {}, but got data_format='{}'".
-                format(valid_format, data_format))
+                "data_format must be one of {}, but got data_format='{}'".format(
+                    valid_format, data_format
+                )
+            )
 
-        channel_last = (data_format == "NHWC") or (data_format
-                                                   == "NDHWC") or (data_format
-                                                                   == "NLC")
+        channel_last = (
+            (data_format == "NHWC")
+            or (data_format == "NDHWC")
+            or (data_format == "NLC")
+        )
         if channel_last:
             self._channel_dim = len(data_format) - 1
         else:
@@ -97,66 +109,86 @@ def __init__(self,
 
         self._stride = utils.convert_to_list(stride, dims, 'stride')
         self._dilation = utils.convert_to_list(dilation, dims, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, dims,
-                                                  'kernel_size')
+        self._kernel_size = utils.convert_to_list(
+            kernel_size, dims, 'kernel_size'
+        )
         self._padding = padding
         self._padding_mode = padding_mode
         self.output_padding = output_padding
         if dims != 1:
             self._updated_padding, self._padding_algorithm = _update_padding_nd(
-                padding, channel_last, dims)
+                padding, channel_last, dims
+            )
 
         if transposed:
-            filter_shape = [self._in_channels, out_channels // groups
-                            ] + self._kernel_size
+            filter_shape = [
+                self._in_channels,
+                out_channels // groups,
+            ] + self._kernel_size
         else:
             if in_channels % groups != 0:
                 raise ValueError("in_channels must be divisible by groups.")
 
             if padding_mode in {'reflect', 'replicate', 'circular'}:
-                _paired_padding = utils.convert_to_list(padding, dims,
-                                                        'padding')
+                _paired_padding = utils.convert_to_list(
+                    padding, dims, 'padding'
+                )
                 self._reversed_padding_repeated_twice = _reverse_repeat_list(
-                    _paired_padding, 2)
+                    _paired_padding, 2
+                )
 
-                self._updated_padding, self._padding_algorithm = _update_padding_nd(
-                    0, channel_last, dims)
+                (
+                    self._updated_padding,
+                    self._padding_algorithm,
+                ) = _update_padding_nd(0, channel_last, dims)
 
-            filter_shape = [out_channels, in_channels // groups
-                            ] + self._kernel_size
+            filter_shape = [
+                out_channels,
+                in_channels // groups,
+            ] + self._kernel_size
 
         def _get_default_param_initializer():
             if transposed:
                 return None
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
             attr=self._param_attr,
-            default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=[self._out_channels],
-                                          is_bias=True)
+            default_initializer=_get_default_param_initializer(),
+        )
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True
+        )
 
         cudnn_version = get_cudnn_version()
 
-        self._use_cudnn = True if (is_compiled_with_cuda()
-                                   and cudnn_version is not None) else False
+        self._use_cudnn = (
+            True
+            if (is_compiled_with_cuda() and cudnn_version is not None)
+            else False
+        )
 
         self._op_type = "conv" + str(dims) + 'd'
-        if self._op_type == 'conv2d' and (in_channels == groups
-                                          and in_channels != 1
-                                          and out_channels % in_channels == 0):
+        if self._op_type == 'conv2d' and (
+            in_channels == groups
+            and in_channels != 1
+            and out_channels % in_channels == 0
+        ):
             self._op_type = 'depthwise_conv2d'
             if is_compiled_with_rocm():
                 self._use_cudnn = True
             else:
                 self._use_cudnn = False
 
-        if (is_compiled_with_cuda() and get_flags("FLAGS_conv2d_disable_cudnn")
-            ["FLAGS_conv2d_disable_cudnn"]):
+        if (
+            is_compiled_with_cuda()
+            and get_flags("FLAGS_conv2d_disable_cudnn")[
+                "FLAGS_conv2d_disable_cudnn"
+            ]
+        ):
             self._use_cudnn = False
 
     def extra_repr(self):
@@ -273,79 +305,84 @@ class Conv1D(_ConvNd):
         - weight: 3-D tensor with shape: (out_channels, in_channels, kernel_size)
         - bias: 1-D tensor with shape: (out_channels)
         - output: 3-D tensor with same shape as input x.
-    
-    Raises:
-        None
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          from paddle.nn import Conv1D
-          import numpy as np
-          x = np.array([[[4, 8, 1, 9],
-            [7, 2, 0, 9],
-            [6, 9, 2, 6]]]).astype(np.float32)
-          w=np.array(
-          [[[9, 3, 4],
-            [0, 0, 7],
-            [2, 5, 6]],
-           [[0, 3, 4],
-            [2, 9, 7],
-            [5, 6, 8]]]).astype(np.float32)
-          x_t = paddle.to_tensor(x)
-          conv = Conv1D(3, 2, 3)
-          conv.weight.set_value(w)
-          y_t = conv(x_t)
-          print(y_t)
-          # [[[133. 238.]
-          #   [160. 211.]]]
+            import paddle
+            from paddle.nn import Conv1D
+
+            x = paddle.to_tensor([[[4, 8, 1, 9],
+                                    [7, 2, 0, 9],
+                                    [6, 9, 2, 6]]], dtype="float32")
+            w = paddle.to_tensor([[[9, 3, 4],
+                                    [0, 0, 7],
+                                    [2, 5, 6]],
+                                    [[0, 3, 4],
+                                    [2, 9, 7],
+                                    [5, 6, 8]]], dtype="float32")
+
+            conv = Conv1D(3, 2, 3)
+            conv.weight.set_value(w)
+            y = conv(x)
+            print(y)
+            # Tensor(shape=[1, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[[133., 238.],
+            #          [160., 211.]]])
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCL"):
-        super(Conv1D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     False,
-                                     1,
-                                     stride=stride,
-                                     padding=padding,
-                                     padding_mode=padding_mode,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCL",
+    ):
+        super(Conv1D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            1,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x):
         padding = 0
         if self._padding_mode != "zeros":
-            x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
+            x = F.pad(
+                x,
+                self._reversed_padding_repeated_twice,
+                mode=self._padding_mode,
+                data_format=self._data_format,
+            )
         else:
             padding = self._padding
 
-        out = F.conv1d(x,
-                       self.weight,
-                       bias=self.bias,
-                       padding=padding,
-                       stride=self._stride,
-                       dilation=self._dilation,
-                       groups=self._groups,
-                       data_format=self._data_format)
+        out = F.conv1d(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+        )
         return out
 
 
@@ -458,62 +495,67 @@ class Conv1DTranspose(_ConvNd):
     Examples:
        .. code-block:: python
 
-          import paddle
-          from paddle.nn import Conv1DTranspose
-          import numpy as np
-          
-          # shape: (1, 2, 4)
-          x=np.array([[[4, 0, 9, 7],
-                       [8, 0, 9, 2]]]).astype(np.float32)
-          # shape: (2, 1, 2)
-          y=np.array([[[7, 0]],
-                      [[4, 2]]]).astype(np.float32)
-          x_t = paddle.to_tensor(x)
-          conv = Conv1DTranspose(2, 1, 2)
-          conv.weight.set_value(y)
-          y_t = conv(x_t)
-          print(y_t)
-          
-          # [[[60. 16. 99. 75.  4.]]]
+            import paddle
+            from paddle.nn import Conv1DTranspose
+
+            # shape: (1, 2, 4)
+            x = paddle.to_tensor([[[4, 0, 9, 7],
+                                [8, 0, 9, 2]]], dtype="float32")
+            # shape: (2, 1, 2)
+            w = paddle.to_tensor([[[7, 0]],
+                                [[4, 2]]], dtype="float32")
+
+            conv = Conv1DTranspose(2, 1, 2)
+            conv.weight.set_value(w)
+            y = conv(x)
+            print(y)
+            # Tensor(shape=[1, 1, 5], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+            #        [[[60., 16., 99., 75., 4. ]]])
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 output_padding=0,
-                 groups=1,
-                 dilation=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCL"):
-        super(Conv1DTranspose, self).__init__(in_channels,
-                                              out_channels,
-                                              kernel_size,
-                                              True,
-                                              1,
-                                              stride=stride,
-                                              padding=padding,
-                                              dilation=dilation,
-                                              output_padding=output_padding,
-                                              groups=groups,
-                                              weight_attr=weight_attr,
-                                              bias_attr=bias_attr,
-                                              data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        groups=1,
+        dilation=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCL",
+    ):
+        super(Conv1DTranspose, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            1,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x, output_size=None):
-        out = F.conv1d_transpose(x,
-                                 self.weight,
-                                 bias=self.bias,
-                                 output_size=output_size,
-                                 output_padding=self.output_padding,
-                                 padding=self._padding,
-                                 stride=self._stride,
-                                 dilation=self._dilation,
-                                 groups=self._groups,
-                                 data_format=self._data_format)
+        out = F.conv1d_transpose(
+            x,
+            self.weight,
+            bias=self.bias,
+            output_size=output_size,
+            output_padding=self.output_padding,
+            padding=self._padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+        )
         return out
 
 
@@ -549,7 +591,7 @@ class Conv2D(_ConvNd):
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
+
     Parameters:
         in_channels(int): The number of input channels in the input image.
         out_channels(int): The number of output channels produced by the convolution.
@@ -559,7 +601,7 @@ class Conv2D(_ConvNd):
             stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -616,11 +658,11 @@ class Conv2D(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
-          
+
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv2D(4, 6, (3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
@@ -628,51 +670,59 @@ class Conv2D(_ConvNd):
           # (2, 6, 6, 6)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     False,
-                                     2,
-                                     stride=stride,
-                                     padding=padding,
-                                     padding_mode=padding_mode,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+    ):
+        super(Conv2D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            2,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x):
         if self._padding_mode != 'zeros':
-            x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-
-        out = F.conv._conv_nd(x,
-                              self.weight,
-                              bias=self.bias,
-                              stride=self._stride,
-                              padding=self._updated_padding,
-                              padding_algorithm=self._padding_algorithm,
-                              dilation=self._dilation,
-                              groups=self._groups,
-                              data_format=self._data_format,
-                              channel_dim=self._channel_dim,
-                              op_type=self._op_type,
-                              use_cudnn=self._use_cudnn)
+            x = F.pad(
+                x,
+                self._reversed_padding_repeated_twice,
+                mode=self._padding_mode,
+                data_format=self._data_format,
+            )
+
+        out = F.conv._conv_nd(
+            x,
+            self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._updated_padding,
+            padding_algorithm=self._padding_algorithm,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            channel_dim=self._channel_dim,
+            op_type=self._op_type,
+            use_cudnn=self._use_cudnn,
+        )
         return out
 
 
@@ -707,7 +757,7 @@ class Conv2DTranspose(_ConvNd):
     * :math:`b`: Bias value, a 1-D ``Tensor`` with shape [M].
     * :math:`\\sigma`: Activation function.
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
-    
+
     Parameters:
         in_channels(int): The number of channels in the input image.
         out_channels(int): The number of channels produced by the convolution.
@@ -719,7 +769,7 @@ class Conv2DTranspose(_ConvNd):
             stride_H = stride_W = stride. Default: 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` on both sides
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -781,7 +831,7 @@ class Conv2DTranspose(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
@@ -793,31 +843,35 @@ class Conv2DTranspose(_ConvNd):
           # (2, 6, 10, 10)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 output_padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW"):
-        super(Conv2DTranspose, self).__init__(in_channels,
-                                              out_channels,
-                                              kernel_size,
-                                              True,
-                                              2,
-                                              stride=stride,
-                                              padding=padding,
-                                              dilation=dilation,
-                                              output_padding=output_padding,
-                                              groups=groups,
-                                              weight_attr=weight_attr,
-                                              bias_attr=bias_attr,
-                                              data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+    ):
+        super(Conv2DTranspose, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            2,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x, output_size=None):
         if output_size is None:
@@ -825,16 +879,18 @@ def forward(self, x, output_size=None):
         else:
             output_padding = 0
 
-        out = F.conv2d_transpose(x,
-                                 self.weight,
-                                 bias=self.bias,
-                                 padding=self._padding,
-                                 output_padding=output_padding,
-                                 stride=self._stride,
-                                 dilation=self._dilation,
-                                 groups=self._groups,
-                                 output_size=output_size,
-                                 data_format=self._data_format)
+        out = F.conv2d_transpose(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            output_padding=output_padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            output_size=output_size,
+            data_format=self._data_format,
+        )
         return out
 
 
@@ -843,7 +899,7 @@ class Conv3D(_ConvNd):
     **Convlution3d Layer**
     The convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional tensors with a shape of 
+    Output(Output) are multidimensional tensors with a shape of
     :math:`[N, C, D, H, W]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
     and W is the width of the feature. Convlution3D is similar with Convlution2D
@@ -874,7 +930,7 @@ class Conv3D(_ConvNd):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -927,21 +983,17 @@ class Conv3D(_ConvNd):
 
            W_{out}&= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (kernel\_size[2] - 1) + 1))}{strides[2]} + 1
 
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
-
     Examples:
 
         .. code-block:: python
 
           import paddle
           import paddle.nn as nn
-          
+
           paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8, 8), dtype='float32', min=-1., max=1.)
-          
+
           conv = nn.Conv3D(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
@@ -949,51 +1001,59 @@ class Conv3D(_ConvNd):
           # (2, 6, 6, 6, 6)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCDHW"):
-        super(Conv3D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     False,
-                                     3,
-                                     stride=stride,
-                                     padding=padding,
-                                     padding_mode=padding_mode,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCDHW",
+    ):
+        super(Conv3D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            False,
+            3,
+            stride=stride,
+            padding=padding,
+            padding_mode=padding_mode,
+            dilation=dilation,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x):
         if self._padding_mode != 'zeros':
-            x = F.pad(x,
-                      self._reversed_padding_repeated_twice,
-                      mode=self._padding_mode,
-                      data_format=self._data_format)
-
-        out = F.conv._conv_nd(x,
-                              self.weight,
-                              bias=self.bias,
-                              stride=self._stride,
-                              padding=self._updated_padding,
-                              padding_algorithm=self._padding_algorithm,
-                              dilation=self._dilation,
-                              groups=self._groups,
-                              data_format=self._data_format,
-                              channel_dim=self._channel_dim,
-                              op_type=self._op_type,
-                              use_cudnn=self._use_cudnn)
+            x = F.pad(
+                x,
+                self._reversed_padding_repeated_twice,
+                mode=self._padding_mode,
+                data_format=self._data_format,
+            )
+
+        out = F.conv._conv_nd(
+            x,
+            self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._updated_padding,
+            padding_algorithm=self._padding_algorithm,
+            dilation=self._dilation,
+            groups=self._groups,
+            data_format=self._data_format,
+            channel_dim=self._channel_dim,
+            op_type=self._op_type,
+            use_cudnn=self._use_cudnn,
+        )
         return out
 
 
@@ -1104,10 +1164,6 @@ class Conv3DTranspose(_ConvNd):
            H^\prime_{out} &= (H_{in} - 1) * strides[1] - 2 * paddings[1] + dilations[1] * (kernel\_size[1] - 1) + 1
            
            W^\prime_{out} &= (W_{in} - 1) * strides[2] - 2 * paddings[2] + dilations[2] * (kernel\_size[2] - 1) + 1
-           
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
     Examples:
 
        .. code-block:: python
@@ -1126,31 +1182,35 @@ class Conv3DTranspose(_ConvNd):
           # (2, 6, 10, 10, 10)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 output_padding=0,
-                 dilation=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCDHW"):
-        super(Conv3DTranspose, self).__init__(in_channels,
-                                              out_channels,
-                                              kernel_size,
-                                              True,
-                                              3,
-                                              stride=stride,
-                                              padding=padding,
-                                              dilation=dilation,
-                                              output_padding=output_padding,
-                                              groups=groups,
-                                              weight_attr=weight_attr,
-                                              bias_attr=bias_attr,
-                                              data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        output_padding=0,
+        dilation=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCDHW",
+    ):
+        super(Conv3DTranspose, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            True,
+            3,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            output_padding=output_padding,
+            groups=groups,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
     def forward(self, x, output_size=None):
         if output_size is None:
@@ -1158,14 +1218,16 @@ def forward(self, x, output_size=None):
         else:
             output_padding = 0
 
-        out = F.conv3d_transpose(x,
-                                 self.weight,
-                                 bias=self.bias,
-                                 padding=self._padding,
-                                 output_padding=output_padding,
-                                 stride=self._stride,
-                                 dilation=self._dilation,
-                                 groups=self._groups,
-                                 output_size=output_size,
-                                 data_format=self._data_format)
+        out = F.conv3d_transpose(
+            x,
+            self.weight,
+            bias=self.bias,
+            padding=self._padding,
+            output_padding=output_padding,
+            stride=self._stride,
+            dilation=self._dilation,
+            groups=self._groups,
+            output_size=output_size,
+            data_format=self._data_format,
+        )
         return out
diff --git a/python/paddle/nn/layer/distance.py b/python/paddle/nn/layer/distance.py
index a7a488c833d7ff..98381b471d6f34 100644
--- a/python/paddle/nn/layer/distance.py
+++ b/python/paddle/nn/layer/distance.py
@@ -20,6 +20,7 @@
 
 class PairwiseDistance(Layer):
     r"""
+
     It computes the pairwise distance between two vectors. The
     distance is calculated by p-oreder norm:
 
@@ -38,14 +39,14 @@ class PairwiseDistance(Layer):
             Generally, no setting is required. Default: None.
 
     Shape:
-        x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
-            is the dimension of the data. Available data type is float32, float64.
-        y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
-        output: The same dtype as input tensor.
+        - x: :math:`[N, D]` or :math:`[D]`, where :math:`N` is batch size, :math:`D`
+          is the dimension of the data. Available data type is float32, float64.
+        - y: :math:`[N, D]` or :math:`[D]`, y have the same dtype as x.
+        - output: The same dtype as input tensor.
             - If :attr:`keepdim` is True, the output shape is :math:`[N, 1]` or :math:`[1]`,
-                depending on whether the input has data shaped as :math:`[N, D]`.
+              depending on whether the input has data shaped as :math:`[N, D]`.
             - If :attr:`keepdim` is False, the output shape is :math:`[N]` or :math:`[]`,
-                depending on whether the input has data shaped as :math:`[N, D]`.
+              depending on whether the input has data shaped as :math:`[N, D]`.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 1ff37afa1412e1..51706ee336f685 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -18,7 +18,11 @@
 import paddle.fluid as fluid
 import paddle
 from .. import functional as F
-from paddle.fluid.framework import _varbase_creator, in_dygraph_mode, _in_legacy_dygraph
+from paddle.fluid.framework import (
+    _varbase_creator,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 from .. import Layer
 from paddle import in_dynamic_mode
 
@@ -27,7 +31,8 @@
 
 class BCEWithLogitsLoss(Layer):
     r"""
-    This operator combines the sigmoid layer and the :ref:`api_nn_loss_BCELoss` layer.
+
+    This operator combines the sigmoid layer and the :ref:`api_paddle_nn_BCELoss` layer.
     Also, we can see it as the combine of ``sigmoid_cross_entropy_with_logits``
     layer and some reduce operations.
 
@@ -50,7 +55,7 @@ class BCEWithLogitsLoss(Layer):
     For stability and to prevent overflow of :math:`e^{-Logit}` when Logit < 0,
     we reformulate the loss as follows:
 
-    .. math::
+        .. math::
            Out = \max(Logit, 0) - Logit * Labels + \log(1 + e^{-\|Logit\|})
 
     Then, if ``weight`` or ``pos_weight`` is not None, this operator multiply the
@@ -82,21 +87,21 @@ class BCEWithLogitsLoss(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shapes:
-        logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, *],
-            N is batch_size, `*` means number of additional dimensions. The ``logit``
-            is usually the output of Linear layer. Available dtype is float32, float64.
-        label (Tensor): The target labels tensor. 2-D tensor with the same shape as
-            ``logit``. The target labels which values should be numbers between 0 and 1.
-            Available dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``logit`` , else the shape of output is scalar.
+        - logit (Tensor): The input predications tensor. 2-D tensor with shape: [N, `*`],
+          N is batch_size, `*` means number of additional dimensions. The ``logit``
+          is usually the output of Linear layer. Available dtype is float32, float64.
+        - label (Tensor): The target labels tensor. 2-D tensor with the same shape as
+          ``logit``. The target labels which values should be numbers between 0 and 1.
+          Available dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``logit`` , else the shape of output is scalar.
 
     Returns:
         A callable object of BCEWithLogitsLoss.
 
     Examples:
-
         .. code-block:: python
+
             import paddle
             logit = paddle.to_tensor([5.0, 1.0, 3.0], dtype="float32")
             label = paddle.to_tensor([1.0, 0.0, 1.0], dtype="float32")
@@ -106,15 +111,14 @@ class BCEWithLogitsLoss(Layer):
 
     """
 
-    def __init__(self,
-                 weight=None,
-                 reduction='mean',
-                 pos_weight=None,
-                 name=None):
+    def __init__(
+        self, weight=None, reduction='mean', pos_weight=None, name=None
+    ):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in BCEWithLogitsLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
 
         super(BCEWithLogitsLoss, self).__init__()
         self.weight = weight
@@ -124,30 +128,36 @@ def __init__(self,
 
     def forward(self, logit, label):
         out = paddle.nn.functional.binary_cross_entropy_with_logits(
-            logit, label, self.weight, self.reduction, self.pos_weight,
-            self.name)
+            logit,
+            label,
+            self.weight,
+            self.reduction,
+            self.pos_weight,
+            self.name,
+        )
         return out
 
 
 class CrossEntropyLoss(Layer):
     r"""
-    By default, this operator implements the cross entropy loss function with softmax. This function 
-    combines the calculation of the softmax operation and the cross entropy loss function 
+
+    By default, this operator implements the cross entropy loss function with softmax. This function
+    combines the calculation of the softmax operation and the cross entropy loss function
     to provide a more numerically stable computing.
 
     This operator will calculate the cross entropy loss function without softmax when use_softmax=False.
 
-    By default, this operator will calculate the mean of the result, and you can also affect 
-    the default behavior by using the reduction parameter. Please refer to the part of 
+    By default, this operator will calculate the mean of the result, and you can also affect
+    the default behavior by using the reduction parameter. Please refer to the part of
     parameters for details.
 
     This operator can be used to calculate the softmax cross entropy loss with soft and hard labels.
-    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels 
+    Where, the hard labels mean the actual label value, 0, 1, 2, etc.  And the soft labels
     mean the probability of the actual label, 0.6, 0.8, 0.2, etc.
 
     The calculation of this operator includes the following two steps.
 
-    -  **I.softmax cross entropy** 
+    -  **I.softmax cross entropy**
 
         1. Hard label (each sample can only be assigned into one category)
 
@@ -184,7 +194,7 @@ class CrossEntropyLoss(Layer):
 
 
 
-    -  **II.Weight and reduction processing** 
+    -  **II.Weight and reduction processing**
 
         1. Weight
 
@@ -196,7 +206,7 @@ class CrossEntropyLoss(Layer):
             1.1. Hard labels (soft_label = False)
 
             .. math::
-                \\loss_j=loss_j*weight[label_j] 
+                \\loss_j=loss_j*weight[label_j]
 
 
             1.2. Soft labels (soft_label = True)
@@ -206,21 +216,21 @@ class CrossEntropyLoss(Layer):
 
         2. reduction
 
-            2.1 if the ``reduction`` parameter is ``none`` 
+            2.1 if the ``reduction`` parameter is ``none``
 
             Return the previous result directly
 
-            2.2 if the ``reduction`` parameter is ``sum`` 
+            2.2 if the ``reduction`` parameter is ``sum``
 
             Return the sum of the previous results
 
             .. math::
                \\loss=\sum_{j}loss_j
 
-            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to 
-            the ``weight`` parameter as follows. 
+            2.3 if the ``reduction`` parameter is ``mean`` , it will be processed according to
+            the ``weight`` parameter as follows.
 
-            2.3.1. If the  ``weight``  parameter is ``None`` 
+            2.3.1. If the  ``weight``  parameter is ``None``
 
             Return the average value of the previous results
 
@@ -234,114 +244,83 @@ class CrossEntropyLoss(Layer):
             1. Hard labels (soft_label = False)
 
              .. math::
-                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j] 
+                \\loss=\sum_{j}loss_j/\sum_{j}weight[label_j]
 
             2. Soft labels (soft_label = True)
 
              .. math::
                 \\loss=\sum_{j}loss_j/\sum_{j}\left(\sum_{i}weight[label_i]\right)
- 
- 
-    Parameters:
 
-        - **weight** (Tensor, optional)
 
-            a manual rescaling weight given to each class. 
-            If given, has to be a Tensor of size C and the data type is float32, float64. 
+    Parameters:
+        weight (Tensor, optional): a manual rescaling weight given to each class.
+            If given, has to be a Tensor of size C and the data type is float32, float64.
             Default is ``'None'`` .
-
-        - **ignore_index** (int64, optional)
-
-            Specifies a target value that is ignored
-            and does not contribute to the loss. A negative value means that no label 
-            value needs to be ignored. Only valid when soft_label = False.  
+        ignore_index (int64, optional): Specifies a target value that is ignored
+            and does not contribute to the loss. A negative value means that no label
+            value needs to be ignored. Only valid when soft_label = False.
             Default is ``-100`` .
-
-        - **reduction** (str, optional)
-
-            Indicate how to average the loss by batch_size,
+        reduction (str, optional): Indicate how to average the loss by batch_size,
             the candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
             If :attr:`reduction` is ``'mean'``, the reduced mean loss is returned;
             If :attr:`size_average` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-
-        - **soft_label** (bool, optional)
-
-            Indicate whether label is soft. 
+        soft_label (bool, optional): Indicate whether label is soft.
             If soft_label=False, the label is hard.  If soft_label=True, the label is soft.
             Default is ``False``.
-
-        - **axis** (int, optional)
-
-            The index of dimension to perform softmax calculations. 
-            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number 
-            of dimensions of input :attr:`input`. 
+        axis (int, optional): The index of dimension to perform softmax calculations.
+            It should be in range :math:`[-1, rank - 1]`, where :math:`rank` is the number
+            of dimensions of input :attr:`input`.
             Default is ``-1`` .
-
-        - **use_softmax** (bool, optional)
-
-            Indicate whether compute softmax before cross_entropy.
+        use_softmax (bool, optional): Indicate whether compute softmax before cross_entropy.
             Default is ``True``.
-
-        - **name** (str, optional)
-
-            The name of the operator. Default is ``None`` .
+        name (str, optional): The name of the operator. Default is ``None`` .
             For more information, please refer to :ref:`api_guide_Name` .
 
 
     Shape:
+        - **input** (Tensor), the data type is float32, float64. Shape is
+          :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` .
+            Note:
 
-        - **input** (Tensor)
-
-            Input tensor, the data type is float32, float64. Shape is
-        :math:`[N_1, N_2, ..., N_k, C]`, where C is number of classes ,  ``k >= 1`` . 
-
-            Note: 
-
-                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the 
+                1. when use_softmax=True, it expects unscaled logits. This operator should not be used with the
                 output of softmax operator, which will produce incorrect results.
 
                 2. when use_softmax=False, it expects the output of softmax operator.
 
-
         - **label** (Tensor)
 
-            1. If soft_label=False, the shape is 
+            1. If soft_label=False, the shape is
             :math:`[N_1, N_2, ..., N_k]` or :math:`[N_1, N_2, ..., N_k, 1]`, k >= 1.
             the data type is int32, int64, float32, float64, where each value is [0, C-1].
 
-            2. If soft_label=True, the shape and data type should be same with ``input`` , 
+            2. If soft_label=True, the shape and data type should be same with ``input`` ,
             and the sum of the labels for each sample should be 1.
 
-        - **output** (Tensor)
+        - **output** (Tensor), Return the softmax cross_entropy loss of ``input`` and ``label``.
+          The data type is the same as input.
+          If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
+          If :attr:`reduction` is ``'none'``:
 
-            Return the softmax cross_entropy loss of ``input`` and ``label``.
+            1. If soft_label = False, the dimension of return value is the same with ``label`` .
 
-            The data type is the same as input.
-
-            If :attr:`reduction` is ``'mean'`` or ``'sum'`` , the dimension of return value is ``1``.
-
-            If :attr:`reduction` is ``'none'``:
-
-            1. If soft_label = False, the dimension of return value is the same with ``label`` . 
-
-            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` . 
+            2. if soft_label = True, the dimension of return value is :math:`[N_1, N_2, ..., N_k, 1]` .
 
     Examples:
 
         .. code-block:: python
-            
+
             # hard labels
             import paddle
             paddle.seed(99999)
             N=100
             C=200
             reduction='mean'
-            input =  paddle.rand([N, C], dtype='float64')  
+            input =  paddle.rand([N, C], dtype='float64')
             label =  paddle.randint(0, C, shape=[N], dtype='int64')
-            weight = paddle.rand([C], dtype='float64') 
-            
+            weight = paddle.rand([C], dtype='float64')
+
             cross_entropy_loss = paddle.nn.loss.CrossEntropyLoss(
                 weight=weight, reduction=reduction)
             dy_ret = cross_entropy_loss(
@@ -365,9 +344,9 @@ class CrossEntropyLoss(Layer):
             labels = paddle.uniform(shape, dtype='float64', min=0.1, max=1.0)
             labels /= paddle.sum(labels, axis=axis, keepdim=True)
             paddle_loss_mean = paddle.nn.functional.cross_entropy(
-                                                                  logits,  
-                                                                  labels, 
-                                                                  soft_label=True, 
+                                                                  logits,
+                                                                  labels,
+                                                                  soft_label=True,
                                                                   axis=axis,
                                                                   weight=weight,
                                                                   reduction=reduction)
@@ -375,14 +354,16 @@ class CrossEntropyLoss(Layer):
 
     """
 
-    def __init__(self,
-                 weight=None,
-                 ignore_index=-100,
-                 reduction='mean',
-                 soft_label=False,
-                 axis=-1,
-                 use_softmax=True,
-                 name=None):
+    def __init__(
+        self,
+        weight=None,
+        ignore_index=-100,
+        reduction='mean',
+        soft_label=False,
+        axis=-1,
+        use_softmax=True,
+        name=None,
+    ):
         super(CrossEntropyLoss, self).__init__()
         self.weight = weight
         self.reduction = reduction
@@ -393,15 +374,17 @@ def __init__(self,
         self.name = name
 
     def forward(self, input, label):
-        ret = paddle.nn.functional.cross_entropy(input,
-                                                 label,
-                                                 weight=self.weight,
-                                                 ignore_index=self.ignore_index,
-                                                 reduction=self.reduction,
-                                                 soft_label=self.soft_label,
-                                                 axis=self.axis,
-                                                 use_softmax=self.use_softmax,
-                                                 name=self.name)
+        ret = paddle.nn.functional.cross_entropy(
+            input,
+            label,
+            weight=self.weight,
+            ignore_index=self.ignore_index,
+            reduction=self.reduction,
+            soft_label=self.soft_label,
+            axis=self.axis,
+            use_softmax=self.use_softmax,
+            name=self.name,
+        )
 
         return ret
 
@@ -409,7 +392,7 @@ def forward(self, input, label):
 class HSigmoidLoss(Layer):
     """
     Hierarchical Sigmoid Layer.
-    
+
     The hierarchical sigmoid organizes the classes into a complete binary tree to reduce the computational complexity
     and speed up the model training, especially the training of language model.
     Each leaf node of the complete binary tree represents a class(word) and each non-leaf node acts as a binary classifier.
@@ -444,7 +427,7 @@ class HSigmoidLoss(Layer):
             is set to False, no bias will be added. If it is set to None or one attribute of ParamAttr,
             hsigmoid will create a ParamAttr as bias_attr. If the Initializer of the bias_attr is not
             set, the bias is initialized zero. Default is None.
-        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and 
+        is_custom (bool, optional): Whether use custom binary tree. If it's True, `path_table` and
             `path_code` should be passed to its forward method, otherwise `path_table` and `path_code`
             should not be passed to its forward method. Default is False.
         is_sparse (bool, optional): Whether use sparse updating instead of dense updating, if it's True,
@@ -477,18 +460,21 @@ class HSigmoidLoss(Layer):
             #  [2.34564662]]
     """
 
-    def __init__(self,
-                 feature_size,
-                 num_classes,
-                 weight_attr=None,
-                 bias_attr=None,
-                 is_custom=False,
-                 is_sparse=False,
-                 name=None):
+    def __init__(
+        self,
+        feature_size,
+        num_classes,
+        weight_attr=None,
+        bias_attr=None,
+        is_custom=False,
+        is_sparse=False,
+        name=None,
+    ):
         super(HSigmoidLoss, self).__init__()
         if (num_classes < 2) and (not is_custom):
             raise ValueError(
-                "num_classes must not be less than 2 with default tree")
+                "num_classes must not be less than 2 with default tree"
+            )
 
         if (not is_custom) and (is_sparse):
             print("Sparse mode should not be used without custom tree")
@@ -506,29 +492,34 @@ def __init__(self,
         self._dtype = paddle.get_default_dtype()
 
         remote_prefetch = is_sparse
-        print("With sparse mode, if your models has only"
-              " small parameter prefetch may cause speed down")
+        print(
+            "With sparse mode, if your models has only"
+            " small parameter prefetch may cause speed down"
+        )
 
         C = self._num_classes if is_custom else self._num_classes - 1
-        self.weight = self.create_parameter([C, self._feature_size],
-                                            attr=self._weight_attr,
-                                            is_bias=False,
-                                            dtype=self._dtype)
-        self.bias = self.create_parameter([C, 1],
-                                          attr=self._bias_attr,
-                                          is_bias=True,
-                                          dtype=self._dtype)
+        self.weight = self.create_parameter(
+            [C, self._feature_size],
+            attr=self._weight_attr,
+            is_bias=False,
+            dtype=self._dtype,
+        )
+        self.bias = self.create_parameter(
+            [C, 1], attr=self._bias_attr, is_bias=True, dtype=self._dtype
+        )
 
     def forward(self, input, label, path_table=None, path_code=None):
-        out = F.hsigmoid_loss(input,
-                              label,
-                              self._num_classes,
-                              self.weight,
-                              self.bias,
-                              path_table=path_table,
-                              path_code=path_code,
-                              is_sparse=self._is_sparse,
-                              name=self._name)
+        out = F.hsigmoid_loss(
+            input,
+            label,
+            self._num_classes,
+            self.weight,
+            self.bias,
+            path_table=path_table,
+            path_code=path_code,
+            is_sparse=self._is_sparse,
+            name=self._name,
+        )
         return out
 
 
@@ -570,15 +561,11 @@ class MSELoss(Layer):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            input_data = np.array([1.5]).astype("float32")
-            label_data = np.array([1.7]).astype("float32")
-
             mse_loss = paddle.nn.loss.MSELoss()
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([1.5])
+            label = paddle.to_tensor([1.7])
             output = mse_loss(input, label)
             print(output)
             # [0.04000002]
@@ -589,17 +576,18 @@ def __init__(self, reduction='mean'):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "'reduction' in 'MSELoss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction))
+                "but received {}.".format(reduction)
+            )
         self.reduction = reduction
 
     def forward(self, input, label):
         if not in_dynamic_mode():
-            fluid.data_feeder.check_variable_and_dtype(input, 'input',
-                                                       ['float32', 'float64'],
-                                                       'MSELoss')
-            fluid.data_feeder.check_variable_and_dtype(label, 'label',
-                                                       ['float32', 'float64'],
-                                                       'MSELoss')
+            fluid.data_feeder.check_variable_and_dtype(
+                input, 'input', ['float32', 'float64'], 'MSELoss'
+            )
+            fluid.data_feeder.check_variable_and_dtype(
+                label, 'label', ['float32', 'float64'], 'MSELoss'
+            )
 
         if in_dygraph_mode():
             square_out = paddle._C_ops.square(paddle.subtract(input, label))
@@ -617,10 +605,11 @@ def forward(self, input, label):
 
 class L1Loss(Layer):
     r"""
-    This interface is used to construct a callable object of the ``L1Loss`` class.
+
+    Construct a callable object of the ``L1Loss`` class.
     The L1Loss layer calculates the L1 Loss of ``input`` and ``label`` as follows.
 
-     If `reduction` set to ``'none'``, the loss is:
+    If `reduction` set to ``'none'``, the loss is:
 
     .. math::
         Out = \lvert input - label\rvert
@@ -646,22 +635,19 @@ class L1Loss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        input (Tensor): The input tensor. The shapes is [N, *], where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
-        label (Tensor): label. The shapes is [N, *], same shape as ``input`` . It's data type should be float32, float64, int32, int64.
-        output (Tensor): The L1 Loss of ``input`` and ``label``.
-            If `reduction` is ``'none'``, the shape of output loss is [N, *], the same as ``input`` .
-            If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
+        - input (Tensor): The input tensor. The shapes is ``[N, *]``, where N is batch size and `*` means any number of additional dimensions. It's data type should be float32, float64, int32, int64.
+        - label (Tensor): label. The shapes is ``[N, *]``, same shape as ``input`` . It's data type should be float32, float64, int32, int64.
+        - output (Tensor): The L1 Loss of ``input`` and ``label``.
+          If `reduction` is ``'none'``, the shape of output loss is ``[N, *]``, the same as ``input`` .
+          If `reduction` is ``'mean'`` or ``'sum'``, the shape of output loss is [1].
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            import numpy as np
 
-            input_data = np.array([[1.5, 0.8], [0.2, 1.3]]).astype("float32")
-            label_data = np.array([[1.7, 1], [0.4, 0.5]]).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([[1.5, 0.8], [0.2, 1.3]])
+            label = paddle.to_tensor([[1.7, 1], [0.4, 0.5]])
 
             l1_loss = paddle.nn.L1Loss()
             output = l1_loss(input, label)
@@ -678,26 +664,28 @@ class L1Loss(Layer):
             print(output)
             # [[0.20000005 0.19999999]
             # [0.2        0.79999995]]
+
     """
 
     def __init__(self, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in L1Loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         super(L1Loss, self).__init__()
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, label):
-        return paddle.nn.functional.l1_loss(input,
-                                            label,
-                                            self.reduction,
-                                            name=self.name)
+        return paddle.nn.functional.l1_loss(
+            input, label, self.reduction, name=self.name
+        )
 
 
 class BCELoss(Layer):
     """
+
     This interface is used to construct a callable object of the ``BCELoss`` class.
     The BCELoss layer measures the binary_cross_entropy loss between input predictions ``input``
     and target labels ``label`` . The binary_cross_entropy loss can be described as:
@@ -741,14 +729,14 @@ class BCELoss(Layer):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-        input (Tensor): 2-D tensor with shape: [N, *], N is batch_size, `*` means
-            number of additional dimensions. The input ``input`` should always
-            be the output of sigmod.  Available dtype is float32, float64.
-        label (Tensor): 2-D tensor with the same shape as ``input``. The target
-            labels which values should be numbers between 0 and 1. Available
-            dtype is float32, float64.
-        output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is scalar.
+        - input (Tensor): 2-D tensor with shape: ``[N, *]``, N is batch_size, `*` means
+          number of additional dimensions. The input ``input`` should always
+          be the output of sigmod.  Available dtype is float32, float64.
+        - label (Tensor): 2-D tensor with the same shape as ``input``. The target
+          labels which values should be numbers between 0 and 1. Available
+          dtype is float32, float64.
+        - output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``input`` , else the shape of output is scalar.
 
     Returns:
         A callable object of BCELoss.
@@ -756,16 +744,15 @@ class BCELoss(Layer):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
-            input_data = np.array([0.5, 0.6, 0.7]).astype("float32")
-            label_data = np.array([1.0, 0.0, 1.0]).astype("float32")
 
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.to_tensor([0.5, 0.6, 0.7])
+            label = paddle.to_tensor([1.0, 0.0, 1.0])
             bce_loss = paddle.nn.BCELoss()
             output = bce_loss(input, label)
-            print(output)  # [0.65537095]
+            print(output)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.65537101])
 
     """
 
@@ -773,7 +760,8 @@ def __init__(self, weight=None, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in bce_loss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
 
         super(BCELoss, self).__init__()
         self.weight = weight
@@ -781,10 +769,9 @@ def __init__(self, weight=None, reduction='mean', name=None):
         self.name = name
 
     def forward(self, input, label):
-        out = paddle.nn.functional.binary_cross_entropy(input, label,
-                                                        self.weight,
-                                                        self.reduction,
-                                                        self.name)
+        out = paddle.nn.functional.binary_cross_entropy(
+            input, label, self.weight, self.reduction, self.name
+        )
         return out
 
 
@@ -842,7 +829,7 @@ class NLLLoss(Layer):
             if `reduction` is ``'sum'``, the reduced sum loss is returned;
             if `reduction` is ``'none'``, no reduction will be apllied.
             Default is ``'mean'``.
-         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Shape:
         - input (Tensor): Input tensor, the shape is :math:`[N, C]`, `C` is the number of classes.
@@ -874,15 +861,14 @@ class NLLLoss(Layer):
 
     """
 
-    def __init__(self,
-                 weight=None,
-                 ignore_index=-100,
-                 reduction='mean',
-                 name=None):
+    def __init__(
+        self, weight=None, ignore_index=-100, reduction='mean', name=None
+    ):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in nll_loss should be 'sum', 'mean' or "
-                "'none', but received %s, which is not allowed." % reduction)
+                "'none', but received %s, which is not allowed." % reduction
+            )
         super(NLLLoss, self).__init__()
         self._weight = weight
         self._ignore_index = ignore_index
@@ -890,19 +876,23 @@ def __init__(self,
         self._name = name
 
     def forward(self, input, label):
-        return F.nll_loss(input,
-                          label,
-                          weight=self._weight,
-                          ignore_index=self._ignore_index,
-                          reduction=self._reduction,
-                          name=self._name)
+        return F.nll_loss(
+            input,
+            label,
+            weight=self._weight,
+            ignore_index=self._ignore_index,
+            reduction=self._reduction,
+            name=self._name,
+        )
 
 
 class KLDivLoss(Layer):
     r"""
-    This interface calculates the Kullback-Leibler divergence loss
-    between Input(X) and Input(Target). Notes that Input(X) is the
-    log-probability and Input(Target) is the probability.
+
+    Generate a callable object of 'KLDivLoss' to calculate the
+    Kullback-Leibler divergence loss between Input(X) and
+    Input(Target). Notes that Input(X) is the log-probability
+    and Input(Target) is the probability.
 
     KL divergence loss is calculated as follows:
 
@@ -918,48 +908,40 @@ class KLDivLoss(Layer):
              Default is ``'mean'``.
 
     Shape:
-
-        - input (Tensor): (N, *), where * means, any number of additional dimensions.
-
-        - label (Tensor): (N, *), same shape as input.
-
+        - input (Tensor): ``(N, *)``, where ``*`` means, any number of additional dimensions.
+        - label (Tensor): ``(N, *)``, same shape as input.
         - output (Tensor): tensor with shape: [1] by default.
 
-
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
             import paddle.nn as nn
 
             shape = (5, 20)
-            x = np.random.uniform(-10, 10, shape).astype('float32')
-            target = np.random.uniform(-10, 10, shape).astype('float32')
+            x = paddle.uniform(shape, min=-10, max=10).astype('float32')
+            target = paddle.uniform(shape, min=-10, max=10).astype('float32')
 
             # 'batchmean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='batchmean')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[1]
 
             # 'mean' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='mean')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[1]
 
             # 'sum' reduction, loss shape will be [1]
             kldiv_criterion = nn.KLDivLoss(reduction='sum')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[1]
 
             # 'none' reduction, loss shape is same with X shape
             kldiv_criterion = nn.KLDivLoss(reduction='none')
-            pred_loss = kldiv_criterion(paddle.to_tensor(x),
-                                        paddle.to_tensor(target))
+            pred_loss = kldiv_criterion(x, target)
             # shape=[5, 20]
+
     """
 
     def __init__(self, reduction='mean'):
@@ -999,7 +981,7 @@ class MarginRankingLoss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shape:
-    
+
         input: N-D Tensor, the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
 
         other: N-D Tensor, `other` have the same shape and dtype as `input`.
@@ -1031,17 +1013,17 @@ def __init__(self, margin=0.0, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in MarginRankingLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         super(MarginRankingLoss, self).__init__()
         self.margin = margin
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, other, label):
-        out = paddle.nn.functional.margin_ranking_loss(input, other, label,
-                                                       self.margin,
-                                                       self.reduction,
-                                                       self.name)
+        out = paddle.nn.functional.margin_ranking_loss(
+            input, other, label, self.margin, self.reduction, self.name
+        )
         return out
 
 
@@ -1072,7 +1054,6 @@ class CTCLoss(Layer):
         .. code-block:: python
 
             # declarative mode
-            import numpy as np
             import paddle
 
             # length of the longest logit sequence
@@ -1084,8 +1065,7 @@ class CTCLoss(Layer):
             # class num
             class_num = 3
 
-            np.random.seed(1)
-            log_probs = np.array([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
+            log_probs = paddle.to_tensor([[[4.17021990e-01, 7.20324516e-01, 1.14374816e-04],
                                     [3.02332580e-01, 1.46755889e-01, 9.23385918e-02]],
 
                                     [[1.86260208e-01, 3.45560730e-01, 3.96767467e-01],
@@ -1098,26 +1078,25 @@ class CTCLoss(Layer):
                                     [9.68261600e-01, 3.13424170e-01, 6.92322612e-01]],
 
                                     [[8.76389146e-01, 8.94606650e-01, 8.50442126e-02],
-                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]]).astype("float32")
-            labels = np.array([[1, 2, 2],
-                            [1, 2, 2]]).astype("int32")
-            input_lengths = np.array([5, 5]).astype("int64")
-            label_lengths = np.array([3, 3]).astype("int64")
-
-            log_probs = paddle.to_tensor(log_probs)
-            labels = paddle.to_tensor(labels)
-            input_lengths = paddle.to_tensor(input_lengths)
-            label_lengths = paddle.to_tensor(label_lengths)
+                                    [3.90547849e-02, 1.69830427e-01, 8.78142476e-01]]], dtype="float32")
+            labels = paddle.to_tensor([[1, 2, 2],
+                            [1, 2, 2]], dtype="int32")
+            input_lengths = paddle.to_tensor([5, 5], dtype="int64")
+            label_lengths = paddle.to_tensor([3, 3], dtype="int64")
 
             loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels,
                 input_lengths,
                 label_lengths)
-            print(loss)  #[3.9179852 2.9076521]
+            print(loss)
+            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [3.91798496, 2.90765190])
 
             loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels,
                 input_lengths,
                 label_lengths)
-            print(loss)  #[1.1376063]
+            print(loss)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.13760614])
     """
 
     def __init__(self, blank=0, reduction='mean'):
@@ -1125,19 +1104,23 @@ def __init__(self, blank=0, reduction='mean'):
         self.blank = blank
         self.reduction = reduction
 
-    def forward(self,
-                log_probs,
-                labels,
-                input_lengths,
-                label_lengths,
-                norm_by_times=False):
-        return paddle.nn.functional.ctc_loss(log_probs,
-                                             labels,
-                                             input_lengths,
-                                             label_lengths,
-                                             self.blank,
-                                             self.reduction,
-                                             norm_by_times=norm_by_times)
+    def forward(
+        self,
+        log_probs,
+        labels,
+        input_lengths,
+        label_lengths,
+        norm_by_times=False,
+    ):
+        return paddle.nn.functional.ctc_loss(
+            log_probs,
+            labels,
+            input_lengths,
+            label_lengths,
+            self.blank,
+            self.reduction,
+            norm_by_times=norm_by_times,
+        )
 
 
 class SmoothL1Loss(Layer):
@@ -1149,16 +1132,16 @@ class SmoothL1Loss(Layer):
 
     .. math::
 
-         loss(x,y) = \frac{1}{n}\sum_{i}z_i
+        loss(x, y) = \frac{1}{n}\sum_{i}z_i
 
-    where z_i is given by:
+    where :math:`z_i` is given by:
 
     .. math::
 
         \mathop{z_i} = \left\{\begin{array}{rcl}
-        0.5(x_i - y_i)^2 & & {if |x_i - y_i| < delta} \\
-        delta * |x_i - y_i| - 0.5 * delta^2 & & {otherwise}
-        \end{array} \right.
+                0.5(x_i - y_i)^2 & & {if |x_i - y_i| < \delta} \\
+                \delta * |x_i - y_i| - 0.5 * \delta^2 & & {otherwise}
+            \end{array} \right.
 
     Parameters:
         reduction (str, optional): Indicate how to average the loss by batch_size,
@@ -1167,12 +1150,11 @@ class SmoothL1Loss(Layer):
             If :attr:`reduction` is ``'sum'``, the reduced sum loss is returned.
             If :attr:`reduction` is ``'none'``, the unreduced loss is returned.
             Default is ``'mean'``.
-        delta (float, optional): Specifies the hyperparameter delta to be used.
+        delta (float, optional): Specifies the hyperparameter :math:`\delta` to be used.
             The value determines how large the errors need to be to use L1. Errors
             smaller than delta are minimized with L2. Parameter is ignored for
-            negative/zero values. Default = 1.0
-        name (str, optional): Name for the operation (optional, default is
-            None). For more information, please refer to :ref:`api_guide_Name`.
+            negative/zero values. Default value is :math:`1.0`.
+        name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
     Call Parameters:
 
@@ -1190,14 +1172,12 @@ class SmoothL1Loss(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            input_data = np.random.rand(3,3).astype("float32")
-            label_data = np.random.rand(3,3).astype("float32")
-            input = paddle.to_tensor(input_data)
-            label = paddle.to_tensor(label_data)
+            input = paddle.rand([3, 3]).astype("float32")
+            label = paddle.rand([3, 3]).astype("float32")
             loss = paddle.nn.SmoothL1Loss()
             output = loss(input, label)
             print(output)
+            # [0.049606]
     """
 
     def __init__(self, reduction='mean', delta=1.0, name=None):
@@ -1207,11 +1187,13 @@ def __init__(self, reduction='mean', delta=1.0, name=None):
         self.name = name
 
     def forward(self, input, label):
-        return F.smooth_l1_loss(input,
-                                label,
-                                reduction=self.reduction,
-                                delta=self.delta,
-                                name=self.name)
+        return F.smooth_l1_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            delta=self.delta,
+            name=self.name,
+        )
 
 
 class MultiLabelSoftMarginLoss(Layer):
@@ -1279,22 +1261,25 @@ def __init__(self, weight=None, reduction="mean", name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "'reduction' in 'MultiLabelSoftMarginloss' should be 'sum', 'mean' or 'none', "
-                "but received {}.".format(reduction))
+                "but received {}.".format(reduction)
+            )
         self.weight = weight
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, label):
-        return F.multi_label_soft_margin_loss(input,
-                                              label,
-                                              weight=self.weight,
-                                              reduction=self.reduction,
-                                              name=self.name)
+        return F.multi_label_soft_margin_loss(
+            input,
+            label,
+            weight=self.weight,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class HingeEmbeddingLoss(Layer):
     r"""
-    This operator calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
+    Create a callable object of `HingeEmbeddingLoss` to calculates hinge_embedding_loss. Measures the loss given an input tensor :math:`x` and a labels tensor :math:`y`(containing 1 or -1).
     This is usually used for measuring whether two inputs are similar or dissimilar, e.g. using the L1 pairwise distance as :math:`x`,
     and is typically used for learning nonlinear embeddings or semi-supervised learning.
 
@@ -1379,11 +1364,13 @@ def __init__(self, margin=1.0, reduction="mean", name=None):
         self.name = name
 
     def forward(self, input, label):
-        return F.hinge_embedding_loss(input,
-                                      label,
-                                      reduction=self.reduction,
-                                      margin=self.margin,
-                                      name=self.name)
+        return F.hinge_embedding_loss(
+            input,
+            label,
+            reduction=self.reduction,
+            margin=self.margin,
+            name=self.name,
+        )
 
 
 class CosineEmbeddingLoss(Layer):
@@ -1457,23 +1444,27 @@ def __init__(self, margin=0, reduction='mean', name=None):
         if margin > 1 or margin < -1:
             raise ValueError(
                 "The value of 'margin' should be in the interval of [-1, 1], but received %f, which is not allowed."
-                % margin)
+                % margin
+            )
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' should be 'sum', 'mean' or "
-                "'none', but received %s, which is not allowed." % reduction)
+                "'none', but received %s, which is not allowed." % reduction
+            )
         super(CosineEmbeddingLoss, self).__init__()
         self.margin = margin
         self.reduction = reduction
         self.name = name
 
     def forward(self, input1, input2, label):
-        return F.cosine_embedding_loss(input1,
-                                       input2,
-                                       label,
-                                       margin=self.margin,
-                                       reduction=self.reduction,
-                                       name=self.name)
+        return F.cosine_embedding_loss(
+            input1,
+            input2,
+            label,
+            margin=self.margin,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class TripletMarginWithDistanceLoss(Layer):
@@ -1491,22 +1482,22 @@ class TripletMarginWithDistanceLoss(Layer):
         L(input, pos, neg) = \max \{d(input_i, pos_i) - d(input_i, neg_i) + {\rm margin}, 0\}
 
     where the default `distance_function`
-    
+
     .. math::
-    	d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
-    
-    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference 
+        d(x_i, y_i) = \left\lVert {\bf x}_i - {\bf y}_i \right\rVert_2
+
+    or user can define their own distance function. `margin` is a nonnegative margin representing the minimum difference
     between the positive and negative distances that is required for the loss to be 0. If `swap` is true, it will compare distance of (input, negative) with
     distance of (negative, positive) and change it to the smaller one. For more details see http://www.bmva.org/bmvc/2016/papers/paper119/paper119.pdf.
 
     Parameters:
         distance_function (Callable, Optional): Quantifies the distance between two tensors. if not specified, 2 norm functions will be used.
-	
+
         margin (float, Optional):Default: :math:`1`.A nonnegative margin representing the minimum difference
                 between the positive and negative distances required for the loss to be 0. Larger
                 margins penalize cases where the negative examples are not distant enough from the
                 anchors, relative to the positives.
-		
+
         swap (bool, Optional):The distance swap changes the negative distance to the swap distance (distance between positive samples
                 and negative samples) if swap distance smaller than negative distance. Default: ``False``.
 
@@ -1518,18 +1509,18 @@ class TripletMarginWithDistanceLoss(Layer):
                 Default: ``'mean'``
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-	    
+
     Shapes:
         input (Tensor):Input tensor, the data type is float32 or float64.
-	the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
+        the shape is [N, \*], N is batch size and `\*` means any number of additional dimensions, available dtype is float32, float64.
 
         positive (Tensor):Positive tensor, the data type is float32 or float64.
-	The shape of label is the same as the shape of input.
+        The shape of label is the same as the shape of input.
 
         negative (Tensor):Negative tensor, the data type is float32 or float64.
-	The shape of label is the same as the shape of input.
-	
-	    output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
+        The shape of label is the same as the shape of input.
+
+            output(Tensor): The tensor variable storing the triplet_margin_with_distance_loss of input and positive and negative.
 
     Return：
         A callable object of TripletMarginWithDistanceLoss
@@ -1555,18 +1546,21 @@ class TripletMarginWithDistanceLoss(Layer):
 
     """
 
-    def __init__(self,
-                 distance_function=None,
-                 margin=1.0,
-                 swap=False,
-                 reduction: str = 'mean',
-                 name=None):
+    def __init__(
+        self,
+        distance_function=None,
+        margin=1.0,
+        swap=False,
+        reduction: str = 'mean',
+        name=None,
+    ):
         super(TripletMarginWithDistanceLoss, self).__init__()
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in TripletMarginWithDistanceLoss "
                 "should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         self.margin = margin
         self.swap = swap
         self.reduction = reduction
@@ -1574,13 +1568,15 @@ def __init__(self,
         self.name = name
 
     def forward(self, input, positive, negative):
-        return F.triplet_margin_with_distance_loss(input,
-                                                   positive,
-                                                   negative,
-                                                   margin=self.margin,
-                                                   swap=self.swap,
-                                                   reduction=self.reduction,
-                                                   name=self.name)
+        return F.triplet_margin_with_distance_loss(
+            input,
+            positive,
+            negative,
+            margin=self.margin,
+            swap=self.swap,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class TripletMarginLoss(Layer):
@@ -1650,7 +1646,7 @@ class TripletMarginLoss(Layer):
             loss = triplet_margin_loss(input, positive, negative)
             print(loss)
             # Tensor([0.        , 0.57496738, 0.        ])
-	    
+
             triplet_margin_loss = paddle.nn.TripletMarginLoss(margin=1.0, swap=True, reduction='mean', )
             loss = triplet_margin_loss(input, positive, negative,)
             print(loss)
@@ -1658,18 +1654,21 @@ class TripletMarginLoss(Layer):
 
     """
 
-    def __init__(self,
-                 margin=1.0,
-                 p=2.,
-                 epsilon=1e-6,
-                 swap=False,
-                 reduction='mean',
-                 name=None):
+    def __init__(
+        self,
+        margin=1.0,
+        p=2.0,
+        epsilon=1e-6,
+        swap=False,
+        reduction='mean',
+        name=None,
+    ):
         super(TripletMarginLoss, self).__init__()
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in TripletMarginLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
         self.margin = margin
         self.p = p
         self.epsilon = epsilon
@@ -1678,19 +1677,22 @@ def __init__(self,
         self.name = name
 
     def forward(self, input, positive, negative):
-        return F.triplet_margin_loss(input,
-                                     positive,
-                                     negative,
-                                     margin=self.margin,
-                                     p=self.p,
-                                     epsilon=self.epsilon,
-                                     swap=self.swap,
-                                     reduction=self.reduction,
-                                     name=self.name)
+        return F.triplet_margin_loss(
+            input,
+            positive,
+            negative,
+            margin=self.margin,
+            p=self.p,
+            epsilon=self.epsilon,
+            swap=self.swap,
+            reduction=self.reduction,
+            name=self.name,
+        )
 
 
 class SoftMarginLoss(Layer):
     r"""
+
     Creates a criterion that measures a two-class soft margin loss between input predictions ``input``
     and target labels ``label`` . It can be described as:
 
@@ -1709,17 +1711,14 @@ class SoftMarginLoss(Layer):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Shapes:
-
-        Input (Tensor): The input tensor with shape: [N, *],
-        N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
-        Available dtype is float32, float64.
-
-        Label (Tensor): The target labels tensor with the same shape as
-        ``input``. The target labels which values should be numbers -1 or 1.
-        Available dtype is int32, int64, float32, float64.
-
-        Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
-            same as ``input`` , else the shape of output is [1].
+        - Input (Tensor): The input tensor with shape: ``[N, *]``,
+          N is batch_size, `*` means any number of additional dimensions. The ``input`` ranges from -inf to inf
+          Available dtype is float32, float64.
+        - Label (Tensor): The target labels tensor with the same shape as
+          ``input``. The target labels which values should be numbers -1 or 1.
+          Available dtype is int32, int64, float32, float64.
+        - Output (Tensor): If ``reduction`` is ``'none'``, the shape of output is
+          same as ``input`` , else the shape of output is [1].
 
     Returns:
         A callable object of SoftMarginLoss.
@@ -1728,33 +1727,45 @@ class SoftMarginLoss(Layer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             input = paddle.to_tensor([[0.5, 0.6, 0.7],[0.3, 0.5, 0.2]], 'float32')
             label = paddle.to_tensor([[1.0, -1.0, 1.0],[-1.0, 1.0, 1.0]], 'float32')
             soft_margin_loss = paddle.nn.SoftMarginLoss()
             output = soft_margin_loss(input, label)
+            print(output)
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.64022040])
 
-            input_np = np.random.uniform(0.1, 0.8, size=(5, 5)).astype(np.float64)
-            label_np = np.random.randint(0, 2, size=(5, 5)).astype(np.int64)
+            input_np = paddle.uniform(shape=(5, 5), min=0.1, max=0.8, dtype="float64")
+            label_np = paddle.randint(high=2, shape=(5, 5), dtype="int64")
             label_np[label_np==0]=-1
             input = paddle.to_tensor(input_np)
             label = paddle.to_tensor(label_np)
             soft_margin_loss = paddle.nn.SoftMarginLoss(reduction='none')
             output = soft_margin_loss(input, label)
+            print(output)
+            # Tensor(shape=[5, 5], dtype=float64, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.61739663, 0.51405668, 1.09346100, 0.42385561, 0.91602303],
+            #         [0.76997038, 1.01977148, 0.98971722, 1.13976032, 0.88152088],
+            #         [0.55476735, 1.10505384, 0.89923519, 0.45018155, 1.06587511],
+            #         [0.37998142, 0.48067240, 0.47791212, 0.55664053, 0.98581399],
+            #         [0.78571653, 0.59319711, 0.39701841, 0.76172109, 0.83781742]])
+
     """
 
     def __init__(self, reduction='mean', name=None):
         if reduction not in ['sum', 'mean', 'none']:
             raise ValueError(
                 "The value of 'reduction' in SoftMarginLoss should be 'sum', 'mean' or 'none', but "
-                "received %s, which is not allowed." % reduction)
+                "received %s, which is not allowed." % reduction
+            )
 
         super(SoftMarginLoss, self).__init__()
         self.reduction = reduction
         self.name = name
 
     def forward(self, input, label):
-        out = paddle.nn.functional.soft_margin_loss(input, label,
-                                                    self.reduction, self.name)
+        out = paddle.nn.functional.soft_margin_loss(
+            input, label, self.reduction, self.name
+        )
         return out
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index 93d6b21c13f548..d359f576dd6a9a 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -56,23 +56,27 @@
 
 class _InstanceNormBase(Layer):
     """
-    This class is based class for InstanceNorm1D, 2d, 3d. 
+    This class is based class for InstanceNorm1D, 2d, 3d.
 
     See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
     """
 
-    def __init__(self,
-                 num_features,
-                 epsilon=1e-5,
-                 momentum=0.9,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        num_features,
+        epsilon=1e-5,
+        momentum=0.9,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NCHW",
+        name=None,
+    ):
         super(_InstanceNormBase, self).__init__()
 
         if weight_attr == False or bias_attr == False:
-            assert weight_attr == bias_attr, "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
+            assert (
+                weight_attr == bias_attr
+            ), "weight_attr and bias_attr must be set to Fasle at the same time in InstanceNorm"
         self._epsilon = epsilon
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -83,11 +87,14 @@ def __init__(self,
                 attr=self._weight_attr,
                 shape=[num_features],
                 default_initializer=Constant(1.0),
-                is_bias=False)
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=[num_features],
-                                              default_initializer=Constant(0.0),
-                                              is_bias=True)
+                is_bias=False,
+            )
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=[num_features],
+                default_initializer=Constant(0.0),
+                is_bias=True,
+            )
         else:
             self.scale = None
             self.bias = None
@@ -98,19 +105,19 @@ def _check_input_dim(self, input):
     def forward(self, input):
         self._check_input_dim(input)
 
-        return instance_norm(input,
-                             weight=self.scale,
-                             bias=self.bias,
-                             eps=self._epsilon)
+        return instance_norm(
+            input, weight=self.scale, bias=self.bias, eps=self._epsilon
+        )
 
     def extra_repr(self):
-        return 'num_features={}, epsilon={}'.format(self._num_features,
-                                                    self._epsilon)
+        return 'num_features={}, epsilon={}'.format(
+            self._num_features, self._epsilon
+        )
 
 
 class InstanceNorm1D(_InstanceNormBase):
     r"""
-    Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+    Create a callable object of `InstanceNorm1D`. Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCL `[batch, in_channels, length]`
 
@@ -126,8 +133,7 @@ class InstanceNorm1D(_InstanceNormBase):
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
         y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    Note:
-        `H` means height of feature map, `W` means width of feature map.
+Where `H` means height of feature map, `W` means width of feature map.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -144,7 +150,7 @@ class InstanceNorm1D(_InstanceNormBase):
 	     will create ParamAttr as bias_attr, the name of bias can be set in ParamAttr. 
 	     If the Initializer of the bias_attr is not set, the bias is initialized zero. 
              If it is set to False, will not create bias_attr. Default: None.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Defalut "NCL".
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL". Default "NCL".
         name(str, optional): Name for the InstanceNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
 
@@ -161,11 +167,8 @@ class InstanceNorm1D(_InstanceNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.rand((2, 2, 3))
           instance_norm = paddle.nn.InstanceNorm1D(2)
           instance_norm_out = instance_norm(x)
 
@@ -175,13 +178,16 @@ class InstanceNorm1D(_InstanceNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
-            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 2D or 3D input (got {}D input)'.format(
+                    len(input.shape)
+                )
+            )
 
 
 class InstanceNorm2D(_InstanceNormBase):
     r"""
-    Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+    Create a callable object of `InstanceNorm2D`. Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, in_height, in_width]`
 
@@ -198,8 +204,7 @@ class InstanceNorm2D(_InstanceNormBase):
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
         y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    Note:
-        `H` means height of feature map, `W` means width of feature map.
+Where `H` means height of feature map, `W` means width of feature map.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -232,11 +237,8 @@ class InstanceNorm2D(_InstanceNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.rand((2, 2, 2, 3))
           instance_norm = paddle.nn.InstanceNorm2D(2)
           instance_norm_out = instance_norm(x)
 
@@ -245,13 +247,14 @@ class InstanceNorm2D(_InstanceNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
-            raise ValueError('expected 4D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 4D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class InstanceNorm3D(_InstanceNormBase):
     r"""
-    Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
+    Create a callable object of `InstanceNorm3D`. Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
     DataLayout: NCHW `[batch, in_channels, D, in_height, in_width]`
 
@@ -268,8 +271,7 @@ class InstanceNorm3D(_InstanceNormBase):
         \sigma_{\beta}^{2} + \epsilon}} \qquad &//\ normalize \\
         y_i &\gets \gamma \hat{x_i} + \beta \qquad &//\ scale\ and\ shift
 
-    Note:
-        `H` means height of feature map, `W` means width of feature map.
+Where `H` means height of feature map, `W` means width of feature map.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -302,11 +304,8 @@ class InstanceNorm3D(_InstanceNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.rand((2, 2, 2, 2, 3))
           instance_norm = paddle.nn.InstanceNorm3D(2)
           instance_norm_out = instance_norm(x)
 
@@ -315,12 +314,14 @@ class InstanceNorm3D(_InstanceNormBase):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
-            raise ValueError('expected 5D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 5D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class GroupNorm(Layer):
     """
+
     This interface is used to construct a callable object of the ``GroupNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Group Normalization Layer.
@@ -341,7 +342,7 @@ class GroupNorm(Layer):
         name(str, optional): Name for the GroupNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
-        - x: Tensor with shape: (batch, num_features, *).
+        - x: Tensor with shape: attr:`(batch, num_features, *)`.
         - output: The same shape as input x.
 
     Returns:
@@ -350,27 +351,25 @@ class GroupNorm(Layer):
     Examples:
         .. code-block:: python
 
-          import paddle
-          import numpy as np
+            import paddle
 
-          paddle.disable_static()
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 6, 2, 2)).astype('float32')
-          x = paddle.to_tensor(x_data) 
-          group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
-          group_norm_out = group_norm(x)
+            x = paddle.arange(48, dtype="float32").reshape((2, 6, 2, 2))
+            group_norm = paddle.nn.GroupNorm(num_channels=6, num_groups=6)
+            group_norm_out = group_norm(x)
 
-          print(group_norm_out.numpy())
+            print(group_norm_out)
     """
 
-    def __init__(self,
-                 num_groups,
-                 num_channels,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
+    def __init__(
+        self,
+        num_groups,
+        num_channels,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        name=None,
+    ):
         super(GroupNorm, self).__init__()
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
@@ -384,39 +383,57 @@ def __init__(self,
 
         if weight_attr == False:
             self.weight = self.create_parameter(
-                attr=None, shape=param_shape, default_initializer=Constant(1.0))
+                attr=None, shape=param_shape, default_initializer=Constant(1.0)
+            )
             self.weight.stop_gradient = True
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+                default_initializer=Constant(1.0),
+            )
+            self.weight.stop_gradient = (
+                self._weight_attr != None
+                and self._weight_attr.learning_rate == 0.0
+            )
 
         if bias_attr == False:
-            self.bias = self.create_parameter(attr=None,
-                                              shape=param_shape,
-                                              default_initializer=Constant(0.0),
-                                              is_bias=True)
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                default_initializer=Constant(0.0),
+                is_bias=True,
+            )
             self.bias.stop_gradient = True
         else:
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=param_shape,
-                                              is_bias=True)
-            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True
+            )
+            self.bias.stop_gradient = (
+                self._bias_attr != None and self._bias_attr.learning_rate == 0.0
+            )
 
     def forward(self, input):
         mean_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
+            dtype=input.dtype, stop_gradient=True
+        )
         variance_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype, stop_gradient=True)
+            dtype=input.dtype, stop_gradient=True
+        )
 
         if in_dygraph_mode():
-            pre_act = _C_ops.group_norm(input, self.weight, self.bias,
-                                        self._epsilon, self._num_groups, "NCHW")
+            pre_act = _C_ops.group_norm(
+                input,
+                self.weight,
+                self.bias,
+                self._epsilon,
+                self._num_groups,
+                "NCHW",
+            )
 
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               act=None)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None
+            )
 
         elif _in_legacy_dygraph():
             pre_act, _, _ = _legacy_C_ops.group_norm(
@@ -430,8 +447,9 @@ def forward(self, input):
                 'groups',
                 self._num_groups,
             )
-            return dygraph_utils._append_activation_in_dygraph(pre_act,
-                                                               act=None)
+            return dygraph_utils._append_activation_in_dygraph(
+                pre_act, act=None
+            )
 
         inputs = {'X': input}
         if self.bias is not None:
@@ -441,34 +459,31 @@ def forward(self, input):
 
         # create output
         group_norm_out = self._helper.create_variable_for_type_inference(
-            dtype=input.dtype)
-
-        self._helper.append_op(type="group_norm",
-                               inputs=inputs,
-                               outputs={
-                                   "Y": group_norm_out,
-                                   "Mean": mean_out,
-                                   "Variance": variance_out,
-                               },
-                               attrs={
-                                   "epsilon": self._epsilon,
-                                   "groups": self._num_groups
-                               })
+            dtype=input.dtype
+        )
+
+        self._helper.append_op(
+            type="group_norm",
+            inputs=inputs,
+            outputs={
+                "Y": group_norm_out,
+                "Mean": mean_out,
+                "Variance": variance_out,
+            },
+            attrs={"epsilon": self._epsilon, "groups": self._num_groups},
+        )
 
         return self._helper.append_activation(group_norm_out, None)
 
     def extra_repr(self):
         return 'num_groups={}, num_channels={}, epsilon={}'.format(
-            self._num_groups, self._num_channels, self._epsilon)
+            self._num_groups, self._num_channels, self._epsilon
+        )
 
 
 class LayerNorm(Layer):
     r"""
-    :alias_main: paddle.nn.LayerNorm
-	:alias: paddle.nn.LayerNorm,paddle.nn.layer.LayerNorm,paddle.nn.layer.norm.LayerNorm
-	:old_api: paddle.fluid.dygraph.LayerNorm
-
-    This interface is used to construct a callable object of the ``LayerNorm`` class.
+    Construct a callable object of the ``LayerNorm`` class.
     For more details, refer to code examples.
     It implements the function of the Layer Normalization Layer and can be applied to mini-batch input data.
     Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_
@@ -516,23 +531,22 @@ class LayerNorm(Layer):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
-          layer_norm = paddle.nn.LayerNorm(x_data.shape[1:])
+          x = paddle.rand((2, 2, 2, 3))
+          layer_norm = paddle.nn.LayerNorm(x.shape[1:])
           layer_norm_out = layer_norm(x)
 
           print(layer_norm_out)
     """
 
-    def __init__(self,
-                 normalized_shape,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 name=None):
+    def __init__(
+        self,
+        normalized_shape,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        name=None,
+    ):
         super(LayerNorm, self).__init__()
         if isinstance(normalized_shape, numbers.Integral):
             normalized_shape = [normalized_shape]
@@ -549,25 +563,29 @@ def __init__(self,
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
+            )
 
         if bias_attr is False:
             self.bias = None
         else:
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=param_shape,
-                                              is_bias=True)
+            self.bias = self.create_parameter(
+                attr=self._bias_attr, shape=param_shape, is_bias=True
+            )
 
     def forward(self, input):
-        return layer_norm(input,
-                          normalized_shape=self._normalized_shape,
-                          weight=self.weight,
-                          bias=self.bias,
-                          epsilon=self._epsilon)
+        return layer_norm(
+            input,
+            normalized_shape=self._normalized_shape,
+            weight=self.weight,
+            bias=self.bias,
+            epsilon=self._epsilon,
+        )
 
     def extra_repr(self):
-        return 'normalized_shape={}, epsilon={}'.format(self._normalized_shape,
-                                                        self._epsilon)
+        return 'normalized_shape={}, epsilon={}'.format(
+            self._normalized_shape, self._epsilon
+        )
 
 
 class _BatchNormBase(Layer):
@@ -575,15 +593,17 @@ class _BatchNormBase(Layer):
     BatchNorm base .
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 use_global_stats=None,
-                 name=None):
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        use_global_stats=None,
+        name=None,
+    ):
         super(_BatchNormBase, self).__init__()
         self._num_features = num_features
         self._weight_attr = weight_attr
@@ -603,29 +623,40 @@ def __init__(self,
                 attr=None,
                 shape=param_shape,
                 dtype=self._dtype,
-                default_initializer=Constant(1.0))
+                default_initializer=Constant(1.0),
+            )
             self.weight.stop_gradient = True
         else:
             self.weight = self.create_parameter(
                 attr=self._weight_attr,
                 shape=param_shape,
                 dtype=self._dtype,
-                default_initializer=Constant(1.0))
-            self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
+                default_initializer=Constant(1.0),
+            )
+            self.weight.stop_gradient = (
+                self._weight_attr != None
+                and self._weight_attr.learning_rate == 0.0
+            )
 
         if bias_attr == False:
-            self.bias = self.create_parameter(attr=None,
-                                              shape=param_shape,
-                                              dtype=self._dtype,
-                                              default_initializer=Constant(0.0),
-                                              is_bias=True)
+            self.bias = self.create_parameter(
+                attr=None,
+                shape=param_shape,
+                dtype=self._dtype,
+                default_initializer=Constant(0.0),
+                is_bias=True,
+            )
             self.bias.stop_gradient = True
         else:
-            self.bias = self.create_parameter(attr=self._bias_attr,
-                                              shape=param_shape,
-                                              dtype=self._dtype,
-                                              is_bias=True)
-            self.bias.stop_gradient = self._bias_attr != None and self._bias_attr.learning_rate == 0.
+            self.bias = self.create_parameter(
+                attr=self._bias_attr,
+                shape=param_shape,
+                dtype=self._dtype,
+                is_bias=True,
+            )
+            self.bias.stop_gradient = (
+                self._bias_attr != None and self._bias_attr.learning_rate == 0.0
+            )
 
         moving_mean_name = None
         moving_variance_name = None
@@ -634,22 +665,28 @@ def __init__(self,
             moving_mean_name = name + "_mean"
             moving_variance_name = name + "_variance"
 
-        self._mean = self.create_parameter(dtype=self._dtype,
-                                           attr=ParamAttr(
-                                               name=moving_mean_name,
-                                               initializer=Constant(0.0),
-                                               trainable=False,
-                                               do_model_average=True),
-                                           shape=param_shape)
+        self._mean = self.create_parameter(
+            dtype=self._dtype,
+            attr=ParamAttr(
+                name=moving_mean_name,
+                initializer=Constant(0.0),
+                trainable=False,
+                do_model_average=True,
+            ),
+            shape=param_shape,
+        )
         self._mean.stop_gradient = True
 
-        self._variance = self.create_parameter(dtype=self._dtype,
-                                               attr=ParamAttr(
-                                                   name=moving_variance_name,
-                                                   initializer=Constant(1.0),
-                                                   trainable=False,
-                                                   do_model_average=True),
-                                               shape=param_shape)
+        self._variance = self.create_parameter(
+            dtype=self._dtype,
+            attr=ParamAttr(
+                name=moving_variance_name,
+                initializer=Constant(1.0),
+                trainable=False,
+                do_model_average=True,
+            ),
+            shape=param_shape,
+        )
         self._variance.stop_gradient = True
 
         self._data_format = data_format
@@ -673,22 +710,26 @@ def forward(self, input):
 
         if self.training:
             warnings.warn(
-                "When training, we now always track global mean and variance.")
-
-        return batch_norm(input,
-                          self._mean,
-                          self._variance,
-                          weight=self.weight,
-                          bias=self.bias,
-                          training=self.training,
-                          momentum=self._momentum,
-                          epsilon=self._epsilon,
-                          data_format=self._data_format,
-                          use_global_stats=self._use_global_stats)
+                "When training, we now always track global mean and variance."
+            )
+
+        return batch_norm(
+            input,
+            self._mean,
+            self._variance,
+            weight=self.weight,
+            bias=self.bias,
+            training=self.training,
+            momentum=self._momentum,
+            epsilon=self._epsilon,
+            data_format=self._data_format,
+            use_global_stats=self._use_global_stats,
+        )
 
     def extra_repr(self):
         main_str = 'num_features={}, momentum={}, epsilon={}'.format(
-            self._num_features, self._momentum, self._epsilon)
+            self._num_features, self._momentum, self._epsilon
+        )
         if self._data_format != 'NCHW':
             main_str += ', data_format={}'.format(self._data_format)
         if self._name is not None:
@@ -743,7 +784,7 @@ class BatchNorm1D(_BatchNormBase):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Defalut "NCL".
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL".
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
@@ -760,29 +801,35 @@ class BatchNorm1D(_BatchNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.rand((2, 1, 3))
           batch_norm = paddle.nn.BatchNorm1D(1)
           batch_norm_out = batch_norm(x)
 
           print(batch_norm_out)
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCL',
-                 use_global_stats=None,
-                 name=None):
-        super(BatchNorm1D,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, use_global_stats, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCL',
+        use_global_stats=None,
+        name=None,
+    ):
+        super(BatchNorm1D, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            use_global_stats,
+            name,
+        )
 
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NC' or input == 'NCL':
@@ -791,12 +838,16 @@ def _check_data_format(self, input):
             self._data_format = "NHWC"
         else:
             raise ValueError(
-                'expected NC , NCL, NLC or None for data_format input')
+                'expected NC , NCL, NLC or None for data_format input'
+            )
 
     def _check_input_dim(self, input):
         if len(input.shape) != 2 and len(input.shape) != 3:
-            raise ValueError('expected 2D or 3D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 2D or 3D input (got {}D input)'.format(
+                    len(input.shape)
+                )
+            )
 
 
 class BatchNorm2D(_BatchNormBase):
@@ -862,11 +913,8 @@ class BatchNorm2D(_BatchNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.rand((2, 1, 2, 3))
           batch_norm = paddle.nn.BatchNorm2D(1)
           batch_norm_out = batch_norm(x)
 
@@ -883,8 +931,9 @@ def _check_data_format(self, input):
 
     def _check_input_dim(self, input):
         if len(input.shape) != 4:
-            raise ValueError('expected 4D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 4D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class BatchNorm3D(_BatchNormBase):
@@ -950,29 +999,35 @@ class BatchNorm3D(_BatchNormBase):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          np.random.seed(123)
-          x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
-          x = paddle.to_tensor(x_data) 
+          x = paddle.rand((2, 1, 2, 2, 3))
           batch_norm = paddle.nn.BatchNorm3D(1)
           batch_norm_out = batch_norm(x)
 
           print(batch_norm_out)
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCDHW',
-                 use_global_stats=None,
-                 name=None):
-        super(BatchNorm3D,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, use_global_stats, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCDHW',
+        use_global_stats=None,
+        name=None,
+    ):
+        super(BatchNorm3D, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            use_global_stats,
+            name,
+        )
 
     def _check_data_format(self, input):
         if input == 'NCHW' or input == 'NCDHW':
@@ -981,16 +1036,19 @@ def _check_data_format(self, input):
             self._data_format = 'NHWC'
         else:
             raise ValueError(
-                'expected NCDHW, NDHWC or None for data_format input')
+                'expected NCDHW, NDHWC or None for data_format input'
+            )
 
     def _check_input_dim(self, input):
         if len(input.shape) != 5:
-            raise ValueError('expected 5D input (got {}D input)'.format(
-                len(input.shape)))
+            raise ValueError(
+                'expected 5D input (got {}D input)'.format(len(input.shape))
+            )
 
 
 class SyncBatchNorm(_BatchNormBase):
     r"""
+
     This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
     It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can 
     be used as a normalizer function for other operations, such as conv2d and fully connected 
@@ -1036,9 +1094,9 @@ class SyncBatchNorm(_BatchNormBase):
     - :math:`\beta` : trainable shift parameter vector 
 
     Note:
-        If you want to use container to pack your model and has ``SyncBatchNorm`` in the 
-        evaluation phase, please use ``nn.LayerList`` or ``nn.Sequential`` instead of 
-        ``list`` to pack the model. 
+        If you want to use container to pack your model and has :ref:`api_paddle_nn_SyncBatchNorm` in the
+        evaluation phase, please use :ref:`api_paddle_nn_LayerList` or :ref:`api_paddle_nn_Sequential` instead of
+        :ref:`api_paddle_hub_list` to pack the model.
 
     Parameters:
         num_features(int): Indicate the number of channels of the input ``Tensor``.
@@ -1056,37 +1114,52 @@ class SyncBatchNorm(_BatchNormBase):
              have trainable bias parameter. Default: None.
 
     Shapes:
-        input: Tensor that the dimension from 2 to 5.
-        output: Tensor with the same shape as input.
+        - input: Tensor that the dimension from 2 to 5.
+        - output: Tensor with the same shape as input.
 
     Examples:
         .. code-block:: python
 
-          import paddle
-          import paddle.nn as nn
-          import numpy as np
+            # required: gpu
+
+            import paddle
+            import paddle.nn as nn
 
-          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
-          x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
+
+            if paddle.is_compiled_with_cuda():
+                sync_batch_norm = nn.SyncBatchNorm(2)
+                hidden1 = sync_batch_norm(x)
+                print(hidden1)
+                # Tensor(shape=[1, 2, 2, 2], dtype=float32, place=Place(gpu:0), stop_gradient=False,
+                #        [[[[ 0.26824948,  1.09363246],
+                #           [ 0.26824948, -1.63013160]],
+
+                #          [[ 0.80956620, -0.66528702],
+                #           [-1.27446556,  1.13018656]]]])
 
-          if paddle.is_compiled_with_cuda():
-              sync_batch_norm = nn.SyncBatchNorm(2)
-              hidden1 = sync_batch_norm(x)
-              print(hidden1)
-              # [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
-        super(SyncBatchNorm,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, None, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        name=None,
+    ):
+        super(SyncBatchNorm, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            None,
+            name,
+        )
 
     def _check_data_format(self):
         if self._data_format in ['NCHW', 'NCDHW', 'NC', 'NCL']:
@@ -1110,24 +1183,55 @@ def forward(self, x):
         ### use_global_stats only support False in sync_batch_norm
         if in_dygraph_mode():
             sync_batch_norm_out, _, _, _, _, _ = _C_ops.sync_batch_norm_(
-                x, self.weight, self.bias, self._mean, self._variance,
-                self._momentum, self._epsilon, self._data_format,
-                not self.training, False, False, False)
+                x,
+                self.weight,
+                self.bias,
+                self._mean,
+                self._variance,
+                self._momentum,
+                self._epsilon,
+                self._data_format,
+                not self.training,
+                False,
+                False,
+                False,
+            )
             return sync_batch_norm_out
 
         elif in_dynamic_mode():
-            attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
-                     "is_test", not self.training, "data_layout",
-                     self._data_format, "use_mkldnn", False, "fuse_with_relu",
-                     False, "use_global_stats", False, 'trainable_statistics',
-                     False)
+            attrs = (
+                "momentum",
+                self._momentum,
+                "epsilon",
+                self._epsilon,
+                "is_test",
+                not self.training,
+                "data_layout",
+                self._data_format,
+                "use_mkldnn",
+                False,
+                "fuse_with_relu",
+                False,
+                "use_global_stats",
+                False,
+                'trainable_statistics',
+                False,
+            )
             sync_batch_norm_out, _, _, _, _, _ = _legacy_C_ops.sync_batch_norm(
-                x, self.weight, self.bias, self._mean, self._variance, mean_out,
-                variance_out, *attrs)
+                x,
+                self.weight,
+                self.bias,
+                self._mean,
+                self._variance,
+                mean_out,
+                variance_out,
+                *attrs
+            )
             return sync_batch_norm_out
 
-        check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
-                                 'SyncBatchNorm')
+        check_variable_and_dtype(
+            x, 'input', ['float16', 'float32', 'float64'], 'SyncBatchNorm'
+        )
 
         attrs = {
             "momentum": self._momentum,
@@ -1145,28 +1249,30 @@ def forward(self, x):
             "Scale": [self.weight],
             "Bias": [self.bias],
             "Mean": [self._mean],
-            "Variance": [self._variance]
+            "Variance": [self._variance],
         }
 
         saved_mean = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
+            dtype=self._dtype, stop_gradient=True
+        )
         saved_variance = self._helper.create_variable_for_type_inference(
-            dtype=self._dtype, stop_gradient=True)
+            dtype=self._dtype, stop_gradient=True
+        )
         sync_batch_norm_out = self._helper.create_variable_for_type_inference(
-            self._dtype)
+            self._dtype
+        )
 
         outputs = {
             "Y": [sync_batch_norm_out],
             "MeanOut": [mean_out],
             "VarianceOut": [variance_out],
             "SavedMean": [saved_mean],
-            "SavedVariance": [saved_variance]
+            "SavedVariance": [saved_variance],
         }
 
-        self._helper.append_op(type="sync_batch_norm",
-                               inputs=inputs,
-                               outputs=outputs,
-                               attrs=attrs)
+        self._helper.append_op(
+            type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs
+        )
         return sync_batch_norm_out
 
     @classmethod
@@ -1181,8 +1287,8 @@ def convert_sync_batchnorm(cls, layer):
             The original model with converted SyncBatchNorm layers. If BatchNorm*d layer in the model, use SyncBatchNorm layer instead.
 
         Examples:
-
             .. code-block:: python
+
                 import paddle
                 import paddle.nn as nn
 
@@ -1192,18 +1298,28 @@ def convert_sync_batchnorm(cls, layer):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            if layer._weight_attr != None and not isinstance(
-                    layer._weight_attr,
-                    bool) and layer._weight_attr.name != None:
+            if (
+                layer._weight_attr != None
+                and not isinstance(layer._weight_attr, bool)
+                and layer._weight_attr.name != None
+            ):
                 layer._weight_attr.name = layer._weight_attr.name + '_sync'
-            if layer._bias_attr != None and not isinstance(
-                    layer._bias_attr, bool) and layer._bias_attr.name != None:
+            if (
+                layer._bias_attr != None
+                and not isinstance(layer._bias_attr, bool)
+                and layer._bias_attr.name != None
+            ):
                 layer._bias_attr.name = layer._bias_attr.name + '_sync'
 
-            layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
-                                         layer._epsilon, layer._weight_attr,
-                                         layer._bias_attr, layer._data_format,
-                                         layer._name)
+            layer_output = SyncBatchNorm(
+                layer._num_features,
+                layer._momentum,
+                layer._epsilon,
+                layer._weight_attr,
+                layer._bias_attr,
+                layer._data_format,
+                layer._name,
+            )
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
@@ -1213,58 +1329,61 @@ def convert_sync_batchnorm(cls, layer):
             layer_output._variance = layer._variance
 
         for name, sublayer in layer.named_children():
-            layer_output.add_sublayer(name,
-                                      cls.convert_sync_batchnorm(sublayer))
+            layer_output.add_sublayer(
+                name, cls.convert_sync_batchnorm(sublayer)
+            )
         del layer
         return layer_output
 
 
 class LocalResponseNorm(Layer):
     """
-        Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
-        For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
+    Local Response Normalization performs a type of "lateral inhibition" by normalizing over local input regions.
+    For more information, please refer to `ImageNet Classification with Deep Convolutional Neural Networks <https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
 
-        See more details in :ref:`api_paddle_nn_functional_local_response_norm` .
+    See more details in :ref:`api_paddle_nn_functional_local_response_norm` .
 
-        Parameters:
-            size (int): The number of channels to sum over.
-            alpha (float, optional): The scaling parameter, positive. Default:1e-4
-            beta (float, optional): The exponent, positive. Default:0.75
-            k (float, optional): An offset, positive. Default: 1.0
-            data_format (str, optional): Specify the data format of the input, and the data format of the output
-                will be consistent with that of the input. An optional string from:
-                If input is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
-                the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
-                If input is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
-                If input is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
-                the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
-            name (str, optional): Name for the operation (optional, default is None). For more information,
-                please refer to :ref:`api_guide_Name`.
-
-        Shape:
-            - input: 3-D/4-D/5-D tensor.
-            - output: 3-D/4-D/5-D tensor, the same shape as input.
+    Parameters:
+        size (int): The number of channels to sum over.
+        alpha (float, optional): The scaling parameter, positive. Default:1e-4
+        beta (float, optional): The exponent, positive. Default:0.75
+        k (float, optional): An offset, positive. Default: 1.0
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
+            will be consistent with that of the input. An optional string from:
+            If input is 3-D Tensor, the string could be `"NCL"` or `"NLC"` . When it is `"NCL"`,
+            the data is stored in the order of: `[batch_size, input_channels, feature_length]`.
+            If input is 4-D Tensor, the string could be  `"NCHW"`, `"NHWC"`. When it is `"NCHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`.
+            If input is 5-D Tensor, the string could be  `"NCDHW"`, `"NDHWC"` . When it is `"NCDHW"`,
+            the data is stored in the order of: `[batch_size, input_channels, input_depth, input_height, input_width]`.
+        name (str, optional): Name for the operation (optional, default is None). For more information,
+            please refer to :ref:`api_guide_Name`.
 
-        Examples:
+    Shape:
+        - input: 3-D/4-D/5-D tensor.
+        - output: 3-D/4-D/5-D tensor, the same shape as input.
 
-        .. code-block:: python
+    Examples:
 
-            import paddle
+    .. code-block:: python
 
-            x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
-            m = paddle.nn.LocalResponseNorm(size=5)
-            y = m(x)
-            print(y.shape)  # [3, 3, 112, 112]
-        """
+        import paddle
+
+        x = paddle.rand(shape=(3, 3, 112, 112), dtype="float32")
+        m = paddle.nn.LocalResponseNorm(size=5)
+        y = m(x)
+        print(y.shape)  # [3, 3, 112, 112]
+    """
 
-    def __init__(self,
-                 size,
-                 alpha=0.0001,
-                 beta=0.75,
-                 k=1.0,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        size,
+        alpha=0.0001,
+        beta=0.75,
+        k=1.0,
+        data_format="NCHW",
+        name=None,
+    ):
         super(LocalResponseNorm, self).__init__()
         self.size = size
         self.alpha = alpha
@@ -1274,13 +1393,21 @@ def __init__(self,
         self.name = name
 
     def forward(self, input):
-        out = F.local_response_norm(input, self.size, self.alpha, self.beta,
-                                    self.k, self.data_format, self.name)
+        out = F.local_response_norm(
+            input,
+            self.size,
+            self.alpha,
+            self.beta,
+            self.k,
+            self.data_format,
+            self.name,
+        )
         return out
 
     def extra_repr(self):
         main_str = 'size={}, alpha={}, beta={}, k={}'.format(
-            self.size, self.alpha, self.beta, self.k)
+            self.size, self.alpha, self.beta, self.k
+        )
         if self.data_format != 'NCHW':
             main_str += ', data_format={}'.format(self.data_format)
         if self.name is not None:
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index ccba13316a17b5..75580342b392c2 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -61,29 +61,30 @@ class AvgPool1D(Layer):
 
     Returns:
         A callable object of AvgPool1D.
-        
+
     Examples:
 
         .. code-block:: python
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
-            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
             AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
             pool_out = AvgPool1D(data)
             # pool_out shape: [1, 3, 16]
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 exclusive=True,
-                 ceil_mode=False,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        exclusive=True,
+        ceil_mode=False,
+        name=None,
+    ):
         super(AvgPool1D, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride
@@ -93,13 +94,21 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
-                           self.exclusive, self.ceil_mode, self.name)
+        out = F.avg_pool1d(
+            x,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.exclusive,
+            self.ceil_mode,
+            self.name,
+        )
         return out
 
     def extra_repr(self):
         return 'kernel_size={kernel_size}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class AvgPool2D(Layer):
@@ -163,10 +172,9 @@ class AvgPool2D(Layer):
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
             # max pool2d
-            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
             AvgPool2D = nn.AvgPool2D(kernel_size=2,
                                 stride=2, padding=0)
             output = AvgPool2D(input)
@@ -174,15 +182,17 @@ class AvgPool2D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 ceil_mode=False,
-                 exclusive=True,
-                 divisor_override=None,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        ceil_mode=False,
+        exclusive=True,
+        divisor_override=None,
+        data_format="NCHW",
+        name=None,
+    ):
         super(AvgPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -194,23 +204,27 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool2d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            ceil_mode=self.ceil_mode,
-                            exclusive=self.exclusive,
-                            divisor_override=self.divisor,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.avg_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            exclusive=self.exclusive,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class AvgPool3D(Layer):
     """
+
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
     in NCDHW format, where N is batch size, C is the number of channels,
@@ -251,15 +265,15 @@ class AvgPool3D(Layer):
           The data type can be float32, float64.
         - output(Tensor): The output tensor of avg pool3d  operator, which is a 5-D tensor.
           The data type is same as input x.
+
     Examples:
         .. code-block:: python
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
             # avg pool3d
-            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
             AvgPool3D = nn.AvgPool3D(kernel_size=2,
                                    stride=2, padding=0)
             output = AvgPool3D(input)
@@ -267,15 +281,17 @@ class AvgPool3D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 ceil_mode=False,
-                 exclusive=True,
-                 divisor_override=None,
-                 data_format="NCDHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        ceil_mode=False,
+        exclusive=True,
+        divisor_override=None,
+        data_format="NCDHW",
+        name=None,
+    ):
         super(AvgPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -287,19 +303,22 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.avg_pool3d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            ceil_mode=self.ceil_mode,
-                            exclusive=self.exclusive,
-                            divisor_override=self.divisor,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.avg_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            exclusive=self.exclusive,
+            divisor_override=self.divisor,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class MaxPool1D(Layer):
@@ -338,14 +357,6 @@ class MaxPool1D(Layer):
     Returns:
         A callable object of MaxPool1D.
 
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ValueError: If `padding` is a list or tuple but its length greater than 1.
-        ShapeError: If the input is not a 3-D.
-        ShapeError: If the output's shape calculated is not greater than 0.
-
-
     Shape:
         - x(Tensor): The input tensor of max pool1d operator, which is a 3-D tensor.
           The data type can be float32, float64.
@@ -358,9 +369,8 @@ class MaxPool1D(Layer):
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
-            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
             MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
             pool_out = MaxPool1D(data)
             # pool_out shape: [1, 3, 16]
@@ -371,13 +381,15 @@ class MaxPool1D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        name=None,
+    ):
         super(MaxPool1D, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride
@@ -387,13 +399,21 @@ def __init__(self,
         self.name = name
 
     def forward(self, input):
-        out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_mask, self.ceil_mode, self.name)
+        out = F.max_pool1d(
+            input,
+            self.kernel_size,
+            self.stride,
+            self.padding,
+            self.return_mask,
+            self.ceil_mode,
+            self.name,
+        )
         return out
 
     def extra_repr(self):
         return 'kernel_size={kernel_size}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class MaxPool2D(Layer):
@@ -442,10 +462,6 @@ class MaxPool2D(Layer):
 
     Returns:
         A callable object of MaxPool2D.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
         - x(Tensor): The input tensor of max pool2d operator, which is a 4-D tensor.
@@ -458,10 +474,9 @@ class MaxPool2D(Layer):
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
             # max pool2d
-            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
+            input = paddle.uniform([1, 3, 32, 32], dtype="float32", min=-1, max=1)
             MaxPool2D = nn.MaxPool2D(kernel_size=2,
                                    stride=2, padding=0)
             output = MaxPool2D(input)
@@ -473,14 +488,16 @@ class MaxPool2D(Layer):
             # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 data_format="NCHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        data_format="NCHW",
+        name=None,
+    ):
         super(MaxPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -491,18 +508,21 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.max_pool2d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            return_mask=self.return_mask,
-                            ceil_mode=self.ceil_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.max_pool2d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_mask=self.return_mask,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class MaxPool3D(Layer):
@@ -539,10 +559,6 @@ class MaxPool3D(Layer):
 
     Returns:
         A callable object of MaxPool3D.
-    Raises:
-        ValueError: If `padding` is a string, but not "SAME" or "VALID".
-        ValueError: If `padding` is "VALID", but `ceil_mode` is True.
-        ShapeError: If the output's shape calculated is not greater than 0.
 
     Shape:
         - x(Tensor): The input tensor of max pool3d operator, which is a 5-D tensor.
@@ -555,10 +571,9 @@ class MaxPool3D(Layer):
 
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
             # max pool3d
-            input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
+            input = paddle.uniform([1, 2, 3, 32, 32], dtype="float32", min=-1, max=1)
             MaxPool3D = nn.MaxPool3D(kernel_size=2,
                                    stride=2, padding=0)
             output = MaxPool3D(input)
@@ -570,14 +585,16 @@ class MaxPool3D(Layer):
             # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 data_format="NCDHW",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        data_format="NCDHW",
+        name=None,
+    ):
         super(MaxPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -588,18 +605,21 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.max_pool3d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            return_mask=self.return_mask,
-                            ceil_mode=self.ceil_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            return_mask=self.return_mask,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
 
 
 class AdaptiveAvgPool1D(Layer):
@@ -645,9 +665,8 @@ class AdaptiveAvgPool1D(Layer):
             #
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
-            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
             AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
             pool_out = AdaptiveAvgPool1D(data)
             # pool_out shape: [1, 3, 16]
@@ -724,11 +743,9 @@ class AdaptiveAvgPool2D(Layer):
             #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
             #
             import paddle
-            import numpy as np
 
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 32, 32]
+            x = paddle.rand([2, 3, 32, 32])
+
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=3)
             pool_out = adaptive_avg_pool(x = x)
             # pool_out.shape is [2, 3, 3, 3]
@@ -741,10 +758,12 @@ def __init__(self, output_size, data_format="NCHW", name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_avg_pool2d(x,
-                                     output_size=self._output_size,
-                                     data_format=self._data_format,
-                                     name=self._name)
+        return F.adaptive_avg_pool2d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
@@ -816,11 +835,9 @@ class AdaptiveAvgPool3D(Layer):
             #                 output[:, :, i, j, k] =
             #                     avg(input[:, :, dstart:dend, hstart: hend, wstart: wend])
             import paddle
-            import numpy as np
 
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
-            # x.shape is [2, 3, 8, 32, 32]
+            x = paddle.rand([2, 3, 8, 32, 32])
+
             adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(output_size=3)
             pool_out = adaptive_avg_pool(x = x)
             # pool_out = [2, 3, 3, 3, 3]
@@ -833,10 +850,12 @@ def __init__(self, output_size, data_format="NCDHW", name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_avg_pool3d(x,
-                                     output_size=self._output_size,
-                                     data_format=self._data_format,
-                                     name=self._name)
+        return F.adaptive_avg_pool3d(
+            x,
+            output_size=self._output_size,
+            data_format=self._data_format,
+            name=self._name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self._output_size)
@@ -871,9 +890,6 @@ class AdaptiveMaxPool1D(Layer):
     Returns:
         A callable object of AdaptiveMaxPool1D.
 
-    Raises:
-        ValueError: 'pool_size' should be a integer or list or tuple with length as 1.
-
     Shape:
         - x(Tensor): The input tensor of adaptive max pool1d operator, which is a 3-D tensor.
           The data type can be float32, float64.
@@ -897,9 +913,8 @@ class AdaptiveMaxPool1D(Layer):
             #
             import paddle
             import paddle.nn as nn
-            import numpy as np
 
-            data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
+            data = paddle.uniform([1, 3, 32], dtype="float32", min=-1, max=1)
             AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
             pool_out = AdaptiveMaxPool1D(data)
             # pool_out shape: [1, 3, 16]
@@ -918,12 +933,14 @@ def __init__(self, output_size, return_mask=False, name=None):
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size, self.return_mask,
-                                     self.name)
+        return F.adaptive_max_pool1d(
+            input, self.output_size, self.return_mask, self.name
+        )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(self.output_size,
-                                                       self.return_mask)
+        return 'output_size={}, return_mask={}'.format(
+            self.output_size, self.return_mask
+        )
 
 
 class AdaptiveMaxPool2D(Layer):
@@ -981,10 +998,9 @@ class AdaptiveMaxPool2D(Layer):
             #             output[:, :, i, j] = max(input[:, :, hstart: hend, wstart: wend])
             #
             import paddle
-            import numpy as np
 
-            input_data = np.random.rand(2, 3, 32, 32)
-            x = paddle.to_tensor(input_data)
+            x = paddle.rand([2, 3, 32, 32])
+
             adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_mask=True)
             pool_out, indices = adaptive_max_pool(x = x)
     """
@@ -996,14 +1012,17 @@ def __init__(self, output_size, return_mask=False, name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_max_pool2d(x,
-                                     output_size=self._output_size,
-                                     return_mask=self._return_mask,
-                                     name=self._name)
+        return F.adaptive_max_pool2d(
+            x,
+            output_size=self._output_size,
+            return_mask=self._return_mask,
+            name=self._name,
+        )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(self._output_size,
-                                                       self._return_mask)
+        return 'output_size={}, return_mask={}'.format(
+            self._output_size, self._return_mask
+        )
 
 
 class AdaptiveMaxPool3D(Layer):
@@ -1068,10 +1087,8 @@ class AdaptiveMaxPool3D(Layer):
             #                 output[:, :, i, j, k] =
             #                     max(input[:, :, dstart:dend, hstart: hend, wstart: wend])
             import paddle
-            import numpy as np
 
-            input_data = np.random.rand(2, 3, 8, 32, 32)
-            x = paddle.to_tensor(input_data)
+            x = paddle.rand([2, 3, 8, 32, 32])
             pool = paddle.nn.AdaptiveMaxPool3D(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
@@ -1088,39 +1105,42 @@ def __init__(self, output_size, return_mask=False, name=None):
         self._name = name
 
     def forward(self, x):
-        return F.adaptive_max_pool3d(x,
-                                     output_size=self._output_size,
-                                     return_mask=self._return_mask,
-                                     name=self._name)
+        return F.adaptive_max_pool3d(
+            x,
+            output_size=self._output_size,
+            return_mask=self._return_mask,
+            name=self._name,
+        )
 
     def extra_repr(self):
-        return 'output_size={}, return_mask={}'.format(self._output_size,
-                                                       self._return_mask)
+        return 'output_size={}, return_mask={}'.format(
+            self._output_size, self._return_mask
+        )
 
 
 class MaxUnPool1D(Layer):
     r"""
     This API implements max unpooling 1d opereation.
 
-    `max_unpool1d` accepts the output of `max_pool1d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool1d` accepts the output of `max_pool1d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, L_{in})`
     - Output: :math:`(N, C, L_{out})`, where
-    
+
     .. math::
         L_{out} = (L_{in} - 1) * stride - 2 * padding + kernel\_size
 
     or as given by :attr:`output_size` in the call operator.
-    
+
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -1136,10 +1156,9 @@ class MaxUnPool1D(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             data = paddle.rand(shape=[1, 3, 16])
             pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
@@ -1150,13 +1169,15 @@ class MaxUnPool1D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCL",
-                 output_size=None,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        data_format="NCL",
+        output_size=None,
+        name=None,
+    ):
         super(MaxUnPool1D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -1166,14 +1187,16 @@ def __init__(self,
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool1d(x,
-                              indices,
-                              kernel_size=self.ksize,
-                              stride=self.stride,
-                              padding=self.padding,
-                              data_format=self.data_format,
-                              output_size=self.output_size,
-                              name=self.name)
+        return F.max_unpool1d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
@@ -1186,7 +1209,7 @@ class MaxUnPool2D(Layer):
     'max_unpool2d' accepts the output of 'max_unpool2d' as input
     Including the indices of the maximum value and calculating the partial inverse
     All non-maximum values ​​are set to zero.
-    
+
 
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
@@ -1195,7 +1218,7 @@ class MaxUnPool2D(Layer):
             it must contain an integer.
         kernel_size (int|tuple): Size of the max unpooling window.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, padding).
         name(str, optional): For detailed information, please refer
@@ -1217,11 +1240,11 @@ class MaxUnPool2D(Layer):
     Returns:
         A callable object of MaxUnPool2D.
 
-            
+
 
     Examples:
         .. code-block:: python
-        
+
         import paddle
         import paddle.nn.functional as F
 
@@ -1234,13 +1257,15 @@ class MaxUnPool2D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCHW",
-                 output_size=None,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        data_format="NCHW",
+        output_size=None,
+        name=None,
+    ):
         super(MaxUnPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -1250,14 +1275,16 @@ def __init__(self,
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool2d(x,
-                              indices,
-                              kernel_size=self.ksize,
-                              stride=self.stride,
-                              padding=self.padding,
-                              data_format=self.data_format,
-                              output_size=self.output_size,
-                              name=self.name)
+        return F.max_unpool2d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
@@ -1267,13 +1294,13 @@ class MaxUnPool3D(Layer):
     r"""
     This API implements max unpooling 3d opereation.
 
-    `max_unpool3d` accepts the output of `max_pool3d` as input, 
-    including the indices of the maximum value and calculate the partial inverse. 
+    `max_unpool3d` accepts the output of `max_pool3d` as input,
+    including the indices of the maximum value and calculate the partial inverse.
     All non-maximum values ​​are set to zero.
 
     - Input: :math:`(N, C, D_{in}, H_{in}, W_{in})`
     - Output: :math:`(N, C, D_{out}, H_{out}, W_{out})`, where
-    
+
     .. math::
         D_{out} = (D_{in} - 1) * stride[0] - 2 * padding[0] + kernel\_size[0]
 
@@ -1285,14 +1312,14 @@ class MaxUnPool3D(Layer):
 
     or as given by :attr:`output_size` in the call operator
 
-    
+
     Parameters:
         kernel_size (int|list|tuple): The unpool kernel size. If unpool kernel size is a tuple or list,
             it must contain an integer.
         stride (int|list|tuple): The unpool stride size. If unpool stride size is a tuple or list,
             it must contain an integer.
         padding (int | tuple): Padding that was added to the input.
-        output_size(list|tuple, optional): The target output size. If output_size is not specified, 
+        output_size(list|tuple, optional): The target output size. If output_size is not specified,
                            the actual output shape will be automatically calculated by (input_shape,
                            kernel_size, stride, padding).
         data_format (string): The data format of the input and output data.
@@ -1308,10 +1335,9 @@ class MaxUnPool3D(Layer):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
             import paddle.nn.functional as F
-            import numpy as np
 
             data = paddle.rand(shape=[1, 1, 4, 4, 6])
             pool_out, indices = F.max_pool3d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
@@ -1322,13 +1348,15 @@ class MaxUnPool3D(Layer):
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 data_format="NCDHW",
-                 output_size=None,
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        data_format="NCDHW",
+        output_size=None,
+        name=None,
+    ):
         super(MaxUnPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -1338,14 +1366,16 @@ def __init__(self,
         self.name = name
 
     def forward(self, x, indices):
-        return F.max_unpool3d(x,
-                              indices,
-                              kernel_size=self.ksize,
-                              stride=self.stride,
-                              padding=self.padding,
-                              data_format=self.data_format,
-                              output_size=self.output_size,
-                              name=self.name)
+        return F.max_unpool3d(
+            x,
+            indices,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            data_format=self.data_format,
+            output_size=self.output_size,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'output_size={}'.format(self.output_size)
diff --git a/python/paddle/nn/quant/quant_layers.py b/python/paddle/nn/quant/quant_layers.py
index 59d1389f099743..72bad0d44a8c38 100644
--- a/python/paddle/nn/quant/quant_layers.py
+++ b/python/paddle/nn/quant/quant_layers.py
@@ -514,14 +514,17 @@ def forward(self, input):
 
 class QuantizedConv2DTranspose(Layer):
     """
+
     The computational logic of QuantizedConv2DTranspose is the same with Conv2DTranspose.
     The only difference is that its inputs are all fake quantized.
     
     Examples:
        .. code-block:: python
+
           import paddle
           import paddle.nn as nn
           from paddle.nn.quant.quant_layers import QuantizedConv2DTranspose
+
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
           conv = nn.Conv2DTranspose(4, 6, (3, 3))
           conv_quantized = QuantizedConv2DTranspose(conv)
@@ -531,6 +534,7 @@ class QuantizedConv2DTranspose(Layer):
           y_np = y_var.numpy()
           print(y_np.shape, y_quantized_np.shape)
           # (2, 6, 10, 10), (2, 6, 10, 10)
+
     """
 
     def __init__(self,
diff --git a/python/paddle/nn/utils/spectral_norm_hook.py b/python/paddle/nn/utils/spectral_norm_hook.py
index 375fe9013b8303..ec9abf46fff1a1 100644
--- a/python/paddle/nn/utils/spectral_norm_hook.py
+++ b/python/paddle/nn/utils/spectral_norm_hook.py
@@ -23,21 +23,21 @@
 __all__ = []
 
 
-def normal_(x, mean=0., std=1.):
+def normal_(x, mean=0.0, std=1.0):
     temp_value = paddle.normal(mean, std, shape=x.shape)
     x.set_value(temp_value)
     return x
 
 
 class SpectralNorm(object):
-
     def __init__(self, name='weight', n_power_iterations=1, dim=0, eps=1e-12):
         self.name = name
         self.dim = dim
         if n_power_iterations <= 0:
             raise ValueError(
                 'Expected n_power_iterations to be positive, but '
-                'got n_power_iterations={}'.format(n_power_iterations))
+                'got n_power_iterations={}'.format(n_power_iterations)
+            )
         self.n_power_iterations = n_power_iterations
         self.eps = eps
 
@@ -46,8 +46,9 @@ def reshape_weight_to_matrix(self, weight):
         if self.dim != 0:
             # transpose dim to front
             weight_mat = weight_mat.transpose(
-                [self.dim] +
-                [d for d in range(weight_mat.dim()) if d != self.dim])
+                [self.dim]
+                + [d for d in range(weight_mat.dim()) if d != self.dim]
+            )
 
         height = weight_mat.shape[0]
 
@@ -64,20 +65,24 @@ def compute_weight(self, layer, do_power_iteration):
                 for _ in range(self.n_power_iterations):
                     v.set_value(
                         F.normalize(
-                            paddle.matmul(weight_mat,
-                                          u,
-                                          transpose_x=True,
-                                          transpose_y=False),
+                            paddle.matmul(
+                                weight_mat,
+                                u,
+                                transpose_x=True,
+                                transpose_y=False,
+                            ),
                             axis=0,
                             epsilon=self.eps,
-                        ))
+                        )
+                    )
 
                     u.set_value(
                         F.normalize(
                             paddle.matmul(weight_mat, v),
                             axis=0,
                             epsilon=self.eps,
-                        ))
+                        )
+                    )
                 if self.n_power_iterations > 0:
                     u = u.clone()
                     v = v.clone()
@@ -87,15 +92,20 @@ def compute_weight(self, layer, do_power_iteration):
         return weight
 
     def __call__(self, layer, inputs):
-        setattr(layer, self.name,
-                self.compute_weight(layer, do_power_iteration=layer.training))
+        setattr(
+            layer,
+            self.name,
+            self.compute_weight(layer, do_power_iteration=layer.training),
+        )
 
     @staticmethod
     def apply(layer, name, n_power_iterations, dim, eps):
         for k, hook in layer._forward_pre_hooks.items():
             if isinstance(hook, SpectralNorm) and hook.name == name:
-                raise RuntimeError("Cannot register two spectral_norm hooks on "
-                                   "the same parameter {}".format(name))
+                raise RuntimeError(
+                    "Cannot register two spectral_norm hooks on "
+                    "the same parameter {}".format(name)
+                )
 
         fn = SpectralNorm(name, n_power_iterations, dim, eps)
         weight = layer._parameters[name]
@@ -106,9 +116,9 @@ def apply(layer, name, n_power_iterations, dim, eps):
 
             # randomly initialize u and v
             u = layer.create_parameter([h])
-            u = normal_(u, 0., 1.)
+            u = normal_(u, 0.0, 1.0)
             v = layer.create_parameter([w])
-            v = normal_(v, 0., 1.)
+            v = normal_(v, 0.0, 1.0)
             u = F.normalize(u, axis=0, epsilon=fn.eps)
             v = F.normalize(v, axis=0, epsilon=fn.eps)
 
@@ -127,13 +137,11 @@ def apply(layer, name, n_power_iterations, dim, eps):
         return fn
 
 
-def spectral_norm(layer,
-                  name='weight',
-                  n_power_iterations=1,
-                  eps=1e-12,
-                  dim=None):
+def spectral_norm(
+    layer, name='weight', n_power_iterations=1, eps=1e-12, dim=None
+):
     r"""
-    This spectral_norm layer applies spectral normalization to a parameter according to the 
+    Applies spectral normalization to a parameter according to the
     following Calculation:
 
     Step 1:
@@ -169,9 +177,9 @@ def spectral_norm(layer,
         n_power_iterations(int, optional): The number of power iterations to calculate spectral norm. Default: 1.
         eps(float, optional): The epsilon for numerical stability in calculating norms. Default: 1e-12.
         dim(int, optional): The index of dimension which should be permuted to the first before reshaping Input(Weight) to matrix, it should be set as 0 if Input(Weight) is the weight of fc layer, and should be set as 1 if Input(Weight) is the weight of conv layer. Default: None.
-        
+
     Returns:
-        The original layer with the spectral norm hook
+        Layer, the original layer with the spectral norm hook.
 
     Examples:
        .. code-block:: python
@@ -188,11 +196,11 @@ def spectral_norm(layer,
             #        [[[[-0.21090528,  0.18563725, -0.14127982],
             #           [-0.02310637,  0.03197737,  0.34353802],
             #           [-0.17117859,  0.33152047, -0.28408015]],
-            # 
+            #
             #          [[-0.13336606, -0.01862637,  0.06959272],
             #           [-0.02236020, -0.27091628, -0.24532901],
             #           [ 0.27254242,  0.15516677,  0.09036587]],
-            # 
+            #
             #          [[ 0.30169338, -0.28146112, -0.11768346],
             #           [-0.45765871, -0.12504843, -0.17482486],
             #           [-0.36866254, -0.19969313,  0.08783543]]]])
@@ -201,8 +209,8 @@ def spectral_norm(layer,
 
     if dim is None:
         if isinstance(
-                layer,
-            (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose, Linear)):
+            layer, (Conv1DTranspose, Conv2DTranspose, Conv3DTranspose, Linear)
+        ):
             dim = 1
         else:
             dim = 0
diff --git a/python/paddle/nn/utils/weight_norm_hook.py b/python/paddle/nn/utils/weight_norm_hook.py
index 40c1021848c868..55e2d408ffaead 100755
--- a/python/paddle/nn/utils/weight_norm_hook.py
+++ b/python/paddle/nn/utils/weight_norm_hook.py
@@ -37,16 +37,15 @@ def l2_norm(x, axis, epsilon=1e-12, name=None):
     helper = LayerHelper("l2_normalize", **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
     norm = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="norm",
-                     inputs={"X": x},
-                     outputs={
-                         "Out": out,
-                         "Norm": norm
-                     },
-                     attrs={
-                         "axis": 1 if axis is None else axis,
-                         "epsilon": epsilon,
-                     })
+    helper.append_op(
+        type="norm",
+        inputs={"X": x},
+        outputs={"Out": out, "Norm": norm},
+        attrs={
+            "axis": 1 if axis is None else axis,
+            "epsilon": epsilon,
+        },
+    )
     return paddle.squeeze(norm, axis=[axis])
 
 
@@ -93,14 +92,13 @@ def _weight_norm(v, g, dim):
         v_normalized = F.l2_normalize(p_matrix, axis=1)
         v_normalized = paddle.reshape(v_normalized, transposed_shape)
         v_normalized = paddle.transpose(v_normalized, perm)
-    weight = F.elementwise_mul(v_normalized,
-                               g,
-                               axis=dim if dim is not None else -1)
+    weight = F.elementwise_mul(
+        v_normalized, g, axis=dim if dim is not None else -1
+    )
     return weight
 
 
 class WeightNorm(object):
-
     def __init__(self, name, dim):
         if dim is None:
             dim = -1
@@ -116,8 +114,10 @@ def compute_weight(self, layer):
     def apply(layer, name, dim):
         for k, hook in layer._forward_pre_hooks.items():
             if isinstance(hook, WeightNorm) and hook.name == name:
-                raise RuntimeError("Cannot register two weight_norm hooks on "
-                                   "the same parameter {}".format(name))
+                raise RuntimeError(
+                    "Cannot register two weight_norm hooks on "
+                    "the same parameter {}".format(name)
+                )
 
         if dim is None:
             dim = -1
@@ -164,40 +164,38 @@ def __call__(self, layer, inputs):
 
 def weight_norm(layer, name='weight', dim=0):
     r"""
-    This weight_norm layer applies weight normalization to a parameter according to the 
+    Applies weight normalization to a parameter according to the
     following formula:
 
     .. math::
 
         \mathbf{w} = g \dfrac{v}{\|v\|}
 
-    Weight normalization is a reparameterization of the weight vectors in a neural network that 
-    decouples the magnitude of those weight vectors from their direction. Weight normalization 
-    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter 
-    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction 
-    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper: 
+    Weight normalization is a reparameterization of the weight vectors in a neural network that
+    decouples the magnitude of those weight vectors from their direction. Weight normalization
+    replaces the parameter specified by `name`(eg: 'weight') with two parameters: one parameter
+    specifying the magnitude (eg: 'weight_g') and one parameter specifying the direction
+    (eg: 'weight_v'). Weight normalization has been implemented as discussed in this paper:
     `Weight Normalization: A Simple Reparameterization to Accelerate Training of Deep Neural Networks
     <https://arxiv.org/pdf/1602.07868.pdf>`_.
 
     Parameters:
         layer(Layer): Layer of paddle, which has weight.
         name(str, optional): Name of the weight parameter. Default: 'weight'.
-        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number 
-              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0, 
-              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4. 
+        dim(int, optional): Dimension over which to compute the norm. Dim is a non-negative number
+              which is less than the rank of weight Tensor. For Example, dim can be chosen from 0,
+              1, 2, 3 for convolution whose weight shape is [cout, cin, kh, kw] and rank is 4.
               If dim is set to None, meaning that all elements will be normalized. Default: 0.
-    
+
     Returns:
         Origin layer with weight norm hook.
 
     Examples:
         .. code-block:: python
 
-          import numpy as np
           from paddle.nn import Conv2D
           from paddle.nn.utils import weight_norm
 
-          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
           conv = Conv2D(3, 5, 3)
           wn = weight_norm(conv)
           print(conv.weight_g.shape)
@@ -218,11 +216,11 @@ def remove_weight_norm(layer, name='weight'):
         name(str, optional): Name of the weight parameter. Default: 'weight'.
 
     Returns:
-        Origin layer without weight norm
+        Layer, the origin layer without weight norm
 
     Examples:
         .. code-block:: python
-          
+
             import paddle
             from paddle.nn import Conv2D
             from paddle.nn.utils import weight_norm, remove_weight_norm
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
index 666cd7c08623af..ea7a9299f5a526 100644
--- a/python/paddle/onnx/export.py
+++ b/python/paddle/onnx/export.py
@@ -26,27 +26,26 @@ def export(layer, path, input_spec=None, opset_version=9, **configs):
     Args:
         layer (Layer): The Layer to be exported.
         path (str): The path prefix to export model. The format is ``dirname/file_prefix`` or ``file_prefix`` ,
-            and the exported ONNX file suffix is ``.onnx`` . 
-        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward 
-            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+            and the exported ONNX file suffix is ``.onnx`` .
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of
             the original Layer's forward method would be the inputs of the exported ``ONNX`` model. Default: None.
         opset_version(int, optional): Opset version of exported ONNX model.
             Now, stable supported opset version include 9, 10, 11. Default: 9.
-        **configs (dict, optional): Other export configuration options for compatibility. We do not 
-            recommend using these configurations, they may be removed in the future. If not necessary, 
+        **configs (dict, optional): Other export configuration options for compatibility. We do not
+            recommend using these configurations, they may be removed in the future. If not necessary,
             DO NOT use them. Default None.
             The following options are currently supported:
             (1) output_spec (list[Tensor]): Selects the output targets of the exported model.
-            By default, all return variables of original Layer's forward method are kept as the 
-            output of the exported model. If the provided ``output_spec`` list is not all output variables, 
-            the exported model will be pruned according to the given ``output_spec`` list. 
+            By default, all return variables of original Layer's forward method are kept as the
+            output of the exported model. If the provided ``output_spec`` list is not all output variables,
+            the exported model will be pruned according to the given ``output_spec`` list.
     Returns:
         None
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             class LinearNet(paddle.nn.Layer):
                 def __init__(self):
@@ -77,8 +76,8 @@ def forward(self, x, y, z):
             # Export model with 'Tensor' to support pruned model by set 'output_spec'.
             def export_logic():
                 model = Logic()
-                x = paddle.to_tensor(np.array([1]))
-                y = paddle.to_tensor(np.array([2]))
+                x = paddle.to_tensor([1])
+                y = paddle.to_tensor([2])
                 # Static and run model.
                 paddle.jit.to_static(model)
                 out = model(x, y, z=True)
@@ -94,11 +93,14 @@ def export_logic():
         raise ValueError(
             "The input path MUST be format of dirname/file_prefix "
             "[dirname\\file_prefix in Windows system], but "
-            "the file_prefix is empty in received path: {}".format(path))
+            "the file_prefix is empty in received path: {}".format(path)
+        )
     save_file = path + '.onnx'
 
-    p2o.dygraph2onnx(layer,
-                     save_file,
-                     input_spec=input_spec,
-                     opset_version=opset_version,
-                     **configs)
+    p2o.dygraph2onnx(
+        layer,
+        save_file,
+        input_spec=input_spec,
+        opset_version=opset_version,
+        **configs
+    )
diff --git a/python/paddle/optimizer/adadelta.py b/python/paddle/optimizer/adadelta.py
index f3c15ce479da73..6d9c5bac75e87e 100644
--- a/python/paddle/optimizer/adadelta.py
+++ b/python/paddle/optimizer/adadelta.py
@@ -70,10 +70,9 @@ class Adadelta(Optimizer):
         .. code-block:: python
 	
             import paddle
-            import numpy as np
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
             linear = paddle.nn.Linear(10, 10)
-            inp = paddle.to_tensor(inp)
             out = linear(inp)
             loss = paddle.mean(out)
             beta1 = paddle.to_tensor([0.9], dtype="float32")
@@ -109,25 +108,29 @@ class Adadelta(Optimizer):
     _avg_squared_grad_acc_str = "_avg_squared_grad"
     _avg_squared_update_acc_str = "_avg_squared_update"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 epsilon=1.0e-6,
-                 rho=0.95,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        epsilon=1.0e-6,
+        rho=0.95,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
         if epsilon is None:
             raise ValueError("epsilon is not set.")
         if rho is None:
             raise ValueError("rho is not set.")
-        super(Adadelta, self).__init__(learning_rate=learning_rate,
-                                       parameters=parameters,
-                                       weight_decay=weight_decay,
-                                       grad_clip=grad_clip,
-                                       name=name)
+        super(Adadelta, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adadelta"
         self._epsilon = epsilon
         self._rho = rho
@@ -151,43 +154,44 @@ def _append_optimize_op(self, block, param_and_grad):
             param_and_grad = self._update_param_group(param_and_grad)
 
         avg_squared_grad_acc = self._get_accumulator(
-            self._avg_squared_grad_acc_str, param_and_grad[0])
+            self._avg_squared_grad_acc_str, param_and_grad[0]
+        )
         avg_squared_update_acc = self._get_accumulator(
-            self._avg_squared_update_acc_str, param_and_grad[0])
+            self._avg_squared_update_acc_str, param_and_grad[0]
+        )
 
         if in_dygraph_mode():
             with no_grad():
-                _C_ops.adadelta_(param_and_grad[0], param_and_grad[1],
-                                 avg_squared_grad_acc, avg_squared_update_acc,
-                                 self._rho, self._epsilon)
+                _C_ops.adadelta_(
+                    param_and_grad[0],
+                    param_and_grad[1],
+                    avg_squared_grad_acc,
+                    avg_squared_update_acc,
+                    self._rho,
+                    self._epsilon,
+                )
             return None
 
         if not isinstance(block, framework.Block):
             raise TypeError("block is not instance of framework.Block.")
 
         # Create the adadelta optimizer op
-        adadelta_op = block.append_op(type=self.type,
-                                      inputs={
-                                          "Param": param_and_grad[0],
-                                          "Grad": param_and_grad[1],
-                                          "AvgSquaredGrad":
-                                          avg_squared_grad_acc,
-                                          "AvgSquaredUpdate":
-                                          avg_squared_update_acc
-                                      },
-                                      outputs={
-                                          "ParamOut":
-                                          param_and_grad[0],
-                                          "AvgSquaredGradOut":
-                                          avg_squared_grad_acc,
-                                          "AvgSquaredUpdateOut":
-                                          avg_squared_update_acc
-                                      },
-                                      attrs={
-                                          "epsilon": self._epsilon,
-                                          "rho": self._rho
-                                      },
-                                      stop_gradient=True)
+        adadelta_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "AvgSquaredGrad": avg_squared_grad_acc,
+                "AvgSquaredUpdate": avg_squared_update_acc,
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "AvgSquaredGradOut": avg_squared_grad_acc,
+                "AvgSquaredUpdateOut": avg_squared_update_acc,
+            },
+            attrs={"epsilon": self._epsilon, "rho": self._rho},
+            stop_gradient=True,
+        )
 
         return adadelta_op
 
diff --git a/python/paddle/optimizer/adagrad.py b/python/paddle/optimizer/adagrad.py
index f5cd7bdaa83e09..634e73ccf3d6d2 100644
--- a/python/paddle/optimizer/adagrad.py
+++ b/python/paddle/optimizer/adagrad.py
@@ -72,7 +72,6 @@ class Adagrad(Optimizer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             inp = paddle.rand(shape=[10, 10])
             linear = paddle.nn.Linear(10, 10)
@@ -108,21 +107,25 @@ class Adagrad(Optimizer):
     """
     _moment_acc_str = "moment"
 
-    def __init__(self,
-                 learning_rate,
-                 epsilon=1.0e-6,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None,
-                 initial_accumulator_value=0.0):
+    def __init__(
+        self,
+        learning_rate,
+        epsilon=1.0e-6,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+        initial_accumulator_value=0.0,
+    ):
         assert learning_rate is not None
         assert epsilon is not None
-        super(Adagrad, self).__init__(learning_rate=learning_rate,
-                                      parameters=parameters,
-                                      weight_decay=weight_decay,
-                                      grad_clip=grad_clip,
-                                      name=name)
+        super(Adagrad, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adagrad"
         self._epsilon = epsilon
         self.initial_accumulator_value = initial_accumulator_value
@@ -138,9 +141,11 @@ def _create_accumulators(self, block, parameters):
             parameters = self._update_param_group(parameters)
 
         for p in parameters:
-            self._add_accumulator(self._moment_acc_str,
-                                  p,
-                                  fill_value=self.initial_accumulator_value)
+            self._add_accumulator(
+                self._moment_acc_str,
+                p,
+                fill_value=self.initial_accumulator_value,
+            )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -148,26 +153,22 @@ def _append_optimize_op(self, block, param_and_grad):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        moment_acc = self._get_accumulator(self._moment_acc_str,
-                                           param_and_grad[0])
+        moment_acc = self._get_accumulator(
+            self._moment_acc_str, param_and_grad[0]
+        )
         # Create the adagrad optimizer op
-        adagrad_op = block.append_op(type=self.type,
-                                     inputs={
-                                         "Param":
-                                         param_and_grad[0],
-                                         "Grad":
-                                         param_and_grad[1],
-                                         "Moment":
-                                         moment_acc,
-                                         "LearningRate":
-                                         self._create_param_lr(param_and_grad)
-                                     },
-                                     outputs={
-                                         "ParamOut": param_and_grad[0],
-                                         "MomentOut": moment_acc
-                                     },
-                                     attrs={"epsilon": self._epsilon},
-                                     stop_gradient=True)
+        adagrad_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": moment_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={"ParamOut": param_and_grad[0], "MomentOut": moment_acc},
+            attrs={"epsilon": self._epsilon},
+            stop_gradient=True,
+        )
 
         return adagrad_op
 
@@ -175,6 +176,7 @@ def _update_param_group(self, parameters):
         self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
         self.initial_accumulator_value = parameters.get(
             'initial_accumulator_value',
-            self._default_dict['initial_accumulator_value'])
+            self._default_dict['initial_accumulator_value'],
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/optimizer/adam.py b/python/paddle/optimizer/adam.py
index 1140516cdc5302..ba3bd964bf14cb 100644
--- a/python/paddle/optimizer/adam.py
+++ b/python/paddle/optimizer/adam.py
@@ -163,18 +163,20 @@ class Adam(Optimizer):
     _beta1_pow_acc_str = "beta1_pow_acc"
     _beta2_pow_acc_str = "beta2_pow_acc"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 lazy_mode=False,
-                 multi_precision=False,
-                 use_multi_tensor=False,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-8,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        lazy_mode=False,
+        multi_precision=False,
+        use_multi_tensor=False,
+        name=None,
+    ):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -182,20 +184,25 @@ def __init__(self,
         if not isinstance(beta1, Variable):
             if not 0 <= beta1 < 1:
                 raise ValueError(
-                    "Invaild value of beta1, expect beta1 in [0,1).")
+                    "Invaild value of beta1, expect beta1 in [0,1)."
+                )
         if not isinstance(beta2, Variable):
             if not 0 <= beta2 < 1:
                 raise ValueError(
-                    "Invaild value of beta2, expect beta2 in [0,1).")
+                    "Invaild value of beta2, expect beta2 in [0,1)."
+                )
         if not isinstance(epsilon, Variable):
             if not 0 <= epsilon:
                 raise ValueError(
-                    "Invaild value of epsilon, expect epsilon >= 0.")
-        super(Adam, self).__init__(learning_rate=learning_rate,
-                                   parameters=parameters,
-                                   weight_decay=weight_decay,
-                                   grad_clip=grad_clip,
-                                   name=name)
+                    "Invaild value of epsilon, expect epsilon >= 0."
+                )
+        super(Adam, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adam"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -212,21 +219,13 @@ def __init__(self,
 
         self._use_multi_tensor = use_multi_tensor
         if self._use_multi_tensor:
-            self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._moment1_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._moment2_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._beta1_pow_acc_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
-            self._beta2_pow_acc_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
-            self._master_weight_dict = {
-                'FP32_LODTensor': None,
-                'FP16_LODTensor': []
-            }
+            self._param_dict = self._create_multi_tensor_dict()
+            self._moment1_dict = self._create_multi_tensor_dict()
+            self._moment2_dict = self._create_multi_tensor_dict()
+            self._beta1_pow_acc_dict = self._create_multi_tensor_dict()
+            self._beta2_pow_acc_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict['FP32_LODTensor'] = None
 
     def _create_master_weight(self, param):
         if param.name in self._master_weights:
@@ -236,19 +235,23 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(name=var_name,
-                                           shape=param.shape,
-                                           value=0,
-                                           dtype='float32',
-                                           persistable=True)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True,
+            )
             block = self.helper.startup_program.global_block()
-            block.append_op(type="cast",
-                            inputs={"X": [param]},
-                            outputs={"Out": [var]},
-                            attrs={
-                                "in_dtype": param.dtype,
-                                "out_dtype": core.VarDesc.VarType.FP32
-                            })
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32,
+                },
+            )
             self._master_weights[param.name] = var
         return var
 
@@ -262,20 +265,30 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
-        target_param = self._master_weights[
-            param.name] if find_master else param
+        find_master = (
+            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        )
+        target_param = (
+            self._master_weights[param.name] if find_master else param
+        )
         target_name = target_param.name
-        if (name not in self._accumulators
-                or target_name not in self._accumulators[name]):
+        if (
+            name not in self._accumulators
+            or target_name not in self._accumulators[name]
+        ):
             raise Exception(
                 "Accumulator {} does not exist for parameter {}".format(
-                    name, target_name))
+                    name, target_name
+                )
+            )
         return self._accumulators[name][target_name]
 
     def _add_moments_pows(self, p):
         acc_dtype = p.dtype
-        if acc_dtype == core.VarDesc.VarType.FP16:
+        if (
+            acc_dtype == core.VarDesc.VarType.FP16
+            or acc_dtype == core.VarDesc.VarType.BF16
+        ):
             acc_dtype = core.VarDesc.VarType.FP32
         self._add_accumulator(self._moment1_acc_str, p, dtype=acc_dtype)
         self._add_accumulator(self._moment2_acc_str, p, dtype=acc_dtype)
@@ -283,18 +296,24 @@ def _add_moments_pows(self, p):
             name=self._beta1_pow_acc_str,
             param=p,
             dtype=acc_dtype,
-            fill_value=0.9 if isinstance(self._beta1, Variable) \
-                    else self._beta1,
+            fill_value=0.9
+            if isinstance(self._beta1, Variable)
+            else self._beta1,
             shape=[1],
-            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            device='cpu',
+        )
         self._add_accumulator(
             name=self._beta2_pow_acc_str,
             param=p,
             dtype=acc_dtype,
-            fill_value=0.999 if isinstance(self._beta2, Variable) \
-                    else self._beta2,
+            fill_value=0.999
+            if isinstance(self._beta2, Variable)
+            else self._beta2,
             shape=[1],
-            type=core.VarDesc.VarType.LOD_TENSOR, device='cpu')
+            type=core.VarDesc.VarType.LOD_TENSOR,
+            device='cpu',
+        )
 
     def _create_accumulators(self, block, parameters):
         assert isinstance(block, framework.Block)
@@ -307,7 +326,10 @@ def _create_accumulators(self, block, parameters):
                 master_p = self._create_master_weight(p)
                 self._add_moments_pows(master_p)
                 continue
-            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+            if (
+                p.dtype == core.VarDesc.VarType.FP16
+                and not self._multi_precision
+            ):
                 warnings.warn(
                     "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
                     "Consider using multi_precision=True option of the Adam optimizer."
@@ -319,50 +341,105 @@ def _append_optimize_op(self, block, param_and_grad):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        moment1 = self._get_accumulator(self._moment1_acc_str,
-                                        param_and_grad[0])
-        moment2 = self._get_accumulator(self._moment2_acc_str,
-                                        param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
-        beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                              param_and_grad[0])
-        find_master = self._multi_precision and param_and_grad[
-            0].dtype == core.VarDesc.VarType.FP16
-        master_weight = (self._master_weights[param_and_grad[0].name]
-                         if find_master else None)
+        moment1 = self._get_accumulator(
+            self._moment1_acc_str, param_and_grad[0]
+        )
+        moment2 = self._get_accumulator(
+            self._moment2_acc_str, param_and_grad[0]
+        )
+        beta1_pow_acc = self._get_accumulator(
+            self._beta1_pow_acc_str, param_and_grad[0]
+        )
+        beta2_pow_acc = self._get_accumulator(
+            self._beta2_pow_acc_str, param_and_grad[0]
+        )
+        find_master = (
+            self._multi_precision
+            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
+        )
+        master_weight = (
+            self._master_weights[param_and_grad[0].name]
+            if find_master
+            else None
+        )
         lr = self._create_param_lr(param_and_grad)
         # create the adam optimize op
 
         if framework.in_dygraph_mode():
             found_inf = self._get_auxiliary_var('found_inf')
 
-            _beta1 = self._beta1 if not isinstance(
-                self._beta1, Variable) else self._beta1.numpy().item(0)
-            _beta2 = self._beta2 if not isinstance(
-                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _beta1 = (
+                self._beta1
+                if not isinstance(self._beta1, Variable)
+                else self._beta1.numpy().item(0)
+            )
+            _beta2 = (
+                self._beta2
+                if not isinstance(self._beta2, Variable)
+                else self._beta2.numpy().item(0)
+            )
 
             _, _, _, _, _, _ = _C_ops.adam_(
-                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, master_weight, found_inf, _beta1,
-                _beta2, self._epsilon, self._lazy_mode, 1000, find_master,
-                False)
+                param_and_grad[0],
+                param_and_grad[1],
+                lr,
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                found_inf,
+                _beta1,
+                _beta2,
+                self._epsilon,
+                self._lazy_mode,
+                1000,
+                find_master,
+                False,
+            )
 
             return None
 
         if framework._in_legacy_dygraph():
 
-            _beta1 = self._beta1 if not isinstance(
-                self._beta1, Variable) else self._beta1.numpy().item(0)
-            _beta2 = self._beta2 if not isinstance(
-                self._beta2, Variable) else self._beta2.numpy().item(0)
+            _beta1 = (
+                self._beta1
+                if not isinstance(self._beta1, Variable)
+                else self._beta1.numpy().item(0)
+            )
+            _beta2 = (
+                self._beta2
+                if not isinstance(self._beta2, Variable)
+                else self._beta2.numpy().item(0)
+            )
             _, _, _, _, _, _ = _legacy_C_ops.adam(
-                param_and_grad[0], param_and_grad[1], lr, moment1, moment2,
-                beta1_pow_acc, beta2_pow_acc, master_weight, param_and_grad[0],
-                moment1, moment2, beta1_pow_acc, beta2_pow_acc, master_weight,
-                'epsilon', self._epsilon, 'lazy_mode', self._lazy_mode,
-                'min_row_size_to_use_multithread', 1000, 'beta1', _beta1,
-                'beta2', _beta2, 'multi_precision', find_master)
+                param_and_grad[0],
+                param_and_grad[1],
+                lr,
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                param_and_grad[0],
+                moment1,
+                moment2,
+                beta1_pow_acc,
+                beta2_pow_acc,
+                master_weight,
+                'epsilon',
+                self._epsilon,
+                'lazy_mode',
+                self._lazy_mode,
+                'min_row_size_to_use_multithread',
+                1000,
+                'beta1',
+                _beta1,
+                'beta2',
+                _beta2,
+                'multi_precision',
+                find_master,
+            )
 
             return None
 
@@ -373,7 +450,7 @@ def _append_optimize_op(self, block, param_and_grad):
             "Moment1": [moment1],
             "Moment2": [moment2],
             "Beta1Pow": [beta1_pow_acc],
-            "Beta2Pow": [beta2_pow_acc]
+            "Beta2Pow": [beta2_pow_acc],
         }
         outputs = {
             "ParamOut": [param_and_grad[0]],
@@ -385,7 +462,7 @@ def _append_optimize_op(self, block, param_and_grad):
         attrs = {
             "lazy_mode": self._lazy_mode,
             "min_row_size_to_use_multithread": 1000,
-            "multi_precision": find_master
+            "multi_precision": find_master,
         }
 
         if isinstance(self._beta1, Variable):
@@ -405,11 +482,13 @@ def _append_optimize_op(self, block, param_and_grad):
             inputs["MasterParam"] = master_weight
             outputs["MasterParamOut"] = master_weight
 
-        adam_op = block.append_op(type=self.type,
-                                  inputs=inputs,
-                                  outputs=outputs,
-                                  attrs=attrs,
-                                  stop_gradient=True)
+        adam_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True,
+        )
 
         return adam_op
 
@@ -426,7 +505,7 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                
+
                 a = paddle.rand([2,13], dtype="float32")
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
@@ -445,27 +524,34 @@ def step(self):
                 if param._grad_ivar() is not None:
                     grad_var = param._grad_ivar()
                     if in_dygraph_mode():
-                        if hasattr(grad_var, "is_selected_rows"
-                                   ) and grad_var.is_selected_rows(
-                                   ) and self.regularization is not None:
+                        if (
+                            hasattr(grad_var, "is_selected_rows")
+                            and grad_var.is_selected_rows()
+                            and self.regularization is not None
+                        ):
                             raise RuntimeError(
                                 "Adam don't support weight_decay with sparse parameters, please set it to None."
                             )
                     else:
-                        if hasattr(
-                                grad_var, "_is_sparse") and grad_var._is_sparse(
-                                ) and self.regularization is not None:
+                        if (
+                            hasattr(grad_var, "_is_sparse")
+                            and grad_var._is_sparse()
+                            and self.regularization is not None
+                        ):
                             raise RuntimeError(
                                 "Adam don't support weight_decay with sparse parameters, please set it to None."
                             )
                     params_grads.append((param, grad_var))
 
-            optimize_ops = self._apply_optimize(loss=None,
-                                                startup_program=None,
-                                                params_grads=params_grads)
+            optimize_ops = self._apply_optimize(
+                loss=None,
+                startup_program=None,
+                params_grads=params_grads,
+                param_group_idx=0,
+            )
         else:
             # optimize parameters in groups
-            for param_group in self._param_groups:
+            for idx, param_group in enumerate(self._param_groups):
                 params_grads = defaultdict(lambda: list())
                 for param in param_group['params']:
                     if param.stop_gradient:
@@ -474,13 +560,16 @@ def step(self):
                         grad_var = param._grad_ivar()
                         params_grads['params'].append((param, grad_var))
                 params_grads.update(
-                    {k: v
-                     for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(loss=None,
-                                     startup_program=None,
-                                     params_grads=params_grads)
+                    {k: v for k, v in param_group.items() if k != 'params'}
+                )
+                self._apply_optimize(
+                    loss=None,
+                    startup_program=None,
+                    params_grads=params_grads,
+                    param_group_idx=idx,
+                )
 
-    def _multi_tensor_init(self, target_block, parameters):
+    def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
         All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
         This function will be overridden in the corresponding optimizer file.
@@ -492,26 +581,49 @@ def _multi_tensor_init(self, target_block, parameters):
         for param in parameters:
             moment1 = self._get_accumulator(self._moment1_acc_str, param)
             moment2 = self._get_accumulator(self._moment2_acc_str, param)
-            beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                                  param)
-            beta2_pow_acc = self._get_accumulator(self._beta2_pow_acc_str,
-                                                  param)
+            beta1_pow_acc = self._get_accumulator(
+                self._beta1_pow_acc_str, param
+            )
+            beta2_pow_acc = self._get_accumulator(
+                self._beta2_pow_acc_str, param
+            )
 
             if param.dtype == paddle.float32:
-                self._param_dict['FP32_LODTensor'].append(param)
-                self._moment1_dict['FP32_LODTensor'].append(moment1)
-                self._moment2_dict['FP32_LODTensor'].append(moment2)
-                self._beta1_pow_acc_dict['FP32_LODTensor'].append(beta1_pow_acc)
-                self._beta2_pow_acc_dict['FP32_LODTensor'].append(beta2_pow_acc)
+                self._param_dict['FP32_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._moment1_dict['FP32_LODTensor'][param_group_idx].append(
+                    moment1
+                )
+                self._moment2_dict['FP32_LODTensor'][param_group_idx].append(
+                    moment2
+                )
+                self._beta1_pow_acc_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(beta2_pow_acc)
             elif param.dtype == paddle.float16:
-                self._param_dict['FP16_LODTensor'].append(param)
-                self._moment1_dict['FP16_LODTensor'].append(moment1)
-                self._moment2_dict['FP16_LODTensor'].append(moment2)
-                self._beta1_pow_acc_dict['FP16_LODTensor'].append(beta1_pow_acc)
-                self._beta2_pow_acc_dict['FP16_LODTensor'].append(beta2_pow_acc)
+                self._param_dict['FP16_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._moment1_dict['FP16_LODTensor'][param_group_idx].append(
+                    moment1
+                )
+                self._moment2_dict['FP16_LODTensor'][param_group_idx].append(
+                    moment2
+                )
+                self._beta1_pow_acc_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(beta1_pow_acc)
+                self._beta2_pow_acc_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(beta2_pow_acc)
                 if self._multi_precision:
-                    self._master_weight_dict['FP16_LODTensor'].append(
-                        self._master_weights[param.name])
+                    self._master_weight_dict['FP16_LODTensor'][
+                        param_group_idx
+                    ].append(self._master_weights[param.name])
                 else:
                     self._master_weight_dict['FP16_LODTensor'] = None
             else:
@@ -519,9 +631,13 @@ def _multi_tensor_init(self, target_block, parameters):
                     "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
                 )
 
-    def _append_optimize_multi_tensor_op(self, target_block,
-                                         parameters_and_grads):
-        """ 
+    def _append_optimize_multi_tensor_op(
+        self,
+        target_block,
+        parameters_and_grads,
+        param_group_idx,
+    ):
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         assert isinstance(target_block, framework.Block)
@@ -534,15 +650,19 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[1] is None:
                     continue
                 if param_and_grad[0].stop_gradient is False:
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
@@ -553,97 +673,149 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[0].stop_gradient is False:
                     param_grad_dict = dict()
                     param_grad_dict['params'] = param_and_grad
-                    param_grad_dict.update({
-                        k: v
-                        for k, v in parameters_and_grads.items()
-                        if k != 'params'
-                    })
+                    param_grad_dict.update(
+                        {
+                            k: v
+                            for k, v in parameters_and_grads.items()
+                            if k != 'params'
+                        }
+                    )
                     param_and_grad = self._update_param_group(param_grad_dict)
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
 
         multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
         for key in multi_tensor_list:
-            if len(self._param_dict[key]) > 0:
+            if len(self._param_dict[key][param_group_idx]) > 0:
                 find_master = self._multi_precision and key == 'FP16_LODTensor'
 
-                _beta1 = self._beta1 if not isinstance(
-                    self._beta1, Variable) else self._beta1.numpy().item(0)
-                _beta2 = self._beta2 if not isinstance(
-                    self._beta2, Variable) else self._beta2.numpy().item(0)
+                _beta1 = (
+                    self._beta1
+                    if not isinstance(self._beta1, Variable)
+                    else self._beta1.numpy().item(0)
+                )
+                _beta2 = (
+                    self._beta2
+                    if not isinstance(self._beta2, Variable)
+                    else self._beta2.numpy().item(0)
+                )
 
                 if framework._non_static_mode():
+                    master_weight = self._master_weight_dict[key]
+                    master_weight = (
+                        master_weight[param_group_idx]
+                        if master_weight is not None
+                        else None
+                    )
                     if in_dygraph_mode():
+
                         _, _, _, _, _, _ = _C_ops.merged_adam_(
-                            self._param_dict[key], grad_dict[key], lr_dict[key],
-                            self._moment1_dict[key], self._moment2_dict[key],
-                            self._beta1_pow_acc_dict[key],
-                            self._beta2_pow_acc_dict[key],
-                            self._master_weight_dict[key], _beta1, _beta2,
-                            self._epsilon, find_master, False)
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            lr_dict[key],
+                            self._moment1_dict[key][param_group_idx],
+                            self._moment2_dict[key][param_group_idx],
+                            self._beta1_pow_acc_dict[key][param_group_idx],
+                            self._beta2_pow_acc_dict[key][param_group_idx],
+                            master_weight,
+                            _beta1,
+                            _beta2,
+                            self._epsilon,
+                            find_master,
+                            False,
+                        )
                     else:
                         _, _, _, _, _, _ = _legacy_C_ops.merged_adam(
-                            self._param_dict[key], grad_dict[key], lr_dict[key],
-                            self._moment1_dict[key], self._moment2_dict[key],
-                            self._beta1_pow_acc_dict[key],
-                            self._beta2_pow_acc_dict[key],
-                            self._master_weight_dict[key],
-                            self._param_dict[key], self._moment1_dict[key],
-                            self._moment2_dict[key],
-                            self._beta1_pow_acc_dict[key],
-                            self._beta2_pow_acc_dict[key],
-                            self._master_weight_dict[key], 'epsilon',
-                            self._epsilon, 'beta1', _beta1, 'beta2', _beta2,
-                            'multi_precision', find_master)
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            lr_dict[key],
+                            self._moment1_dict[key][param_group_idx],
+                            self._moment2_dict[key][param_group_idx],
+                            self._beta1_pow_acc_dict[key][param_group_idx],
+                            self._beta2_pow_acc_dict[key][param_group_idx],
+                            master_weight,
+                            self._param_dict[key][param_group_idx],
+                            self._moment1_dict[key][param_group_idx],
+                            self._moment2_dict[key][param_group_idx],
+                            self._beta1_pow_acc_dict[key][param_group_idx],
+                            self._beta2_pow_acc_dict[key][param_group_idx],
+                            master_weight,
+                            'epsilon',
+                            self._epsilon,
+                            'beta1',
+                            _beta1,
+                            'beta2',
+                            _beta2,
+                            'multi_precision',
+                            find_master,
+                        )
                 else:
                     inputs = {
-                        "Param": self._param_dict[key],
+                        "Param": self._param_dict[key][param_group_idx],
                         "Grad": grad_dict[key],
                         "LearningRate": lr_dict[key],
-                        "Moment1": self._moment1_dict[key],
-                        "Moment2": self._moment2_dict[key],
-                        "Beta1Pow": self._beta1_pow_acc_dict[key],
-                        "Beta2Pow": self._beta2_pow_acc_dict[key]
+                        "Moment1": self._moment1_dict[key][param_group_idx],
+                        "Moment2": self._moment2_dict[key][param_group_idx],
+                        "Beta1Pow": self._beta1_pow_acc_dict[key][
+                            param_group_idx
+                        ],
+                        "Beta2Pow": self._beta2_pow_acc_dict[key][
+                            param_group_idx
+                        ],
                     }
                     outputs = {
-                        "ParamOut": self._param_dict[key],
-                        "Moment1Out": self._moment1_dict[key],
-                        "Moment2Out": self._moment2_dict[key],
-                        "Beta1PowOut": self._beta1_pow_acc_dict[key],
-                        "Beta2PowOut": self._beta2_pow_acc_dict[key]
+                        "ParamOut": self._param_dict[key][param_group_idx],
+                        "Moment1Out": self._moment1_dict[key][param_group_idx],
+                        "Moment2Out": self._moment2_dict[key][param_group_idx],
+                        "Beta1PowOut": self._beta1_pow_acc_dict[key][
+                            param_group_idx
+                        ],
+                        "Beta2PowOut": self._beta2_pow_acc_dict[key][
+                            param_group_idx
+                        ],
                     }
                     attrs = {
                         "epsilon": self._epsilon,
                         "beta1": _beta1,
-                        "beta2": _beta2
+                        "beta2": _beta2,
                     }
                     if find_master:
-                        inputs["MasterParam"] = self._master_weight_dict[key]
+                        inputs["MasterParam"] = self._master_weight_dict[key][
+                            param_group_idx
+                        ]
                         outputs["MasterParamOut"] = self._master_weight_dict[
-                            key]
+                            key
+                        ][param_group_idx]
                         attrs["multi_precision"] = find_master
-                    target_block.append_op(type="merged_adam",
-                                           inputs=inputs,
-                                           outputs=outputs,
-                                           attrs=attrs,
-                                           stop_gradient=True)
+                    target_block.append_op(
+                        type="merged_adam",
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                        stop_gradient=True,
+                    )
         return None
 
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
         self._beta2 = parameters.get('beta2', self._default_dict['beta2'])
         self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
-        self._lazy_mode = parameters.get('lazy_mode',
-                                         self._default_dict['lazy_mode'])
+        self._lazy_mode = parameters.get(
+            'lazy_mode', self._default_dict['lazy_mode']
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/optimizer/adamax.py b/python/paddle/optimizer/adamax.py
index cb07fdb7f56e9d..e3959fa67d7eca 100644
--- a/python/paddle/optimizer/adamax.py
+++ b/python/paddle/optimizer/adamax.py
@@ -85,9 +85,8 @@ class Adamax(Optimizer):
         .. code-block:: python
             
             import paddle
-            import numpy as np
 
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+            inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.to_tensor(inp)
             out = linear(inp)
@@ -133,15 +132,17 @@ class Adamax(Optimizer):
     _inf_norm_acc_str = "inf_norm"
     _beta1_pow_acc_str = "beta1_pow_acc"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 beta1=0.9,
-                 beta2=0.999,
-                 epsilon=1e-8,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        beta1=0.9,
+        beta2=0.999,
+        epsilon=1e-8,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
         assert learning_rate is not None
         assert beta1 is not None
         assert beta2 is not None
@@ -152,11 +153,13 @@ def __init__(self,
             raise ValueError("Invaild value of beta2, expect beta2 in [0,1).")
         if not 0 <= epsilon:
             raise ValueError("Invaild value of epsilon, expect epsilon >= 0.")
-        super(Adamax, self).__init__(learning_rate=learning_rate,
-                                     parameters=parameters,
-                                     weight_decay=weight_decay,
-                                     grad_clip=grad_clip,
-                                     name=name)
+        super(Adamax, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "adamax"
         self._beta1 = beta1
         self._beta2 = beta2
@@ -164,7 +167,7 @@ def __init__(self,
         self._default_dict = {
             'beta1': beta1,
             'beta2': beta2,
-            'epsilon': epsilon
+            'epsilon': epsilon,
         }
 
     def _create_accumulators(self, block, parameters):
@@ -175,10 +178,12 @@ def _create_accumulators(self, block, parameters):
         for p in parameters:
             self._add_accumulator(self._moment_acc_str, p)
             self._add_accumulator(self._inf_norm_acc_str, p)
-            self._add_accumulator(name=self._beta1_pow_acc_str,
-                                  param=p,
-                                  fill_value=self._beta1,
-                                  shape=[1])
+            self._add_accumulator(
+                name=self._beta1_pow_acc_str,
+                param=p,
+                fill_value=self._beta1,
+                shape=[1],
+            )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
@@ -186,22 +191,43 @@ def _append_optimize_op(self, block, param_and_grad):
             param_and_grad = self._update_param_group(param_and_grad)
 
         moment = self._get_accumulator(self._moment_acc_str, param_and_grad[0])
-        inf_norm = self._get_accumulator(self._inf_norm_acc_str,
-                                         param_and_grad[0])
-        beta1_pow_acc = self._get_accumulator(self._beta1_pow_acc_str,
-                                              param_and_grad[0])
+        inf_norm = self._get_accumulator(
+            self._inf_norm_acc_str, param_and_grad[0]
+        )
+        beta1_pow_acc = self._get_accumulator(
+            self._beta1_pow_acc_str, param_and_grad[0]
+        )
 
         if framework.in_dygraph_mode():
-            _C_ops.adamax_(param_and_grad[0], param_and_grad[1],
-                           self._create_param_lr(param_and_grad), moment,
-                           inf_norm, beta1_pow_acc, self._beta1, self._beta2,
-                           self._epsilon)
+            _C_ops.adamax_(
+                param_and_grad[0],
+                param_and_grad[1],
+                self._create_param_lr(param_and_grad),
+                moment,
+                inf_norm,
+                beta1_pow_acc,
+                self._beta1,
+                self._beta2,
+                self._epsilon,
+            )
         elif framework._in_legacy_dygraph():
-            _legacy_C_ops.adamax(param_and_grad[0], param_and_grad[1],
-                                 self._create_param_lr(param_and_grad), moment,
-                                 inf_norm, beta1_pow_acc, param_and_grad[0],
-                                 moment, inf_norm, "beta1", self._beta1,
-                                 "beta2", self._beta2, "epsilon", self._epsilon)
+            _legacy_C_ops.adamax(
+                param_and_grad[0],
+                param_and_grad[1],
+                self._create_param_lr(param_and_grad),
+                moment,
+                inf_norm,
+                beta1_pow_acc,
+                param_and_grad[0],
+                moment,
+                inf_norm,
+                "beta1",
+                self._beta1,
+                "beta2",
+                self._beta2,
+                "epsilon",
+                self._epsilon,
+            )
         else:
             # create the adamax optimize op
             adamax_op = block.append_op(
@@ -212,25 +238,25 @@ def _append_optimize_op(self, block, param_and_grad):
                     "LearningRate": self._create_param_lr(param_and_grad),
                     "Moment": moment,
                     "InfNorm": inf_norm,
-                    "Beta1Pow": beta1_pow_acc
+                    "Beta1Pow": beta1_pow_acc,
                 },
                 outputs={
                     "ParamOut": param_and_grad[0],
                     "MomentOut": moment,
-                    "InfNormOut": inf_norm
+                    "InfNormOut": inf_norm,
                 },
                 attrs={
                     "beta1": self._beta1,
                     "beta2": self._beta2,
-                    "epsilon": self._epsilon
+                    "epsilon": self._epsilon,
                 },
-                stop_gradient=True)
+                stop_gradient=True,
+            )
 
             return adamax_op
 
     def _finish_update(self, block, parameters_and_grads):
-        """Update Beta1 Power accumulator
-        """
+        """Update Beta1 Power accumulator"""
         assert isinstance(block, framework.Block)
         if isinstance(parameters_and_grads, list):
             for param, grad in parameters_and_grads:
@@ -238,47 +264,61 @@ def _finish_update(self, block, parameters_and_grads):
                     continue
                 if framework.in_dygraph_mode():
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
+                        self._beta1_pow_acc_str, param
+                    )
                     with no_grad():
-                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
-                                           True)
+                        tmp = _C_ops.scale(
+                            beta1_pow_acc, self._beta1, 0.0, True
+                        )
                         beta1_pow_acc.copy_(tmp, False)
                     continue
                 with param.block.program._optimized_guard(
-                    [param, grad]), name_scope('adamax'):
+                    [param, grad]
+                ), name_scope('adamax'):
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
-                    block.append_op(type="scale",
-                                    inputs={"X": beta1_pow_acc},
-                                    outputs={"Out": beta1_pow_acc},
-                                    attrs={"scale": self._beta1},
-                                    stop_gradient=True)
+                        self._beta1_pow_acc_str, param
+                    )
+                    block.append_op(
+                        type="scale",
+                        inputs={"X": beta1_pow_acc},
+                        outputs={"Out": beta1_pow_acc},
+                        attrs={"scale": self._beta1},
+                        stop_gradient=True,
+                    )
         else:
             for param, grad in parameters_and_grads['params']:
                 if grad is None or param.stop_gradient is True:
                     continue
                 if framework.in_dygraph_mode():
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
+                        self._beta1_pow_acc_str, param
+                    )
                     self._beta1 = parameters_and_grads.get(
-                        'beta1', self._default_dict['beta1'])
+                        'beta1', self._default_dict['beta1']
+                    )
                     with no_grad():
-                        tmp = _C_ops.scale(beta1_pow_acc, self._beta1, 0.0,
-                                           True)
+                        tmp = _C_ops.scale(
+                            beta1_pow_acc, self._beta1, 0.0, True
+                        )
                         beta1_pow_acc.copy_(tmp, False)
                     continue
 
                 with param.block.program._optimized_guard(
-                    [param, grad]), name_scope('adamax'):
+                    [param, grad]
+                ), name_scope('adamax'):
                     beta1_pow_acc = self._get_accumulator(
-                        self._beta1_pow_acc_str, param)
+                        self._beta1_pow_acc_str, param
+                    )
                     self._beta1 = parameters_and_grads.get(
-                        'beta1', self._default_dict['beta1'])
-                    block.append_op(type="scale",
-                                    inputs={"X": beta1_pow_acc},
-                                    outputs={"Out": beta1_pow_acc},
-                                    attrs={"scale": self._beta1},
-                                    stop_gradient=True)
+                        'beta1', self._default_dict['beta1']
+                    )
+                    block.append_op(
+                        type="scale",
+                        inputs={"X": beta1_pow_acc},
+                        outputs={"Out": beta1_pow_acc},
+                        attrs={"scale": self._beta1},
+                        stop_gradient=True,
+                    )
 
     def _update_param_group(self, parameters):
         self._beta1 = parameters.get('beta1', self._default_dict['beta1'])
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 4d7d128e05e498..77a332cbdd9971 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -59,7 +59,7 @@ class LRScheduler(object):
         instance to schedule learning rate.
 
     Examples:
-        Here is an example of a simple ``StepDecay`` implementation. 
+        Here is an example of a simple ``StepDecay`` implementation.
 
         .. code-block:: python
 
@@ -93,8 +93,10 @@ def get_lr(self):
     def __init__(self, learning_rate=0.1, last_epoch=-1, verbose=False):
         if not isinstance(learning_rate, (float, int)):
             raise TypeError(
-                "The type of learning rate must be float, but received {}".
-                format(type(learning_rate)))
+                "The type of learning rate must be float, but received {}".format(
+                    type(learning_rate)
+                )
+            )
         self.base_lr = float(learning_rate)
         self.last_lr = float(learning_rate)
         self.last_epoch = last_epoch
@@ -133,8 +135,11 @@ def step(self, epoch=None):
                 self.last_lr = self.get_lr()
 
         if self.verbose:
-            print('Epoch {}: {} set learning rate to {}.'.format(
-                self.last_epoch, self.__class__.__name__, self.last_lr))
+            print(
+                'Epoch {}: {} set learning rate to {}.'.format(
+                    self.last_epoch, self.__class__.__name__, self.last_lr
+                )
+            )
 
     def state_dict(self):
         """
@@ -153,7 +158,8 @@ def state_dict(self):
                 assert value.shape == [
                     1
                 ], "shape of Tensor in state_dict must be [1] {}".format(
-                    value.shape)
+                    value.shape
+                )
                 value = value.numpy()[0]
             state_dict[key] = value
 
@@ -184,8 +190,10 @@ def set_state_dict(self, state_dict):
                 self.__dict__[key] = state_dict[key]
             else:
                 raise RuntimeError(
-                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict"
-                    .format(key))
+                    "Please check whether state_dict is correct for optimizer. Can't find [ {} ] in state_dict".format(
+                        key
+                    )
+                )
         if len(state_dict) > len(self.keys):
             warnings.warn(
                 "There are some unused values in state_dict. Maybe the optimizer have different 'LearningRateDecay' when invoking state_dict and set_dict"
@@ -279,12 +287,14 @@ class NoamDecay(LRScheduler):
 
     """
 
-    def __init__(self,
-                 d_model,
-                 warmup_steps,
-                 learning_rate=1.0,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        d_model,
+        warmup_steps,
+        learning_rate=1.0,
+        last_epoch=-1,
+        verbose=False,
+    ):
         self.d_model = d_model
         self.warmup_steps = warmup_steps
         super(NoamDecay, self).__init__(learning_rate, last_epoch, verbose)
@@ -379,8 +389,9 @@ class PiecewiseDecay(LRScheduler):
     def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
         self.boundaries = boundaries
         self.values = values
-        super(PiecewiseDecay, self).__init__(last_epoch=last_epoch,
-                                             verbose=verbose)
+        super(PiecewiseDecay, self).__init__(
+            last_epoch=last_epoch, verbose=verbose
+        )
 
     def get_lr(self):
         for i in range(len(self.boundaries)):
@@ -460,10 +471,13 @@ class NaturalExpDecay(LRScheduler):
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
-        assert gamma > 0.0, " 'gamma' must be a positive number so that the learning rate will decay."
+        assert (
+            gamma > 0.0
+        ), " 'gamma' must be a positive number so that the learning rate will decay."
         self.gamma = gamma
-        super(NaturalExpDecay, self).__init__(learning_rate, last_epoch,
-                                              verbose)
+        super(NaturalExpDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         return self.base_lr * math.exp(-1 * self.gamma * self.last_epoch)
@@ -543,8 +557,9 @@ class InverseTimeDecay(LRScheduler):
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
         self.gamma = gamma
-        super(InverseTimeDecay, self).__init__(learning_rate, last_epoch,
-                                               verbose)
+        super(InverseTimeDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         return self.base_lr / (1 + self.gamma * self.last_epoch)
@@ -637,30 +652,37 @@ class PolynomialDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 decay_steps,
-                 end_lr=0.0001,
-                 power=1.0,
-                 cycle=False,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        learning_rate,
+        decay_steps,
+        end_lr=0.0001,
+        power=1.0,
+        cycle=False,
+        last_epoch=-1,
+        verbose=False,
+    ):
         assert decay_steps > 0 and isinstance(
-            decay_steps, int), " 'decay_steps' must be a positive integer."
+            decay_steps, int
+        ), " 'decay_steps' must be a positive integer."
         self.decay_steps = decay_steps
         self.end_lr = end_lr
-        assert power > 0.0, " 'power' must be greater than 0.0 so that the learning rate will decay."
+        assert (
+            power > 0.0
+        ), " 'power' must be greater than 0.0 so that the learning rate will decay."
         self.power = power
         self.cycle = cycle
-        super(PolynomialDecay, self).__init__(learning_rate, last_epoch,
-                                              verbose)
+        super(PolynomialDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         tmp_epoch_num = self.last_epoch
         tmp_decay_steps = self.decay_steps
         if self.cycle:
             div_res = math.ceil(
-                float(self.last_epoch) / float(self.decay_steps))
+                float(self.last_epoch) / float(self.decay_steps)
+            )
 
             if self.last_epoch == 0:
                 div_res = 1
@@ -669,8 +691,8 @@ def get_lr(self):
             tmp_epoch_num = min(self.last_epoch, self.decay_steps)
 
         return (self.base_lr - self.end_lr) * (
-            (1 - float(tmp_epoch_num) / float(tmp_decay_steps))**
-            self.power) + self.end_lr
+            (1 - float(tmp_epoch_num) / float(tmp_decay_steps)) ** self.power
+        ) + self.end_lr
 
 
 class LinearWarmup(LRScheduler):
@@ -758,27 +780,36 @@ class LinearWarmup(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 warmup_steps,
-                 start_lr,
-                 end_lr,
-                 last_epoch=-1,
-                 verbose=False):
-        type_check = isinstance(learning_rate, float) or isinstance(
-            learning_rate, int) or isinstance(learning_rate, LRScheduler)
+    def __init__(
+        self,
+        learning_rate,
+        warmup_steps,
+        start_lr,
+        end_lr,
+        last_epoch=-1,
+        verbose=False,
+    ):
+        type_check = (
+            isinstance(learning_rate, float)
+            or isinstance(learning_rate, int)
+            or isinstance(learning_rate, LRScheduler)
+        )
         if not type_check:
             raise TypeError(
-                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}"
-                .format(learning_rate))
+                "the type of learning_rate should be [int, float or LRScheduler], the current type is {}".format(
+                    learning_rate
+                )
+            )
         self.learning_rate = learning_rate
         assert warmup_steps > 0 and isinstance(
-            warmup_steps, int), " 'warmup_steps' must be a positive integer."
+            warmup_steps, int
+        ), " 'warmup_steps' must be a positive integer."
         self.warmup_steps = warmup_steps
         self.start_lr = start_lr
         self.end_lr = end_lr
-        assert end_lr > start_lr, "end_lr {} must be greater than start_lr {}".format(
-            end_lr, start_lr)
+        assert (
+            end_lr > start_lr
+        ), "end_lr {} must be greater than start_lr {}".format(end_lr, start_lr)
         super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
 
     def state_dict(self):
@@ -803,7 +834,8 @@ def set_state_dict(self, state_dict):
     def get_lr(self):
         if self.last_epoch < self.warmup_steps:
             return (self.end_lr - self.start_lr) * float(
-                self.last_epoch) / float(self.warmup_steps) + self.start_lr
+                self.last_epoch
+            ) / float(self.warmup_steps) + self.start_lr
         else:
             if isinstance(self.learning_rate, LRScheduler):
                 self.learning_rate.step(self.last_epoch - self.warmup_steps)
@@ -884,10 +916,13 @@ class ExponentialDecay(LRScheduler):
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
-        assert gamma > 0.0 and gamma < 1.0, " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
+        assert (
+            gamma > 0.0 and gamma < 1.0
+        ), " 'gamma' must be in interval (0.0, 1.0) so that the learning rate will decay."
         self.gamma = gamma
-        super(ExponentialDecay, self).__init__(learning_rate, last_epoch,
-                                               verbose)
+        super(ExponentialDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         return self.base_lr * (self.gamma**self.last_epoch)
@@ -973,21 +1008,21 @@ class MultiStepDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 milestones,
-                 gamma=0.1,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self, learning_rate, milestones, gamma=0.1, last_epoch=-1, verbose=False
+    ):
         if not isinstance(milestones, (tuple, list)):
             raise TypeError(
                 "The type of 'milestones' in 'MultiStepDecay' must be 'tuple, list', but received %s."
-                % type(milestones))
+                % type(milestones)
+            )
 
-        if not all([
+        if not all(
+            [
                 milestones[i] < milestones[i + 1]
                 for i in range(len(milestones) - 1)
-        ]):
+            ]
+        ):
             raise ValueError('The elements of milestones must be incremented')
         if gamma >= 1.0:
             raise ValueError('gamma should be < 1.0.')
@@ -1000,7 +1035,7 @@ def get_lr(self):
         for i in range(len(self.milestones)):
             if self.last_epoch < self.milestones[i]:
                 return self.base_lr * (self.gamma**i)
-        return self.base_lr * (self.gamma**len(self.milestones))
+        return self.base_lr * (self.gamma ** len(self.milestones))
 
 
 class StepDecay(LRScheduler):
@@ -1082,21 +1117,20 @@ class StepDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 step_size,
-                 gamma=0.1,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self, learning_rate, step_size, gamma=0.1, last_epoch=-1, verbose=False
+    ):
         if not isinstance(step_size, int):
             raise TypeError(
-                "The type of 'step_size' must be 'int', but received %s." %
-                type(step_size))
+                "The type of 'step_size' must be 'int', but received %s."
+                % type(step_size)
+            )
         if gamma >= 1.0:
             raise ValueError('gamma should be < 1.0.')
 
         assert step_size > 0 and isinstance(
-            step_size, int), " 'step_size' must be a positive integer."
+            step_size, int
+        ), " 'step_size' must be a positive integer."
         self.step_size = step_size
         self.gamma = gamma
         super(StepDecay, self).__init__(learning_rate, last_epoch, verbose)
@@ -1185,7 +1219,8 @@ def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
         if not callable(lr_lambda):
             raise TypeError(
                 "The type of 'lr_lambda' in 'LambdaDecay' must be 'function', but received %s."
-                % type(lr_lambda))
+                % type(lr_lambda)
+            )
 
         self.lr_lambda = lr_lambda
         super(LambdaDecay, self).__init__(learning_rate, last_epoch, verbose)
@@ -1281,17 +1316,19 @@ class ReduceOnPlateau(LRScheduler):
 
     """
 
-    def __init__(self,
-                 learning_rate,
-                 mode='min',
-                 factor=0.1,
-                 patience=10,
-                 threshold=1e-4,
-                 threshold_mode='rel',
-                 cooldown=0,
-                 min_lr=0,
-                 epsilon=1e-8,
-                 verbose=False):
+    def __init__(
+        self,
+        learning_rate,
+        mode='min',
+        factor=0.1,
+        patience=10,
+        threshold=1e-4,
+        threshold_mode='rel',
+        cooldown=0,
+        min_lr=0,
+        epsilon=1e-8,
+        verbose=False,
+    ):
         mode = mode.lower()
         if mode not in ['min', 'max']:
             raise ValueError('mode: ' + mode + ' is unknown!')
@@ -1299,18 +1336,21 @@ def __init__(self,
 
         if factor >= 1.0:
             raise ValueError(
-                'new_lr = origin_lr * gamma and gamma should be < 1.0.')
+                'new_lr = origin_lr * gamma and gamma should be < 1.0.'
+            )
         self.factor = factor
 
         threshold_mode = threshold_mode.lower()
         if threshold_mode not in ['rel', 'abs']:
-            raise ValueError('threshold mode: ' + threshold_mode +
-                             ' is unknown!')
+            raise ValueError(
+                'threshold mode: ' + threshold_mode + ' is unknown!'
+            )
         self.threshold_mode = threshold_mode
         if not isinstance(learning_rate, (float, int)):
             raise TypeError(
                 "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
-                % type(learning_rate))
+                % type(learning_rate)
+            )
 
         self.patience = patience
         self.threshold = threshold
@@ -1333,8 +1373,11 @@ def __init__(self,
     # "cooldown_counter / best / num_bad_epochs / last_epoch / last_lr" will be stored.
     def state_keys(self):
         self.keys = [
-            'cooldown_counter', 'best', 'num_bad_epochs', 'last_epoch',
-            'last_lr'
+            'cooldown_counter',
+            'best',
+            'num_bad_epochs',
+            'last_epoch',
+            'last_lr',
         ]
 
     def step(self, metrics, epoch=None):
@@ -1364,18 +1407,25 @@ def step(self, metrics, epoch=None):
         else:
             # need to declarate explicitly
             from paddle.framework import VarBase as Tensor
+
             tmp = Tensor
         # loss must be float, numpy.ndarray or 1-D Tensor with shape [1]
         if isinstance(metrics, (tmp, numpy.ndarray)):
-            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, "the metrics.shape " \
-                                                                      "should be (1L,), but the current metrics.shape is {}. Maybe that " \
-                                                                      "you should call paddle.mean to process it first.".format(
-                metrics.shape)
-        elif not isinstance(metrics,
-                            (int, float, numpy.float32, numpy.float64)):
+            assert len(metrics.shape) == 1 and metrics.shape[0] == 1, (
+                "the metrics.shape "
+                "should be (1L,), but the current metrics.shape is {}. Maybe that "
+                "you should call paddle.mean to process it first.".format(
+                    metrics.shape
+                )
+            )
+        elif not isinstance(
+            metrics, (int, float, numpy.float32, numpy.float64)
+        ):
             raise TypeError(
-                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}"
-                .format(type(metrics)))
+                "metrics must be 'int', 'float', 'np.float', 'numpy.ndarray' or 'paddle.Tensor', but receive {}".format(
+                    type(metrics)
+                )
+            )
 
         if self.cooldown_counter > 0:
             self.cooldown_counter -= 1
@@ -1393,9 +1443,13 @@ def step(self, metrics, epoch=None):
                 if self.last_lr - new_lr > self.epsilon:
                     self.last_lr = new_lr
                     if self.verbose:
-                        print('Epoch {}: {} set learning rate to {}.'.format(
-                            self.last_epoch, self.__class__.__name__,
-                            self.last_lr))
+                        print(
+                            'Epoch {}: {} set learning rate to {}.'.format(
+                                self.last_epoch,
+                                self.__class__.__name__,
+                                self.last_lr,
+                            )
+                        )
 
     def _is_better(self, current, best):
         if self.mode == 'min' and self.threshold_mode == 'rel':
@@ -1493,41 +1547,50 @@ class CosineAnnealingDecay(LRScheduler):
               # scheduler.step()        # If you update learning rate each epoch
     """
 
-    def __init__(self,
-                 learning_rate,
-                 T_max,
-                 eta_min=0,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self, learning_rate, T_max, eta_min=0, last_epoch=-1, verbose=False
+    ):
         if not isinstance(T_max, int):
             raise TypeError(
                 "The type of 'T_max' in 'CosineAnnealingDecay' must be 'int', but received %s."
-                % type(T_max))
+                % type(T_max)
+            )
         if not isinstance(eta_min, (float, int)):
             raise TypeError(
                 "The type of 'eta_min' in 'CosineAnnealingDecay' must be 'float, int', but received %s."
-                % type(eta_min))
+                % type(eta_min)
+            )
         assert T_max > 0 and isinstance(
-            T_max, int), " 'T_max' must be a positive integer."
+            T_max, int
+        ), " 'T_max' must be a positive integer."
         self.T_max = T_max
         self.eta_min = float(eta_min)
-        super(CosineAnnealingDecay, self).__init__(learning_rate, last_epoch,
-                                                   verbose)
+        super(CosineAnnealingDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         if self.last_epoch == 0:
             return self.base_lr
         elif (self.last_epoch - 1 - self.T_max) % (2 * self.T_max) == 0:
-            return self.last_lr + (self.base_lr - self.eta_min) * (
-                1 - math.cos(math.pi / self.T_max)) / 2
+            return (
+                self.last_lr
+                + (self.base_lr - self.eta_min)
+                * (1 - math.cos(math.pi / self.T_max))
+                / 2
+            )
 
         return (1 + math.cos(math.pi * self.last_epoch / self.T_max)) / (
-            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)) * (
-                self.last_lr - self.eta_min) + self.eta_min
+            1 + math.cos(math.pi * (self.last_epoch - 1) / self.T_max)
+        ) * (self.last_lr - self.eta_min) + self.eta_min
 
     def _get_closed_form_lr(self):
-        return self.eta_min + (self.base_lr - self.eta_min) * (
-            1 + math.cos(math.pi * self.last_epoch / self.T_max)) / 2
+        return (
+            self.eta_min
+            + (self.base_lr - self.eta_min)
+            * (1 + math.cos(math.pi * self.last_epoch / self.T_max))
+            / 2
+        )
 
 
 class MultiplicativeDecay(LRScheduler):
@@ -1559,7 +1622,6 @@ class MultiplicativeDecay(LRScheduler):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             # train on default dynamic graph mode
             linear = paddle.nn.Linear(10, 10)
@@ -1582,11 +1644,13 @@ def __init__(self, learning_rate, lr_lambda, last_epoch=-1, verbose=False):
         if not callable(lr_lambda):
             raise TypeError(
                 "The type of 'lr_lambda' in 'MultiplicativeDecay' must be 'function', but received %s."
-                % type(lr_lambda))
+                % type(lr_lambda)
+            )
 
         self.lr_lambda = lr_lambda
-        super(MultiplicativeDecay, self).__init__(learning_rate, last_epoch,
-                                                  verbose)
+        super(MultiplicativeDecay, self).__init__(
+            learning_rate, last_epoch, verbose
+        )
 
     def get_lr(self):
         cur_lr = self.base_lr
@@ -1597,6 +1661,7 @@ def get_lr(self):
 
 class OneCycleLR(LRScheduler):
     r"""
+
     Sets the learning rate according to the one cycle learning rate scheduler.
     The scheduler adjusts the learning rate from an initial learning rate to the maximum learning rate and then
     from that maximum learning rate to the minimum learning rate, which is much less than the initial learning rate.
@@ -1610,22 +1675,25 @@ class OneCycleLR(LRScheduler):
     Also note that you should update learning rate each step.
 
     Args:
-        max_learning_rate (float): The maximum learning rate. It is a python float number.
-             Functionally, it defines the initial learning rate by ``divide_factor`` .
+        max_learning_rate (float): The maximum learning rate. It is a python float number. Functionally, it defines the initial learning rate by ``divide_factor`` .
         total_steps (int): Number of total training steps.
-        divide_factor (float): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
+        divide_factor (float, optional): Initial learning rate will be determined by initial_learning_rate = max_learning_rate / divide_factor. Default: 25.
         end_learning_rate (float, optional): The minimum learning rate during training, it should be much less than initial learning rate.
         phase_pct (float): The percentage of total steps which used to increasing learning rate. Default: 0.3.
-        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing,
-            'linear' for linear annealing. Default: 'cos'.
+        anneal_strategy (str, optional): Strategy of adjusting learning rate.'cos' for cosine annealing, 'linear' for linear annealing. Default: 'cos'.
         three_phase (bool, optional): Whether to use three phase.
+
             If ``True``:
+
                 1. The learning rate will first increase from initial learning rate to maximum learning rate.
                 2. Then it will decrease to initial learning rate. Number of step in this phase is the same as the one in first phase.
                 3. Finally, it will decrease to minimum learning rate which is much less than initial learning rate.
+
             If ``False``:
+
                 1. The learning rate will increase to maximum learning rate.
                 2. Then it will directly decrease to minimum learning rate.
+
         last_epoch (int, optional):  The index of last epoch. Can be set to restart training. Default: -1, means initial learning rate.
         verbose (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
@@ -1677,31 +1745,38 @@ class OneCycleLR(LRScheduler):
                         },
                         fetch_list=loss.name)
                     scheduler.step()    # You should update learning rate each step
+
     """
 
-    def __init__(self,
-                 max_learning_rate,
-                 total_steps,
-                 divide_factor=25.,
-                 end_learning_rate=0.0001,
-                 phase_pct=0.3,
-                 anneal_strategy='cos',
-                 three_phase=False,
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        max_learning_rate,
+        total_steps,
+        divide_factor=25.0,
+        end_learning_rate=0.0001,
+        phase_pct=0.3,
+        anneal_strategy='cos',
+        three_phase=False,
+        last_epoch=-1,
+        verbose=False,
+    ):
         # Check type and value of max_learning_rate
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
-                "'max_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(max_learning_rate)))
+                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
+                    type(max_learning_rate)
+                )
+            )
         if max_learning_rate < 0:
             raise ValueError("'max_learning_rate' must be a positive integer.")
 
         # Check type and value of end_learning_rate
         if not isinstance(end_learning_rate, (float, int)):
             raise TypeError(
-                "'end_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(end_learning_rate)))
+                "'end_learning_rate' must be 'float' or 'int', but received {}".format(
+                    type(end_learning_rate)
+                )
+            )
         if end_learning_rate < 0:
             raise ValueError("'end_learning_rate' must be a positive integer.")
 
@@ -1709,7 +1784,9 @@ def __init__(self,
         if not isinstance(total_steps, int):
             raise TypeError(
                 "'total_step' must be 'int', but received {}".format(
-                    type(total_steps)))
+                    type(total_steps)
+                )
+            )
         if total_steps <= 0:
             raise ValueError("'total_step' must be a positive integer.")
         self.total_steps = total_steps
@@ -1718,17 +1795,23 @@ def __init__(self,
         if not isinstance(phase_pct, float):
             raise TypeError(
                 "'phase_pct' must be 'float', but received {}".format(
-                    type(phase_pct)))
+                    type(phase_pct)
+                )
+            )
         if phase_pct < 0 or phase_pct > 1:
             raise ValueError(
                 "'phase_pct' must be between 0 and 1, but received {}".format(
-                    phase_pct))
+                    phase_pct
+                )
+            )
 
         # Check type and value of divide_factor
         if not isinstance(divide_factor, (float, int)):
             raise TypeError(
-                "'divide_factor' must be 'float' or 'int', but received {}".
-                format(type(divide_factor)))
+                "'divide_factor' must be 'float' or 'int', but received {}".format(
+                    type(divide_factor)
+                )
+            )
 
         initial_lr = max_learning_rate / float(divide_factor)
         min_lr = float(end_learning_rate)
@@ -1751,17 +1834,22 @@ def __init__(self,
                 self._step_config[1] - self._step_config[0],
                 self._step_config[2] - self._step_config[1],
                 self._step_config[3] - self._step_config[2],
-                self._step_config[3] -
-                self._step_config[2],  # for the last step.
+                self._step_config[3]
+                - self._step_config[2],  # for the last step.
             ]
             # start lr and end lr of each phase.
             self._lr_config = [
-                initial_lr, max_learning_rate, initial_lr, min_lr
+                initial_lr,
+                max_learning_rate,
+                initial_lr,
+                min_lr,
             ]
         else:
             self._step_config = [
-                0, phase_pct * self.total_steps - 1, self.total_steps - 1,
-                self.total_steps - 1
+                0,
+                phase_pct * self.total_steps - 1,
+                self.total_steps - 1,
+                self.total_steps - 1,
             ]
             self._steps_size = [
                 self._step_config[1] - self._step_config[0],
@@ -1777,8 +1865,10 @@ def __init__(self,
             self.anneal_func = self._linear_annealing
         else:
             raise ValueError(
-                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}"
-                .format(anneal_strategy))
+                "'anneal_strategy' must by one of 'cos' or 'linear', but received {}".format(
+                    anneal_strategy
+                )
+            )
         super(OneCycleLR, self).__init__(initial_lr, last_epoch, verbose)
 
     def _cos_annealing(self, start_lr, end_lr, pct):
@@ -1793,17 +1883,21 @@ def get_lr(self):
 
         if current_step > self.total_steps:
             raise ValueError(
-                "Tried to step {} times. However the number of total steps is {}"
-                .format(current_step, self.total_steps))
+                "Tried to step {} times. However the number of total steps is {}".format(
+                    current_step, self.total_steps
+                )
+            )
 
         for (i, (end_step, step_size)) in enumerate(
-                zip(self._step_config[1:], self._steps_size)):
+            zip(self._step_config[1:], self._steps_size)
+        ):
             # i == len(self._lr_config) - 2 catch the last step, otherwise it will return None.
             if current_step <= end_step or i == len(self._lr_config) - 2:
                 # self._step_config[i] means start step of a phase.
                 percentage = (current_step - self._step_config[i]) / step_size
-                return self.anneal_func(self._lr_config[i],
-                                        self._lr_config[i + 1], percentage)
+                return self.anneal_func(
+                    self._lr_config[i], self._lr_config[i + 1], percentage
+                )
 
 
 class CyclicLR(LRScheduler):
@@ -1847,7 +1941,7 @@ class CyclicLR(LRScheduler):
         verbose: (bool, optional): If ``True``, prints a message to stdout for each update. Default: ``False`` .
 
     Returns:
-    ``CyclicLR`` instance to schedule learning rate.
+        ``CyclicLR`` instance to schedule learning rate.
 
     Examples:
         .. code-block:: python
@@ -1897,71 +1991,93 @@ class CyclicLR(LRScheduler):
                     scheduler.step()    # You should update learning rate each step
     """
 
-    def __init__(self,
-                 base_learning_rate,
-                 max_learning_rate,
-                 step_size_up,
-                 step_size_down=None,
-                 mode='triangular',
-                 exp_gamma=1.,
-                 scale_fn=None,
-                 scale_mode='cycle',
-                 last_epoch=-1,
-                 verbose=False):
+    def __init__(
+        self,
+        base_learning_rate,
+        max_learning_rate,
+        step_size_up,
+        step_size_down=None,
+        mode='triangular',
+        exp_gamma=1.0,
+        scale_fn=None,
+        scale_mode='cycle',
+        last_epoch=-1,
+        verbose=False,
+    ):
         # check type and value of max_learning_rate
         if not isinstance(max_learning_rate, (float, int)):
             raise TypeError(
-                "'max_learning_rate' must be 'float' or 'int', but received {}".
-                format(type(max_learning_rate)))
+                "'max_learning_rate' must be 'float' or 'int', but received {}".format(
+                    type(max_learning_rate)
+                )
+            )
         if max_learning_rate < 0:
             raise ValueError(
-                "'max_learning_rate' must be a positive integer, but received {}"
-                .format(max_learning_rate))
+                "'max_learning_rate' must be a positive integer, but received {}".format(
+                    max_learning_rate
+                )
+            )
 
         # check type and value of step_size_up
         if not isinstance(step_size_up, int):
             raise TypeError(
-                "The type of 'step_size_up' must be int, but received {}".
-                format(type(step_size_up)))
+                "The type of 'step_size_up' must be int, but received {}".format(
+                    type(step_size_up)
+                )
+            )
         if step_size_up <= 0:
             raise ValueError(
-                "'step_size_up' must be a positive integer, but received {}".
-                format(step_size_up))
+                "'step_size_up' must be a positive integer, but received {}".format(
+                    step_size_up
+                )
+            )
 
         # check type and value of step_size_down
         if step_size_down is not None:
             if not isinstance(step_size_down, int):
                 raise TypeError(
-                    "The type of 'step_size_down' must be int, but received {}".
-                    format(type(step_size_down)))
+                    "The type of 'step_size_down' must be int, but received {}".format(
+                        type(step_size_down)
+                    )
+                )
             if step_size_down <= 0:
                 raise ValueError(
-                    "'step_size_down' must be a positive integer, but received {}"
-                    .format(step_size_down))
+                    "'step_size_down' must be a positive integer, but received {}".format(
+                        step_size_down
+                    )
+                )
 
         # check type of exp_gamma
         if not isinstance(exp_gamma, float):
             raise TypeError(
                 "The type of 'exp_gamma' must be float, but received {}".format(
-                    type(exp_gamma)))
+                    type(exp_gamma)
+                )
+            )
 
         step_size_up = float(step_size_up)
-        step_size_down = float(
-            step_size_down) if step_size_down is not None else step_size_up
+        step_size_down = (
+            float(step_size_down)
+            if step_size_down is not None
+            else step_size_up
+        )
 
         self.cycle_size = step_size_up + step_size_down
         self.step_up_pct = step_size_up / self.cycle_size
         self.max_lr = float(max_learning_rate)
         self.amplitude = self.max_lr - base_learning_rate
 
-        if mode not in ['triangular', 'triangular2', 'exp_range'
-                        ] and scale_fn is None:
+        if (
+            mode not in ['triangular', 'triangular2', 'exp_range']
+            and scale_fn is None
+        ):
             raise ValueError(
                 "'mode' is invalid and 'scale_fn' is not specified, make sure one of 'mode' or 'scale_fn' is valid"
             )
         if scale_mode not in ['cycle', 'iterations']:
             raise ValueError(
-                "'scale_mode' must be one of 'cycle' or 'iterations")
+                "'scale_mode' must be one of 'cycle' or 'iterations"
+            )
 
         self.mode = mode
         self.gamma = exp_gamma  # only for exp_range mode
@@ -1982,10 +2098,10 @@ def __init__(self,
         super().__init__(base_learning_rate, last_epoch, verbose)
 
     def _triangular_scale_fn(self, x):
-        return 1.
+        return 1.0
 
     def _triangular2_scale_fn(self, x):
-        return 1 / (2.**(x - 1))
+        return 1 / (2.0 ** (x - 1))
 
     def _exp_range_scale_fn(self, x):
         return self.gamma**x
@@ -1994,7 +2110,7 @@ def get_lr(self):
         iterations = self.last_epoch
 
         cycle = 1 + iterations // self.cycle_size
-        pct_per_cycle = 1. + iterations / self.cycle_size - cycle
+        pct_per_cycle = 1.0 + iterations / self.cycle_size - cycle
 
         if pct_per_cycle <= self.step_up_pct:
             scale_factor = pct_per_cycle / self.step_up_pct
diff --git a/python/paddle/optimizer/momentum.py b/python/paddle/optimizer/momentum.py
index 988ac052b0307b..da70ca1303ab77 100644
--- a/python/paddle/optimizer/momentum.py
+++ b/python/paddle/optimizer/momentum.py
@@ -85,8 +85,8 @@ class Momentum(Optimizer):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+
+            inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.to_tensor(inp)
             out = linear(inp)
@@ -123,29 +123,35 @@ class Momentum(Optimizer):
     """
     _velocity_acc_str = "velocity"
 
-    def __init__(self,
-                 learning_rate=0.001,
-                 momentum=0.9,
-                 parameters=None,
-                 use_nesterov=False,
-                 weight_decay=None,
-                 grad_clip=None,
-                 multi_precision=False,
-                 rescale_grad=1.0,
-                 use_multi_tensor=False,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate=0.001,
+        momentum=0.9,
+        parameters=None,
+        use_nesterov=False,
+        weight_decay=None,
+        grad_clip=None,
+        multi_precision=False,
+        rescale_grad=1.0,
+        use_multi_tensor=False,
+        name=None,
+    ):
         if learning_rate is None:
             raise ValueError("learning_rate is not set")
         if momentum is None:
             raise ValueError("momentum is not set")
 
-        predicate = lambda regular: isinstance(regular,
-                                               (L2DecayRegularizer, float))
+        predicate = lambda regular: isinstance(
+            regular, (L2DecayRegularizer, float)
+        )
         if isinstance(parameters, list):
             if isinstance(parameters[0], dict):
                 for param_group in parameters:
-                    decay = param_group[
-                        'weight_decay'] if 'weight_decay' in param_group else weight_decay
+                    decay = (
+                        param_group['weight_decay']
+                        if 'weight_decay' in param_group
+                        else weight_decay
+                    )
                     reg_method, reg_coeff = self._update_regularization(decay)
                     param_group['regularization_method'] = reg_method
                     param_group['regularization_coeff'] = reg_coeff
@@ -153,16 +159,20 @@ def __init__(self,
                     param_group['weight_decay'] = py_regular
 
         py_regular = None if predicate(weight_decay) else weight_decay
-        super(Momentum, self).__init__(learning_rate=learning_rate,
-                                       parameters=parameters,
-                                       weight_decay=py_regular,
-                                       grad_clip=grad_clip,
-                                       name=name)
+        super(Momentum, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=py_regular,
+            grad_clip=grad_clip,
+            name=name,
+        )
         self.type = "momentum"
         self._momentum = momentum
         self._use_nesterov = bool(use_nesterov)
-        self._regularization_method, self._regularization_coeff = self._update_regularization(
-            weight_decay)
+        (
+            self._regularization_method,
+            self._regularization_coeff,
+        ) = self._update_regularization(weight_decay)
         self._multi_precision = multi_precision
         self._rescale_grad = rescale_grad
         self._master_weights = {}
@@ -176,29 +186,21 @@ def __init__(self,
         }
         self._use_multi_tensor = use_multi_tensor
         if self._use_multi_tensor:
-            self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._velocity_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
-            self._master_weight_dict = {
-                'FP32_LODTensor': None,
-                'FP16_LODTensor': []
-            }
-            self._regularization_method_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
-            self._regularization_coeff_dict = {
-                'FP32_LODTensor': [],
-                'FP16_LODTensor': []
-            }
+            self._param_dict = self._create_multi_tensor_dict()
+            self._velocity_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict = self._create_multi_tensor_dict()
+            self._master_weight_dict['FP32_LODTensor'] = None
+            self._regularization_method_dict = self._create_multi_tensor_dict()
+            self._regularization_coeff_dict = self._create_multi_tensor_dict()
 
     def _update_regularization(self, weight_decay):
         reg_method = ""
         reg_coeff = 0.0
 
-        if (isinstance(weight_decay, L2DecayRegularizer)):
+        if isinstance(weight_decay, L2DecayRegularizer):
             reg_method = "l2_decay"
             reg_coeff = weight_decay._regularization_coeff
-        if (isinstance(weight_decay, float)):
+        if isinstance(weight_decay, float):
             reg_method = "l2_decay"
             reg_coeff = weight_decay
         return reg_method, reg_coeff
@@ -211,19 +213,23 @@ def _create_master_weight(self, param):
 
             var_name = param.name + "_fp32_master"
             var_name = unique_name.generate(var_name)
-            var = layers.create_global_var(name=var_name,
-                                           shape=param.shape,
-                                           value=0,
-                                           dtype='float32',
-                                           persistable=True)
+            var = layers.create_global_var(
+                name=var_name,
+                shape=param.shape,
+                value=0,
+                dtype='float32',
+                persistable=True,
+            )
             block = self.helper.startup_program.global_block()
-            block.append_op(type="cast",
-                            inputs={"X": [param]},
-                            outputs={"Out": [var]},
-                            attrs={
-                                "in_dtype": param.dtype,
-                                "out_dtype": core.VarDesc.VarType.FP32
-                            })
+            block.append_op(
+                type="cast",
+                inputs={"X": [param]},
+                outputs={"Out": [var]},
+                attrs={
+                    "in_dtype": param.dtype,
+                    "out_dtype": core.VarDesc.VarType.FP32,
+                },
+            )
             self._master_weights[param.name] = var
         return var
 
@@ -239,15 +245,22 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        find_master = self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
-        target_param = self._master_weights[
-            param.name] if find_master else param
+        find_master = (
+            self._multi_precision and param.dtype == core.VarDesc.VarType.FP16
+        )
+        target_param = (
+            self._master_weights[param.name] if find_master else param
+        )
         target_name = target_param.name
-        if (name not in self._accumulators
-                or target_name not in self._accumulators[name]):
+        if (
+            name not in self._accumulators
+            or target_name not in self._accumulators[name]
+        ):
             raise Exception(
                 "Accumulator {} does not exist for parameter {}".format(
-                    name, target_name))
+                    name, target_name
+                )
+            )
         return self._accumulators[name][target_name]
 
     def _create_accumulators(self, block, parameters):
@@ -265,7 +278,10 @@ def _create_accumulators(self, block, parameters):
                 master_p = self._create_master_weight(p)
                 self._add_accumulator(self._velocity_acc_str, master_p)
                 continue
-            if p.dtype == core.VarDesc.VarType.FP16 and not self._multi_precision:
+            if (
+                p.dtype == core.VarDesc.VarType.FP16
+                and not self._multi_precision
+            ):
                 warnings.warn(
                     "Accumulating with FP16 in optimizer can lead to poor accuracy or slow convergence."
                     "Consider using multi_precision=True option of the Momentum optimizer."
@@ -273,25 +289,28 @@ def _create_accumulators(self, block, parameters):
             self._add_accumulator(self._velocity_acc_str, p)
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
-        """ Create and add backward regularization Operators
-    
+        """Create and add backward regularization Operators
+
         Function helper of append_regularization_ops.
         """
         # If ParamAttr is set to L2Decay, we skip doing regularization here. And then we fused
         # L2Decay with momentum which can refer to _append_optimize_op below.
-        if hasattr(param, 'regularizer') and isinstance(param.regularizer,
-                                                        L2DecayRegularizer):
+        if hasattr(param, 'regularizer') and isinstance(
+            param.regularizer, L2DecayRegularizer
+        ):
             return grad
         return super(Momentum, self)._create_regularization_of_grad(
-            param, grad, regularization)
+            param, grad, regularization
+        )
 
     def _append_optimize_op(self, block, param_and_grad):
         assert isinstance(block, framework.Block)
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        velocity_acc = self._get_accumulator(self._velocity_acc_str,
-                                             param_and_grad[0])
+        velocity_acc = self._get_accumulator(
+            self._velocity_acc_str, param_and_grad[0]
+        )
         lr = self._create_param_lr(param_and_grad)
 
         # For fusion of momentum and l2decay
@@ -308,30 +327,56 @@ def _append_optimize_op(self, block, param_and_grad):
                 regularization_method = ""
                 regularization_coeff = 0.0
 
-        find_master = self._multi_precision and param_and_grad[
-            0].dtype == core.VarDesc.VarType.FP16
-        master_weight = (self._master_weights[param_and_grad[0].name]
-                         if find_master else None)
+        find_master = (
+            self._multi_precision
+            and param_and_grad[0].dtype == core.VarDesc.VarType.FP16
+        )
+        master_weight = (
+            self._master_weights[param_and_grad[0].name]
+            if find_master
+            else None
+        )
 
         if _in_legacy_dygraph():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
             _, _, _ = _legacy_C_ops.momentum(
-                param_and_grad[0], param_and_grad[1], velocity_acc, lr,
-                master_weight, param_and_grad[0], velocity_acc, master_weight,
-                'mu', self._momentum, 'use_nesterov', self._use_nesterov,
-                'regularization_method', regularization_method,
-                'regularization_coeff', regularization_coeff, 'multi_precision',
-                find_master)
+                param_and_grad[0],
+                param_and_grad[1],
+                velocity_acc,
+                lr,
+                master_weight,
+                param_and_grad[0],
+                velocity_acc,
+                master_weight,
+                'mu',
+                self._momentum,
+                'use_nesterov',
+                self._use_nesterov,
+                'regularization_method',
+                regularization_method,
+                'regularization_coeff',
+                regularization_coeff,
+                'multi_precision',
+                find_master,
+            )
             return None
         if in_dygraph_mode():
             if isinstance(param_and_grad, dict):
                 self._update_regularization(param_and_grad['weight_decay'])
-            return _C_ops.momentum_(param_and_grad[0], param_and_grad[1],
-                                    velocity_acc, lr, master_weight,
-                                    self._momentum, self._use_nesterov,
-                                    regularization_method, regularization_coeff,
-                                    find_master, self._rescale_grad)
+            return _C_ops.momentum_(
+                param_and_grad[0],
+                param_and_grad[1],
+                velocity_acc,
+                lr,
+                master_weight,
+                self._momentum,
+                self._use_nesterov,
+                regularization_method,
+                regularization_coeff,
+                find_master,
+                self._rescale_grad,
+            )
 
         attrs = {
             "mu": self._momentum,
@@ -339,19 +384,19 @@ def _append_optimize_op(self, block, param_and_grad):
             "regularization_method": regularization_method,
             "regularization_coeff": regularization_coeff,
             "multi_precision": find_master,
-            "rescale_grad": self._rescale_grad
+            "rescale_grad": self._rescale_grad,
         }
 
         inputs = {
             "Param": [param_and_grad[0]],
             "Grad": [param_and_grad[1]],
             "Velocity": [velocity_acc],
-            "LearningRate": [lr]
+            "LearningRate": [lr],
         }
 
         outputs = {
             "ParamOut": [param_and_grad[0]],
-            "VelocityOut": [velocity_acc]
+            "VelocityOut": [velocity_acc],
         }
 
         if find_master:
@@ -359,15 +404,17 @@ def _append_optimize_op(self, block, param_and_grad):
             outputs["MasterParamOut"] = master_weight
 
         # create the momentum optimize op
-        momentum_op = block.append_op(type=self.type,
-                                      inputs=inputs,
-                                      outputs=outputs,
-                                      attrs=attrs,
-                                      stop_gradient=True)
+        momentum_op = block.append_op(
+            type=self.type,
+            inputs=inputs,
+            outputs=outputs,
+            attrs=attrs,
+            stop_gradient=True,
+        )
 
         return momentum_op
 
-    def _multi_tensor_init(self, target_block, parameters):
+    def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
         All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
         This function will be overridden in the corresponding optimizer file.
@@ -385,38 +432,59 @@ def _multi_tensor_init(self, target_block, parameters):
                 # we skip param's l2decay before, so fuse it with momentum here.
                 if isinstance(param.regularizer, L2DecayRegularizer):
                     regularization_method = "l2_decay"
-                    regularization_coeff = param.regularizer._regularization_coeff
+                    regularization_coeff = (
+                        param.regularizer._regularization_coeff
+                    )
                 elif param.regularizer is not None:
                     regularization_method = ""
                     regularization_coeff = 0.0
             if param.dtype == paddle.float32:
-                self._param_dict['FP32_LODTensor'].append(param)
-                self._velocity_dict['FP32_LODTensor'].append(velocity_acc)
+                self._param_dict['FP32_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._velocity_dict['FP32_LODTensor'][param_group_idx].append(
+                    velocity_acc
+                )
                 # fp32 no master weight
-                self._regularization_method_dict['FP32_LODTensor'].append(
-                    regularization_method)
-                self._regularization_coeff_dict['FP32_LODTensor'].append(
-                    regularization_coeff)
+                self._regularization_method_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(regularization_method)
+                self._regularization_coeff_dict['FP32_LODTensor'][
+                    param_group_idx
+                ].append(regularization_coeff)
             elif param.dtype == paddle.float16:
-                self._param_dict['FP16_LODTensor'].append(param)
-                self._velocity_dict['FP16_LODTensor'].append(velocity_acc)
+                self._param_dict['FP16_LODTensor'][param_group_idx].append(
+                    param
+                )
+                self._velocity_dict['FP16_LODTensor'][param_group_idx].append(
+                    velocity_acc
+                )
                 if self._multi_precision:
-                    self._master_weight_dict['FP16_LODTensor'].append(
-                        self._master_weights[param.name])
+                    self._master_weight_dict['FP16_LODTensor'][
+                        param_group_idx
+                    ].append(self._master_weights[param.name])
                 else:
-                    self._master_weight_dict['FP16_LODTensor'] = None
-                self._regularization_method_dict['FP16_LODTensor'].append(
-                    regularization_method)
-                self._regularization_coeff_dict['FP16_LODTensor'].append(
-                    regularization_coeff)
+                    self._master_weight_dict['FP16_LODTensor'][
+                        param_group_idx
+                    ] = None
+                self._regularization_method_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(regularization_method)
+                self._regularization_coeff_dict['FP16_LODTensor'][
+                    param_group_idx
+                ].append(regularization_coeff)
             else:
                 raise ValueError(
                     "Now multi_tensor_momentum only support fp32 and fp16 parameters and grad is LOD_TENSOR."
                 )
 
-    def _append_optimize_multi_tensor_op(self, target_block,
-                                         parameters_and_grads):
-        """ 
+    def _append_optimize_multi_tensor_op(
+        self,
+        target_block,
+        parameters_and_grads,
+        param_group_idx,
+    ):
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         assert isinstance(target_block, framework.Block)
@@ -429,15 +497,19 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[1] is None:
                     continue
                 if param_and_grad[0].stop_gradient is False:
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
@@ -448,97 +520,144 @@ def _append_optimize_multi_tensor_op(self, target_block,
                 if param_and_grad[0].stop_gradient is False:
                     param_grad_dict = dict()
                     param_grad_dict['params'] = param_and_grad
-                    param_grad_dict.update({
-                        k: v
-                        for k, v in parameters_and_grads.items()
-                        if k != 'params'
-                    })
+                    param_grad_dict.update(
+                        {
+                            k: v
+                            for k, v in parameters_and_grads.items()
+                            if k != 'params'
+                        }
+                    )
                     param_and_grad = self._update_param_group(param_grad_dict)
-                    if param_and_grad[
-                            0].dtype == paddle.float32 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    if (
+                        param_and_grad[0].dtype == paddle.float32
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP32_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP32_LODTensor'].append(lr)
-                    elif param_and_grad[
-                            0].dtype == paddle.float16 and param_and_grad[
-                                1].type == core.VarDesc.VarType.LOD_TENSOR:
+                    elif (
+                        param_and_grad[0].dtype == paddle.float16
+                        and param_and_grad[1].type
+                        == core.VarDesc.VarType.LOD_TENSOR
+                    ):
                         grad_dict['FP16_LODTensor'].append(param_and_grad[1])
                         lr = self._create_param_lr(param_and_grad)
                         lr_dict['FP16_LODTensor'].append(lr)
 
         multi_tensor_list = ['FP32_LODTensor', 'FP16_LODTensor']
         for key in multi_tensor_list:
-            if len(self._param_dict[key]) > 0:
+            if len(self._param_dict[key][param_group_idx]) > 0:
                 find_master = self._multi_precision and key == 'FP16_LODTensor'
 
+                master_weight = self._master_weight_dict[key]
+                master_weight = (
+                    master_weight[param_group_idx]
+                    if master_weight is not None
+                    else None
+                )
+
                 if framework._non_static_mode():
                     if in_dygraph_mode():
                         _, _, _ = _C_ops.merged_momentum_(
-                            self._param_dict[key], grad_dict[key],
-                            self._velocity_dict[key], lr_dict[key],
-                            self._master_weight_dict[key], self._momentum,
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            self._velocity_dict[key][param_group_idx],
+                            lr_dict[key],
+                            master_weight,
+                            self._momentum,
                             self._use_nesterov,
-                            self._regularization_method_dict[key],
-                            self._regularization_coeff_dict[key], find_master,
-                            self._rescale_grad)
+                            self._regularization_method_dict[key][
+                                param_group_idx
+                            ],
+                            self._regularization_coeff_dict[key][
+                                param_group_idx
+                            ],
+                            find_master,
+                            self._rescale_grad,
+                        )
                     else:
                         _, _, _ = _legacy_C_ops.merged_momentum(
-                            self._param_dict[key], grad_dict[key],
-                            self._velocity_dict[key], lr_dict[key],
-                            self._master_weight_dict[key],
-                            self._param_dict[key], self._velocity_dict[key],
-                            self._master_weight_dict[key], 'mu', self._momentum,
-                            'use_nesterov', self._use_nesterov,
+                            self._param_dict[key][param_group_idx],
+                            grad_dict[key],
+                            self._velocity_dict[key][param_group_idx],
+                            lr_dict[key],
+                            master_weight,
+                            self._param_dict[key][param_group_idx],
+                            self._velocity_dict[key][param_group_idx],
+                            master_weight,
+                            'mu',
+                            self._momentum,
+                            'use_nesterov',
+                            self._use_nesterov,
                             'regularization_method',
-                            self._regularization_method_dict[key],
+                            self._regularization_method_dict[key][
+                                param_group_idx
+                            ],
                             'regularization_coeff',
-                            self._regularization_coeff_dict[key],
-                            'multi_precision', find_master)
+                            self._regularization_coeff_dict[key][
+                                param_group_idx
+                            ],
+                            'multi_precision',
+                            find_master,
+                        )
                 else:
                     inputs = {
-                        "Param": self._param_dict[key],
+                        "Param": self._param_dict[key][param_group_idx],
                         "Grad": grad_dict[key],
-                        "Velocity": self._velocity_dict[key],
+                        "Velocity": self._velocity_dict[key][param_group_idx],
                         "LearningRate": lr_dict[key],
                     }
                     outputs = {
-                        "ParamOut": self._param_dict[key],
-                        "VelocityOut": self._velocity_dict[key],
+                        "ParamOut": self._param_dict[key][param_group_idx],
+                        "VelocityOut": self._velocity_dict[key][
+                            param_group_idx
+                        ],
                     }
                     attrs = {
-                        "mu":
-                        self._momentum,
-                        "use_nesterov":
-                        self._use_nesterov,
-                        "regularization_method":
-                        self._regularization_method_dict[key],
-                        "regularization_coeff":
-                        self._regularization_coeff_dict[key],
+                        "mu": self._momentum,
+                        "use_nesterov": self._use_nesterov,
+                        "regularization_method": self._regularization_method_dict[
+                            key
+                        ][
+                            param_group_idx
+                        ],
+                        "regularization_coeff": self._regularization_coeff_dict[
+                            key
+                        ][param_group_idx],
                     }
                     if find_master:
-                        inputs["MasterParam"] = self._master_weight_dict[key]
+                        inputs["MasterParam"] = self._master_weight_dict[key][
+                            param_group_idx
+                        ]
                         outputs["MasterParamOut"] = self._master_weight_dict[
-                            key]
+                            key
+                        ][param_group_idx]
                         attrs["multi_precision"] = find_master
-                    target_block.append_op(type="merged_momentum",
-                                           inputs=inputs,
-                                           outputs=outputs,
-                                           attrs=attrs,
-                                           stop_gradient=True)
+                    target_block.append_op(
+                        type="merged_momentum",
+                        inputs=inputs,
+                        outputs=outputs,
+                        attrs=attrs,
+                        stop_gradient=True,
+                    )
         return None
 
     def _update_param_group(self, parameters):
-        self._momentum = parameters.get('momentum',
-                                        self._default_dict['momentum'])
-        self._use_nesterov = parameters.get('use_nesterov',
-                                            self._default_dict['use_nesterov'])
-        self._rescale_grad = parameters.get('rescale_grad',
-                                            self._default_dict['rescale_grad'])
+        self._momentum = parameters.get(
+            'momentum', self._default_dict['momentum']
+        )
+        self._use_nesterov = parameters.get(
+            'use_nesterov', self._default_dict['use_nesterov']
+        )
+        self._rescale_grad = parameters.get(
+            'rescale_grad', self._default_dict['rescale_grad']
+        )
         self._regularization_method = parameters.get(
-            'regularization_method',
-            self._default_dict['regularization_method'])
+            'regularization_method', self._default_dict['regularization_method']
+        )
         self._regularization_coeff = parameters.get(
-            'regularization_coeff', self._default_dict['regularization_coeff'])
+            'regularization_coeff', self._default_dict['regularization_coeff']
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/optimizer/optimizer.py b/python/paddle/optimizer/optimizer.py
index 1d399021c8e8d0..2ab61bb548731b 100644
--- a/python/paddle/optimizer/optimizer.py
+++ b/python/paddle/optimizer/optimizer.py
@@ -21,13 +21,30 @@
 
 import paddle
 from paddle.fluid.distribute_lookup_table import find_distributed_lookup_table
-from paddle.fluid.framework import Program, Variable, name_scope, default_main_program, default_startup_program, device_guard
+from paddle.fluid.framework import (
+    Program,
+    Variable,
+    name_scope,
+    default_main_program,
+    default_startup_program,
+    device_guard,
+)
 
 from ..fluid import framework
 from ..fluid import layers
 from ..fluid import unique_name
-from ..fluid.backward import append_backward, _some_in_set_, _append_grad_suffix_, _get_no_grad_set_name
-from ..fluid.clip import GradientClipBase, GradientClipByNorm, error_clip_callback, append_gradient_clip_ops
+from ..fluid.backward import (
+    append_backward,
+    _some_in_set_,
+    _append_grad_suffix_,
+    _get_no_grad_set_name,
+)
+from ..fluid.clip import (
+    GradientClipBase,
+    GradientClipByNorm,
+    error_clip_callback,
+    append_gradient_clip_ops,
+)
 from ..fluid.framework import program_guard, Parameter
 from ..fluid.initializer import Constant
 from ..fluid.layer_helper import LayerHelper
@@ -42,24 +59,36 @@
 from .lr import LRScheduler
 import copy
 from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _current_expected_place, in_dygraph_mode
+from paddle.fluid.framework import (
+    _in_legacy_dygraph,
+    _in_eager_without_dygraph_check,
+    _current_expected_place,
+    in_dygraph_mode,
+)
 
 __all__ = []
 
 
 @framework.static_only
-def append_backward_new(loss_list,
-                        parameter_list=None,
-                        no_grad_set=None,
-                        callbacks=None,
-                        checkpoints=None,
-                        distop_context=None):
+def append_backward_new(
+    loss_list,
+    parameter_list=None,
+    no_grad_set=None,
+    callbacks=None,
+    checkpoints=None,
+    distop_context=None,
+):
     from paddle.incubate.autograd.primx import orig2prim, Transform
+
     program = default_main_program()
-    assert program.num_blocks == 1, "The append_backward_new interface is designed to process only one block."
+    assert (
+        program.num_blocks == 1
+    ), "The append_backward_new interface is designed to process only one block."
     block = program.current_block()
     for el in loss_list:
-        assert el.block == block, f'variable in loss_list should be in current block of main program'
+        assert (
+            el.block == block
+        ), f'variable in loss_list should be in current block of main program'
 
     orig2prim(block)
     ad = Transform(block)
@@ -163,12 +192,14 @@ class Optimizer(object):
     """
 
     @imperative_base.no_grad
-    def __init__(self,
-                 learning_rate,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
 
         if parameters is not None:
             # paddle.Tensor is also iterable, so here we don't check whether
@@ -177,13 +208,16 @@ def __init__(self,
             if isinstance(parameters, (paddle.Tensor, core.eager.Tensor)):
                 raise TypeError(
                     "`parameters` argument given to the optimizer should be "
-                    "an iterable of paddle Tensors, but got argument type is `{}`."
-                    .format(type(parameters)))
+                    "an iterable of paddle Tensors, but got argument type is `{}`.".format(
+                        type(parameters)
+                    )
+                )
             if isinstance(parameters, dict):
                 raise TypeError(
                     "`parameters` argument should not get dict type, "
                     "if parameter groups is needed, please set `parameters`"
-                    " as list of dict")
+                    " as list of dict"
+                )
             self._parameter_list = list(parameters)
         else:
             self._parameter_list = None
@@ -197,18 +231,22 @@ def __init__(self,
             if weight_decay is not None:
                 if not isinstance(self._parameter_list[0], dict):
                     for param in self._parameter_list:
-                        if hasattr(param, 'regularizer'
-                                   ) and param.regularizer is not None:
+                        if (
+                            hasattr(param, 'regularizer')
+                            and param.regularizer is not None
+                        ):
                             logging.info(
                                 "If regularizer of a Parameter has been set by 'paddle.ParamAttr' or 'static.WeightNormParamAttr' already. "
                                 "The weight_decay[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                                % weight_decay.__str__())
+                                % weight_decay.__str__()
+                            )
                             break
 
         if not isinstance(learning_rate, (float, LRScheduler)):
             raise TypeError(
-                "learning rate should be float or LRScheduler, got %s here" %
-                type(learning_rate))
+                "learning rate should be float or LRScheduler, got %s here"
+                % type(learning_rate)
+            )
         if grad_clip is not None:
             if not isinstance(grad_clip, GradientClipBase):
                 raise TypeError(
@@ -216,6 +254,7 @@ def __init__(self,
                 )
         if isinstance(weight_decay, float):
             from ..fluid.regularizer import L2Decay
+
             self.regularization = L2Decay(weight_decay)
         else:
             self.regularization = weight_decay
@@ -227,8 +266,9 @@ def __init__(self,
         if self._parameter_list:
             if isinstance(self._parameter_list[0], dict):
                 for param_group in self._parameter_list:
-                    assert 'params' in param_group, \
-                        'params should be set in parameters if parameter groups are optimized in different options'
+                    assert (
+                        'params' in param_group
+                    ), 'params should be set in parameters if parameter groups are optimized in different options'
                 self._dtype = self._parameter_list[0]['params'][0].dtype
             else:
                 self._dtype = self._parameter_list[0].dtype
@@ -248,7 +288,7 @@ def __init__(self,
         self.clear_gradients = self.clear_grad
         self._default_dict = {
             'weight_decay': self.regularization,
-            'grad_clip': self._grad_clip
+            'grad_clip': self._grad_clip,
         }
 
         self._param_groups = []
@@ -261,13 +301,20 @@ def __init__(self,
         # NOTE: Multi Tensor: Pass in all parameters and gradients to the op kernel of the Optimizer at one time for updating for dygraph mode.
         # Optimizer support list: [ paddle.optimizer.Momentum, paddle.optimizer.Adam].
         self._use_multi_tensor = None
-        self._param_dict = {'FP32_LODTensor': [], 'FP16_LODTensor': []}
 
+        self._param_dict = self._create_multi_tensor_dict()
         self._auxiliary_vars = {}
 
     def _set_auxiliary_var(self, key, val):
         self._auxiliary_vars[key] = val
 
+    def _create_multi_tensor_dict(self):
+        n = len(self._param_groups) if self._param_groups is not None else 1
+        return {
+            'FP32_LODTensor': [[] for _ in range(n)],
+            'FP16_LODTensor': [[] for _ in range(n)],
+        }
+
     def _get_auxiliary_var(self, key):
         return self._auxiliary_vars.get(key, None)
 
@@ -277,12 +324,12 @@ def state_dict(self):
         Get state dict information from optimizer. It contain all the tensor used by optimizer. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be include in state dict.
         If the optimizer never be called(minimize function), the state_dict is empty.
 
-        Args: 
+        Args:
             None
 
         Returns:
             state_dict(dict) : dict contains all the Tensor used by optimizer
-        
+
         Examples:
             .. code-block:: python
 
@@ -311,11 +358,11 @@ def set_state_dict(self, state_dict):
         '''
         Load optimizer state dict. For Adam optimizer, contains beta1, beta2, momentum etc. If LRScheduler have been used, global_step will be changed.
 
-        Args: 
+        Args:
             state_dict(dict) : Dict contains all the Tensor needed by optimizer
         Return:
             None
-        
+
         Examples:
             .. code-block:: python
 
@@ -326,7 +373,7 @@ def set_state_dict(self, state_dict):
                 layer_state_dict = emb.state_dict()
                 paddle.save(layer_state_dict, "emb.pdparams")
 
-                scheduler = paddle.optimizer.lr.NoamDecay(	
+                scheduler = paddle.optimizer.lr.NoamDecay(
                     d_model=0.01, warmup_steps=100, verbose=True)
                 adam = paddle.optimizer.Adam(
                     learning_rate=scheduler,
@@ -353,8 +400,9 @@ def set_state_dict(self, state_dict):
         self._accumulators_holder = state_dict
         for k, v in self._accumulators.items():
             for para_name, var_tmp in v.items():
-                assert var_tmp.name in state_dict, \
-                        "optimizer Tensor {} not found".format( var_tmp.name )
+                assert (
+                    var_tmp.name in state_dict
+                ), "optimizer Tensor {} not found".format(var_tmp.name)
                 var = var_tmp.value()
                 tensor = var.get_tensor()
                 model_np = np.array(tensor)
@@ -368,16 +416,23 @@ def set_state_dict(self, state_dict):
                 elif isinstance(load_para, np.ndarray):
                     load_para_np = load_para
                 else:
-                    raise RuntimeError("State dict type {} not supprt".format(
-                        str(type(load_para))))
-
-                assert model_np.shape == load_para_np.shape,  \
-                                          "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
-                                                 model_np.name, model_np.shape, load_para_np.shape)
+                    raise RuntimeError(
+                        "State dict type {} not supprt".format(
+                            str(type(load_para))
+                        )
+                    )
+
+                assert (
+                    model_np.shape == load_para_np.shape
+                ), "Parameter shape not match, Dygraph Parameter [ {} ] need tensor with shape {} but load tensor with shape {}".format(
+                    model_np.name, model_np.shape, load_para_np.shape
+                )
 
-                assert model_np.dtype == load_para_np.dtype, \
-                                          "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
-                                                model_np.name, model_np.dtype, load_para_np.dtype)
+                assert (
+                    model_np.dtype == load_para_np.dtype
+                ), "Parameter dtype not match, Dygraph Parameter [ {} ] need tensor with dtype {}  but load tensor with dtype {}".format(
+                    model_np.name, model_np.dtype, load_para_np.dtype
+                )
 
                 tensor.set(load_para_np, framework._current_expected_place())
 
@@ -386,51 +441,63 @@ def get_opti_var_name_list(self):
 
     def _create_global_learning_rate(self):
         # lr var can't be float16, for pure fp16 training, should extra handle the dtype for lr
-        _lr_dtype = paddle.get_default_dtype(
-        ) if self._dtype is None else self._dtype
-        _lr_dtype = paddle.float32 if (
-            paddle.get_default_dtype() != "float16"
-            and _lr_dtype == paddle.float16) else _lr_dtype
+        _lr_dtype = (
+            paddle.get_default_dtype() if self._dtype is None else self._dtype
+        )
+        _lr_dtype = (
+            paddle.float32
+            if (
+                paddle.get_default_dtype() != "float16"
+                and _lr_dtype == paddle.float16
+            )
+            else _lr_dtype
+        )
         if isinstance(self._learning_rate, LRScheduler):
             lr_var = self._global_learning_rate()
             # only create global lr_var once
             if not isinstance(lr_var, framework.Variable):
                 lr_name = unique_name.generate('learning_rate')
                 self._learning_rate._var_name = lr_name
-                lr_var = self.helper.create_global_variable(name=lr_name,
-                                                            shape=[1],
-                                                            persistable=True,
-                                                            stop_gradient=True,
-                                                            dtype=_lr_dtype)
+                lr_var = self.helper.create_global_variable(
+                    name=lr_name,
+                    shape=[1],
+                    persistable=True,
+                    stop_gradient=True,
+                    dtype=_lr_dtype,
+                )
                 main_prog = framework.default_main_program()
                 main_prog.lr_sheduler = self._learning_rate
                 main_prog.lr_var = lr_var
 
                 self._learning_rate_map[
-                    framework.default_main_program()] = lr_var
+                    framework.default_main_program()
+                ] = lr_var
 
             lr_value = float(self._learning_rate())
             self.helper.set_variable_initializer(
-                lr_var, initializer=Constant(value=lr_value))
+                lr_var, initializer=Constant(value=lr_value)
+            )
         elif isinstance(self._learning_rate, float):
             # only create global lr_var once
             lr = self._global_learning_rate()
             if isinstance(lr, framework.Variable):
                 return
             else:
-                self._learning_rate_map[framework.default_main_program(
-                )] = layers.create_global_var(
+                self._learning_rate_map[
+                    framework.default_main_program()
+                ] = layers.create_global_var(
                     name=unique_name.generate("learning_rate"),
                     shape=[1],
                     value=float(self._learning_rate),
                     dtype=_lr_dtype,
-                    persistable=True)
+                    persistable=True,
+                )
 
     @framework.dygraph_only
     def set_lr(self, value):
         """
         :api_attr: imperative
-        
+
         Set the value of the learning rate manually in the optimizer. If the optimizer use LRScheduler,
         this API cannot be invoked, because it will lead to conflict.
 
@@ -439,7 +506,7 @@ def set_lr(self, value):
 
         Returns:
             None
-          
+
         Examples:
             .. code-block:: python
 
@@ -465,7 +532,8 @@ def set_lr(self, value):
         if not isinstance(value, (int, float)):
             raise TypeError(
                 "The type of 'value' in optimizer.set_lr must be float, but received %s."
-                % (type(value)))
+                % (type(value))
+            )
         if isinstance(self._learning_rate, LRScheduler):
             raise RuntimeError(
                 "optimizer's learning rate can't be LRScheduler when invoke this API, because this will lead to conflict."
@@ -475,27 +543,40 @@ def set_lr(self, value):
         if current_lr is not None:
             if in_dygraph_mode():
                 place = _current_expected_place()
-                _C_ops.full_(current_lr, list(current_lr.shape), float(value),
-                             current_lr.dtype, place)
+                _C_ops.full_(
+                    current_lr,
+                    list(current_lr.shape),
+                    float(value),
+                    current_lr.dtype,
+                    place,
+                )
 
             elif _in_legacy_dygraph():
-                _legacy_C_ops.fill_constant(current_lr, 'value', float(value),
-                                            'dtype', current_lr.dtype, 'shape',
-                                            list(current_lr.shape))
+                _legacy_C_ops.fill_constant(
+                    current_lr,
+                    'value',
+                    float(value),
+                    'dtype',
+                    current_lr.dtype,
+                    'shape',
+                    list(current_lr.shape),
+                )
             else:
                 global_block = framework.default_main_program().global_block()
-                global_block.append_op(type='fill_constant',
-                                       outputs={'Out': [current_lr]},
-                                       attrs={
-                                           'dtype': current_lr.dtype,
-                                           'shape': list(current_lr.shape),
-                                           'value': float(value)
-                                       },
-                                       stop_gradient=True)
+                global_block.append_op(
+                    type='fill_constant',
+                    outputs={'Out': [current_lr]},
+                    attrs={
+                        'dtype': current_lr.dtype,
+                        'shape': list(current_lr.shape),
+                        'value': float(value),
+                    },
+                    stop_gradient=True,
+                )
 
     def get_lr(self):
         """
-        Get current learning rate of optimizer. 
+        Get current learning rate of optimizer.
         If 'LRScheduler' is not used, the return value is all the same.
         If 'LRScheduler' is used, the return value is the current scheduled learing rete.
 
@@ -565,8 +646,7 @@ def _global_learning_rate(self, program=None):
         return self._learning_rate_map.get(program, None)
 
     def _append_optimize_op(self, block, param_and_grad):
-        """ append optimize operator to block and return all the added optimize_op
-        """
+        """append optimize operator to block and return all the added optimize_op"""
         raise NotImplementedError(
             "Class \"Optimizer\" connot be used directly as an optimizer, please use its subclasses such as \"Adam\""
         )
@@ -583,8 +663,8 @@ def _create_param_lr(self, param_and_grad):
                     return self._global_learning_rate()
                 else:
                     with default_main_program()._lr_schedule_guard(
-                            is_with_opt=True), framework.name_scope(
-                                'scale_with_param_lr'):
+                        is_with_opt=True
+                    ), framework.name_scope('scale_with_param_lr'):
                         return self._global_learning_rate() * param_lr
         else:
             return self._global_learning_rate()
@@ -611,14 +691,16 @@ def _finish_update(self, block, parameters_and_grads):
         """
         pass
 
-    def _add_accumulator(self,
-                         name,
-                         param,
-                         dtype=None,
-                         fill_value=0.0,
-                         shape=None,
-                         type=None,
-                         device=None):
+    def _add_accumulator(
+        self,
+        name,
+        param,
+        dtype=None,
+        fill_value=0.0,
+        shape=None,
+        type=None,
+        device=None,
+    ):
         """Utility function to add an accumulator for a parameter
 
         Args:
@@ -630,13 +712,17 @@ def _add_accumulator(self,
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name in self._accumulators
-                and param.name in self._accumulators[name]):
+        if (
+            name in self._accumulators
+            and param.name in self._accumulators[name]
+        ):
             if framework._non_static_mode():
                 return self._accumulators[name][param.name]
             raise Exception(
                 "Accumulator {} already exists for parameter {}".format(
-                    name, param.name))
+                    name, param.name
+                )
+            )
         if shape == None:
             shape = param.shape
         assert isinstance(self.helper, LayerHelper)
@@ -650,20 +736,25 @@ def _add_accumulator(self,
             persistable=True,
             dtype=dtype or param.dtype,
             type=core.VarDesc.VarType.LOD_TENSOR
-            if framework._in_eager_without_dygraph_check() else
-            (param.type if type is None else type),
+            if framework._in_eager_without_dygraph_check()
+            else (param.type if type is None else type),
             shape=shape,
-            belong_to_optimizer=True)
+            belong_to_optimizer=True,
+        )
         if device is None:
             device = self._get_device_for_param(param.name)
         with device_guard(device):
             self.helper.set_variable_initializer(
-                var, initializer=Constant(value=float(fill_value)))
+                var, initializer=Constant(value=float(fill_value))
+            )
 
         if framework._non_static_mode():
             if len(self._accumulators_holder) > 0:
-                assert var_name in self._accumulators_holder, \
-                        "Optimizer set error, {} should in state dict".format( var_name )
+                assert (
+                    var_name in self._accumulators_holder
+                ), "Optimizer set error, {} should in state dict".format(
+                    var_name
+                )
                 var.set_value(self._accumulators_holder[var_name])
 
         self._accumulators[name][param.name] = var
@@ -681,11 +772,15 @@ def _get_accumulator(self, name, param):
         """
         if self._name is not None:
             name = self._name + "_" + name
-        if (name not in self._accumulators
-                or param.name not in self._accumulators[name]):
+        if (
+            name not in self._accumulators
+            or param.name not in self._accumulators[name]
+        ):
             raise Exception(
                 "Accumulator {} does not exist for parameter {}".format(
-                    name, param.name))
+                    name, param.name
+                )
+            )
         return self._accumulators[name][param.name]
 
     def _update_param_device_map(self, parameters_and_grads, target_block):
@@ -693,13 +788,15 @@ def _update_param_device_map(self, parameters_and_grads, target_block):
             if param_and_grad[0].stop_gradient is False:
                 param_name = param_and_grad[0].name
                 ops = target_block.ops
-                device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName(
+                device_attr_name = (
+                    core.op_proto_and_checker_maker.kOpDeviceAttrName()
                 )
                 for op in ops:
                     input_arg_names = op.input_arg_names
                     if param_name in input_arg_names:
                         self._param_device_map[param_name] = op.attr(
-                            device_attr_name)
+                            device_attr_name
+                        )
                         break
 
     def _get_device_for_param(self, param_name):
@@ -708,7 +805,9 @@ def _get_device_for_param(self, param_name):
             device = self._param_device_map[param_name]
         return device
 
-    def _create_optimization_pass(self, parameters_and_grads):
+    def _create_optimization_pass(
+        self, parameters_and_grads, param_group_idx=0
+    ):
         """Add optimization operators to update gradients to tensors.
 
         Args:
@@ -736,10 +835,12 @@ def _create_optimization_pass(self, parameters_and_grads):
         target_block = global_block
         current_block = framework.default_main_program().current_block()
         if current_block.idx != global_block.idx:
-            assert current_block.backward_block_idx != -1, \
-                "current block is not global_block, but it doesn't have backward block."
+            assert (
+                current_block.backward_block_idx != -1
+            ), "current block is not global_block, but it doesn't have backward block."
             target_block = framework.default_main_program().blocks[
-                current_block.backward_block_idx]
+                current_block.backward_block_idx
+            ]
 
         start = len(target_block.ops)
         self.helper = LayerHelper(self.__class__.__name__)
@@ -748,57 +849,91 @@ def _create_optimization_pass(self, parameters_and_grads):
 
         # NOTE: Multi Tensor support [ Momentum, Adam ] for dygraph mode
         if self._use_multi_tensor and self.__class__.__name__ in [
-                'Momentum', 'Adam'
+            'Momentum',
+            'Adam',
         ]:
-            if len(self._param_dict['FP32_LODTensor']) == 0 and len(
-                    self._param_dict['FP16_LODTensor']) == 0:
+            if (
+                len(self._param_dict['FP32_LODTensor'][param_group_idx]) == 0
+                and len(self._param_dict['FP16_LODTensor'][param_group_idx])
+                == 0
+            ):
                 if isinstance(parameters_and_grads, list):
-                    self._multi_tensor_init(target_block, [
-                        p[0]
-                        for p in parameters_and_grads if not p[0].stop_gradient
-                    ])
+                    assert param_group_idx == 0
+                    self._multi_tensor_init(
+                        target_block,
+                        [
+                            p[0]
+                            for p in parameters_and_grads
+                            if not p[0].stop_gradient
+                        ],
+                        param_group_idx,
+                    )
                 else:
                     self._update_param_group(parameters_and_grads)
-                    self._multi_tensor_init(target_block, [
-                        p[0] for p in parameters_and_grads['params']
-                        if not p[0].stop_gradient
-                    ])
+                    self._multi_tensor_init(
+                        target_block,
+                        [
+                            p[0]
+                            for p in parameters_and_grads['params']
+                            if not p[0].stop_gradient
+                        ],
+                        param_group_idx,
+                    )
             if framework._non_static_mode():
-                self._append_optimize_multi_tensor_op(target_block,
-                                                      parameters_and_grads)
+                self._append_optimize_multi_tensor_op(
+                    target_block,
+                    parameters_and_grads,
+                    param_group_idx=param_group_idx,
+                )
             else:
-                self._update_param_device_map(parameters_and_grads,
-                                              target_block)
+                self._update_param_device_map(
+                    parameters_and_grads, target_block
+                )
                 # NOTE: Multi Tensor requires all parameters to be in the same device and program.
                 # param_grad_list = [p_0,g_0,p_1,g_1,....]
                 param_grad_list = []
                 for param_and_grad in parameters_and_grads:
-                    if not param_and_grad[0].stop_gradient and param_and_grad[
-                            1] is not None:
+                    if (
+                        not param_and_grad[0].stop_gradient
+                        and param_and_grad[1] is not None
+                    ):
                         param_grad_list.append(param_and_grad[0])
                         param_grad_list.append(param_and_grad[1])
                 with param_grad_list[0].block.program._optimized_guard(
-                        param_grad_list), name_scope("optimizer"):
+                    param_grad_list
+                ), name_scope("optimizer"):
                     device = self._get_device_for_param(param_grad_list[0].name)
                     with device_guard(device):
                         self._append_optimize_multi_tensor_op(
-                            target_block, parameters_and_grads)
+                            target_block,
+                            parameters_and_grads,
+                            param_group_idx=param_group_idx,
+                        )
         else:
             if not framework._non_static_mode():
-                params_grads_device_map = parameters_and_grads[
-                    'params'] if isinstance(parameters_and_grads,
-                                            dict) else parameters_and_grads
-                self._update_param_device_map(params_grads_device_map,
-                                              target_block)
+                params_grads_device_map = (
+                    parameters_and_grads['params']
+                    if isinstance(parameters_and_grads, dict)
+                    else parameters_and_grads
+                )
+                self._update_param_device_map(
+                    params_grads_device_map, target_block
+                )
 
             if isinstance(parameters_and_grads, list):
-                self._create_accumulators(target_block, [
-                    p[0] for p in parameters_and_grads if not p[0].stop_gradient
-                ])
+                self._create_accumulators(
+                    target_block,
+                    [
+                        p[0]
+                        for p in parameters_and_grads
+                        if not p[0].stop_gradient
+                    ],
+                )
             else:
                 params_acc_dict = parameters_and_grads.copy()
                 params_acc_dict['params'] = [
-                    p[0] for p in params_acc_dict['params']
+                    p[0]
+                    for p in params_acc_dict['params']
                     if not p[0].stop_gradient
                 ]
                 self._create_accumulators(target_block, params_acc_dict)
@@ -809,8 +944,9 @@ def _create_optimization_pass(self, parameters_and_grads):
                         if param_and_grad[1] is None:
                             continue
                         if param_and_grad[0].stop_gradient is False:
-                            self._append_optimize_op(target_block,
-                                                     param_and_grad)
+                            self._append_optimize_op(
+                                target_block, param_and_grad
+                            )
                 else:
                     for param_and_grad in parameters_and_grads['params']:
                         if param_and_grad[1] is None:
@@ -818,25 +954,31 @@ def _create_optimization_pass(self, parameters_and_grads):
                         if param_and_grad[0].stop_gradient is False:
                             param_grad_dict = dict()
                             param_grad_dict['params'] = param_and_grad
-                            param_grad_dict.update({
-                                k: v
-                                for k, v in parameters_and_grads.items()
-                                if k != 'params'
-                            })
-                            self._append_optimize_op(target_block,
-                                                     param_grad_dict)
+                            param_grad_dict.update(
+                                {
+                                    k: v
+                                    for k, v in parameters_and_grads.items()
+                                    if k != 'params'
+                                }
+                            )
+                            self._append_optimize_op(
+                                target_block, param_grad_dict
+                            )
             else:
                 for param_and_grad in parameters_and_grads:
                     if param_and_grad[1] is None:
                         continue
                     with param_and_grad[0].block.program._optimized_guard(
-                            param_and_grad), name_scope("optimizer"):
+                        param_and_grad
+                    ), name_scope("optimizer"):
                         if param_and_grad[0].stop_gradient is False:
                             device = self._get_device_for_param(
-                                param_and_grad[0].name)
+                                param_and_grad[0].name
+                            )
                             with device_guard(device):
                                 optimize_op = self._append_optimize_op(
-                                    target_block, param_and_grad)
+                                    target_block, param_and_grad
+                                )
 
         # Get custom finish ops for subclasses
         # FIXME: Need to fix this once we figure out how to handle dependencies
@@ -848,12 +990,14 @@ def _create_optimization_pass(self, parameters_and_grads):
     def _append_dgc_ops(self, param_and_grad):
         pass
 
-    def backward(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None,
-                 callbacks=None):
+    def backward(
+        self,
+        loss,
+        startup_program=None,
+        parameters=None,
+        no_grad_set=None,
+        callbacks=None,
+    ):
         """
         The first part of ``minimize``, do auto-diff to append backward operations for
         the current program.
@@ -879,14 +1023,13 @@ def backward(self,
             .. code-block:: python
 
                 import paddle
-                import numpy as np
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
+                x = paddle.arange(26, dtype="float32").reshape([2, 13])
+
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
-                out = linear(a)
+                out = linear(x)
                 out.backward()
                 adam.step()
                 adam.clear_grad()
@@ -902,8 +1045,7 @@ def backward(self,
             self._dtype = loss.dtype
 
         if framework._non_static_mode():
-            parameter_list = parameters if parameters \
-                else self._parameter_list
+            parameter_list = parameters if parameters else self._parameter_list
 
             params_grads = []
             for param in parameter_list:
@@ -917,23 +1059,26 @@ def backward(self,
             if callbacks is None:
                 callbacks = [error_clip_callback]
             else:
-                assert (isinstance(callbacks, list))
+                assert isinstance(callbacks, list)
             program = loss.block.program
-            assert len(loss.shape) == 1 and loss.shape[0] == 1, \
-                "The loss.shape should be (1L,), but the current loss.shape is {}. " \
+            assert len(loss.shape) == 1 and loss.shape[0] == 1, (
+                "The loss.shape should be (1L,), but the current loss.shape is {}. "
                 "Maybe that you should call paddle.mean to process the current loss.".format(
-                    loss.shape)
-            parameter_list = parameters if parameters \
-                else self._parameter_list
+                    loss.shape
+                )
+            )
+            parameter_list = parameters if parameters else self._parameter_list
             with program_guard(program, startup_program):
                 from paddle.incubate.autograd.utils import prim_enabled
+
                 if prim_enabled():
-                    params_grads = append_backward_new([loss], parameter_list,
-                                                       act_no_grad_set,
-                                                       callbacks)
+                    params_grads = append_backward_new(
+                        [loss], parameter_list, act_no_grad_set, callbacks
+                    )
                 else:
-                    params_grads = append_backward(loss, parameter_list,
-                                                   act_no_grad_set, callbacks)
+                    params_grads = append_backward(
+                        loss, parameter_list, act_no_grad_set, callbacks
+                    )
                 # Note: since we can't use all_reduce_op now,
                 #  dgc_op should be the last op of one grad.
                 self._append_dgc_ops(params_grads)
@@ -954,11 +1099,9 @@ def apply_gradients(self, params_grads):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
 
-                inp = np.random.uniform(-0.1, 0.1, [10, 10]).astype("float32")
+                inp = paddle.uniform([10, 10], dtype="float32", min=-0.1, max=0.1)
                 linear = paddle.nn.Linear(10, 10)
-                inp = paddle.to_tensor(inp)
                 out = linear(inp)
                 loss = paddle.mean(out)
                 optimizer = paddle.optimizer.Adam(learning_rate=0.1,
@@ -978,13 +1121,16 @@ def apply_gradients(self, params_grads):
             params_grads = append_gradient_clip_ops(params_grads)
 
         # Add regularization if any
-        params_grads = self.append_regularization_ops(params_grads,
-                                                      self.regularization)
+        params_grads = self.append_regularization_ops(
+            params_grads, self.regularization
+        )
 
         optimize_ops = self._create_optimization_pass(params_grads)
         return optimize_ops
 
-    def _apply_optimize(self, loss, startup_program, params_grads):
+    def _apply_optimize(
+        self, loss, startup_program, params_grads, param_group_idx=0
+    ):
         """
         Second part of `minimize`, appending optimization operators for
         given `params_grads` pairs.
@@ -997,38 +1143,49 @@ def _apply_optimize(self, loss, startup_program, params_grads):
             list: A list of operators appended to the current program.
         """
         if framework._non_static_mode():
-            with program_guard(framework.default_main_program(),
-                               framework.default_startup_program()):
+            with program_guard(
+                framework.default_main_program(),
+                framework.default_startup_program(),
+            ):
                 if isinstance(params_grads, list):
                     if self._grad_clip is not None:
                         params_grads = self._grad_clip(params_grads)
                     params_grads = self.append_regularization_ops(
-                        params_grads, self.regularization)
+                        params_grads, self.regularization
+                    )
                 else:
                     grad_clip = params_grads['grad_clip']
                     if grad_clip is not None:
                         params_grads['params'] = grad_clip(
-                            params_grads['params'])
+                            params_grads['params']
+                        )
 
                     params_grads['params'] = self.append_regularization_ops(
-                        params_grads['params'], self.regularization)
-                optimize_ops = self._create_optimization_pass(params_grads)
+                        params_grads['params'], self.regularization
+                    )
+                optimize_ops = self._create_optimization_pass(
+                    params_grads, param_group_idx=param_group_idx
+                )
         else:
+            assert param_group_idx == 0
             program = loss.block.program
             with program_guard(program, startup_program):
                 optimize_ops = self.apply_gradients(params_grads)
         return optimize_ops
 
     def _create_regularization_of_grad(self, param, grad, regularization=None):
-        """ Create and add backward regularization Operators
-    
+        """Create and add backward regularization Operators
+
         Function helper of append_regularization_ops.
         """
         # If no gradient or no regularization is specified,  then we don't need to do anything
         if grad is None or (
-            (not hasattr(param, 'regularizer') or
-             (hasattr(param, 'regularizer') and param.regularizer is None))
-                and regularization is None):
+            (
+                not hasattr(param, 'regularizer')
+                or (hasattr(param, 'regularizer') and param.regularizer is None)
+            )
+            and regularization is None
+        ):
             return grad
         regularization_term = None
         if hasattr(param, 'regularizer') and param.regularizer is not None:
@@ -1040,9 +1197,7 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
         assert regularization_term is not None
 
         if framework.in_dygraph_mode():
-            if grad.is_dense() and regularization_term.is_dense():
-                return _C_ops.add_n([grad, regularization_term])
-            return _legacy_C_ops.sum([grad, regularization_term])
+            return _C_ops.add_n([grad, regularization_term])
         elif framework._in_legacy_dygraph():
             return _legacy_C_ops.sum([grad, regularization_term])
 
@@ -1057,7 +1212,8 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
                 dtype=param.dtype,
                 shape=param.shape,
                 lod_level=param.lod_level,
-                type=core.VarDesc.VarType.LOD_TENSOR)
+                type=core.VarDesc.VarType.LOD_TENSOR,
+            )
 
         inputs = {"X": [grad, regularization_term]}
         outputs = {"Out": [new_grad]}
@@ -1065,9 +1221,9 @@ def _create_regularization_of_grad(self, param, grad, regularization=None):
 
         return new_grad
 
-    def append_regularization_ops(self,
-                                  parameters_and_grads,
-                                  regularization=None):
+    def append_regularization_ops(
+        self, parameters_and_grads, regularization=None
+    ):
         r"""Create and add backward regularization Operators
     
         Creates and adds backward regularization operators in the BlockDesc.
@@ -1092,21 +1248,28 @@ def append_regularization_ops(self,
         if framework._non_static_mode():
             for param, grad in parameters_and_grads:
                 new_grad = self._create_regularization_of_grad(
-                    param, grad, regularization)
+                    param, grad, regularization
+                )
                 params_and_grads.append((param, new_grad))
         else:
             repeate_regularizer = False
             with framework.name_scope('regularization'):
                 for param, grad in parameters_and_grads:
-                    if not repeate_regularizer and param.regularizer is not None and regularization is not None:
+                    if (
+                        not repeate_regularizer
+                        and param.regularizer is not None
+                        and regularization is not None
+                    ):
                         repeate_regularizer = True
                         logging.info(
                             "If regularizer of a Parameter has been set by 'fluid.ParamAttr' or 'fluid.WeightNormParamAttr' already. "
                             "The Regularization[%s] in Optimizer will not take effect, and it will only be applied to other Parameters!"
-                            % regularization.__str__())
+                            % regularization.__str__()
+                        )
                     with param.block.program._optimized_guard([param, grad]):
                         new_grad = self._create_regularization_of_grad(
-                            param, grad, regularization)
+                            param, grad, regularization
+                        )
                         params_and_grads.append((param, new_grad))
         return params_and_grads
 
@@ -1114,7 +1277,8 @@ def _get_no_grad_set(self, loss, no_grad_set=None):
         no_grad_set = _get_no_grad_set_name(no_grad_set)
         parameters = loss.block.program.global_block().all_parameters()
         param_no_trainable = set(
-            [param.name for param in parameters if param.stop_gradient is True])
+            [param.name for param in parameters if param.stop_gradient is True]
+        )
         # If the parameter is no trainable, it should not have a gradient.
         no_grad_set.update(param_no_trainable)
 
@@ -1128,24 +1292,22 @@ def clear_grad(self, set_to_zero=True):
         If not, new gradient will accumulat on previous gradient.
 
         There are two method to clear grad: set_to_zero or delete grad.
-        
+
         Args:
             set_to_zero (bool, optional): If set grads to zero or not, default is True.
-        
+
         Returns:
             None
-        
+
         Examples:
             .. code-block:: python
 
-                import numpy as np
                 import paddle
 
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
+                a = paddle.arange(26, dtype="float32").reshape([2, 13])
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
                                             parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
@@ -1155,7 +1317,8 @@ def clear_grad(self, set_to_zero=True):
         """
         param_list = []
         if self._parameter_list is None or not isinstance(
-                self._parameter_list[0], dict):
+            self._parameter_list[0], dict
+        ):
             for p in self._parameter_list:
                 if not p.stop_gradient:
                     param_list.append(p)
@@ -1172,11 +1335,9 @@ def clear_grad(self, set_to_zero=True):
             core.clear_gradients(param_list, set_to_zero)
 
     @imperative_base.no_grad
-    def minimize(self,
-                 loss,
-                 startup_program=None,
-                 parameters=None,
-                 no_grad_set=None):
+    def minimize(
+        self, loss, startup_program=None, parameters=None, no_grad_set=None
+    ):
         """
         Add operations to minimize ``loss`` by updating ``parameters``.
 
@@ -1195,13 +1356,13 @@ def minimize(self,
             tuple: tuple (optimize_ops, params_grads), A list of operators appended
             by minimize and a list of (param, grad) tensor pairs, param is
             ``Parameter``, grad is the gradient value corresponding to the parameter.
-            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to 
-            indicate program pruning. If so, the program will be pruned by ``feed`` and 
+            In static graph mode, the returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to
+            indicate program pruning. If so, the program will be pruned by ``feed`` and
             ``fetch_list`` before run, see details in ``Executor``.
 
         Examples:
             .. code-block:: python
- 
+
                 import paddle
                 linear = paddle.nn.Linear(10, 10)
                 input = paddle.uniform(shape=[10, 10], min=-0.1, max=0.1)
@@ -1221,17 +1382,18 @@ def minimize(self,
         """
         assert isinstance(loss, Variable), "The loss should be an Tensor."
 
-        parameter_list = parameters if parameters \
-            else self._parameter_list
+        parameter_list = parameters if parameters else self._parameter_list
 
-        params_grads = self.backward(loss,
-                                     startup_program=startup_program,
-                                     parameters=parameter_list,
-                                     no_grad_set=no_grad_set)
+        params_grads = self.backward(
+            loss,
+            startup_program=startup_program,
+            parameters=parameter_list,
+            no_grad_set=no_grad_set,
+        )
 
-        optimize_ops = self._apply_optimize(loss,
-                                            startup_program=startup_program,
-                                            params_grads=params_grads)
+        optimize_ops = self._apply_optimize(
+            loss, startup_program=startup_program, params_grads=params_grads
+        )
 
         return optimize_ops, params_grads
 
@@ -1240,7 +1402,7 @@ def minimize(self,
     def step(self):
         """
         Execute the optimizer and update parameters once.
-        
+
         Returns:
             None
 
@@ -1248,14 +1410,12 @@ def step(self):
             .. code-block:: python
 
                 import paddle
-                import numpy as np
 
-                value = np.arange(26).reshape(2, 13).astype("float32")
-                a = paddle.to_tensor(value)
+                a = paddle.arange(26, dtype="float32").reshape([2, 13])
                 linear = paddle.nn.Linear(13, 5)
                 # This can be any optimizer supported by dygraph.
-                adam = paddle.optimizer.Adam(learning_rate = 0.01, 
-                                            parameters = linear.parameters())
+                adam = paddle.optimizer.Adam(learning_rate = 0.01,
+                                        parameters = linear.parameters())
                 out = linear(a)
                 out.backward()
                 adam.step()
@@ -1271,13 +1431,16 @@ def step(self):
                     grad_var = param._grad_ivar()
                     params_grads.append((param, grad_var))
 
-            self._apply_optimize(loss=None,
-                                 startup_program=None,
-                                 params_grads=params_grads)
+            self._apply_optimize(
+                loss=None,
+                startup_program=None,
+                params_grads=params_grads,
+                param_group_idx=0,
+            )
 
         else:
             # optimize parameters in groups
-            for param_group in self._param_groups:
+            for idx, param_group in enumerate(self._param_groups):
                 params_grads = defaultdict(lambda: list())
                 for param in param_group['params']:
                     if param.stop_gradient:
@@ -1286,11 +1449,14 @@ def step(self):
                         grad_var = param._grad_ivar()
                         params_grads['params'].append((param, grad_var))
                 params_grads.update(
-                    {k: v
-                     for k, v in param_group.items() if k != 'params'})
-                self._apply_optimize(loss=None,
-                                     startup_program=None,
-                                     params_grads=params_grads)
+                    {k: v for k, v in param_group.items() if k != 'params'}
+                )
+                self._apply_optimize(
+                    loss=None,
+                    startup_program=None,
+                    params_grads=params_grads,
+                    param_group_idx=idx,
+                )
 
     def _add_param_group(self, param_group):
         """
@@ -1306,7 +1472,8 @@ def _add_param_group(self, param_group):
         elif isinstance(params, set):
             raise TypeError(
                 "optimizer parameters should be in ordered collections,"
-                "but received set, please use list instead.")
+                "but received set, please use list instead."
+            )
         else:
             param_group['params'] = list(params)
 
@@ -1320,18 +1487,21 @@ def _add_param_group(self, param_group):
 
         if not param_set.isdisjoint(set(param_group['params'])):
             raise ValueError(
-                "some parameters appear in more than one parameter group")
+                "some parameters appear in more than one parameter group"
+            )
 
         for param in param_group['params']:
             weight_decay = param_group['weight_decay']
             if isinstance(weight_decay, float):
                 from ..fluid.regularizer import L2Decay
+
                 regularization = L2Decay(weight_decay)
             else:
                 regularization = weight_decay
             param.regularizer = regularization
             param.optimize_attr['learning_rate'] = param_group.get(
-                'learning_rate', 1.)
+                'learning_rate', 1.0
+            )
 
         self._param_groups.append(param_group)
 
@@ -1345,7 +1515,7 @@ def _update_param_group(self, parameters):
         pass
 
     @framework.dygraph_only
-    def _multi_tensor_init(self, target_block, parameters):
+    def _multi_tensor_init(self, target_block, parameters, param_group_idx):
         """
         All parameters used for optimizer (such as: parameters, master_weight, velocity_acc for momentum) calculations are grouped into a python list by data type (float16, float32).
         This function will be overridden in the corresponding optimizer file.
@@ -1357,9 +1527,10 @@ def _multi_tensor_init(self, target_block, parameters):
         pass
 
     @framework.dygraph_only
-    def _append_optimize_multi_tensor_op(self, target_block,
-                                         parameters_and_grads):
-        """ 
+    def _append_optimize_multi_tensor_op(
+        self, target_block, parameters_and_grads, param_group_idx
+    ):
+        """
         For Multi Tensor, append optimize merged_operator to block.
         """
         pass
diff --git a/python/paddle/optimizer/rmsprop.py b/python/paddle/optimizer/rmsprop.py
index 7205a434d388f8..10532c8f62846b 100644
--- a/python/paddle/optimizer/rmsprop.py
+++ b/python/paddle/optimizer/rmsprop.py
@@ -71,12 +71,12 @@ class RMSProp(Optimizer):
     Parameters:
         learning_rate (float|LRScheduler): The learning rate used to update ``Parameter``.
           It can be a float value or a LRScheduler.
-        rho(float): rho is :math:`\rho` in equation, default is 0.95.
-        epsilon(float): :math:`\epsilon` in equation is smoothing term to
+        rho(float, optional): rho is :math:`\rho` in equation, default is 0.95.
+        epsilon(float, optional): :math:`\epsilon` in equation is smoothing term to
           avoid division by zero, default is 1e-6.
-        momentum(float): :math:`\beta` in equation is the momentum term,
+        momentum(float, optional): :math:`\beta` in equation is the momentum term,
           default is 0.0.
-        centered(bool): If True, gradients are normalized by the estimated variance of
+        centered(bool, optional): If True, gradients are normalized by the estimated variance of
           the gradient; if False, by the uncentered second moment. Setting this to
           True may help with training, but is slightly more expensive in terms of
           computation and memory. Defaults to False.
@@ -100,9 +100,6 @@ class RMSProp(Optimizer):
         name (str, optional): This parameter is used by developers to print debugging information. 
           For details, please refer to :ref:`api_guide_Name`. Default is None.
 
-    Raises:
-        ValueError: If learning_rate, rho, epsilon, momentum are None.
-
     Examples:
           .. code-block:: python
 
@@ -146,16 +143,18 @@ class RMSProp(Optimizer):
     _mean_square_acc_str = "mean_square"
     _mean_grad_acc_str = "mean_grad"
 
-    def __init__(self,
-                 learning_rate,
-                 rho=0.95,
-                 epsilon=1.0e-6,
-                 momentum=0.0,
-                 centered=False,
-                 parameters=None,
-                 weight_decay=None,
-                 grad_clip=None,
-                 name=None):
+    def __init__(
+        self,
+        learning_rate,
+        rho=0.95,
+        epsilon=1.0e-6,
+        momentum=0.0,
+        centered=False,
+        parameters=None,
+        weight_decay=None,
+        grad_clip=None,
+        name=None,
+    ):
         if learning_rate is None:
             raise ValueError("learning_rate is not set.")
         if rho is None:
@@ -171,11 +170,13 @@ def __init__(self,
         if not 0.0 <= rho:
             raise ValueError("Invalid value of rho, expect rho >= 0.")
 
-        super(RMSProp, self).__init__(learning_rate=learning_rate,
-                                      parameters=parameters,
-                                      weight_decay=weight_decay,
-                                      grad_clip=grad_clip,
-                                      name=name)
+        super(RMSProp, self).__init__(
+            learning_rate=learning_rate,
+            parameters=parameters,
+            weight_decay=weight_decay,
+            grad_clip=grad_clip,
+            name=name,
+        )
 
         self.type = "rmsprop"
         self._rho = rho
@@ -208,49 +209,50 @@ def _append_optimize_op(self, block, param_and_grad):
         if isinstance(param_and_grad, dict):
             param_and_grad = self._update_param_group(param_and_grad)
 
-        momentum_acc = self._get_accumulator(self._momentum_acc_str,
-                                             param_and_grad[0])
-        mean_square_acc = self._get_accumulator(self._mean_square_acc_str,
-                                                param_and_grad[0])
-        mean_grad_acc = self._get_accumulator(self._mean_grad_acc_str,
-                                              param_and_grad[0])
-        rmsprop_op = block.append_op(type=self.type,
-                                     inputs={
-                                         "Param":
-                                         param_and_grad[0],
-                                         "Grad":
-                                         param_and_grad[1],
-                                         "Moment":
-                                         momentum_acc,
-                                         "MeanSquare":
-                                         mean_square_acc,
-                                         "MeanGrad":
-                                         mean_grad_acc,
-                                         "LearningRate":
-                                         self._create_param_lr(param_and_grad),
-                                     },
-                                     outputs={
-                                         "ParamOut": param_and_grad[0],
-                                         "MomentOut": momentum_acc,
-                                         "MeanSquareOut": mean_square_acc,
-                                         "MeanGradOut": mean_grad_acc
-                                     },
-                                     attrs={
-                                         "epsilon": self._epsilon,
-                                         "decay": self._rho,
-                                         "momentum": self._momentum,
-                                         "centered": self._centered
-                                     },
-                                     stop_gradient=True)
+        momentum_acc = self._get_accumulator(
+            self._momentum_acc_str, param_and_grad[0]
+        )
+        mean_square_acc = self._get_accumulator(
+            self._mean_square_acc_str, param_and_grad[0]
+        )
+        mean_grad_acc = self._get_accumulator(
+            self._mean_grad_acc_str, param_and_grad[0]
+        )
+        rmsprop_op = block.append_op(
+            type=self.type,
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Moment": momentum_acc,
+                "MeanSquare": mean_square_acc,
+                "MeanGrad": mean_grad_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "MomentOut": momentum_acc,
+                "MeanSquareOut": mean_square_acc,
+                "MeanGradOut": mean_grad_acc,
+            },
+            attrs={
+                "epsilon": self._epsilon,
+                "decay": self._rho,
+                "momentum": self._momentum,
+                "centered": self._centered,
+            },
+            stop_gradient=True,
+        )
 
         return rmsprop_op
 
     def _update_param_group(self, parameters):
         self._epsilon = parameters.get('epsilon', self._default_dict['epsilon'])
         self._rho = parameters.get('rho', self._default_dict['rho'])
-        self._momentum = parameters.get('momentum',
-                                        self._default_dict['momentum'])
-        self._centered = parameters.get('centered',
-                                        self._default_dict['centered'])
+        self._momentum = parameters.get(
+            'momentum', self._default_dict['momentum']
+        )
+        self._centered = parameters.get(
+            'centered', self._default_dict['centered']
+        )
         parameters = parameters.get('params')
         return parameters
diff --git a/python/paddle/profiler/profiler.py b/python/paddle/profiler/profiler.py
index c44d2f0f611ad3..03cfb3a9a42231 100644
--- a/python/paddle/profiler/profiler.py
+++ b/python/paddle/profiler/profiler.py
@@ -22,11 +22,16 @@
 import json
 
 import paddle
-from paddle.fluid.core import (_Profiler, _ProfilerResult, ProfilerOptions,
-                               TracerEventType, enable_memory_recorder,
-                               enable_input_shape_recorder,
-                               disable_memory_recorder,
-                               disable_input_shape_recorder)
+from paddle.fluid.core import (
+    _Profiler,
+    _ProfilerResult,
+    ProfilerOptions,
+    TracerEventType,
+    enable_memory_recorder,
+    enable_input_shape_recorder,
+    disable_memory_recorder,
+    disable_input_shape_recorder,
+)
 
 from .utils import RecordEvent, wrap_optimizers
 from .profiler_statistic import StatisticData, _build_table, SortedKeys
@@ -105,12 +110,14 @@ class ProfilerTarget(Enum):
     CUSTOM_DEVICE = 3
 
 
-def make_scheduler(*,
-                   closed: int,
-                   ready: int,
-                   record: int,
-                   repeat: int = 0,
-                   skip_first: int = 0) -> Callable:
+def make_scheduler(
+    *,
+    closed: int,
+    ready: int,
+    record: int,
+    repeat: int = 0,
+    skip_first: int = 0
+) -> Callable:
     r"""
     Return a scheduler function, which scheduler the :ref:`state <api_paddle_profiler_ProfilerState>` according to the setting.
     The state transform confirms to:
@@ -132,12 +139,12 @@ def make_scheduler(*,
         skip_first(int, optional): The number of first steps to drop, not participate in the state transform, and at ProfilerState.CLOSED state. Default value is 0.
 
     Returns:
-        A scheduler function, conforms to above state transform setting. The function will takes one parameter step_num, and returns corresponding ProfilerState.
+        A scheduler function, conforms to above state transform setting. The function will takes one parameter `step_num`, and returns corresponding ProfilerState.
 
     Examples:
-        1. profiling range [2, 5]
+        1. profiling range [2, 5].
 
-        Assume batch 0: closed, batch 1: ready, batch [2, 5] record
+        Assume batch 0: closed, batch 1: ready, batch [2, 5] record.
 
             .. code-block:: python
                 :name: code-example1
@@ -146,9 +153,9 @@ def make_scheduler(*,
                 profiler.make_scheduler(closed=1, ready=1, record=4, repeat=1)
 
 
-        2. profiling range [3,6], [9,12], [15,18]...
+        2. profiling range [3,6], [9,12], [15,18].
 
-        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat
+        Assume batch 0: skiped, batch 1: closed, batch 2: ready, batch [3,6]: record, repeat.
 
             .. code-block:: python
                 :name: code-example2
@@ -164,7 +171,9 @@ def getScheduleState(step: int) -> ProfilerState:
         step = step - skip_first
         period_steps = closed + ready + record
         has_repeated = step // period_steps
-        if repeat > 0 and has_repeated >= repeat:  # the period has repeated repeat times, return CLOSED state
+        if (
+            repeat > 0 and has_repeated >= repeat
+        ):  # the period has repeated repeat times, return CLOSED state
             return ProfilerState.CLOSED
         mod_step = step % period_steps
         if mod_step < closed:
@@ -176,12 +185,19 @@ def getScheduleState(step: int) -> ProfilerState:
                 return ProfilerState.RECORD
             else:
                 return ProfilerState.RECORD_AND_RETURN
-    assert closed >= 0 and ready >= 0 and record > 0 and \
-             repeat >= 0 and skip_first >= 0, "Invalid profiler scheduler arguments"
+
+    assert (
+        closed >= 0
+        and ready >= 0
+        and record > 0
+        and repeat >= 0
+        and skip_first >= 0
+    ), "Invalid profiler scheduler arguments"
     if ready == 0:
-        warn("Profiler will record data after enabling profiler immediately, \
+        warn(
+            "Profiler will record data after enabling profiler immediately, \
           some data collected at the beginning of profiling may be 'noisy' because of overhead."
-             )
+        )
     return getScheduleState
 
 
@@ -192,17 +208,18 @@ def _default_state_scheduler(step: int):
     return ProfilerState.RECORD
 
 
-def export_chrome_tracing(dir_name: str,
-                          worker_name: Optional[str] = None) -> Callable:
+def export_chrome_tracing(
+    dir_name: str, worker_name: Optional[str] = None
+) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to chrome tracing format file.
-    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
-    if worker_name is not set, the default name is [hostname]_[pid].
+    The output file will be saved in directory ``dir_name``, and file name will be set as `worker_name`.
+    if `worker_name` is not set, the default name is `[hostname]_[pid]`.
 
     Args:
         dir_name(str): Directory to save profiling data.
-        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
-    
+        worker_name(str, optional): Prefix of the file name saved, default is `[hostname]_[pid]`.
+
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to chrome tracing format file.
 
@@ -226,32 +243,37 @@ def export_chrome_tracing(dir_name: str,
             os.makedirs(dir_name, exist_ok=True)
         except Exception:
             raise RuntimeError(
-                "Can not create directory '{}' for saving profiling results.".
-                format(dir_name))
+                "Can not create directory '{}' for saving profiling results.".format(
+                    dir_name
+                )
+            )
 
     def handle_fn(prof):
         nonlocal worker_name
         if not worker_name:
-            worker_name = "host_{}pid_{}".format(socket.gethostname(),
-                                                 str(os.getpid()))
+            worker_name = "host_{}pid_{}".format(
+                socket.gethostname(), str(os.getpid())
+            )
         now = datetime.datetime.now()
         filename = '{}_time_{}.paddle_trace.json'.format(
-            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')
+        )
         prof.export(os.path.join(dir_name, filename), "json")
 
     return handle_fn
 
 
-def export_protobuf(dir_name: str,
-                    worker_name: Optional[str] = None) -> Callable:
+def export_protobuf(
+    dir_name: str, worker_name: Optional[str] = None
+) -> Callable:
     r"""
     Return a callable, used for outputing tracing data to protobuf file.
-    The output file will be saved in directory ``dir_name``, and file name will be set as worker_name.
-    if worker_name is not set, the default name is [hostname]_[pid].
+    The output file will be saved in directory ``dir_name``, and file name will be set as ``worker_name``.
+    if ``worker_name`` is not set, the default name is `[hostname]_[pid]`.
 
     Args:
         dir_name(str): Directory to save profiling data.
-        worker_name(str, optional): Prefix of the file name saved, default is [hostname]_[pid].
+        worker_name(str, optional): Prefix of the file name saved, default is `[hostname]_[pid]`.
 
     Returns:
         A callable, which takes a Profiler object as parameter and calls its export method to save data to protobuf file.
@@ -276,17 +298,21 @@ def export_protobuf(dir_name: str,
             os.makedirs(dir_name, exist_ok=True)
         except Exception:
             raise RuntimeError(
-                "Can not create directory '{}' for saving profiling results.".
-                format(dir_name))
+                "Can not create directory '{}' for saving profiling results.".format(
+                    dir_name
+                )
+            )
 
     def handle_fn(prof):
         nonlocal worker_name
         if not worker_name:
-            worker_name = "host_{}pid_{}".format(socket.gethostname(),
-                                                 str(os.getpid()))
+            worker_name = "host_{}pid_{}".format(
+                socket.gethostname(), str(os.getpid())
+            )
         now = datetime.datetime.now()
         filename = '{}_time_{}.paddle_trace.pb'.format(
-            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f'))
+            worker_name, now.strftime('%Y_%m_%d_%H_%M_%S_%f')
+        )
         prof.export(os.path.join(dir_name, filename), "pb")
 
     return handle_fn
@@ -298,11 +324,15 @@ def _get_supported_targets() -> Iterable[ProfilerTarget]:
     """
     if _Profiler.is_cupti_supported():
         return [
-            ProfilerTarget.CPU, ProfilerTarget.GPU, ProfilerTarget.CUSTOM_DEVICE
+            ProfilerTarget.CPU,
+            ProfilerTarget.GPU,
+            ProfilerTarget.CUSTOM_DEVICE,
         ]
     if _Profiler.is_cnpapi_supported():
         return [
-            ProfilerTarget.CPU, ProfilerTarget.MLU, ProfilerTarget.CUSTOM_DEVICE
+            ProfilerTarget.CPU,
+            ProfilerTarget.MLU,
+            ProfilerTarget.CUSTOM_DEVICE,
         ]
     return [ProfilerTarget.CPU, ProfilerTarget.CUSTOM_DEVICE]
 
@@ -317,7 +347,7 @@ class Profiler:
             If not provided (None), the default scheduler will keep tracing until the profiler exits. If it is a tuple, it has two values start_batch and end_batch,
             which means profiling range [start_batch, end_batch).
         on_trace_ready (Callable, optional): Callable object, serves as callback function, and takes the Profiler object as parameter, which provides a way for users to do post-processing.
-            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>` (./profiler_log/).
+            This callable object will be called when ``scheduler`` returns ``ProfilerState.RECORD_AND_RETURN``. The default value is :ref:`export_chrome_tracing <api_paddle_profiler_export_chrome_tracing>`.
         timer_only (bool, optional): If it is True, the cost of Dataloader and every step of the model will be count without profiling. Otherwise, the model will
             be timed and profiled. Default: False.
         record_shapes (bool, optional): If it is True, collect op's input shape information. Default: False.
@@ -339,7 +369,7 @@ class Profiler:
                         #train()
                         p.step()
 
-        2. profiling range [2,4], [7, 9], [11,13]
+        2. profiling range [2,4], [7, 9], [11,13].
 
             .. code-block:: python
                 :name: code-example2
@@ -354,7 +384,7 @@ class Profiler:
                         #train()
                         p.step()
 
-        3. Use profiler without context manager, and use default parameters
+        3. Use profiler without context manager, and use default parameters.
 
             .. code-block:: python
                 :name: code-example3
@@ -369,38 +399,37 @@ class Profiler:
                 p.stop()
                 p.summary()
 
-        4. Use profiler to get throughput and cost of the model
+        4. Use profiler to get throughput and cost of the model.
 
             .. code-block:: python
                 :name: code-example-timer1
 
                 import paddle
                 import paddle.profiler as profiler
-                
+
                 class RandomDataset(paddle.io.Dataset):
                     def __init__(self, num_samples):
                         self.num_samples = num_samples
-                
+
                     def __getitem__(self, idx):
                         image = paddle.rand(shape=[100], dtype='float32')
                         label = paddle.randint(0, 10, shape=[1], dtype='int64')
                         return image, label
-                
+
                     def __len__(self):
                         return self.num_samples
-                
+
                 class SimpleNet(paddle.nn.Layer):
                     def __init__(self):
                         super(SimpleNet, self).__init__()
                         self.fc = paddle.nn.Linear(100, 10)
-                
+
                     def forward(self, image, label=None):
                         return self.fc(image)
-                
+
                 dataset = RandomDataset(20 * 4)
                 simple_net = SimpleNet()
-                opt = paddle.optimizer.SGD(learning_rate=1e-3,
-                                           parameters=simple_net.parameters())
+                opt = paddle.optimizer.SGD(learning_rate=1e-3, parameters=simple_net.parameters())
                 BATCH_SIZE = 4
                 loader = paddle.io.DataLoader(
                     dataset,
@@ -433,36 +462,40 @@ def forward(self, image, label=None):
                 # |       ips       |    1086.42904   |    1227.30604   |    959.92796    |
     """
 
-    def __init__(self,
-                 *,
-                 targets: Optional[Iterable[ProfilerTarget]] = None,
-                 scheduler: Union[Callable[[int], ProfilerState], tuple,
-                                  None] = None,
-                 on_trace_ready: Optional[Callable[..., Any]] = None,
-                 record_shapes: Optional[bool] = False,
-                 profile_memory=False,
-                 timer_only: Optional[bool] = False,
-                 emit_nvtx: Optional[bool] = False,
-                 custom_device_types: Optional[list] = []):
+    def __init__(
+        self,
+        *,
+        targets: Optional[Iterable[ProfilerTarget]] = None,
+        scheduler: Union[Callable[[int], ProfilerState], tuple, None] = None,
+        on_trace_ready: Optional[Callable[..., Any]] = None,
+        record_shapes: Optional[bool] = False,
+        profile_memory=False,
+        timer_only: Optional[bool] = False,
+        emit_nvtx: Optional[bool] = False,
+        custom_device_types: Optional[list] = []
+    ):
         supported_targets = _get_supported_targets()
         if targets:
             self.targets = set(targets)
             for target in targets:
                 if target not in supported_targets:
                     self.targets.remove(target)
-                    warn("Profiling {} is not supported in current context.".
-                         format(target))
+                    warn(
+                        "Profiling {} is not supported in current context.".format(
+                            target
+                        )
+                    )
         else:
             self.targets = supported_targets
         profileoption = ProfilerOptions()
         if ProfilerTarget.CPU in self.targets:
             profileoption.trace_switch |= 1
         if ProfilerTarget.GPU in self.targets:
-            profileoption.trace_switch |= (1 << 1)
+            profileoption.trace_switch |= 1 << 1
         if ProfilerTarget.MLU in self.targets:
-            profileoption.trace_switch |= (1 << 2)
+            profileoption.trace_switch |= 1 << 2
         if ProfilerTarget.CUSTOM_DEVICE in self.targets:
-            profileoption.trace_switch |= (1 << 3)
+            profileoption.trace_switch |= 1 << 3
             if not custom_device_types:
                 custom_device_types = paddle.device.get_all_custom_device_type()
         wrap_optimizers()
@@ -474,17 +507,19 @@ def __init__(self,
             start_batch, end_batch = scheduler
             start_batch = max(start_batch, 0)
             if start_batch >= 1:
-                self.scheduler = make_scheduler(closed=max(start_batch - 1, 0),
-                                                ready=1,
-                                                record=(end_batch -
-                                                        start_batch),
-                                                repeat=1)
+                self.scheduler = make_scheduler(
+                    closed=max(start_batch - 1, 0),
+                    ready=1,
+                    record=(end_batch - start_batch),
+                    repeat=1,
+                )
             else:
-                self.scheduler = make_scheduler(closed=0,
-                                                ready=0,
-                                                record=(end_batch -
-                                                        start_batch),
-                                                repeat=1)
+                self.scheduler = make_scheduler(
+                    closed=0,
+                    ready=0,
+                    record=(end_batch - start_batch),
+                    repeat=1,
+                )
         else:
             self.scheduler = _default_state_scheduler
 
@@ -531,7 +566,7 @@ def start(self):
                 prof.stop()
 
         '''
-        # Timing only without profiling
+        # Timing only without profiling.
         benchmark().begin()
         if not self.timer_only or self.emit_nvtx:
             utils._is_profiler_used = True
@@ -550,9 +585,10 @@ def start(self):
         elif self.current_state == ProfilerState.RECORD_AND_RETURN:
             self.profiler.prepare()
             self.profiler.start()
-        self.record_event = RecordEvent(name="ProfileStep#{}".format(
-            self.step_num),
-                                        event_type=TracerEventType.ProfileStep)
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep,
+        )
         self.record_event.begin()
 
     def stop(self):
@@ -584,7 +620,7 @@ def stop(self):
         if self.profile_memory:
             disable_memory_recorder()
         # self.current_state -> CLOSED
-        # In this situation, RECORD state is regarded as RECORD_AND_RETURN
+        # In this situation, RECORD state is regarded as RECORD_AND_RETURN.
         if self.record_event:
             self.record_event.end()
             self.record_event = None
@@ -594,7 +630,10 @@ def stop(self):
             )
             self.profiler.start()
             self.profiler.stop()
-        if self.current_state == ProfilerState.RECORD or self.current_state == ProfilerState.RECORD_AND_RETURN:
+        if (
+            self.current_state == ProfilerState.RECORD
+            or self.current_state == ProfilerState.RECORD_AND_RETURN
+        ):
             self.profiler_result = self.profiler.stop()
             if self.on_trace_ready:
                 self.on_trace_ready(self)
@@ -607,7 +646,7 @@ def step(self, num_samples: Optional[int] = None):
 
         Args:
             num_samples (int|None, optional): Specifies the batch size of every step of the model
-                that is used to compute throughput when timer_only is True. Default: None.
+                that is used to compute throughput when `timer_only` is True. Default: None.
 
         Examples:
             .. code-block:: python
@@ -636,16 +675,17 @@ def step(self, num_samples: Optional[int] = None):
         self.step_num += 1
         self.current_state = self.scheduler(self.step_num)
         self._trigger_action()
-        self.record_event = RecordEvent(name="ProfileStep#{}".format(
-            self.step_num),
-                                        event_type=TracerEventType.ProfileStep)
+        self.record_event = RecordEvent(
+            name="ProfileStep#{}".format(self.step_num),
+            event_type=TracerEventType.ProfileStep,
+        )
         self.record_event.begin()
 
     def step_info(self, unit=None):
         r"""
         Get statistics for current step. If the function is called at certain iteration
         intervals, the result is the average of all steps between the previous call and
-        this call. Statistics are as follows：
+        this call. Statistics are as follows:
 
         1. reader_cost: the cost of loading data measured in seconds.
 
@@ -695,7 +735,9 @@ def _trigger_action(self):
             if self.current_state == ProfilerState.RECORD:  # CLOSED -> RECORD
                 self.profiler.prepare()
                 self.profiler.start()
-            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # CLOSED -> RECORD_AND_RETURN
+            if (
+                self.current_state == ProfilerState.RECORD_AND_RETURN
+            ):  # CLOSED -> RECORD_AND_RETURN
                 self.profiler.prepare()
                 self.profiler.start()
 
@@ -708,7 +750,9 @@ def _trigger_action(self):
                 self.profiler.stop()
             if self.current_state == ProfilerState.RECORD:  # READY -> RECORD
                 self.profiler.start()
-            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # READY -> RECORD_AND_RETURN
+            if (
+                self.current_state == ProfilerState.RECORD_AND_RETURN
+            ):  # READY -> RECORD_AND_RETURN
                 self.profiler.start()
 
         elif self.previous_state == ProfilerState.RECORD:
@@ -724,21 +768,31 @@ def _trigger_action(self):
                 )
                 self.profiler.stop()
                 self.profiler.prepare()
-            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD -> RECORD_AND_RETURN
+            if (
+                self.current_state == ProfilerState.RECORD_AND_RETURN
+            ):  # RECORD -> RECORD_AND_RETURN
                 pass
 
         else:
             assert self.previous_state == ProfilerState.RECORD_AND_RETURN
-            if self.current_state == ProfilerState.CLOSED:  # RECORD_AND_RETURN -> CLOSED
+            if (
+                self.current_state == ProfilerState.CLOSED
+            ):  # RECORD_AND_RETURN -> CLOSED
                 self.profiler_result = self.profiler.stop()
-            if self.current_state == ProfilerState.READY:  # RECORD_AND_RETURN -> READY
+            if (
+                self.current_state == ProfilerState.READY
+            ):  # RECORD_AND_RETURN -> READY
                 self.profiler_result = self.profiler.stop()
                 self.profiler.prepare()
-            if self.current_state == ProfilerState.RECORD:  # RECORD_AND_RETURN -> RECORD
+            if (
+                self.current_state == ProfilerState.RECORD
+            ):  # RECORD_AND_RETURN -> RECORD
                 self.profiler_result = self.profiler.stop()
                 self.profiler.prepare()
                 self.profiler.start()
-            if self.current_state == ProfilerState.RECORD_AND_RETURN:  # RECORD_AND_RETURN -> RECORD_AND_RETURN
+            if (
+                self.current_state == ProfilerState.RECORD_AND_RETURN
+            ):  # RECORD_AND_RETURN -> RECORD_AND_RETURN
                 self.profiler_result = self.profiler.stop()
                 self.profiler.prepare()
                 self.profiler.start()
@@ -751,7 +805,7 @@ def export(self, path="", format="json"):
 
         Args:
             path(str): file path of the output.
-            format(str, optional): output format, can be chosen from ['json', 'pb], 'json' for chrome tracing and 'pb' for protobuf, default value is "json".
+            format(str, optional): output format, can be chosen from ['json', 'pb'], 'json' for chrome tracing and 'pb' for protobuf, default value is 'json'.
 
 
         Examples:
@@ -773,12 +827,14 @@ def export(self, path="", format="json"):
         if self.profiler_result:
             self.profiler_result.save(path, format)
 
-    def summary(self,
-                sorted_by=SortedKeys.CPUTotal,
-                op_detail=True,
-                thread_sep=False,
-                time_unit='ms',
-                views=None):
+    def summary(
+        self,
+        sorted_by=SortedKeys.CPUTotal,
+        op_detail=True,
+        thread_sep=False,
+        time_unit='ms',
+        views=None,
+    ):
         r"""
         Print the Summary table. Currently support overview, model, distributed, operator, memory manipulation and userdefined summary.
 
@@ -812,14 +868,18 @@ def summary(self,
         if self.profiler_result:
             statistic_data = StatisticData(
                 self.profiler_result.get_data(),
-                self.profiler_result.get_extra_info())
+                self.profiler_result.get_extra_info(),
+            )
             print(
-                _build_table(statistic_data,
-                             sorted_by=sorted_by,
-                             op_detail=op_detail,
-                             thread_sep=thread_sep,
-                             time_unit=time_unit,
-                             views=views))
+                _build_table(
+                    statistic_data,
+                    sorted_by=sorted_by,
+                    op_detail=op_detail,
+                    thread_sep=thread_sep,
+                    time_unit=time_unit,
+                    views=views,
+                )
+            )
 
 
 def get_profiler(config_path):
@@ -852,17 +912,20 @@ def get_profiler(config_path):
                     method = getattr(module, key)
                     if not use_direct:
                         translated_config_dict['scheduler'] = method(
-                            *value['args'], **value['kwargs'])
+                            *value['args'], **value['kwargs']
+                        )
                     else:
                         translated_config_dict['scheduler'] = method
             else:
                 translated_config_dict['scheduler'] = [
-                    config_dict['scheduler'][0], config_dict['scheduler'][1]
+                    config_dict['scheduler'][0],
+                    config_dict['scheduler'][1],
                 ]
 
         except:
             print(
-                'Set scheduler parameter error, use default parameter instead.')
+                'Set scheduler parameter error, use default parameter instead.'
+            )
             translated_config_dict['scheduler'] = None
     if "on_trace_ready" in config_dict:
         try:
@@ -874,7 +937,8 @@ def get_profiler(config_path):
                     method = getattr(module, key)
                     if not use_direct:
                         translated_config_dict['on_trace_ready'] = method(
-                            *value['args'], **value['kwargs'])
+                            *value['args'], **value['kwargs']
+                        )
                     else:
                         translated_config_dict['on_trace_ready'] = method
         except:
diff --git a/python/paddle/profiler/utils.py b/python/paddle/profiler/utils.py
index c0146fe92763fe..efe3975f144524 100644
--- a/python/paddle/profiler/utils.py
+++ b/python/paddle/profiler/utils.py
@@ -18,16 +18,19 @@
 from contextlib import ContextDecorator
 
 from paddle.fluid import core
-from paddle.fluid.core import (_RecordEvent, TracerEventType)
+from paddle.fluid.core import _RecordEvent, TracerEventType
 
 _is_profiler_used = False
 _has_optimizer_wrapped = False
 
 _AllowedEventTypeList = [
-    TracerEventType.Dataloader, TracerEventType.ProfileStep,
-    TracerEventType.Forward, TracerEventType.Backward,
-    TracerEventType.Optimization, TracerEventType.PythonOp,
-    TracerEventType.PythonUserDefined
+    TracerEventType.Dataloader,
+    TracerEventType.ProfileStep,
+    TracerEventType.Forward,
+    TracerEventType.Backward,
+    TracerEventType.Optimization,
+    TracerEventType.PythonOp,
+    TracerEventType.PythonUserDefined,
 ]
 
 
@@ -36,8 +39,10 @@ class RecordEvent(ContextDecorator):
     Interface for recording a time range by user defined.
 
     Args:
-        name(str): Name of the record event
-        event_type(TracerEventType, optional): Optional, default value is TracerEventType.PythonUserDefined. It is reserved for internal purpose, and it is better not to specify this parameter. 
+        name (str): Name of the record event.
+        event_type (TracerEventType, optional): Optional, default value is
+            `TracerEventType.PythonUserDefined`. It is reserved for internal
+            purpose, and it is better not to specify this parameter.
 
     Examples:
         .. code-block:: python
@@ -59,13 +64,14 @@ class RecordEvent(ContextDecorator):
             record_event.end()
 
     **Note**:
-        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of RECORD.
+        RecordEvent will take effect only when :ref:`Profiler <api_paddle_profiler_Profiler>` is on and at the state of `RECORD`.
     """
 
     def __init__(
-            self,
-            name: str,
-            event_type: TracerEventType = TracerEventType.PythonUserDefined):
+        self,
+        name: str,
+        event_type: TracerEventType = TracerEventType.PythonUserDefined,
+    ):
         self.name = name
         self.event_type = event_type
         self.event = None
@@ -98,8 +104,12 @@ def begin(self):
         if not _is_profiler_used:
             return
         if self.event_type not in _AllowedEventTypeList:
-            warn("Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
-                  can be recorded.".format(*_AllowedEventTypeList))
+            warn(
+                "Only TracerEvent Type in [{}, {}, {}, {}, {}, {},{}]\
+                  can be recorded.".format(
+                    *_AllowedEventTypeList
+                )
+            )
             self.event = None
         else:
             self.event = _RecordEvent(self.name, self.event_type)
@@ -134,7 +144,7 @@ def load_profiler_result(filename: str):
         filename(str): Name of the exported protobuf file of profiler data.
 
     Returns:
-        ProfilerResult object, which stores profiling data.
+        ``ProfilerResult`` object, which stores profiling data.
 
     Examples:
         .. code-block:: python
@@ -158,14 +168,13 @@ def in_profiler_mode():
 
 
 def wrap_optimizers():
-
     def optimizer_warpper(func):
-
         @functools.wraps(func)
         def warpper(*args, **kwargs):
             if in_profiler_mode():
-                with RecordEvent('Optimization Step',
-                                 event_type=TracerEventType.Optimization):
+                with RecordEvent(
+                    'Optimization Step', event_type=TracerEventType.Optimization
+                ):
                     return func(*args, **kwargs)
             else:
                 return func(*args, **kwargs)
@@ -176,6 +185,7 @@ def warpper(*args, **kwargs):
     if _has_optimizer_wrapped == True:
         return
     import paddle.optimizer as optimizer
+
     for classname in optimizer.__all__:
         if classname != 'Optimizer':
             classobject = getattr(optimizer, classname)
diff --git a/python/paddle/reader/decorator.py b/python/paddle/reader/decorator.py
index 981f6e9253c06a..20874215ddc268 100644
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
@@ -51,31 +51,31 @@
 
 def cache(reader):
     """
-    Cache the reader data into memory. 
+    Cache the reader data into memory.
 
-    Be careful that this method may take long time to process, 
-    and consume lots of memory. :code:`reader()` would only 
-    call once. 
+    Be careful that this method may take long time to process,
+    and consume lots of memory. :code:`reader()` would only
+    call once.
 
     Args:
-        reader (generator): a reader object which yields 
+        reader (generator): a reader object which yields
             data each time.
 
     Returns:
         generator: a decorated reader object which yields data from cached memory.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def reader():
                 for i in range(3):
                     yield i
-            
+
             # All data is cached into memory
             cached_reader = paddle.io.cache(reader)
-            
+
             # Output: 0 1 2
             for i in cached_reader():
                 print(i)
@@ -100,10 +100,10 @@ def map_readers(func, *readers):
 
 
     Args:
-        func: a function to read data and compute result, the output of this function 
+        func: a function to read data and compute result, the output of this function
               will be set as the output of the resulted data reader.
         readers (Reader|list of Reader): list of readers whose outputs will be used as arguments of func.
- 
+
     Returns:
         the resulted data reader (Reader)
 
@@ -138,9 +138,9 @@ def shuffle(reader, buf_size):
 
     This API creates a decorated reader that outputs the shuffled data.
 
-    The output data from the origin reader will be saved into a buffer, 
+    The output data from the origin reader will be saved into a buffer,
     and then shuffle the data. The size of buffer is determined by argument buf_size.
- 
+
     Args:
         reader(callable): the original reader whose data will be shuffled.
         buf_size(int): the size of shuffled buffer.
@@ -255,18 +255,15 @@ def compose(*readers, **kwargs):
     (1, 2, 3, 4, 5)
 
     Args:
-        readers (Reader|list of Reader): readers that will be composed together. 
+        readers (Reader|list of Reader): readers that will be composed together.
         check_alignment(bool, optional): Indicates whether the input readers are checked for
                               alignment. If True, whether input readers are aligned
                               correctly will be checked, else alignment will not be checkout and trailing outputs
                               will be discarded. Defaults to True.
 
-    Returns: 
+    Returns:
         the new data reader (Reader).
 
-    Raises:
-        ComposeNotAligned: outputs of readers are not aligned. This will not raise if check_alignment is set to False.
-  
     Examples:
         .. code-block:: python
 
@@ -284,7 +281,7 @@ def make_tuple(x):
         if isinstance(x, tuple):
             return x
         else:
-            return (x, )
+            return (x,)
 
     def reader():
         rs = []
@@ -299,7 +296,8 @@ def reader():
                     if o is None:
                         # None will be not be present if compose is aligned
                         raise ComposeNotAligned(
-                            "outputs of readers are not aligned.")
+                            "outputs of readers are not aligned."
+                        )
                 yield sum(list(map(make_tuple, outputs)), ())
 
     return reader
@@ -319,25 +317,25 @@ def buffered(reader, size):
 
     Returns:
         generator: the buffered data reader.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            
+
             def reader():
                 for i in range(3):
                     yield i
-            
+
             # Create a buffered reader, and the buffer size is 2.
             buffered_reader = paddle.io.buffered(reader, 2)
-            
+
             # Output: 0 1 2
             for i in buffered_reader():
                 print(i)
     """
 
-    class EndSignal():
+    class EndSignal:
         pass
 
     end = EndSignal()
@@ -350,10 +348,13 @@ def read_worker(r, q):
     def data_reader():
         r = reader()
         q = Queue(maxsize=size)
-        t = Thread(target=read_worker, args=(
-            r,
-            q,
-        ))
+        t = Thread(
+            target=read_worker,
+            args=(
+                r,
+                q,
+            ),
+        )
         t.daemon = True
         t.start()
         e = q.get()
@@ -368,8 +369,8 @@ def firstn(reader, n):
     """
     paddle.fluid.io.firstn ( :ref:`api_fluid_io_firstn` ) is recommended to use,
     and paddle.reader.firstn is an alias.
-    
-    This API creates a decorated reader, and limits the max number of 
+
+    This API creates a decorated reader, and limits the max number of
     samples that reader could return.
 
     Args:
@@ -390,7 +391,7 @@ def reader():
             firstn_reader = fluid.io.firstn(reader, 5)
             for e in firstn_reader():
                 print(e)
-            # the outputs are: 0 1 2 3 4  
+            # the outputs are: 0 1 2 3 4
     """
 
     # TODO(yuyang18): Check if just drop the reader, could clean the opened
@@ -405,7 +406,7 @@ def firstn_reader():
     return firstn_reader
 
 
-class XmapEndSignal():
+class XmapEndSignal:
     pass
 
 
@@ -415,14 +416,14 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
 
     Args:
         mapper (callable): a function to map the data from reader.
-        reader (callable): a data reader which yields the data. 
+        reader (callable): a data reader which yields the data.
         process_num (int): thread number to handle original sample.
-        buffer_size (int): size of the queue to read data in. 
-        order (bool): whether to keep the data order from original reader. 
+        buffer_size (int): size of the queue to read data in.
+        order (bool): whether to keep the data order from original reader.
             Default False.
 
     Returns:
-        callable: a decorated reader with data mapping. 
+        callable: a decorated reader with data mapping.
     """
     end = XmapEndSignal()
 
@@ -477,8 +478,11 @@ def xreader():
         t.start()
         # start several handle_workers
         target = order_handle_worker if order else handle_worker
-        args = (in_queue, out_queue, mapper,
-                out_order) if order else (in_queue, out_queue, mapper)
+        args = (
+            (in_queue, out_queue, mapper, out_order)
+            if order
+            else (in_queue, out_queue, mapper)
+        )
         workers = []
         for i in range(process_num):
             worker = Thread(target=target, args=args)
@@ -505,17 +509,17 @@ def xreader():
 def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
     """
     This API use python ``multiprocessing`` to read data from ``readers`` parallelly,
-    and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge 
-    these data. A separate process will be created for each reader in the 
-    ``readers`` list, please guarantee every reader can work independently 
+    and then ``multiprocess.Queue`` or ``multiprocess.Pipe`` is used to merge
+    these data. A separate process will be created for each reader in the
+    ``readers`` list, please guarantee every reader can work independently
     to avoid conflicts in parallel environment.
-    
 
-    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported 
+
+    ``Multiprocess.Queue`` require the rw access right to /dev/shm, and it's not supported
     in some platforms.
 
     Parameters:
-       readers (list( ``generator`` ) | tuple( ``generator`` )): a python ``generator`` list 
+       readers (list( ``generator`` ) | tuple( ``generator`` )): a python ``generator`` list
            used to read input data
        use_pipe (bool, optional): control the inner API used to implement the multi-processing,
            default True - use ``multiprocess.Pipe`` which is recommended
@@ -534,16 +538,16 @@ def multiprocess_reader(readers, use_pipe=True, queue_size=1000):
         import paddle.fluid as fluid
         from paddle.fluid.io import multiprocess_reader
         import numpy as np
-        
+
         sample_files = ['sample_file_1', 'sample_file_2']
-        
+
         def fake_input_files():
             with open(sample_files[0], 'w') as f:
                np.savez(f, a=np.array([1, 2]), b=np.array([3, 4]), c=np.array([5, 6]), d=np.array([7, 8]))
             with open(sample_files[1], 'w') as f:
                np.savez(f, a=np.array([9, 10]), b=np.array([11, 12]), c=np.array([13, 14]))
-        
-        
+
+
         def generate_reader(file_name):
             # load data file
             def _impl():
@@ -551,28 +555,28 @@ def _impl():
                 for item in sorted(data.files):
                     yield data[item],
             return _impl
-        
+
         if __name__ == '__main__':
             # generate sample input files
             fake_input_files()
-            
+
             with fluid.program_guard(fluid.Program(), fluid.Program()):
                 place = fluid.CPUPlace()
                 # the 1st 2 is batch size
-                image = fluid.data(name='image', dtype='int64', shape=[2, 1, 2]) 
+                image = fluid.data(name='image', dtype='int64', shape=[2, 1, 2])
                 fluid.layers.Print(image)
                 # print detailed tensor info of image variable
-            
+
                 reader = fluid.io.PyReader(feed_list=[image], capacity=2)
-            
+
                 decorated_reader = multiprocess_reader(
                     [generate_reader(sample_files[0]), generate_reader(sample_files[1])], False)
-            
+
                 reader.decorate_sample_generator(decorated_reader, batch_size=2, places=[place])
-            
+
                 exe = fluid.Executor(place)
                 exe.run(fluid.default_startup_program())
-            
+
                 for data in reader():
                     res = exe.run(feed=data, fetch_list=[image])
                     print(res[0])
@@ -586,7 +590,8 @@ def _impl():
 
     if sys.platform == 'win32':
         raise NotImplementedError(
-            "The multiprocess_reader method is not supported on windows.")
+            "The multiprocess_reader method is not supported on windows."
+        )
 
     # ujson is ultra fast json encoder and decoder written in pure C with bindings for Python 3.6+.
     try:
@@ -594,11 +599,13 @@ def _impl():
     except Exception as e:
         warnings.warn(
             "The `ujson` module is not found, use the `json` module, `ujson` encodes and decodes faster, "
-            "you can install `ujson` through `pip install ujson`.")
+            "you can install `ujson` through `pip install ujson`."
+        )
         import json
 
-    assert isinstance(readers, (list, tuple)) and len(readers) > 0, (
-        "`readers` must be list or tuple.")
+    assert (
+        isinstance(readers, (list, tuple)) and len(readers) > 0
+    ), "`readers` must be list or tuple."
 
     def _read_into_queue(reader, queue):
         try:
@@ -614,8 +621,9 @@ def _read_into_queue(reader, queue):
     def queue_reader():
         queue = fork_context.Queue(queue_size)
         for reader in readers:
-            p = fork_context.Process(target=_read_into_queue,
-                                     args=(reader, queue))
+            p = fork_context.Process(
+                target=_read_into_queue, args=(reader, queue)
+            )
             p.start()
 
         reader_num = len(readers)
@@ -656,8 +664,9 @@ def pipe_reader():
         for reader in readers:
             parent_conn, child_conn = fork_context.Pipe()
             conns.append(parent_conn)
-            p = fork_context.Process(target=_read_into_pipe,
-                                     args=(reader, child_conn))
+            p = fork_context.Process(
+                target=_read_into_pipe, args=(reader, child_conn)
+            )
             p.start()
 
         reader_num = len(readers)
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 586ae0f988c2e5..38060b8233fdba 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -20,30 +20,30 @@
 class L1Decay(fluid.regularizer.L1Decay):
     r"""
     Implement the L1 Weight Decay Regularization, which encourages the weights to be sparse.
-    
-    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
-    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+
+    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
     in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
     in Optimizer will be used.
-    
+
     In the implementation, the loss function of L1 Weight Decay Regularization is as follows:
-	
+
     .. math::
 
         loss = coeff * reduce\_sum(abs(x))
 
     Args:
         coeff(float, optional): regularization coeff. Default:0.0.
-	
+
     Examples:
         .. code-block:: python
 
             # Example1: set Regularizer in optimizer
             import paddle
             from paddle.regularizer import L1Decay
-            import numpy as np
+
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.rand(shape=[10, 10], dtype="float32")
             out = linear(inp)
@@ -82,14 +82,14 @@ def __init__(self, coeff=0.0):
 class L2Decay(fluid.regularizer.L2Decay):
     r"""
     Implement the L2 Weight Decay Regularization, which helps to prevent the model over-fitting.
-    
-    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ). 
-    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in 
-    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has 
-    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined 
+
+    It can be set in :ref:`api_paddle_ParamAttr` or ``optimizer`` (such as :ref:`api_paddle_optimizer_Momentum` ).
+    When set in ``ParamAttr`` , it only takes effect for trainable parameters in this layer. When set in
+    ``optimizer`` , it takes effect for all trainable parameters. When set together, ``ParamAttr`` has
+    higher priority than ``optimizer`` , which means that for a trainable parameter, if regularizer is defined
     in its ParamAttr, then the regularizer in Optimizer will be ignored. Otherwise the  regularizer
     in Optimizer will be used.
-    
+
     In the implementation, the loss function of L2 Weight Decay Regularization is as follows:
 
     .. math::
@@ -98,14 +98,13 @@ class L2Decay(fluid.regularizer.L2Decay):
 
     Args:
         regularization_coeff(float, optional): regularization coeff. Default:0.0
-	
+
     Examples:
         .. code-block:: python
 
             # Example1: set Regularizer in optimizer
             import paddle
             from paddle.regularizer import L2Decay
-            import numpy as np
             linear = paddle.nn.Linear(10, 10)
             inp = paddle.rand(shape=[10, 10], dtype="float32")
             out = linear(inp)
diff --git a/python/paddle/signal.py b/python/paddle/signal.py
index 656605f1bf2b7c..6a4de719a9c8ee 100644
--- a/python/paddle/signal.py
+++ b/python/paddle/signal.py
@@ -39,15 +39,15 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
             with shape `[..., seq_length]` or `[seq_length, ...]`.
         frame_length (int): Length of the frame and `0 < frame_length <= x.shape[axis]`.
         hop_length (int): Number of steps to advance between adjacent frames
-            and `0 < hop_length`. 
+            and `0 < hop_length`.
         axis (int, optional): Specify the axis to operate on the input Tensors. Its
             value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.
 
     Returns:
         The output frames tensor with shape `[..., frame_length, num_frames]` if `axis==-1`,
             otherwise `[num_frames, frame_length, ...]` where
-        
+
             `num_framse = 1 + (x.shape[axis] - frame_length) // hop_length`
 
     Examples:
@@ -56,7 +56,7 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
 
         import paddle
         from paddle.signal import frame
-        
+
         # 1D
         x = paddle.arange(8)
         y0 = frame(x, frame_length=4, hop_length=2, axis=-1)  # [4, 3]
@@ -124,7 +124,8 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
         if frame_length > x.shape[axis]:
             raise ValueError(
                 f'Attribute frame_length should be less equal than sequence length, '
-                f'but got ({frame_length}) > ({x.shape[axis]}).')
+                f'but got ({frame_length}) > ({x.shape[axis]}).'
+            )
 
     op_type = 'frame'
 
@@ -132,25 +133,33 @@ def frame(x, frame_length, hop_length, axis=-1, name=None):
         return _C_ops.frame(x, frame_length, hop_length, axis)
 
     if _in_legacy_dygraph():
-        attrs = ('frame_length', frame_length, 'hop_length', hop_length, 'axis',
-                 axis)
+        attrs = (
+            'frame_length',
+            frame_length,
+            'hop_length',
+            hop_length,
+            'axis',
+            axis,
+        )
         op = getattr(_legacy_C_ops, op_type)
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-            op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
-                         inputs={'X': x},
-                         attrs={
-                             'frame_length': frame_length,
-                             'hop_length': hop_length,
-                             'axis': axis
-                         },
-                         outputs={'Out': out})
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={
+                'frame_length': frame_length,
+                'hop_length': hop_length,
+                'axis': axis,
+            },
+            outputs={'Out': out},
+        )
     return out
 
 
@@ -163,10 +172,10 @@ def overlap_add(x, hop_length, axis=-1, name=None):
             with shape `[..., frame_length, num_frames]` or
             `[num_frames, frame_length ...]`.
         hop_length (int): Number of steps to advance between adjacent frames and
-            `0 < hop_length <= frame_length`. 
+            `0 < hop_length <= frame_length`.
         axis (int, optional): Specify the axis to operate on the input Tensors. Its
             value should be 0(the first dimension) or -1(the last dimension). If not
-            specified, the last axis is used by default. 
+            specified, the last axis is used by default.
 
     Returns:
         The output frames tensor with shape `[..., seq_length]` if `axis==-1`,
@@ -180,7 +189,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
 
         import paddle
         from paddle.signal import overlap_add
-        
+
         # 2D
         x0 = paddle.arange(16).reshape([8, 2])
         # [[0 , 1 ],
@@ -205,7 +214,7 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         y0 = overlap_add(x0, hop_length=2, axis=-1)  # [2, 1, 10]
 
         x1 = paddle.arange(32).reshape([2, 8, 1, 2])
-        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2] 
+        y1 = overlap_add(x1, hop_length=2, axis=0)   # [10, 1, 2]
     """
     if axis not in [0, -1]:
         raise ValueError(f'Unexpected axis: {axis}. It should be 0 or -1.')
@@ -225,32 +234,34 @@ def overlap_add(x, hop_length, axis=-1, name=None):
         out = op(x, *attrs)
     else:
         check_variable_and_dtype(
-            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'],
-            op_type)
+            x, 'x', ['int32', 'int64', 'float16', 'float32', 'float64'], op_type
+        )
         helper = LayerHelper(op_type, **locals())
         dtype = helper.input_dtype(input_param_name='x')
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type=op_type,
-                         inputs={'X': x},
-                         attrs={
-                             'hop_length': hop_length,
-                             'axis': axis
-                         },
-                         outputs={'Out': out})
+        helper.append_op(
+            type=op_type,
+            inputs={'X': x},
+            attrs={'hop_length': hop_length, 'axis': axis},
+            outputs={'Out': out},
+        )
     return out
 
 
-def stft(x,
-         n_fft,
-         hop_length=None,
-         win_length=None,
-         window=None,
-         center=True,
-         pad_mode='reflect',
-         normalized=False,
-         onesided=True,
-         name=None):
+def stft(
+    x,
+    n_fft,
+    hop_length=None,
+    win_length=None,
+    window=None,
+    center=True,
+    pad_mode='reflect',
+    normalized=False,
+    onesided=True,
+    name=None,
+):
     r"""
+
     Short-time Fourier transform (STFT).
 
     The STFT computes the discrete Fourier transforms (DFT) of short overlapping
@@ -263,11 +274,14 @@ def stft(x,
     
     Where:
     - :math:`t`: The :math:`t`-th input window.
+
     - :math:`\omega`: Frequency :math:`0 \leq \omega < \text{n\_fft}` for `onesided=False`,
-        or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`. 
+      or :math:`0 \leq \omega < \lfloor \text{n\_fft} / 2 \rfloor + 1` for `onesided=True`.
+
     - :math:`N`: Value of `n_fft`.
-    - :math:`H`: Value of `hop_length`.  
-    
+
+    - :math:`H`: Value of `hop_length`.
+
     Args:
         x (Tensor): The input data which is a 1-dimensional or 2-dimensional Tensor with
             shape `[..., seq_length]`. It can be a real-valued or a complex Tensor.
@@ -292,10 +306,10 @@ def stft(x,
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
     
     Returns:
-        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`(
-            real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`(
-            `onesided` is `False`)
-    
+        The complex STFT output tensor with shape `[..., n_fft//2 + 1, num_frames]`
+        (real-valued input and `onesided` is `True`) or `[..., n_fft, num_frames]`
+        (`onesided` is `False`)
+
     Examples:
         .. code-block:: python
     
@@ -311,14 +325,17 @@ def stft(x,
             x = paddle.randn([8, 48000], dtype=paddle.float64) + \
                     paddle.randn([8, 48000], dtype=paddle.float64)*1j  # [8, 48000] complex128
             y1 = stft(x, n_fft=512, center=False, onesided=False)  # [8, 512, 372]
+
     """
-    check_variable_and_dtype(x, 'x',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'stft')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'complex64', 'complex128'], 'stft'
+    )
 
     x_rank = len(x.shape)
-    assert x_rank in [1, 2], \
-        f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
+    assert x_rank in [
+        1,
+        2,
+    ], f'x should be a 1D or 2D real tensor, but got rank of x is {x_rank}'
 
     if x_rank == 1:  # (batch, seq_length)
         x = x.unsqueeze(0)
@@ -326,69 +343,77 @@ def stft(x,
     if hop_length is None:
         hop_length = int(n_fft // 4)
 
-    assert hop_length > 0, \
-        f'hop_length should be > 0, but got {hop_length}.'
+    assert hop_length > 0, f'hop_length should be > 0, but got {hop_length}.'
 
     if win_length is None:
         win_length = n_fft
 
     if _non_static_mode():
-        assert 0 < n_fft <= x.shape[-1], \
-            f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
+        assert (
+            0 < n_fft <= x.shape[-1]
+        ), f'n_fft should be in (0, seq_length({x.shape[-1]})], but got {n_fft}.'
 
-    assert 0 < win_length <= n_fft, \
-        f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
+    assert (
+        0 < win_length <= n_fft
+    ), f'win_length should be in (0, n_fft({n_fft})], but got {win_length}.'
 
     if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
-            f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
+        assert (
+            len(window.shape) == 1 and len(window) == win_length
+        ), f'expected a 1D window tensor of size equal to win_length({win_length}), but got window with shape {window.shape}.'
     else:
-        window = paddle.ones(shape=(win_length, ), dtype=x.dtype)
+        window = paddle.ones(shape=(win_length,), dtype=x.dtype)
 
     if win_length < n_fft:
         pad_left = (n_fft - win_length) // 2
         pad_right = n_fft - win_length - pad_left
-        window = paddle.nn.functional.pad(window,
-                                          pad=[pad_left, pad_right],
-                                          mode='constant')
+        window = paddle.nn.functional.pad(
+            window, pad=[pad_left, pad_right], mode='constant'
+        )
 
     if center:
-        assert pad_mode in ['constant', 'reflect'], \
-            'pad_mode should be "reflect" or "constant", but got "{}".'.format(pad_mode)
+        assert pad_mode in [
+            'constant',
+            'reflect',
+        ], 'pad_mode should be "reflect" or "constant", but got "{}".'.format(
+            pad_mode
+        )
 
         pad_length = n_fft // 2
         # FIXME: Input `x` can be a complex tensor but pad does not supprt complex input.
-        x = paddle.nn.functional.pad(x.unsqueeze(-1),
-                                     pad=[pad_length, pad_length],
-                                     mode=pad_mode,
-                                     data_format="NLC").squeeze(-1)
+        x = paddle.nn.functional.pad(
+            x.unsqueeze(-1),
+            pad=[pad_length, pad_length],
+            mode=pad_mode,
+            data_format="NLC",
+        ).squeeze(-1)
 
     x_frames = frame(x=x, frame_length=n_fft, hop_length=hop_length, axis=-1)
     x_frames = x_frames.transpose(
-        perm=[0, 2,
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+        perm=[0, 2, 1]
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
     x_frames = paddle.multiply(x_frames, window)
 
     norm = 'ortho' if normalized else 'backward'
     if is_complex(x_frames):
-        assert not onesided, \
-            'onesided should be False when input or window is a complex Tensor.'
+        assert (
+            not onesided
+        ), 'onesided should be False when input or window is a complex Tensor.'
 
     if not is_complex(x):
-        out = fft_r2c(x=x_frames,
-                      n=None,
-                      axis=-1,
-                      norm=norm,
-                      forward=True,
-                      onesided=onesided,
-                      name=name)
+        out = fft_r2c(
+            x=x_frames,
+            n=None,
+            axis=-1,
+            norm=norm,
+            forward=True,
+            onesided=onesided,
+            name=name,
+        )
     else:
-        out = fft_c2c(x=x_frames,
-                      n=None,
-                      axis=-1,
-                      norm=norm,
-                      forward=True,
-                      name=name)
+        out = fft_c2c(
+            x=x_frames, n=None, axis=-1, norm=norm, forward=True, name=name
+        )
 
     out = out.transpose(perm=[0, 2, 1])  # (batch, n_fft, num_frames)
 
@@ -398,22 +423,24 @@ def stft(x,
     return out
 
 
-def istft(x,
-          n_fft,
-          hop_length=None,
-          win_length=None,
-          window=None,
-          center=True,
-          normalized=False,
-          onesided=True,
-          length=None,
-          return_complex=False,
-          name=None):
+def istft(
+    x,
+    n_fft,
+    hop_length=None,
+    win_length=None,
+    window=None,
+    center=True,
+    normalized=False,
+    onesided=True,
+    length=None,
+    return_complex=False,
+    name=None,
+):
     r"""
     Inverse short-time Fourier transform (ISTFT).
 
     Reconstruct time-domain signal from the giving complex input and window tensor when
-        nonzero overlap-add (NOLA) condition is met: 
+        nonzero overlap-add (NOLA) condition is met:
 
     .. math::
         \sum_{t = -\infty}^{\infty}%
@@ -432,7 +459,7 @@ def istft(x,
 
     Args:
         x (Tensor): The input data which is a 2-dimensional or 3-dimensional **complesx**
-            Tensor with shape `[..., n_fft, num_frames]`. 
+            Tensor with shape `[..., n_fft, num_frames]`.
         n_fft (int): The size of Fourier transform.
         hop_length (int, optional): Number of steps to advance between adjacent windows
             from time-domain signal and `0 < hop_length < win_length`. Default: `None`(
@@ -452,10 +479,10 @@ def istft(x,
             and `istft` will return a real-valued tensor when it is set to `True`.
             Default: `True`.
         length (int, optional): Specify the length of time-domain signal. Default: `None`(
-            treated as the whole length of signal). 
+            treated as the whole length of signal).
         return_complex (bool, optional): It means that whether the time-domain signal is
             real-valued. If `return_complex` is set to `True`, `onesided` should be set to
-            `False` cause the output is complex. 
+            `False` cause the output is complex.
         name (str, optional): The default value is None. Normally there is no need for user
             to set this property. For more information, please refer to :ref:`api_guide_Name`.
 
@@ -484,8 +511,12 @@ def istft(x,
     check_variable_and_dtype(x, 'x', ['complex64', 'complex128'], 'istft')
 
     x_rank = len(x.shape)
-    assert x_rank in [2, 3], \
-        'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(x_rank)
+    assert x_rank in [
+        2,
+        3,
+    ], 'x should be a 2D or 3D complex tensor, but got rank of x is {}'.format(
+        x_rank
+    )
 
     if x_rank == 2:  # (batch, n_fft, n_frames)
         x = x.unsqueeze(0)
@@ -497,83 +528,107 @@ def istft(x,
         win_length = n_fft
 
     # Assure no gaps between frames.
-    assert 0 < hop_length <= win_length, \
-        'hop_length should be in (0, win_length({})], but got {}.'.format(win_length, hop_length)
-
-    assert 0 < win_length <= n_fft, \
-        'win_length should be in (0, n_fft({})], but got {}.'.format(n_fft, win_length)
+    assert (
+        0 < hop_length <= win_length
+    ), 'hop_length should be in (0, win_length({})], but got {}.'.format(
+        win_length, hop_length
+    )
+
+    assert (
+        0 < win_length <= n_fft
+    ), 'win_length should be in (0, n_fft({})], but got {}.'.format(
+        n_fft, win_length
+    )
 
     n_frames = x.shape[-1]
     fft_size = x.shape[-2]
 
     if _non_static_mode():
         if onesided:
-            assert (fft_size == n_fft // 2 + 1), \
-                'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(n_fft // 2 + 1, fft_size)
+            assert (
+                fft_size == n_fft // 2 + 1
+            ), 'fft_size should be equal to n_fft // 2 + 1({}) when onesided is True, but got {}.'.format(
+                n_fft // 2 + 1, fft_size
+            )
         else:
-            assert (fft_size == n_fft), \
-                'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(n_fft, fft_size)
+            assert (
+                fft_size == n_fft
+            ), 'fft_size should be equal to n_fft({}) when onesided is False, but got {}.'.format(
+                n_fft, fft_size
+            )
 
     if window is not None:
-        assert len(window.shape) == 1 and len(window) == win_length, \
-            'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(win_length, window.shape)
+        assert (
+            len(window.shape) == 1 and len(window) == win_length
+        ), 'expected a 1D window tensor of size equal to win_length({}), but got window with shape {}.'.format(
+            win_length, window.shape
+        )
     else:
-        window_dtype = paddle.float32 if x.dtype in [
-            paddle.float32, paddle.complex64
-        ] else paddle.float64
-        window = paddle.ones(shape=(win_length, ), dtype=window_dtype)
+        window_dtype = (
+            paddle.float32
+            if x.dtype in [paddle.float32, paddle.complex64]
+            else paddle.float64
+        )
+        window = paddle.ones(shape=(win_length,), dtype=window_dtype)
 
     if win_length < n_fft:
         pad_left = (n_fft - win_length) // 2
         pad_right = n_fft - win_length - pad_left
         # FIXME: Input `window` can be a complex tensor but pad does not supprt complex input.
-        window = paddle.nn.functional.pad(window,
-                                          pad=[pad_left, pad_right],
-                                          mode='constant')
+        window = paddle.nn.functional.pad(
+            window, pad=[pad_left, pad_right], mode='constant'
+        )
 
     x = x.transpose(
-        perm=[0, 2,
-              1])  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
+        perm=[0, 2, 1]
+    )  # switch n_fft to last dim, egs: (batch, num_frames, n_fft)
     norm = 'ortho' if normalized else 'backward'
 
     if return_complex:
-        assert not onesided, \
-            'onesided should be False when input(output of istft) or window is a complex Tensor.'
+        assert (
+            not onesided
+        ), 'onesided should be False when input(output of istft) or window is a complex Tensor.'
 
         out = fft_c2c(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
     else:
-        assert not is_complex(window), \
-            'Data type of window should not be complex when return_complex is False.'
+        assert not is_complex(
+            window
+        ), 'Data type of window should not be complex when return_complex is False.'
 
         if onesided is False:
-            x = x[:, :, :n_fft // 2 + 1]
+            x = x[:, :, : n_fft // 2 + 1]
         out = fft_c2r(x=x, n=None, axis=-1, norm=norm, forward=False, name=None)
 
     out = paddle.multiply(out, window).transpose(
-        perm=[0, 2, 1])  # (batch, n_fft, num_frames)
-    out = overlap_add(x=out, hop_length=hop_length,
-                      axis=-1)  # (batch, seq_length)
+        perm=[0, 2, 1]
+    )  # (batch, n_fft, num_frames)
+    out = overlap_add(
+        x=out, hop_length=hop_length, axis=-1
+    )  # (batch, seq_length)
 
     window_envelop = overlap_add(
         x=paddle.tile(
             x=paddle.multiply(window, window).unsqueeze(0),
-            repeat_times=[n_frames,
-                          1]).transpose(perm=[1, 0]),  # (n_fft, num_frames)
+            repeat_times=[n_frames, 1],
+        ).transpose(
+            perm=[1, 0]
+        ),  # (n_fft, num_frames)
         hop_length=hop_length,
-        axis=-1)  # (seq_length, )
+        axis=-1,
+    )  # (seq_length, )
 
     if length is None:
         if center:
-            out = out[:, (n_fft // 2):-(n_fft // 2)]
-            window_envelop = window_envelop[(n_fft // 2):-(n_fft // 2)]
+            out = out[:, (n_fft // 2) : -(n_fft // 2)]
+            window_envelop = window_envelop[(n_fft // 2) : -(n_fft // 2)]
     else:
         if center:
             start = n_fft // 2
         else:
             start = 0
 
-        out = out[:, start:start + length]
-        window_envelop = window_envelop[start:start + length]
+        out = out[:, start : start + length]
+        window_envelop = window_envelop[start : start + length]
 
     # Check whether the Nonzero Overlap Add (NOLA) constraint is met.
     if _non_static_mode() and window_envelop.abs().min().item() < 1e-11:
diff --git a/python/paddle/incubate/sparse/__init__.py b/python/paddle/sparse/__init__.py
similarity index 92%
rename from python/paddle/incubate/sparse/__init__.py
rename to python/paddle/sparse/__init__.py
index 8408c3ca277306..9ca932ac46b6ad 100644
--- a/python/paddle/incubate/sparse/__init__.py
+++ b/python/paddle/sparse/__init__.py
@@ -34,6 +34,8 @@
 from .unary import deg2rad
 from .unary import rad2deg
 from .unary import expm1
+from .unary import transpose
+from .unary import reshape
 
 from .binary import mv
 from .binary import matmul
@@ -42,6 +44,7 @@
 from .binary import divide
 from .binary import multiply
 from .binary import subtract
+from .binary import is_same_shape
 
 from .multiary import addmm
 
@@ -74,7 +77,10 @@
     'addmm',
     'add',
     'subtract',
+    'transpose',
     'multiply',
     'divide',
     'coalesce',
+    'is_same_shape',
+    'reshape',
 ]
diff --git a/python/paddle/incubate/sparse/binary.py b/python/paddle/sparse/binary.py
similarity index 73%
rename from python/paddle/incubate/sparse/binary.py
rename to python/paddle/sparse/binary.py
index 93ce90c9f021a5..3d2a3af8ec83b0 100644
--- a/python/paddle/incubate/sparse/binary.py
+++ b/python/paddle/sparse/binary.py
@@ -14,6 +14,9 @@
 
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only, core
+from paddle import in_dynamic_mode
+from paddle.fluid.layer_helper import LayerHelper
+from .unary import cast
 
 __all__ = []
 
@@ -30,13 +33,13 @@
 @dygraph_only
 def matmul(x, y, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.0`` .
 
-    Applies matrix multiplication of two Tensors. 
-    
+    Applies matrix multiplication of two Tensors.
+
     The supported input/output Tensor layout are as follows:
-    
+
     Note:
         x[SparseCsrTensor] @ y[SparseCsrTensor] -> out[SparseCsrTensor]
         x[SparseCsrTensor] @ y[DenseTensor] -> out[DenseTensor]
@@ -46,14 +49,14 @@ def matmul(x, y, name=None):
     It supports backward propagation.
 
     Dimensions `x` and `y` must be >= 2D. Automatic broadcasting of Tensor is not supported.
-    the shape of `x` should be `[*, M, K]` , and the shape of `y` should be `[*, K, N]` , where `*` 
+    the shape of `x` should be `[*, M, K]` , and the shape of `y` should be `[*, K, N]` , where `*`
     is zero or more batch dimensions.
 
     Args:
         x (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor. The data type can be float32 or float64.
         y (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor/DenseTensor. The data type can be float32 or float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: Its layout is determined by that of `x` and `y` .
 
@@ -61,19 +64,20 @@ def matmul(x, y, name=None):
 
         .. code-block:: python
 
+            # required: gpu
             import paddle
 
             # csr @ dense -> dense
             crows = [0, 1, 2, 3]
             cols = [1, 2, 0]
             values = [1., 2., 3.]
-            csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, [3, 3])
-            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 1, 2, 3], 
-            #        cols=[1, 2, 0], 
+            csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, [3, 3])
+            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 1, 2, 3],
+            #        cols=[1, 2, 0],
             #        values=[1., 2., 3.])
             dense = paddle.ones([3, 2])
-            out = paddle.incubate.sparse.matmul(csr, dense)
+            out = paddle.sparse.matmul(csr, dense)
             # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [[1., 1.],
             #         [2., 2.],
@@ -82,13 +86,13 @@ def matmul(x, y, name=None):
             # coo @ dense -> dense
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1., 2., 3.]
-            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, [3, 3])
-            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
+            coo = paddle.sparse.sparse_coo_tensor(indices, values, [3, 3])
+            # Tensor(shape=[3, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
             #        indices=[[0, 1, 2],
-            #                 [1, 2, 0]], 
+            #                 [1, 2, 0]],
             #        values=[1., 2., 3.])
             dense = paddle.ones([3, 2])
-            out = paddle.incubate.sparse.matmul(coo, dense)
+            out = paddle.sparse.matmul(coo, dense)
             # Tensor(shape=[3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
             #        [[1., 1.],
             #         [2., 2.],
@@ -100,13 +104,13 @@ def matmul(x, y, name=None):
 @dygraph_only
 def masked_matmul(x, y, mask, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.3`` .
 
-    Applies matrix multiplication of two Dense Tensors. 
-    
+    Applies matrix multiplication of two Dense Tensors.
+
     The supported input/output Tensor layout are as follows:
-    
+
     Note:
         x[DenseTensor] @ y[DenseTensor] * mask[SparseCooTensor] -> out[SparseCooTensor]
         x[DenseTensor] @ y[DenseTensor] * mask[SparseCsrTensor] -> out[SparseCsrTensor]
@@ -130,6 +134,7 @@ def masked_matmul(x, y, mask, name=None):
 
         .. code-block:: python
 
+            # required: gpu
             import paddle
             paddle.seed(100)
 
@@ -138,7 +143,7 @@ def masked_matmul(x, y, mask, name=None):
             cols = [1, 3, 2, 0, 1]
             values = [1., 2., 3., 4., 5.]
             dense_shape = [3, 4]
-            mask = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+            mask = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
             # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
             #       crows=[0, 2, 3, 5],
             #       cols=[1, 3, 2, 0, 1],
@@ -147,10 +152,10 @@ def masked_matmul(x, y, mask, name=None):
             x = paddle.rand([3, 5])
             y = paddle.rand([5, 4])
 
-            out = paddle.incubate.sparse.masked_matmul(x, y, mask)
-            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 3, 5], 
-            #        cols=[1, 3, 2, 0, 1], 
+            out = paddle.sparse.masked_matmul(x, y, mask)
+            # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 3, 5],
+            #        cols=[1, 3, 2, 0, 1],
             #        values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981])
 
     """
@@ -160,11 +165,11 @@ def masked_matmul(x, y, mask, name=None):
 @dygraph_only
 def mv(x, vec, name=None):
     """
-    Note:    
+    Note:
         This API is only supported from ``CUDA 11.0`` .
 
-    Applies matrix-vector product of Sparse Matrix 'x' and Dense vector 'vec' . 
-    
+    Applies matrix-vector product of Sparse Matrix 'x' and Dense vector 'vec' .
+
     The supported input/output Tensor layout are as follows:
 
     Note:
@@ -173,39 +178,40 @@ def mv(x, vec, name=None):
 
     It supports backward propagation.
 
-    The shape of `x` should be `[M, N]` , and the shape of `y` should be `[N]` , 
+    The shape of `x` should be `[M, N]` , and the shape of `y` should be `[N]` ,
     and the shape of `out` will be `[M]` .
 
     Args:
         x (Tensor): The input 2D tensor. It must be SparseCooTensor/SparseCsrTensor. The data type can be float32 or float64.
         y (Tensor): The input 1D tensor. It must be DenseTensor vector. The data type can be float32 or float64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-    
+
     Returns:
         Tensor: 1D Tensor.
 
     Examples:
 
         .. code-block:: python
-        
+
+            # required: gpu
             import paddle
-            from paddle.fluid.framework import _test_eager_guard 
+            from paddle.fluid.framework import _test_eager_guard
             paddle.seed(100)
 
             # csr @ dense -> dense
-            with _test_eager_guard():         
+            with _test_eager_guard():
                 crows = [0, 2, 3, 5]
                 cols = [1, 3, 2, 0, 1]
                 values = [1., 2., 3., 4., 5.]
                 dense_shape = [3, 4]
-                csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True, 
-                #        crows=[0, 2, 3, 5], 
-                #        cols=[1, 3, 2, 0, 1], 
+                csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+                # Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+                #        crows=[0, 2, 3, 5],
+                #        cols=[1, 3, 2, 0, 1],
                 #        values=[1., 2., 3., 4., 5.])
                 vec = paddle.randn([4])
-                
-                out = paddle.incubate.sparse.mv(csr, vec)
+
+                out = paddle.sparse.mv(csr, vec)
                 # Tensor(shape=[3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
                 #        [-3.85499096, -2.42975140, -1.75087738])
 
@@ -235,17 +241,15 @@ def add(x, y, name=None):
     ..  code-block:: python
 
         import paddle
-        from paddle.fluid.framework import _test_eager_guard
 
         paddle.device.set_device("cpu")
 
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.add(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
+        x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+        y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+        sparse_x = x.to_sparse_csr()
+        sparse_y = y.to_sparse_csr()
+        sparse_z = paddle.sparse.add(sparse_x, sparse_y)
+        print(sparse_z.to_dense())
 
         # [[ 0., -1.,  0.,  0.],
         # [ 0.,  2., -6.,  0.],
@@ -253,8 +257,19 @@ def add(x, y, name=None):
 
     """
     if y.dtype != x.dtype:
-        y = _C_ops.sparse_cast(y, None, x.dtype)
-    return _C_ops.sparse_add(x, y)
+        y = cast(y, None, x.dtype)
+
+    if in_dynamic_mode():
+        return _C_ops.sparse_add(x, y)
+    else:
+        op_type = 'sparse_add'
+        inputs = {'x': x, 'y': y}
+        helper = LayerHelper(op_type)
+        out = helper.create_sparse_variable_for_type_inference(x.dtype)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs={'out': out}, attrs={}
+        )
+        return out
 
 
 @dygraph_only
@@ -280,17 +295,15 @@ def subtract(x, y, name=None):
     ..  code-block:: python
 
         import paddle
-        from paddle.fluid.framework import _test_eager_guard
 
         paddle.device.set_device("cpu")
 
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.subtract(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
+        x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+        y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+        sparse_x = x.to_sparse_csr()
+        sparse_y = y.to_sparse_csr()
+        sparse_z = paddle.sparse.subtract(sparse_x, sparse_y)
+        print(sparse_z.to_dense())
 
         # [[ 0., -1.,  0.,  4.],
         # [ 0., -2.,  0.,  0.],
@@ -325,17 +338,15 @@ def multiply(x, y, name=None):
     ..  code-block:: python
 
         import paddle
-        from paddle.fluid.framework import _test_eager_guard
 
         paddle.device.set_device("cpu")
 
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.multiply(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
+        x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+        y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+        sparse_x = x.to_sparse_csr()
+        sparse_y = y.to_sparse_csr()
+        sparse_z = paddle.sparse.multiply(sparse_x, sparse_y)
+        print(sparse_z.to_dense())
 
         # [[ 0.,  0.,  0., -4.],
         # [ 0.,  0.,  9.,  0.],
@@ -373,17 +384,15 @@ def divide(x, y, name=None):
     ..  code-block:: python
 
         import paddle
-        from paddle.fluid.framework import _test_eager_guard
 
         paddle.device.set_device("cpu")
 
-        with _test_eager_guard():
-            x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
-            y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
-            sparse_x = x.to_sparse_csr()
-            sparse_y = y.to_sparse_csr()
-            sparse_z = paddle.incubate.sparse.divide(sparse_x, sparse_y)
-            print(sparse_z.to_dense())
+        x = paddle.to_tensor([[0, -1, 0, 2], [0, 0, -3, 0], [4, 5, 0, 0]], 'float32')
+        y = paddle.to_tensor([[0, 0, 0, -2], [0, 2, -3, 0], [2, 3, 4, 8]], 'float32')
+        sparse_x = x.to_sparse_csr()
+        sparse_y = y.to_sparse_csr()
+        sparse_z = paddle.sparse.divide(sparse_x, sparse_y)
+        print(sparse_z.to_dense())
 
         # [[ nan      , -inf.     ,  nan      , -1.       ],
         # [ nan      ,  0.       ,  1.       ,  nan      ],
@@ -399,3 +408,36 @@ def divide(x, y, name=None):
         if y.dtype != x.dtype:
             y = _C_ops.sparse_cast(y, None, x.dtype)
         return _C_ops.sparse_divide(x, y)
+
+
+@dygraph_only
+def is_same_shape(x, y):
+    """
+    Return the results of shape comparison between two Tensors, check whether x.shape equal to y.shape.
+    Any two type Tensor among DenseTensor/SparseCooTensor/SparseCsrTensor are supported.
+
+    Args:
+        x (Tensor): The input tensor. It can be DenseTensor/SparseCooTensor/SparseCsrTensor.
+        y (Tensor): The input tensor. It can be DenseTensor/SparseCooTensor/SparseCsrTensor.
+
+    Returns:
+        bool: True for same shape and False for different shape.
+
+    Examples:
+
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.rand([2, 3, 8])
+            y = paddle.rand([2, 3, 8])
+            y = y.to_sparse_csr()
+            z = paddle.rand([2, 5])
+
+            paddle.sparse.is_same_shape(x, y)
+            # True
+            paddle.sparse.is_same_shape(x, z)
+            # False
+
+    """
+    return x.is_same_shape(y)
diff --git a/python/paddle/incubate/sparse/creation.py b/python/paddle/sparse/creation.py
similarity index 55%
rename from python/paddle/incubate/sparse/creation.py
rename to python/paddle/sparse/creation.py
index 1879478883188d..6258f6e05a76c0 100644
--- a/python/paddle/incubate/sparse/creation.py
+++ b/python/paddle/sparse/creation.py
@@ -17,7 +17,14 @@
 from paddle.fluid.framework import core, dygraph_only
 from paddle.fluid.framework import _current_expected_place, _get_paddle_place
 from paddle.tensor import to_tensor, max
-from paddle.fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from paddle.fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+    convert_dtype,
+)
+from paddle import in_dynamic_mode
+from paddle.fluid.layer_helper import LayerHelper
 
 import numpy as np
 
@@ -49,8 +56,8 @@ def _get_place(place):
     if place is None:
         place = _current_expected_place()
     elif not isinstance(
-            place,
-        (core.Place, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)):
+        place, (core.Place, core.CPUPlace, core.CUDAPinnedPlace, core.CUDAPlace)
+    ):
         raise ValueError(
             "'place' must be any of paddle.Place, paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace"
         )
@@ -64,15 +71,11 @@ def _check_indices_dtype(dtype):
         )
 
 
-@dygraph_only
-def sparse_coo_tensor(indices,
-                      values,
-                      shape=None,
-                      dtype=None,
-                      place=None,
-                      stop_gradient=True):
+def sparse_coo_tensor(
+    indices, values, shape=None, dtype=None, place=None, stop_gradient=True
+):
     r"""
-    Constructs a sparse ``paddle.Tensor`` in coordinate format according to the indices 
+    Constructs a sparse ``paddle.Tensor`` in coordinate format according to the indices
     and values of the specified non-zero elements.
 
     Args:
@@ -81,155 +84,154 @@ def sparse_coo_tensor(indices,
         values(list|tuple|ndarray|Tensor): Initial values for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
         shape(list|tuple, optional): The shape of the sparse tensor also represents the shape of
-            original dense tensor. If not provided the smallest shape will be inferred to 
+            original dense tensor. If not provided the smallest shape will be inferred to
             hold all elements.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
         Tensor: A Tensor constructed from ``indices`` and ``values`` .
 
-    Raises:
-        TypeError: If the data type of ``values`` is not list, tuple, numpy.ndarray, paddle.Tensor
-        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``indices`` is not a 2-D. 
-        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. 
-
     Examples:
 
     .. code-block:: python
 
         import paddle
-        from paddle.fluid.framework import _test_eager_guard
-
-        with _test_eager_guard():
-            indices = [[0, 1, 2], [1, 2, 0]]
-            values = [1.0, 2.0, 3.0]
-            dense_shape = [3, 3]
-            coo = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape)
-            # print(coo)
-            # Tensor(shape=[2, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
-            #       indices=[[0, 1, 2],
-            #                [1, 2, 0]],
-            #       values=[1., 2., 3.])
-    """
-
-    place = _get_place(place)
-
-    if not isinstance(indices, core.eager.Tensor):
-        indices = to_tensor(indices,
-                            dtype=None,
-                            place=place,
-                            stop_gradient=True)
-    if not isinstance(values, core.eager.Tensor):
-        values = to_tensor(values, dtype, place, stop_gradient)
-    if len(indices.shape) != 2:
-        raise ValueError("'indices' must be 2-D.")
 
-    nnz = indices.shape[1]
-    sparse_dim = indices.shape[0]
+        indices = [[0, 1, 2], [1, 2, 0]]
+        values = [1.0, 2.0, 3.0]
+        dense_shape = [3, 3]
+        coo = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape)
+        # print(coo)
+        # Tensor(shape=[2, 3], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
+        #       indices=[[0, 1, 2],
+        #                [1, 2, 0]],
+        #       values=[1., 2., 3.])
+    """
 
-    _check_indices_dtype(indices.dtype)
+    if in_dynamic_mode():
+        place = _get_place(place)
 
-    if nnz != values.shape[0]:
-        raise ValueError(
-            "the indices and values must have same number of non-zero, but get {} and {}"
-            .format(nnz, values.shape[0]))
+        if not isinstance(indices, core.eager.Tensor):
+            indices = to_tensor(
+                indices, dtype=None, place=place, stop_gradient=True
+            )
+        if not isinstance(values, core.eager.Tensor):
+            values = to_tensor(values, dtype, place, stop_gradient)
+        if len(indices.shape) != 2:
+            raise ValueError("'indices' must be 2-D.")
 
-    dense_dim = len(values.shape) - 1
+        nnz = indices.shape[1]
+        sparse_dim = indices.shape[0]
 
-    if not indices.place._equals(place):
-        indices = indices._copy_to(place, False)
+        _check_indices_dtype(indices.dtype)
 
-    if not values.place._equals(place):
-        values = values._copy_to(place, False)
-    values = _handle_dtype(values, dtype)
-    values.stop_gradient = stop_gradient
-
-    min_shape = _infer_dense_shape(indices, values)
-
-    if shape is None:
-        shape = min_shape
-    else:
-        if shape < min_shape:
-            raise ValueError(
-                "the minimun shape required is {}, but get {}".format(
-                    min_shape, shape))
-        if len(shape) != sparse_dim + dense_dim:
+        if nnz != values.shape[0]:
             raise ValueError(
-                "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}"
-                .format(sparse_dim, dense_dim, len(shape)))
+                "the indices and values must have same number of non-zero, but get {} and {}".format(
+                    nnz, values.shape[0]
+                )
+            )
+
+        dense_dim = len(values.shape) - 1
+
+        if not indices.place._equals(place):
+            indices = indices._copy_to(place, False)
+
+        if not values.place._equals(place):
+            values = values._copy_to(place, False)
+        values = _handle_dtype(values, dtype)
+        values.stop_gradient = stop_gradient
+
+        min_shape = _infer_dense_shape(indices, values)
+
+        if shape is None:
+            shape = min_shape
+        else:
+            if shape < min_shape:
+                raise ValueError(
+                    "the minimun shape required is {}, but get {}".format(
+                        min_shape, shape
+                    )
+                )
+            if len(shape) != sparse_dim + dense_dim:
+                raise ValueError(
+                    "the number of dimensions(len(shape) must be sparse_dim({}) + dense_dim({}), but get {}".format(
+                        sparse_dim, dense_dim, len(shape)
+                    )
+                )
+
+        return _C_ops.sparse_sparse_coo_tensor(values, indices, shape)
 
-    return _C_ops.sparse_sparse_coo_tensor(values, indices, shape)
+    else:
+        op_type = 'sparse_sparse_coo_tensor'
+        inputs = {'values': values, 'indices': indices}
+        if shape[0] is None:
+            shape[0] = -1
+        attrs = {'shape': shape}
+        helper = LayerHelper(op_type)
+        out = helper.create_sparse_variable_for_type_inference(dtype)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs={'out': out}, attrs=attrs
+        )
+        return out
 
 
-#TODO: need to support shape is None
+# TODO: need to support shape is None
 @dygraph_only
-def sparse_csr_tensor(crows,
-                      cols,
-                      values,
-                      shape,
-                      dtype=None,
-                      place=None,
-                      stop_gradient=True):
+def sparse_csr_tensor(
+    crows, cols, values, shape, dtype=None, place=None, stop_gradient=True
+):
     r"""
-    Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the 
+    Constructs a sparse ``paddle.Tensor`` in CSR(Compressed Sparse Row) format according to the
     ``crows``, ``cols`` and ``values``.
     Currently, the crows and cols of each batch must be incrementd.
 
     Args:
-        crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the 
-            starting position of the first non-zero element of each row in values. 
-            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor. 
+        crows(list|tuple|ndarray|Tensor): 1-D array, each element in the rows represents the
+            starting position of the first non-zero element of each row in values.
+            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor.
         cols(list|tuple|ndarray|Tensor): 1-D array, the column of non-zero elements.
-            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor. 
+            Can be a list, tuple, numpy\.ndarray, paddle\.Tensor.
         values(list|tuple|ndarray|Tensor): 1-D array, the non-zero elements.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
         shape(list|tuple, optional): The shape of the sparse tensor also represents the shape of
-            original dense tensor. 
+            original dense tensor.
             hold all elements.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
         Tensor: A Tensor constructed from ``crows``, ``cols`` and ``values`` .
 
-    Raises:
-        TypeError: If the data type of ``values`` is not list, tuple, numpy.ndarray, paddle.Tensor
-        ValueError: If ``values`` is tuple|list, it can't contain nested tuple|list with different lengths , such as: [[1, 2], [3, 4, 5]]. If the ``crow``, ``cols`` and ``values`` is not a 2-D. 
-        TypeError: If ``dtype`` is not bool, float16, float32, float64, int8, int16, int32, int64, uint8, complex64, complex128
-        ValueError: If ``place`` is not paddle.CPUPlace, paddle.CUDAPinnedPlace, paddle.CUDAPlace or specified pattern string. 
-
     Examples:
 
     .. code-block:: python
 
         import paddle
-        from paddle.fluid.framework import _test_eager_guard
-
-        with _test_eager_guard():
-            crows = [0, 2, 3, 5]
-            cols = [1, 3, 2, 0, 1]
-            values = [1, 2, 3, 4, 5]
-            dense_shape = [3, 4]
-            csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
-            # print(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.int64, place=Place(gpu:0), stop_gradient=True,
-            #       crows=[0, 2, 3, 5],
-            #       cols=[1, 3, 2, 0, 1],
-            #       values=[1, 2, 3, 4, 5])
+
+        crows = [0, 2, 3, 5]
+        cols = [1, 3, 2, 0, 1]
+        values = [1, 2, 3, 4, 5]
+        dense_shape = [3, 4]
+        csr = paddle.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
+        # print(csr)
+        # Tensor(shape=[3, 4], dtype=paddle.int64, place=Place(gpu:0), stop_gradient=True,
+        #       crows=[0, 2, 3, 5],
+        #       cols=[1, 3, 2, 0, 1],
+        #       values=[1, 2, 3, 4, 5])
     """
 
     place = _get_place(place)
@@ -246,8 +248,10 @@ def sparse_csr_tensor(crows,
 
     if len(shape) != 2 and len(shape) != 3:
         raise ValueError(
-            "SparseCsrTensor only support 2-D or 3-D matrix. but get shape {}".
-            format(shape))
+            "SparseCsrTensor only support 2-D or 3-D matrix. but get shape {}".format(
+                shape
+            )
+        )
     rows = shape[len(shape) - 2]
 
     if not crows.place._equals(place):
@@ -264,26 +268,32 @@ def sparse_csr_tensor(crows,
     if len(crows.shape) != 1 or len(cols.shape) != 1 or len(values.shape) != 1:
         raise ValueError("The 'crows', 'cols' and 'values' must be 1-D.")
 
-    if (len(cols) != len(values)):
+    if len(cols) != len(values):
         raise ValueError("the length of cols must be same as length of values")
 
     if len(shape) == 2:
         if crows.shape[0] != rows + 1:
             raise ValueError(
-                "The length({}) of crows must be equal to the rows({})+1 of matrix."
-                .format(crows.shape[0], rows))
+                "The length({}) of crows must be equal to the rows({})+1 of matrix.".format(
+                    crows.shape[0], rows
+                )
+            )
         if crows[0] != 0:
             raise ValueError("the 0th value of crows must be 0")
 
         if crows[-1] != values.shape[0]:
             raise ValueError(
-                "the last value of crows must be equal the number of non-zero")
+                "the last value of crows must be equal the number of non-zero"
+            )
     else:
         if crows.shape[0] % (rows + 1) != 0:
             raise ValueError(
-                "The length({}) of crows must be divisible the rows({})+1 of matrix."
-                .format(crows.shape[0], rows))
+                "The length({}) of crows must be divisible the rows({})+1 of matrix.".format(
+                    crows.shape[0], rows
+                )
+            )
     # TODO(zkh2016): check whether the value in crows and cols is legal
 
-    return core.eager.sparse_csr_tensor(crows, cols, values, shape,
-                                        stop_gradient)
+    return core.eager.sparse_csr_tensor(
+        crows, cols, values, shape, stop_gradient
+    )
diff --git a/python/paddle/incubate/sparse/multiary.py b/python/paddle/sparse/multiary.py
similarity index 90%
rename from python/paddle/incubate/sparse/multiary.py
rename to python/paddle/sparse/multiary.py
index d65847f1383063..9874b8279135a4 100644
--- a/python/paddle/incubate/sparse/multiary.py
+++ b/python/paddle/sparse/multiary.py
@@ -58,6 +58,7 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
 
         .. code-block:: python
 
+            # required: gpu
             import paddle
 
             # dense + csr @ dense -> dense
@@ -65,17 +66,17 @@ def addmm(input, x, y, beta=1.0, alpha=1.0, name=None):
             crows = [0, 1, 2, 3]
             cols = [1, 2, 0]
             values = [1., 2., 3.]
-            x = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, [3, 3])
+            x = paddle.sparse.sparse_csr_tensor(crows, cols, values, [3, 3])
             y = paddle.rand([3, 2])
-            out = paddle.incubate.sparse.addmm(input, x, y, 3.0, 2.0)
+            out = paddle.sparse.addmm(input, x, y, 3.0, 2.0)
 
             # dense + coo @ dense -> dense
             input = paddle.rand([3, 2])
             indices = [[0, 1, 2], [1, 2, 0]]
             values = [1., 2., 3.]
-            x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, [3, 3])
+            x = paddle.sparse.sparse_coo_tensor(indices, values, [3, 3])
             y = paddle.rand([3, 2])
-            out = paddle.incubate.sparse.addmm(input, x, y, 3.0, 2.0)
+            out = paddle.sparse.addmm(input, x, y, 3.0, 2.0)
             
     """
     return _C_ops.sparse_addmm(input, x, y, alpha, beta)
diff --git a/python/paddle/incubate/sparse/nn/__init__.py b/python/paddle/sparse/nn/__init__.py
similarity index 100%
rename from python/paddle/incubate/sparse/nn/__init__.py
rename to python/paddle/sparse/nn/__init__.py
diff --git a/python/paddle/incubate/sparse/nn/functional/__init__.py b/python/paddle/sparse/nn/functional/__init__.py
similarity index 100%
rename from python/paddle/incubate/sparse/nn/functional/__init__.py
rename to python/paddle/sparse/nn/functional/__init__.py
diff --git a/python/paddle/incubate/sparse/nn/functional/activation.py b/python/paddle/sparse/nn/functional/activation.py
similarity index 88%
rename from python/paddle/incubate/sparse/nn/functional/activation.py
rename to python/paddle/sparse/nn/functional/activation.py
index ddaa6ada01be1e..8a7c671df178e5 100644
--- a/python/paddle/incubate/sparse/nn/functional/activation.py
+++ b/python/paddle/sparse/nn/functional/activation.py
@@ -16,9 +16,10 @@
 
 from paddle import _C_ops, _legacy_C_ops
 from paddle.fluid.framework import dygraph_only
+from paddle import in_dynamic_mode
+from paddle.fluid.layer_helper import LayerHelper
 
 
-@dygraph_only
 def relu(x, name=None):
     """
     sparse relu activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
@@ -42,10 +43,20 @@ def relu(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.nn.functional.relu(sparse_x)
+            out = paddle.sparse.nn.functional.relu(sparse_x)
             # [0., 0., 1.]
     """
-    return _C_ops.sparse_relu(x)
+    if in_dynamic_mode():
+        return _C_ops.sparse_relu(x)
+    else:
+        op_type = 'sparse_relu'
+        helper = LayerHelper(op_type)
+        out = helper.create_sparse_variable_for_type_inference(x.dtype)
+        helper.append_op(type=op_type,
+                         inputs={'x': x},
+                         outputs={'out': out},
+                         attrs={})
+        return out
 
 
 @dygraph_only
@@ -93,7 +104,7 @@ def softmax(x, axis=-1, name=None):
             #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
             #                0.98275049])
 
-            out = paddle.incubate.sparse.nn.functional.softmax(csr)
+            out = paddle.sparse.nn.functional.softmax(csr)
             # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
             #        crows=[0, 2, 5, 6], 
             #        cols=[2, 3, 0, 2, 3, 3], 
@@ -128,7 +139,7 @@ def relu6(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 8.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.nn.functional.relu6(sparse_x)
+            out = paddle.sparse.nn.functional.relu6(sparse_x)
     """
     return _C_ops.sparse_relu6(x, 6.0)
 
@@ -164,6 +175,6 @@ def leaky_relu(x, negative_slope=0.01, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 5.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.nn.functional.leaky_relu(sparse_x, 0.5)
+            out = paddle.sparse.nn.functional.leaky_relu(sparse_x, 0.5)
     """
     return _C_ops.sparse_leaky_relu(x, negative_slope)
diff --git a/python/paddle/incubate/sparse/nn/functional/conv.py b/python/paddle/sparse/nn/functional/conv.py
similarity index 58%
rename from python/paddle/incubate/sparse/nn/functional/conv.py
rename to python/paddle/sparse/nn/functional/conv.py
index cd3e8e3551f5bc..f9d16f4683982c 100644
--- a/python/paddle/incubate/sparse/nn/functional/conv.py
+++ b/python/paddle/sparse/nn/functional/conv.py
@@ -18,83 +18,124 @@
 from paddle.fluid.layers.utils import convert_to_list
 from paddle.fluid.layers.nn import elementwise_add
 from ...creation import sparse_coo_tensor
+from ...binary import add
 from paddle.nn.functional.conv import _update_padding_nd
-
-
-def _conv3d(x,
-            weight,
-            bias=None,
-            stride=1,
-            padding=0,
-            dilation=1,
-            groups=1,
-            subm=False,
-            key=None,
-            data_format="NDHWC",
-            name=None):
-    assert in_dynamic_mode(), "Currently, only support dynamic mode"
+from paddle.fluid.layer_helper import LayerHelper
+
+
+def _conv3d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    subm=False,
+    key=None,
+    data_format="NDHWC",
+    name=None,
+):
     assert groups == 1, "Currently, only support groups=1"
 
     dims = 3
 
     # Currently, only support 'NDHWC'
     if data_format not in ["NDHWC"]:
-        raise ValueError("Attr(data_format) should be 'NDHWC'. Received "
-                         "Attr(data_format): {}.".format(data_format))
+        raise ValueError(
+            "Attr(data_format) should be 'NDHWC'. Received "
+            "Attr(data_format): {}.".format(data_format)
+        )
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 5D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
 
-    channel_last = (data_format == "NDHWC")
+    channel_last = data_format == "NDHWC"
     channel_dim = -1 if channel_last else 1
     if len(x.shape) != 5:
         raise ValueError(
-            "Input x should be 5D tensor, but received x with the shape of {}".
-            format(x.shape))
+            "Input x should be 5D tensor, but received x with the shape of {}".format(
+                x.shape
+            )
+        )
     num_channels = x.shape[channel_dim]
     if num_channels < 0:
         raise ValueError(
             "The channel dimension of the input({}) should be defined. "
-            "Received: {}.".format(x.shape, num_channels))
+            "Received: {}.".format(x.shape, num_channels)
+        )
 
     padding, padding_algorithm = _update_padding_nd(padding, channel_last, dims)
     stride = convert_to_list(stride, dims, 'stride')
     dilation = convert_to_list(dilation, dims, 'dilation')
-    op_type = "conv3d"
-
-    pre_bias = _C_ops.sparse_conv3d(x, weight, padding, dilation, stride,
-                                    groups, subm,
-                                    key if key is not None else "")
-    if bias is not None:
-        values = pre_bias.values()
-        add_bias = elementwise_add(values, bias, axis=1)
-        return sparse_coo_tensor(pre_bias.indices(),
-                                 add_bias,
-                                 shape=pre_bias.shape,
-                                 stop_gradient=pre_bias.stop_gradient)
+
+    if in_dynamic_mode():
+        pre_bias = _C_ops.sparse_conv3d(
+            x,
+            weight,
+            padding,
+            dilation,
+            stride,
+            groups,
+            subm,
+            key if key is not None else "",
+        )
+        if bias is not None:
+            return add(pre_bias, bias)
+        else:
+            return pre_bias
     else:
-        return pre_bias
-
-
-def conv3d(x,
-           weight,
-           bias=None,
-           stride=1,
-           padding=0,
-           dilation=1,
-           groups=1,
-           data_format="NDHWC",
-           name=None):
+        inputs = {'x': x, 'kernel': weight}
+        attrs = {
+            'paddings': padding,
+            'dilations': dilation,
+            'strides': stride,
+            'groups': groups,
+            'subm': subm,
+            'key': key,
+        }
+        op_type = 'sparse_conv3d'
+        helper = LayerHelper(op_type, **locals())
+        rulebook = helper.create_variable_for_type_inference(
+            dtype='int32', stop_gradient=True
+        )
+        counter = helper.create_variable_for_type_inference(
+            dtype='int32', stop_gradient=True
+        )
+        pre_bias = helper.create_sparse_variable_for_type_inference(x.dtype)
+        outputs = {"out": pre_bias, "rulebook": rulebook, "counter": counter}
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
+        if bias is not None:
+            return add(pre_bias, bias)
+        else:
+            return pre_bias
+
+
+def conv3d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format="NDHWC",
+    name=None,
+):
     r"""
 
     The sparse convolution3d functional calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
-    bias is added to the output of the convolution. 
+    and W is the width of the feature. If bias attribution is provided,
+    bias is added to the output of the convolution.
 
     For each input :math:`X`, the equation is:
 
@@ -110,36 +151,17 @@ def conv3d(x,
     * :math:`b`: Bias value, a 1-D tensor with shape [M].
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})`
-
-          Filter shape: :math:`(D_f, H_f, W_f, C_{in}, C_{out})`
-
-        - Output:
-          Output shape: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})`
-
-        Where
-
-        ..  math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
     Args:
-        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data 
+        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data
             type of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel, a Tensor with shape [kD, kH, kW, C/g, M],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Tensor, optional): The bias, a Tensor of shape [M, ], currently, only support bias is None.
-        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
-            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        bias (Tensor, optional): The bias, a Tensor of shape [M].
+        stride (int|list|tuple, optional): The stride size. It means the stride in convolution. If stride is a
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple, optional): The padding size. It means the number of zero-paddings
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -149,67 +171,78 @@ def conv3d(x,
             when `data_format` is `"NDHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel points.
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
             Default: dilation = 1.
-        groups (int): The groups number of the Conv3D Layer. According to grouped
+        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Default: groups=1. Currently, only support groups=1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
             The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
             `[batch_size, input_depth, input_height, input_width, input_channels]`.
-        name(str|None): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A SparseCooTensor representing the conv3d, whose data type is the same with input. 
+        A SparseCooTensor representing the conv3d, whose data type is the same with input.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-              indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-              values = [[1], [2], [3], [4]]
-              indices = paddle.to_tensor(indices, dtype='int32')
-              values = paddle.to_tensor(values, dtype='float32')
-              dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-              weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-              y = paddle.incubate.sparse.nn.functional.conv3d(sparse_x, weight)
-              print(y.shape)
-              # (1, 1, 1, 2, 1)
+
+            indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            values = [[1], [2], [3], [4]]
+            indices = paddle.to_tensor(indices, dtype='int32')
+            values = paddle.to_tensor(values, dtype='float32')
+            dense_shape = [1, 1, 3, 4, 1]
+            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
+            y = paddle.sparse.nn.functional.conv3d(sparse_x, weight)
+            print(y.shape)
+            # (1, 1, 1, 2, 1)
     """
-    return _conv3d(x, weight, bias, stride, padding, dilation, groups, False,
-                   None, data_format, name)
-
-
-def subm_conv3d(x,
-                weight,
-                bias=None,
-                stride=1,
-                padding=0,
-                dilation=1,
-                groups=1,
-                data_format="NDHWC",
-                key=None,
-                name=None):
+    return _conv3d(
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        False,
+        None,
+        data_format,
+        name,
+    )
+
+
+def subm_conv3d(
+    x,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    data_format="NDHWC",
+    key=None,
+    name=None,
+):
     r"""
 
     The sparse submanifold convolution3d functional calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
-    bias is added to the output of the convolution. 
+    and W is the width of the feature. If bias attribution is provided,
+    bias is added to the output of the convolution.
 
     For each input :math:`X`, the equation is:
 
@@ -225,36 +258,17 @@ def subm_conv3d(x,
     * :math:`b`: Bias value, a 1-D tensor with shape [M].
     * :math:`Out`: Output value, the shape of :math:`Out` and :math:`X` may be different.
 
-    Example:
-
-        - Input:
-
-          Input shape: :math:`(N, D_{in}, H_{in}, W_{in}, C_{in})`
-
-          Filter shape: :math:`(D_f, H_f, W_f, C_{in}, C_{out})`
-
-        - Output:
-          Output shape: :math:`(N, D_{out}, H_{out}, W_{out}, C_{out})`
-
-        Where
-
-        ..  math::
-
-            D_{out}&= \\frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{strides[0]} + 1 \\\\
-            H_{out}&= \\frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{strides[1]} + 1 \\\\
-            W_{out}&= \\frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{strides[2]} + 1
-
     Args:
-        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data 
+        x (Tensor): The input is 5-D SparseCooTensor with shape [N, D, H, W, C], the data
             type of input is float16 or float32 or float64.
         weight (Tensor): The convolution kernel, a Tensor with shape [kD, kH, kW, C/g, M],
             where M is the number of filters(output channels), g is the number of groups,
             kD, kH, kW are the filter's depth, height and width respectively.
-        bias (Tensor, optional): The bias, a Tensor of shape [M, ], currently, only support bias is None.
-        stride (int|list|tuple): The stride size. It means the stride in convolution. If stride is a 
-            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width). 
+        bias (Tensor, optional): The bias, a Tensor of shape [M].
+        stride (int|list|tuple, optional): The stride size. It means the stride in convolution. If stride is a
+            list/tuple, it must contain three integers, (stride_depth, stride_height, stride_width).
             Otherwise, stride_depth = stride_height = stride_width = stride. Default: stride = 1.
-        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings 
+        padding (string|int|list|tuple): The padding size. It means the number of zero-paddings
             on both sides for each dimension. If `padding` is a string, either 'VALID' or
             'SAME' which is the padding algorithm. If padding size is a tuple or list,
             it could be in three forms: `[pad_depth, pad_height, pad_width]` or
@@ -264,48 +278,57 @@ def subm_conv3d(x,
             when `data_format` is `"NHWC"`, `padding` can be in the form
             `[[0,0], [pad_depth_front, pad_depth_back], [pad_height_top, pad_height_bottom], [pad_width_left, pad_width_right], [0,0]]`.
             Default: padding = 0.
-        dilation (int|list|tuple): The dilation size. It means the spacing between the kernel points. 
+        dilation (int|list|tuple, optional): The dilation size. It means the spacing between the kernel points.
             If dilation is a list/tuple, it must contain three integers, (dilation_depth, dilation_height,
-            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
+            dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation.
             Default: dilation = 1.
-        groups (int): The groups number of the Conv3D Layer. According to grouped
+        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. Currently, only support groups=1.
-        data_format (str, optional): Specify the data format of the input, and the data format of the output 
+        data_format (str, optional): Specify the data format of the input, and the data format of the output
             will be consistent with that of the input. An optional string from: `"NCDHW"`, `"NDHWC"`.
             The default is `"NDHWC"`. When it is `"NDHWC"`, the data is stored in the order of:
             `[batch_size, input_depth, input_height, input_width, input_channels]`.
-        key(str, optional): the key is used to save or use the same rulebook, 
+        key(str, optional): the key is used to save or use the same rulebook,
             the definition and role of rulebook refers to
-            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The 
+            https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
             default value is None.
-        name(str|None): For detailed information, please refer 
-           to :ref:`api_guide_Name`. Usually name is no need to set and 
+        name(str, optional): For detailed information, please refer
+           to :ref:`api_guide_Name`. Usually name is no need to set and
            None by default.
 
     Returns:
-        A SparseCooTensor representing the conv3d, whose data type is 
-        the same with input. 
+        A SparseCooTensor representing the conv3d, whose data type is
+        the same with input.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
-
-            with _test_eager_guard():
-              indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-              values = [[1], [2], [3], [4]]
-              indices = paddle.to_tensor(indices, dtype='int32')
-              values = paddle.to_tensor(values, dtype='float32')
-              dense_shape = [1, 1, 3, 4, 1]
-              sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-              weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
-              y = paddle.incubate.sparse.nn.functional.subm_conv3d(sparse_x, weight)
-              print(y.shape)
-              #(1, 1, 3, 4, 1)
+
+            indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+            values = [[1], [2], [3], [4]]
+            indices = paddle.to_tensor(indices, dtype='int32')
+            values = paddle.to_tensor(values, dtype='float32')
+            dense_shape = [1, 1, 3, 4, 1]
+            sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+            weight = paddle.randn((1, 3, 3, 1, 1), dtype='float32')
+            y = paddle.sparse.nn.functional.subm_conv3d(sparse_x, weight)
+            print(y.shape)
+            #(1, 1, 3, 4, 1)
     """
-    return _conv3d(x, weight, bias, stride, padding, dilation, groups, True,
-                   key, data_format, name)
+    return _conv3d(
+        x,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        True,
+        key,
+        data_format,
+        name,
+    )
diff --git a/python/paddle/incubate/sparse/nn/functional/pooling.py b/python/paddle/sparse/nn/functional/pooling.py
similarity index 69%
rename from python/paddle/incubate/sparse/nn/functional/pooling.py
rename to python/paddle/sparse/nn/functional/pooling.py
index cae93553b175ae..615a27d3df94e8 100644
--- a/python/paddle/incubate/sparse/nn/functional/pooling.py
+++ b/python/paddle/sparse/nn/functional/pooling.py
@@ -19,13 +19,15 @@
 __all__ = []
 
 
-def max_pool3d(x,
-               kernel_size,
-               stride=None,
-               padding=0,
-               ceil_mode=False,
-               data_format="NDHWC",
-               name=None):
+def max_pool3d(
+    x,
+    kernel_size,
+    stride=None,
+    padding=0,
+    ceil_mode=False,
+    data_format="NDHWC",
+    name=None,
+):
     """
     Implements sparse max pooling 3d operation.
     See more details in :ref:`api_sparse_pooling_MaxPool3d` .
@@ -37,47 +39,48 @@ def max_pool3d(x,
             is a tuple or list, it must contain three integers,
             (kernel_size_Depth, kernel_size_Height, kernel_size_Width).
             Otherwise, the pool kernel size will be the cube of an int.
-        stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list,
+        stride (int|list|tuple, optional): The pool stride size. If pool stride size is a tuple or list,
             it must contain three integers, [stride_Depth, stride_Height, stride_Width).
             Otherwise, the pool stride size will be a cube of an int.
-        padding (string|int|list|tuple): The padding size. Padding could be in one of the following forms.
+        padding (string|int|list|tuple, optional): The padding size. Padding could be in one of the following forms.
             1. A string in ['valid', 'same'].
             2. An int, which means the feature map is zero padded by size of `padding` on every sides.
             3. A list[int] or tuple(int) whose length is 3, [pad_depth, pad_height, pad_weight] whose value means the padding size of each dimension.
             4. A list[int] or tuple(int) whose length is 6. [pad_depth_front, pad_depth_back, pad_height_top, pad_height_bottom, pad_width_left, pad_width_right] whose value means the padding size of each side.
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        ceil_mode (bool): ${ceil_mode_comment}
-        data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
+        ceil_mode (bool, optional): ${ceil_mode_comment}
+        data_format (string, optional): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`. Currently only support `"NDHWC"` .
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
-    
+
     Returns:
         Tensor: The output tensor of pooling result. The data type is same as input tensor.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
 
-            with _test_eager_guard():
-                dense_x = paddle.randn((1, 4, 4, 4, 3))
-                sparse_x = dense_x.to_sparse_coo(4)
-                kernel_sizes = [3, 3, 3]
-                paddings = [0, 0, 0]
-                strides = [1, 1, 1]
-                out = paddle.incubate.sparse.nn.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
-                #[1, 2, 2, 2, 3]
+            dense_x = paddle.randn((1, 4, 4, 4, 3))
+            sparse_x = dense_x.to_sparse_coo(4)
+            kernel_sizes = [3, 3, 3]
+            paddings = [0, 0, 0]
+            strides = [1, 1, 1]
+            out = paddle.sparse.nn.functional.max_pool3d(sparse_x, kernel_sizes, stride=strides, padding=paddings)
+            #[1, 2, 2, 2, 3]
     """
 
     assert in_dynamic_mode(), "Currently, Sparse API only support dynamic mode"
-    assert x.is_sparse_coo(
+    assert (
+        x.is_sparse_coo()
     ), "Currently, sparse.relu only support the input of SparseCooTensor"
-    assert data_format == 'NDHWC', "Currently, sparse.max_pool3d only support data format of 'NDHWC'"
+    assert (
+        data_format == 'NDHWC'
+    ), "Currently, sparse.max_pool3d only support data format of 'NDHWC'"
 
     kernel_size = utils.convert_to_list(kernel_size, 3, 'pool_size')
     if stride is None:
@@ -87,12 +90,11 @@ def max_pool3d(x,
 
     channel_last = True
 
-    padding, padding_algorithm = _update_padding_nd(padding,
-                                                    3,
-                                                    channel_last=channel_last,
-                                                    ceil_mode=ceil_mode)
+    padding, padding_algorithm = _update_padding_nd(
+        padding, 3, channel_last=channel_last, ceil_mode=ceil_mode
+    )
 
-    #TODO(zkh2016): remove the dependency on dilation from the backend
+    # TODO(zkh2016): remove the dependency on dilation from the backend
     dilation = [1, 1, 1]
 
     return _C_ops.sparse_maxpool(x, kernel_size, padding, dilation, stride)
diff --git a/python/paddle/incubate/sparse/nn/functional/transformer.py b/python/paddle/sparse/nn/functional/transformer.py
similarity index 96%
rename from python/paddle/incubate/sparse/nn/functional/transformer.py
rename to python/paddle/sparse/nn/functional/transformer.py
index a4c9faf9ad57a1..d2fe100b7d9045 100644
--- a/python/paddle/incubate/sparse/nn/functional/transformer.py
+++ b/python/paddle/sparse/nn/functional/transformer.py
@@ -63,7 +63,8 @@ def attention(query,
 
     Examples:
         .. code-block:: python
-            
+
+            # required: gpu
             import paddle
 
             batch_size = 16
@@ -85,7 +86,7 @@ def attention(query,
             kp_mask = paddle.randint(0, 2, [batch_size, seq_len])
             attn_mask = paddle.randint(0, 2, [seq_len, seq_len])
 
-            output = paddle.incubate.sparse.nn.functional.attention(query, key, value, sp_mask, kp_mask, attn_mask)
+            output = paddle.sparse.nn.functional.attention(query, key, value, sp_mask, kp_mask, attn_mask)
             output.backward()
     """
     return _C_ops.sparse_fused_attention(query, key, value, sparse_mask,
diff --git a/python/paddle/incubate/sparse/nn/layer/activation.py b/python/paddle/sparse/nn/layer/activation.py
similarity index 91%
rename from python/paddle/incubate/sparse/nn/layer/activation.py
rename to python/paddle/sparse/nn/layer/activation.py
index da374fa87a88b8..3e8e81ea0b11dc 100644
--- a/python/paddle/incubate/sparse/nn/layer/activation.py
+++ b/python/paddle/sparse/nn/layer/activation.py
@@ -20,6 +20,7 @@
 
 class ReLU(Layer):
     """
+
     Sparse ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
@@ -41,9 +42,10 @@ class ReLU(Layer):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            relu = paddle.incubate.sparse.nn.ReLU()
+            relu = paddle.sparse.nn.ReLU()
             out = relu(sparse_x)
             # [0., 0., 1.]
+
     """
 
     def __init__(self, name=None):
@@ -59,14 +61,15 @@ def extra_repr(self):
 
 
 class Softmax(Layer):
-    """
+    r"""
+
     Sparse Softmax Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     Note:
-        Only support axis=-1 for SparseCsrTensor, which is faster when read data 
+        Only support axis=-1 for SparseCsrTensor, which is faster when read data
         by row (axis=-1).
 
-    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j` 
+    From the point of view of dense matrix, for each row :math:`i` and each column :math:`j`
     in the matrix, we have:
 
     .. math::
@@ -96,17 +99,17 @@ class Softmax(Layer):
             #  [0.         0.         0.         0.98275049]]
 
             csr = paddle.to_tensor(np_x).to_sparse_csr()
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.96823406, 0.19722934, 0.94373937, 0.02060066, 0.71456372,
             #                0.98275049])
 
-            softmax = paddle.incubate.sparse.nn.Softmax()
+            softmax = paddle.sparse.nn.Softmax()
             out = softmax(csr)
-            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True, 
-            #        crows=[0, 2, 5, 6], 
-            #        cols=[2, 3, 0, 2, 3, 3], 
+            # Tensor(shape=[3, 4], dtype=paddle.float64, place=Place(gpu:0), stop_gradient=True,
+            #        crows=[0, 2, 5, 6],
+            #        cols=[2, 3, 0, 2, 3, 3],
             #        values=[0.68373820, 0.31626180, 0.45610887, 0.18119845, 0.36269269,
             #                1.        ])
     """
@@ -126,6 +129,7 @@ def extra_repr(self):
 
 class ReLU6(Layer):
     """
+
     Sparse ReLU6 Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
@@ -147,8 +151,9 @@ class ReLU6(Layer):
 
             dense_x = paddle.to_tensor([-2., 0., 8.])
             sparse_x = dense_x.to_sparse_coo(1)
-            relu6 = paddle.incubate.sparse.nn.ReLU6()
+            relu6 = paddle.sparse.nn.ReLU6()
             out = relu6(sparse_x)
+
     """
 
     def __init__(self, name=None):
@@ -164,8 +169,9 @@ def extra_repr(self):
 
 
 class LeakyReLU(Layer):
-    """
-    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor. 
+    r"""
+
+    Sparse Leaky ReLU Activation, requiring x to be a SparseCooTensor or SparseCsrTensor.
 
     .. math::
 
@@ -194,8 +200,9 @@ class LeakyReLU(Layer):
 
             dense_x = paddle.to_tensor([-2., 0., 5.])
             sparse_x = dense_x.to_sparse_coo(1)
-            leaky_relu = paddle.incubate.sparse.nn.LeakyReLU(0.5)
+            leaky_relu = paddle.sparse.nn.LeakyReLU(0.5)
             out = leaky_relu(sparse_x)
+
     """
 
     def __init__(self, negative_slope=0.01, name=None):
diff --git a/python/paddle/incubate/sparse/nn/layer/conv.py b/python/paddle/sparse/nn/layer/conv.py
similarity index 71%
rename from python/paddle/incubate/sparse/nn/layer/conv.py
rename to python/paddle/sparse/nn/layer/conv.py
index f44358bbe9f3e1..faf0f86f2d125d 100644
--- a/python/paddle/incubate/sparse/nn/layer/conv.py
+++ b/python/paddle/sparse/nn/layer/conv.py
@@ -23,23 +23,26 @@
 
 
 class _Conv3D(Layer):
-
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 subm=False,
-                 key=None,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NDHWC"):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        subm=False,
+        key=None,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NDHWC",
+    ):
         super(_Conv3D, self).__init__()
-        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        assert (
+            weight_attr is not False
+        ), "weight_attr should not be False in Conv."
         self._param_attr = weight_attr
         self._bias_attr = bias_attr
         self._groups = groups
@@ -49,56 +52,66 @@ def __init__(self,
         self._subm = subm
         self._key = key
 
-        assert padding_mode == 'zeros', "Currently, only support padding_mode='zeros'"
+        assert (
+            padding_mode == 'zeros'
+        ), "Currently, only support padding_mode='zeros'"
         assert groups == 1, "Currently, only support groups=1"
 
         valid_format = {'NDHWC'}
         if data_format not in valid_format:
             raise ValueError(
-                "data_format must be one of {}, but got data_format='{}'".
-                format(valid_format, data_format))
+                "data_format must be one of {}, but got data_format='{}'".format(
+                    valid_format, data_format
+                )
+            )
 
         channel_last = data_format == "NDHWC"
 
         dims = 3
         self._stride = utils.convert_to_list(stride, dims, 'stride')
         self._dilation = utils.convert_to_list(dilation, dims, 'dilation')
-        self._kernel_size = utils.convert_to_list(kernel_size, dims,
-                                                  'kernel_size')
+        self._kernel_size = utils.convert_to_list(
+            kernel_size, dims, 'kernel_size'
+        )
         self._padding = padding
         self._padding_mode = padding_mode
         self._updated_padding, self._padding_algorithm = _update_padding_nd(
-            padding, channel_last, dims)
+            padding, channel_last, dims
+        )
 
         # the sparse conv restricts the shape is [D, H, W, in_channels, out_channels]
         filter_shape = self._kernel_size + [
-            self._in_channels, self._out_channels
+            self._in_channels,
+            self._out_channels,
         ]
 
         def _get_default_param_initializer():
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return Normal(0.0, std)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
             attr=self._param_attr,
-            default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=[self._out_channels],
-                                          is_bias=True)
+            default_initializer=_get_default_param_initializer(),
+        )
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True
+        )
 
     def forward(self, x):
-        out = F.conv._conv3d(x,
-                             self.weight,
-                             bias=self.bias,
-                             stride=self._stride,
-                             padding=self._updated_padding,
-                             dilation=self._dilation,
-                             groups=self._groups,
-                             subm=self._subm,
-                             key=self._key,
-                             data_format=self._data_format)
+        out = F.conv._conv3d(
+            x,
+            self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._updated_padding,
+            dilation=self._dilation,
+            groups=self._groups,
+            subm=self._subm,
+            key=self._key,
+            data_format=self._data_format,
+        )
         return out
 
     def extra_repr(self):
@@ -122,11 +135,11 @@ class Conv3D(_Conv3D):
     **Sparse Convlution3d Layer**
     The Sparse convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
-    bias is added to the output of the convolution. 
+    and W is the width of the feature. If bias attribution is provided,
+    bias is added to the output of the convolution.
     For each input :math:`X`, the equation is:
 
     ..  math::
@@ -150,7 +163,7 @@ class Conv3D(_Conv3D):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -208,63 +221,65 @@ class Conv3D(_Conv3D):
         .. code-block:: python
 
           import paddle
-          from paddle.fluid.framework import _test_eager_guard
-          
-          with _test_eager_guard():
-            indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-            values = [[1], [2], [3], [4]]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(values, dtype='float32')
-            dense_shape = [1, 1, 3, 4, 1]
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-            conv = paddle.incubate.sparse.nn.Conv3D(1, 1, (1, 3, 3))
-            y = conv(sparse_x)
-            print(y.shape)
-            # (1, 1, 1, 2, 1)
+
+          indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+          values = [[1], [2], [3], [4]]
+          indices = paddle.to_tensor(indices, dtype='int32')
+          values = paddle.to_tensor(values, dtype='float32')
+          dense_shape = [1, 1, 3, 4, 1]
+          sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+          conv = paddle.sparse.nn.Conv3D(1, 1, (1, 3, 3))
+          y = conv(sparse_x)
+          print(y.shape)
+          # (1, 1, 1, 2, 1)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NDHWC"):
-        super(Conv3D, self).__init__(in_channels,
-                                     out_channels,
-                                     kernel_size,
-                                     stride=stride,
-                                     padding=padding,
-                                     dilation=dilation,
-                                     groups=groups,
-                                     subm=False,
-                                     key=None,
-                                     padding_mode=padding_mode,
-                                     weight_attr=weight_attr,
-                                     bias_attr=bias_attr,
-                                     data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NDHWC",
+    ):
+        super(Conv3D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            subm=False,
+            key=None,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
 
 
 class SubmConv3D(_Conv3D):
     r"""
-    **Sparse Submanifold Convlution3d Layer**
-    The Sparse submanifold convolution3d layer calculates the output based on the input, filter
+    **Submanifold Sparse Convlution3d Layer**
+    The submanifold sparse convolution3d layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input(Input) and
-    Output(Output) are multidimensional SparseCooTensors with a shape of 
+    Output(Output) are multidimensional SparseCooTensors with a shape of
     :math:`[N, D, H, W, C]` . Where N is batch size, C is the number of
     channels, D is the depth of the feature, H is the height of the feature,
-    and W is the width of the feature. If bias attribution is provided, 
+    and W is the width of the feature. If bias attribution is provided,
     bias is added to the output of the convolution.
     For each input :math:`X`, the equation is:
 
     ..  math::
 
-        Out =(W \ast X + b
+        Out = W \ast X + b
 
     In the above equation:
 
@@ -283,7 +298,7 @@ class SubmConv3D(_Conv3D):
             stride_D = stride_H = stride_W = stride. The default value is 1.
         padding(int|str|tuple|list, optional): The padding size. Padding coule be in one of the following forms.
             1. a string in ['valid', 'same'].
-            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding` 
+            2. an int, which means each spartial dimension(depth, height, width) is zero paded by size of `padding`
             3. a list[int] or tuple[int] whose length is the number of spartial dimensions, which contains the amount of padding on each side for each spartial dimension. It has the form [pad_d1, pad_d2, ...].
             4. a list[int] or tuple[int] whose length is 2 * number of spartial dimensions. It has the form  [pad_before, pad_after, pad_before, pad_after, ...] for all spartial dimensions.
             5. a list or tuple of pairs of ints. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension are also included. Each pair of integers correspond to the amount of padding for a dimension of the input. Padding in batch dimension and channel dimension should be [0, 0] or (0, 0).
@@ -297,7 +312,7 @@ class SubmConv3D(_Conv3D):
             of the input channels, while the second half of the filters is only
             connected to the second half of the input channels. The default value is 1.
         padding_mode(str, optional): ``'zeros'``, ``'reflect'``, ``'replicate'`` or ``'circular'``. Currently only support ``'zeros'``.
-        key(str, optional): the key is used to save or use the same rulebook, 
+        key(str, optional): the key is used to save or use the same rulebook,
             the definition and role of rulebook refers to
             https://pdfs.semanticscholar.org/5125/a16039cabc6320c908a4764f32596e018ad3.pdf. The
             default value is None.
@@ -345,44 +360,46 @@ class SubmConv3D(_Conv3D):
         .. code-block:: python
 
           import paddle
-          from paddle.fluid.framework import _test_eager_guard
-          
-          with _test_eager_guard():
-            indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
-            values = [[1], [2], [3], [4]]
-            dense_shape = [1, 1, 3, 4, 1]
-            indices = paddle.to_tensor(indices, dtype='int32')
-            values = paddle.to_tensor(values, dtype='float32')
-            sparse_x = paddle.incubate.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True) 
-            subm_conv = paddle.incubate.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
-            y = subm_conv(sparse_x)
-            print(y.shape)
-            # (1, 1, 3, 4, 1)
+
+          indices = [[0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 1, 2], [1, 3, 2, 3]]
+          values = [[1], [2], [3], [4]]
+          dense_shape = [1, 1, 3, 4, 1]
+          indices = paddle.to_tensor(indices, dtype='int32')
+          values = paddle.to_tensor(values, dtype='float32')
+          sparse_x = paddle.sparse.sparse_coo_tensor(indices, values, dense_shape, stop_gradient=True)
+          subm_conv = paddle.sparse.nn.SubmConv3D(1, 1, (1, 3, 3))
+          y = subm_conv(sparse_x)
+          print(y.shape)
+          # (1, 1, 3, 4, 1)
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 groups=1,
-                 padding_mode='zeros',
-                 key=None,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format="NDHWC"):
-        super(SubmConv3D, self).__init__(in_channels,
-                                         out_channels,
-                                         kernel_size,
-                                         stride=stride,
-                                         padding=padding,
-                                         dilation=dilation,
-                                         groups=groups,
-                                         subm=True,
-                                         key=key,
-                                         padding_mode=padding_mode,
-                                         weight_attr=weight_attr,
-                                         bias_attr=bias_attr,
-                                         data_format=data_format)
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        groups=1,
+        padding_mode='zeros',
+        key=None,
+        weight_attr=None,
+        bias_attr=None,
+        data_format="NDHWC",
+    ):
+        super(SubmConv3D, self).__init__(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            subm=True,
+            key=key,
+            padding_mode=padding_mode,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+        )
diff --git a/python/paddle/incubate/sparse/nn/layer/norm.py b/python/paddle/sparse/nn/layer/norm.py
similarity index 62%
rename from python/paddle/incubate/sparse/nn/layer/norm.py
rename to python/paddle/sparse/nn/layer/norm.py
index 776967ac04dc00..34ed96f9e434ce 100644
--- a/python/paddle/incubate/sparse/nn/layer/norm.py
+++ b/python/paddle/sparse/nn/layer/norm.py
@@ -12,23 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 import paddle
 import warnings
 from paddle.nn.layer.norm import _BatchNormBase
 from paddle.framework import no_grad
+from paddle import _C_ops, in_dynamic_mode
+from paddle.fluid.layer_helper import LayerHelper
 
 
 class BatchNorm(paddle.nn.BatchNorm1D):
@@ -78,7 +67,7 @@ class BatchNorm(paddle.nn.BatchNorm1D):
             If it is set to None or one attribute of ParamAttr, batch_norm
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
-        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Defalut "NCL".
+        data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Default "NCL".
         use_global_stats(bool|None, optional): Whether to use global mean and variance. If set to False, use the statistics of one mini-batch, if set to True, use the global statistics, if set to None, use global statistics in the test phase and use the statistics of one mini-batch in the training phase. Default: None.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
@@ -94,71 +83,125 @@ class BatchNorm(paddle.nn.BatchNorm1D):
         .. code-block:: python
 
           import paddle
-          from paddle.fluid.framework import _test_eager_guard
-
-          with _test_eager_guard():
-              paddle.seed(123)
-              channels = 3
-              x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
-              dense_x = paddle.to_tensor(x_data) 
-              sparse_x = dense_x.to_sparse_coo(4)
-              batch_norm = paddle.incubate.sparse.nn.BatchNorm(channels)
-              batch_norm_out = batch_norm(sparse_x)
-              print(batch_norm_out.shape)
-              # [1, 6, 6, 6, 3]
+
+          paddle.seed(123)
+          channels = 3
+          x_data = paddle.randn((1, 6, 6, 6, channels)).astype('float32')
+          dense_x = paddle.to_tensor(x_data)
+          sparse_x = dense_x.to_sparse_coo(4)
+          batch_norm = paddle.sparse.nn.BatchNorm(channels)
+          batch_norm_out = batch_norm(sparse_x)
+          print(batch_norm_out.shape)
+          # [1, 6, 6, 6, 3]
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NDHWC',
-                 use_global_stats=None,
-                 name=None):
-        super(BatchNorm, self).__init__(num_features,
-                                        momentum=momentum,
-                                        epsilon=epsilon,
-                                        weight_attr=weight_attr,
-                                        bias_attr=bias_attr,
-                                        data_format=data_format,
-                                        use_global_stats=use_global_stats,
-                                        name=name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NDHWC',
+        use_global_stats=None,
+        name=None,
+    ):
+        super(BatchNorm, self).__init__(
+            num_features,
+            momentum=momentum,
+            epsilon=epsilon,
+            weight_attr=weight_attr,
+            bias_attr=bias_attr,
+            data_format=data_format,
+            use_global_stats=use_global_stats,
+            name=name,
+        )
 
     def _check_data_format(self, input):
         if input != "NDHWC":
             raise ValueError('sparse BatchNorm only support layout of "NDHWC"')
 
     def forward(self, input):
-        values = input.values()
         self._check_data_format(self._data_format)
 
-        if len(values.shape) != 2:
-            raise ValueError('expected 2D input.values() (got {}D)'.format(
-                len(values.shape)))
-
         if self.training:
             warnings.warn(
-                "When training, we now always track global mean and variance.")
-
-        batch_norm_out = paddle.nn.functional.batch_norm(
-            values,
-            self._mean,
-            self._variance,
-            weight=self.weight,
-            bias=self.bias,
-            training=self.training,
-            momentum=self._momentum,
-            epsilon=self._epsilon,
-            data_format='NC',
-            use_global_stats=self._use_global_stats)
-
-        return paddle.incubate.sparse.sparse_coo_tensor(
-            input.indices(),
-            batch_norm_out,
-            shape=input.shape,
-            stop_gradient=input.stop_gradient)
+                "When training, we now always track global mean and variance."
+            )
+
+        if self._use_global_stats == None:
+            self._use_global_stats = not self.training
+            trainable_statistics = False
+        else:
+            trainable_statistics = not self._use_global_stats
+
+        data_format = 'NCHW' if self._data_format[1] == 'C' else 'NHWC'
+
+        if in_dynamic_mode():
+            batch_norm_out, _, _, _, _, _ = _C_ops.sparse_batch_norm(
+                input,
+                self.weight,
+                self.bias,
+                self._mean,
+                self._variance,
+                self._momentum,
+                self._epsilon,
+                data_format,
+                not self.training,
+                self._use_global_stats,
+                trainable_statistics,
+                False,
+            )
+            return batch_norm_out
+        else:
+            inputs = {
+                'x': input,
+                'scale': self.weight,
+                'bias': self.bias,
+                'mean': self._mean,
+                'variance': self._variance,
+            }
+            attrs = {
+                'momentum': self._momentum,
+                'epsilon': self._epsilon,
+                'data_layout': data_format,
+                'is_test': not self.training,
+                'use_global_stats': self._use_global_stats,
+                'trainable_statistics': trainable_statistics,
+                'fuse_with_relu': False,
+            }
+            op_type = 'sparse_batch_norm'
+            helper = LayerHelper(op_type)
+            dtype = input.dtype
+            mean_out = helper.create_variable_for_type_inference(
+                dtype=dtype, stop_gradient=True
+            )
+            variance_out = helper.create_variable_for_type_inference(
+                dtype=dtype, stop_gradient=True
+            )
+            saved_mean = helper.create_variable_for_type_inference(
+                dtype=dtype, stop_gradient=True
+            )
+            saved_variance = helper.create_variable_for_type_inference(
+                dtype=dtype, stop_gradient=True
+            )
+            reserve_space = helper.create_variable_for_type_inference(
+                dtype=dtype, stop_gradient=True
+            )
+            out = helper.create_sparse_variable_for_type_inference(dtype)
+            outputs = {
+                "out": out,
+                "mean_out": mean_out,
+                "variance_out": variance_out,
+                "saved_mean": saved_mean,
+                "saved_variance": saved_variance,
+                "reserve_space": reserve_space,
+            }
+
+            helper.append_op(
+                type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+            )
+            return out
 
 
 class SyncBatchNorm(paddle.nn.SyncBatchNorm):
@@ -226,6 +269,8 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. If it is set to False, this layer will not 
              have trainable bias parameter. Default: None.
+        data_format(str, optional): Specify the input data format, may be "NCHW". Default "NCHW".
+        name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shapes:
         input: Tensor that the dimension from 2 to 5.
@@ -237,11 +282,9 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
 
           # required: gpu
           import paddle
-          import paddle.incubate.sparse.nn as nn
-          import numpy as np
+          import paddle.sparse.nn as nn
 
-          x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
-          x = paddle.to_tensor(x)
+          x = paddle.to_tensor([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]], dtype='float32')
           x = x.to_sparse_coo(len(x.shape)-1)
 
           if paddle.is_compiled_with_cuda():
@@ -258,29 +301,48 @@ class SyncBatchNorm(paddle.nn.SyncBatchNorm):
               #                 [-0.88415730,  1.57439375]])
     """
 
-    def __init__(self,
-                 num_features,
-                 momentum=0.9,
-                 epsilon=1e-05,
-                 weight_attr=None,
-                 bias_attr=None,
-                 data_format='NCHW',
-                 name=None):
-        super(SyncBatchNorm,
-              self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, name)
+    def __init__(
+        self,
+        num_features,
+        momentum=0.9,
+        epsilon=1e-05,
+        weight_attr=None,
+        bias_attr=None,
+        data_format='NCHW',
+        name=None,
+    ):
+        super(SyncBatchNorm, self).__init__(
+            num_features,
+            momentum,
+            epsilon,
+            weight_attr,
+            bias_attr,
+            data_format,
+            name,
+        )
 
     def forward(self, x):
-        assert x.is_sparse_coo(
-        ), "SyncBatchNorm only support SparseTensor in COO format."
-        out = super(SyncBatchNorm, self).forward(x.values())
-        return paddle.incubate.sparse.sparse_coo_tensor(
-            x.indices(), out, shape=x.shape, stop_gradient=x.stop_gradient)
+        self._check_data_format()
+        sync_batch_norm_out, _, _, _, _, _ = _C_ops.sparse_sync_batch_norm_(
+            x,
+            self.weight,
+            self.bias,
+            self._mean,
+            self._variance,
+            self._momentum,
+            self._epsilon,
+            self._data_format,
+            not self.training,
+            False,
+            False,
+            False,
+        )
+        return sync_batch_norm_out
 
     @classmethod
     def convert_sync_batchnorm(cls, layer):
         r"""
-        Helper function to convert :class: `paddle.incubate.sparse.nn.BatchNorm` layers in the model to :class: `paddle.incubate.sparse.nn.SyncBatchNorm` layers.
+        Helper function to convert :class: `paddle.sparse.nn.BatchNorm` layers in the model to :class: `paddle.sparse.nn.SyncBatchNorm` layers.
 
         Parameters:
             layer(paddle.nn.Layer): model containing one or more `BatchNorm` layers.
@@ -293,7 +355,7 @@ def convert_sync_batchnorm(cls, layer):
             .. code-block:: python
 
                 import paddle
-                import paddle.incubate.sparse.nn as nn
+                import paddle.sparse.nn as nn
 
                 model = paddle.nn.Sequential(nn.Conv3D(3, 5, 3), nn.BatchNorm(5))
                 sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
@@ -301,27 +363,41 @@ def convert_sync_batchnorm(cls, layer):
 
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            if layer._weight_attr != None and not isinstance(
-                    layer._weight_attr,
-                    bool) and layer._weight_attr.name != None:
+            if (
+                layer._weight_attr != None
+                and not isinstance(layer._weight_attr, bool)
+                and layer._weight_attr.name != None
+            ):
                 layer._weight_attr.name = layer._weight_attr.name + '_sync'
-            if layer._bias_attr != None and not isinstance(
-                    layer._bias_attr, bool) and layer._bias_attr.name != None:
+            if (
+                layer._bias_attr != None
+                and not isinstance(layer._bias_attr, bool)
+                and layer._bias_attr.name != None
+            ):
                 layer._bias_attr.name = layer._bias_attr.name + '_sync'
 
-            #convert sparse BatchNorm
+            # convert sparse BatchNorm
             if isinstance(layer, BatchNorm):
-                layer_output = SyncBatchNorm(layer._num_features,
-                                             layer._momentum, layer._epsilon,
-                                             layer._weight_attr,
-                                             layer._bias_attr,
-                                             layer._data_format, layer._name)
-            #convert dense BatchNorm
+                layer_output = SyncBatchNorm(
+                    layer._num_features,
+                    layer._momentum,
+                    layer._epsilon,
+                    layer._weight_attr,
+                    layer._bias_attr,
+                    layer._data_format,
+                    layer._name,
+                )
+            # convert dense BatchNorm
             else:
                 layer_output = paddle.nn.SyncBatchNorm(
-                    layer._num_features, layer._momentum, layer._epsilon,
-                    layer._weight_attr, layer._bias_attr, layer._data_format,
-                    layer._name)
+                    layer._num_features,
+                    layer._momentum,
+                    layer._epsilon,
+                    layer._weight_attr,
+                    layer._bias_attr,
+                    layer._data_format,
+                    layer._name,
+                )
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
@@ -331,7 +407,8 @@ def convert_sync_batchnorm(cls, layer):
             layer_output._variance = layer._variance
 
         for name, sublayer in layer.named_children():
-            layer_output.add_sublayer(name,
-                                      cls.convert_sync_batchnorm(sublayer))
+            layer_output.add_sublayer(
+                name, cls.convert_sync_batchnorm(sublayer)
+            )
         del layer
         return layer_output
diff --git a/python/paddle/incubate/sparse/nn/layer/pooling.py b/python/paddle/sparse/nn/layer/pooling.py
similarity index 79%
rename from python/paddle/incubate/sparse/nn/layer/pooling.py
rename to python/paddle/sparse/nn/layer/pooling.py
index 9fb67ecc0a6dd5..340e7e5e1fce1c 100644
--- a/python/paddle/incubate/sparse/nn/layer/pooling.py
+++ b/python/paddle/sparse/nn/layer/pooling.py
@@ -61,26 +61,26 @@ class MaxPool3D(Layer):
         .. code-block:: python
 
             import paddle
-            from paddle.fluid.framework import _test_eager_guard
 
-            with _test_eager_guard():
-                dense_x = paddle.randn((2, 3, 6, 6, 3))
-                sparse_x = dense_x.to_sparse_coo(4)
-                max_pool3d = paddle.incubate.sparse.nn.MaxPool3D(
-                    kernel_size=3, data_format='NDHWC')
-                out = max_pool3d(sparse_x)
-                #shape=[2, 1, 2, 2, 3]
+            dense_x = paddle.randn((2, 3, 6, 6, 3))
+            sparse_x = dense_x.to_sparse_coo(4)
+            max_pool3d = paddle.sparse.nn.MaxPool3D(
+                kernel_size=3, data_format='NDHWC')
+            out = max_pool3d(sparse_x)
+            #shape=[2, 1, 2, 2, 3]
 
     """
 
-    def __init__(self,
-                 kernel_size,
-                 stride=None,
-                 padding=0,
-                 return_mask=False,
-                 ceil_mode=False,
-                 data_format="NDHWC",
-                 name=None):
+    def __init__(
+        self,
+        kernel_size,
+        stride=None,
+        padding=0,
+        return_mask=False,
+        ceil_mode=False,
+        data_format="NDHWC",
+        name=None,
+    ):
         super(MaxPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
@@ -91,14 +91,17 @@ def __init__(self,
         self.name = name
 
     def forward(self, x):
-        return F.max_pool3d(x,
-                            kernel_size=self.ksize,
-                            stride=self.stride,
-                            padding=self.padding,
-                            ceil_mode=self.ceil_mode,
-                            data_format=self.data_format,
-                            name=self.name)
+        return F.max_pool3d(
+            x,
+            kernel_size=self.ksize,
+            stride=self.stride,
+            padding=self.padding,
+            ceil_mode=self.ceil_mode,
+            data_format=self.data_format,
+            name=self.name,
+        )
 
     def extra_repr(self):
         return 'kernel_size={ksize}, stride={stride}, padding={padding}'.format(
-            **self.__dict__)
+            **self.__dict__
+        )
diff --git a/python/paddle/incubate/sparse/unary.py b/python/paddle/sparse/unary.py
similarity index 75%
rename from python/paddle/incubate/sparse/unary.py
rename to python/paddle/sparse/unary.py
index 472a71d482b81a..32825c32b3650b 100644
--- a/python/paddle/incubate/sparse/unary.py
+++ b/python/paddle/sparse/unary.py
@@ -15,7 +15,11 @@
 import numpy as np
 
 from paddle import _C_ops, _legacy_C_ops
-from paddle.fluid.framework import dygraph_only, core, convert_np_dtype_to_dtype_
+from paddle.fluid.framework import (
+    dygraph_only,
+    core,
+    convert_np_dtype_to_dtype_,
+)
 
 __all__ = []
 
@@ -33,7 +37,7 @@
 def sin(x, name=None):
     """
     Calculate elementwise sin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = sin(x)
@@ -53,8 +57,8 @@ def sin(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.sin(sparse_x)
-            
+            out = paddle.sparse.sin(sparse_x)
+
     """
     return _C_ops.sparse_sin(x)
 
@@ -63,7 +67,7 @@ def sin(x, name=None):
 def tan(x, name=None):
     """
     Calculate elementwise tan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = tan(x)
@@ -83,8 +87,8 @@ def tan(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.tan(sparse_x)
-            
+            out = paddle.sparse.tan(sparse_x)
+
     """
     return _C_ops.sparse_tan(x)
 
@@ -93,7 +97,7 @@ def tan(x, name=None):
 def asin(x, name=None):
     """
     Calculate elementwise asin of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = asin(x)
@@ -113,17 +117,48 @@ def asin(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.asin(sparse_x)
-            
+            out = paddle.sparse.asin(sparse_x)
+
     """
     return _C_ops.sparse_asin(x)
 
 
+@dygraph_only
+def transpose(x, perm, name=None):
+    """
+    Changes the perm order of ``x`` without changing its data, requiring x to be a SparseCooTensor or SparseCsrTensor.
+
+    .. math::
+
+        out = transpose(x, perm)
+
+    Parameters:
+        x (Tensor): The input Sparse Tensor with data type float32, float64.
+        perm (list|tuple): Permute the input according to the data of perm.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        A transposed Sparse Tensor with the same data type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            dense_x = paddle.to_tensor([[-2., 0.], [1., 2.]])
+            sparse_x = dense_x.to_sparse_coo(1)
+            out = paddle.sparse.transpose(sparse_x, [1, 0])
+
+    """
+    return _C_ops.sparse_transpose(x, perm)
+
+
 @dygraph_only
 def atan(x, name=None):
     """
     Calculate elementwise atan of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = atan(x)
@@ -143,8 +178,8 @@ def atan(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.atan(sparse_x)
-            
+            out = paddle.sparse.atan(sparse_x)
+
     """
     return _C_ops.sparse_atan(x)
 
@@ -153,7 +188,7 @@ def atan(x, name=None):
 def sinh(x, name=None):
     """
     Calculate elementwise sinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = sinh(x)
@@ -173,8 +208,8 @@ def sinh(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.sinh(sparse_x)
-            
+            out = paddle.sparse.sinh(sparse_x)
+
     """
     return _C_ops.sparse_sinh(x)
 
@@ -183,7 +218,7 @@ def sinh(x, name=None):
 def asinh(x, name=None):
     """
     Calculate elementwise asinh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = asinh(x)
@@ -203,8 +238,8 @@ def asinh(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.asinh(sparse_x)
-            
+            out = paddle.sparse.asinh(sparse_x)
+
     """
     return _C_ops.sparse_asinh(x)
 
@@ -213,7 +248,7 @@ def asinh(x, name=None):
 def atanh(x, name=None):
     """
     Calculate elementwise atanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = atanh(x)
@@ -233,8 +268,8 @@ def atanh(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.atanh(sparse_x)
-            
+            out = paddle.sparse.atanh(sparse_x)
+
     """
     return _C_ops.sparse_atanh(x)
 
@@ -243,7 +278,7 @@ def atanh(x, name=None):
 def tanh(x, name=None):
     """
     Calculate elementwise tanh of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = tanh(x)
@@ -260,11 +295,11 @@ def tanh(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.tanh(sparse_x)
-            
+            out = paddle.sparse.tanh(sparse_x)
+
     """
     return _C_ops.sparse_tanh(x)
 
@@ -273,7 +308,7 @@ def tanh(x, name=None):
 def square(x, name=None):
     """
     Calculate elementwise square of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = square(x)
@@ -290,11 +325,11 @@ def square(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.square(sparse_x)
-            
+            out = paddle.sparse.square(sparse_x)
+
     """
     return _C_ops.sparse_square(x)
 
@@ -303,7 +338,7 @@ def square(x, name=None):
 def sqrt(x, name=None):
     """
     Calculate elementwise sqrt of SparseTensor, requiring x to be a SparseCooTensor or SparseCsrTensor.
-        
+
     .. math::
 
         out = sqrt(x)
@@ -323,8 +358,8 @@ def sqrt(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.sqrt(sparse_x)
-            
+            out = paddle.sparse.sqrt(sparse_x)
+
     """
     return _C_ops.sparse_sqrt(x)
 
@@ -353,8 +388,8 @@ def log1p(x, name=None):
 
             dense_x = paddle.to_tensor([-2, 0, 1], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.log1p(sparse_x)
-            
+            out = paddle.sparse.log1p(sparse_x)
+
     """
     return _C_ops.sparse_log1p(x)
 
@@ -367,7 +402,7 @@ def cast(x, index_dtype=None, value_dtype=None, name=None):
 
     Parameters:
         x (Tensor): The input Sparse Tensor with data type float32, float64.
-        index_dtype (np.dtype|str, optional): Data type of the index of SparseCooTensor, 
+        index_dtype (np.dtype|str, optional): Data type of the index of SparseCooTensor,
             or crows/cols of SparseCsrTensor. Can be uint8, int8, int16, int32, int64.
         value_dtype (np.dtype|str, optional): Data type of the value of SparseCooTensor,
             SparseCsrTensor. Can be bool, float16, float32, float64, int8, int32, int64, uint8.
@@ -384,8 +419,8 @@ def cast(x, index_dtype=None, value_dtype=None, name=None):
 
             dense_x = paddle.to_tensor([-2, 0, 1])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.cast(sparse_x, 'int32', 'float64')
-            
+            out = paddle.sparse.cast(sparse_x, 'int32', 'float64')
+
     """
     if index_dtype and not isinstance(index_dtype, core.VarDesc.VarType):
         index_dtype = convert_np_dtype_to_dtype_(index_dtype)
@@ -419,8 +454,8 @@ def pow(x, factor, name=None):
 
             dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.pow(sparse_x, 2)
-            
+            out = paddle.sparse.pow(sparse_x, 2)
+
     """
     return _C_ops.sparse_pow(x, float(factor))
 
@@ -449,8 +484,8 @@ def neg(x, name=None):
 
             dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.neg(sparse_x)
-            
+            out = paddle.sparse.neg(sparse_x)
+
     """
     return _C_ops.sparse_scale(x, -1.0, 0.0, True)
 
@@ -479,19 +514,21 @@ def abs(x, name=None):
 
             dense_x = paddle.to_tensor([-2, 0, 3], dtype='float32')
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.abs(sparse_x)
-            
+            out = paddle.sparse.abs(sparse_x)
+
     """
     return _C_ops.sparse_abs(x)
 
 
 @dygraph_only
-def coalesce(x):
+def coalesce(x, name=None):
     r"""
     the coalesced operator include sorted and merge, after coalesced, the indices of x is sorted and unique.
 
     Parameters:
         x (Tensor): the input SparseCooTensor.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         Tensor: return the SparseCooTensor after coalesced.
@@ -501,17 +538,15 @@ def coalesce(x):
 
             import paddle
 
-            from paddle.incubate import sparse
-
             indices = [[0, 0, 1], [1, 1, 2]]
             values = [1.0, 2.0, 3.0]
-            sp_x = sparse.sparse_coo_tensor(indices, values)
-            sp_x = sparse.coalesce(sp_x)
+            sp_x = paddle.sparse.sparse_coo_tensor(indices, values)
+            sp_x = paddle.sparse.coalesce(sp_x)
             print(sp_x.indices())
             #[[0, 1], [1, 2]]
             print(sp_x.values())
             #[3.0, 3.0]
-	"""
+    """
     return _C_ops.sparse_coalesce(x)
 
 
@@ -540,8 +575,8 @@ def rad2deg(x, name=None):
 
             dense_x = paddle.to_tensor([3.142, 0., -3.142])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.rad2deg(sparse_x)
-            
+            out = paddle.sparse.rad2deg(sparse_x)
+
     """
     if x.dtype in _int_dtype_:
         x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
@@ -553,7 +588,7 @@ def deg2rad(x, name=None):
     """
     Convert each of the elements of input x from degrees to angles in radians,
     requiring x to be a SparseCooTensor or SparseCsrTensor.
-    
+
     .. math::
 
         deg2rad(x) = \pi * x / 180
@@ -573,8 +608,8 @@ def deg2rad(x, name=None):
 
             dense_x = paddle.to_tensor([-180, 0, 180])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.deg2rad(sparse_x)
-            
+            out = paddle.sparse.deg2rad(sparse_x)
+
     """
     if x.dtype in _int_dtype_:
         x = _C_ops.sparse_cast(x, None, core.VarDesc.VarType.FP32)
@@ -605,6 +640,63 @@ def expm1(x, name=None):
 
             dense_x = paddle.to_tensor([-2., 0., 1.])
             sparse_x = dense_x.to_sparse_coo(1)
-            out = paddle.incubate.sparse.expm1(sparse_x)
+            out = paddle.sparse.expm1(sparse_x)
     """
     return _C_ops.sparse_expm1(x)
+
+
+@dygraph_only
+def reshape(x, shape, name=None):
+    """
+    Changes the shape of ``x`` without changing its value, requiring x to be a SparseCooTensor or SparseCsrTensor.
+    Currently this function can only reshape the sparse dims of ``x`` , but ``shape`` argument must be specified
+    as the shape of the reshaped tensor.
+
+    Note that if x is a SparseCsrTensor, then len(shape) must be 2 or 3.
+
+    There are some tricks when specifying the target shape.
+
+        - 1. -1 means the value of this dimension is inferred from the total element number of x and remaining dimensions. Thus one and only one dimension can be set -1.
+
+        - 2. 0 means the actual dimension value is going to be copied from the corresponding dimension of x. The indices of 0 in the target shape can not exceed the rank of x.
+
+    Here are some examples to explain it.
+
+        - 1. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is [6, 8], the reshape operator will transform x into a 2-D tensor with shape [6, 8] and leaving x's data unchanged.
+
+        - 2. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is [2, 3, -1, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 3, 4, 2] and leaving x's data unchanged. In this case, one dimension of the target shape is set to -1, the value of this dimension is inferred from the total element number of x and remaining dimensions.
+
+        - 3. Given a 3-D tensor x with a shape [2, 4, 6], and the target shape is [-1, 0, 3, 2], the reshape operator will transform x into a 4-D tensor with shape [2, 4, 3, 2] and leaving x's data unchanged. In this case, besides -1, 0 means the actual dimension value is going to be copied from the corresponding dimension of x.
+
+    Args:
+        x (Tensor): The input sparse tensor with data type ``float32``, ``float64``, ``int32``, ``int64`` or ``bool``.
+        shape (list|tuple): Define the target shape. At most one dimension of the target shape can be -1.
+                        The data type is ``int32``.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor: A reshaped Tensor with the same data type as ``x``.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x_shape = [6, 2, 3]
+            new_shape = [1, 0, 2, -1, 3]
+            format = "coo"
+
+            dense_x = paddle.randint(-100, 100, x_shape) * paddle.randint(0, 2, x_shape)
+
+            if format == "coo":
+                sp_x = dense_x.to_sparse_coo(len(x_shape))
+            else:
+                sp_x = dense_x.to_sparse_csr()
+            sp_out = paddle.sparse.reshape(sp_x, new_shape)
+
+            print(sp_out)
+            # the shape of sp_out is [1, 2, 2, 3, 3]
+
+    """
+    return _C_ops.sparse_reshape(x, shape)
diff --git a/python/paddle/static/input.py b/python/paddle/static/input.py
index 4098ae5dbf3581..3b8cf5ea67c9bf 100644
--- a/python/paddle/static/input.py
+++ b/python/paddle/static/input.py
@@ -107,7 +107,8 @@ def data(name, shape, dtype=None, lod_level=0):
             stop_gradient=True,
             lod_level=lod_level,
             is_data=True,
-            need_check_feed=True)
+            need_check_feed=True,
+        )
     else:
         return helper.create_global_variable(
             name=name,
@@ -117,7 +118,8 @@ def data(name, shape, dtype=None, lod_level=0):
             stop_gradient=True,
             lod_level=lod_level,
             is_data=True,
-            need_check_feed=True)
+            need_check_feed=True,
+        )
 
 
 class InputSpec(object):
@@ -166,7 +168,8 @@ def _create_feed_layer(self):
 
     def __repr__(self):
         return '{}(shape={}, dtype={}, name={})'.format(
-            type(self).__name__, self.shape, self.dtype, self.name)
+            type(self).__name__, self.shape, self.dtype, self.name
+        )
 
     @classmethod
     def from_tensor(cls, tensor, name=None):
@@ -182,13 +185,12 @@ def from_tensor(cls, tensor, name=None):
         Examples:
             .. code-block:: python
 
-                import numpy as np
                 import paddle
                 from paddle.static import InputSpec
 
                 paddle.disable_static()
 
-                x = paddle.to_tensor(np.ones([2, 2], np.float32))
+                x = paddle.ones([2, 2], dtype="float32")
                 x_spec = InputSpec.from_tensor(x, name='x')
                 print(x_spec)  # InputSpec(shape=(2, 2), dtype=paddle.float32, name=x)
 
@@ -198,7 +200,9 @@ def from_tensor(cls, tensor, name=None):
         else:
             raise ValueError(
                 "Input `tensor` should be a Tensor, but received {}.".format(
-                    type(tensor).__name__))
+                    type(tensor).__name__
+                )
+            )
 
     @classmethod
     def from_numpy(cls, ndarray, name=None):
@@ -247,13 +251,17 @@ def batch(self, batch_size):
         if isinstance(batch_size, (list, tuple)):
             if len(batch_size) != 1:
                 raise ValueError(
-                    "Length of batch_size: {} shall be 1, but received {}.".
-                    format(batch_size, len(batch_size)))
+                    "Length of batch_size: {} shall be 1, but received {}.".format(
+                        batch_size, len(batch_size)
+                    )
+                )
             batch_size = batch_size[1]
         elif not isinstance(batch_size, six.integer_types):
             raise TypeError(
                 "type(batch_size) shall be `int`, but received {}.".format(
-                    type(batch_size).__name__))
+                    type(batch_size).__name__
+                )
+            )
 
         new_shape = [batch_size] + list(self.shape)
         self.shape = tuple(new_shape)
@@ -279,7 +287,8 @@ def unbatch(self):
         """
         if len(self.shape) == 0:
             raise ValueError(
-                "Not support to unbatch a InputSpec when len(shape) == 0.")
+                "Not support to unbatch a InputSpec when len(shape) == 0."
+            )
 
         self.shape = self._verify(self.shape[1:])
         return self
@@ -290,20 +299,25 @@ def _verify(self, shape):
         """
         if not isinstance(shape, (list, tuple)):
             raise TypeError(
-                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}."
-                .format(type(shape).__name__))
+                "Type of `shape` in InputSpec should be one of (tuple, list), but received {}.".format(
+                    type(shape).__name__
+                )
+            )
         if len(shape) == 0:
             raise ValueError(
-                "`shape` in InputSpec should contain at least 1 element, but received {}."
-                .format(shape))
+                "`shape` in InputSpec should contain at least 1 element, but received {}.".format(
+                    shape
+                )
+            )
 
         for i, ele in enumerate(shape):
             if ele is not None:
                 if not isinstance(ele, six.integer_types):
                     raise ValueError(
-                        "shape[{}] should be an `int`, but received `{}`:{}.".
-                        format(i,
-                               type(ele).__name__, ele))
+                        "shape[{}] should be an `int`, but received `{}`:{}.".format(
+                            i, type(ele).__name__, ele
+                        )
+                    )
             if ele is None or ele < -1:
                 shape[i] = -1
 
@@ -328,8 +342,9 @@ def __hash__(self):
 
     def __eq__(self, other):
         slots = ['shape', 'dtype', 'name']
-        return (type(self) is type(other) and all(
-            getattr(self, attr) == getattr(other, attr) for attr in slots))
+        return type(self) is type(other) and all(
+            getattr(self, attr) == getattr(other, attr) for attr in slots
+        )
 
     def __ne__(self, other):
         return not self == other
diff --git a/python/paddle/static/io.py b/python/paddle/static/io.py
index de9e48b3367cc7..b880da6abc59db 100644
--- a/python/paddle/static/io.py
+++ b/python/paddle/static/io.py
@@ -40,9 +40,9 @@
 
 __all__ = []
 
-_logger = get_logger(__name__,
-                     logging.INFO,
-                     fmt='%(asctime)s-%(levelname)s: %(message)s')
+_logger = get_logger(
+    __name__, logging.INFO, fmt='%(asctime)s-%(levelname)s: %(message)s'
+)
 
 
 def _check_args(caller, args, supported_args=None, deprecated_args=None):
@@ -51,12 +51,16 @@ def _check_args(caller, args, supported_args=None, deprecated_args=None):
     for arg in args:
         if arg in deprecated_args:
             raise ValueError(
-                "argument '{}' in function '{}' is deprecated, only {} are supported."
-                .format(arg, caller, supported_args))
+                "argument '{}' in function '{}' is deprecated, only {} are supported.".format(
+                    arg, caller, supported_args
+                )
+            )
         elif arg not in supported_args:
             raise ValueError(
-                "function '{}' doesn't support argument '{}',\n only {} are supported."
-                .format(caller, arg, supported_args))
+                "function '{}' doesn't support argument '{}',\n only {} are supported.".format(
+                    caller, arg, supported_args
+                )
+            )
 
 
 def _check_vars(name, var_list):
@@ -64,7 +68,8 @@ def _check_vars(name, var_list):
         var_list = [var_list]
     if not var_list or not all([isinstance(var, Variable) for var in var_list]):
         raise ValueError(
-            "'{}' should be a Variable or a list of Variable.".format(name))
+            "'{}' should be a Variable or a list of Variable.".format(name)
+        )
 
 
 def _normalize_path_prefix(path_prefix):
@@ -93,29 +98,35 @@ def _get_valid_program(program=None):
                 "The type of input program is invalid, expected tyep is Program, but received None"
             )
         warnings.warn(
-            "The input is a CompiledProgram, this is not recommended.")
+            "The input is a CompiledProgram, this is not recommended."
+        )
     if not isinstance(program, Program):
         raise TypeError(
             "The type of input program is invalid, expected type is fluid.Program, but received %s"
-            % type(program))
+            % type(program)
+        )
     return program
 
 
 def _clone_var_in_block(block, var):
     assert isinstance(var, Variable)
     if var.desc.type() == core.VarDesc.VarType.LOD_TENSOR:
-        return block.create_var(name=var.name,
-                                shape=var.shape,
-                                dtype=var.dtype,
-                                type=var.type,
-                                lod_level=var.lod_level,
-                                persistable=True)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            lod_level=var.lod_level,
+            persistable=True,
+        )
     else:
-        return block.create_var(name=var.name,
-                                shape=var.shape,
-                                dtype=var.dtype,
-                                type=var.type,
-                                persistable=True)
+        return block.create_var(
+            name=var.name,
+            shape=var.shape,
+            dtype=var.dtype,
+            type=var.type,
+            persistable=True,
+        )
 
 
 def normalize_program(program, feed_vars, fetch_vars):
@@ -132,11 +143,6 @@ def normalize_program(program, feed_vars, fetch_vars):
     Returns:
         Program: Normalized/Optimized program.
 
-    Raises:
-        TypeError: If `program` is not a Program, an exception is thrown.
-        TypeError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        TypeError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -163,18 +169,21 @@ def normalize_program(program, feed_vars, fetch_vars):
     """
     if not isinstance(program, Program):
         raise TypeError(
-            "program type must be `fluid.Program`, but received `%s`" %
-            type(program))
+            "program type must be `fluid.Program`, but received `%s`"
+            % type(program)
+        )
     if not isinstance(feed_vars, list):
         feed_vars = [feed_vars]
     if not all(isinstance(v, Variable) for v in feed_vars):
         raise TypeError(
-            "feed_vars type must be a Variable or a list of Variable.")
+            "feed_vars type must be a Variable or a list of Variable."
+        )
     if not isinstance(fetch_vars, list):
         fetch_vars = [fetch_vars]
     if not all(isinstance(v, Variable) for v in fetch_vars):
         raise TypeError(
-            "fetch_vars type must be a Variable or a list of Variable.")
+            "fetch_vars type must be a Variable or a list of Variable."
+        )
 
     # remind users to set auc_states to 0 if auc op were found.
     for op in program.global_block().ops:
@@ -182,8 +191,10 @@ def normalize_program(program, feed_vars, fetch_vars):
         device_attr_name = core.op_proto_and_checker_maker.kOpDeviceAttrName()
         op._set_attr(device_attr_name, "")
         if op.type == 'auc':
-            warnings.warn("Be sure that you have set auc states to 0 "
-                          "before saving inference model.")
+            warnings.warn(
+                "Be sure that you have set auc states to 0 "
+                "before saving inference model."
+            )
             break
 
     # fix the bug that the activation op's output as target will be pruned.
@@ -193,9 +204,9 @@ def normalize_program(program, feed_vars, fetch_vars):
         uniq_fetch_vars = []
         for i, var in enumerate(fetch_vars):
             if var.dtype != paddle.bool:
-                var = layers.scale(var,
-                                   1.,
-                                   name="save_infer_model/scale_{}".format(i))
+                var = layers.scale(
+                    var, 1.0, name="save_infer_model/scale_{}".format(i)
+                )
             uniq_fetch_vars.append(var)
         fetch_vars = uniq_fetch_vars
 
@@ -213,7 +224,8 @@ def normalize_program(program, feed_vars, fetch_vars):
 
     feed_var_names = [var.name for var in feed_vars]
     copy_program = copy_program._prune_with_input(
-        feeded_var_names=feed_var_names, targets=fetch_vars)
+        feeded_var_names=feed_var_names, targets=fetch_vars
+    )
     copy_program = copy_program._inference_optimize(prune_read_op=True)
     fetch_var_names = [var.name for var in fetch_vars]
     prepend_feed_ops(copy_program, feed_var_names)
@@ -243,9 +255,11 @@ def is_persistable(var):
             param = fluid.default_main_program().global_block().var('fc.b')
             res = fluid.io.is_persistable(param)
     """
-    if var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH or \
-                    var.desc.type() == core.VarDesc.VarType.FETCH_LIST or \
-                    var.desc.type() == core.VarDesc.VarType.READER:
+    if (
+        var.desc.type() == core.VarDesc.VarType.FEED_MINIBATCH
+        or var.desc.type() == core.VarDesc.VarType.FETCH_LIST
+        or var.desc.type() == core.VarDesc.VarType.READER
+    ):
         return False
     return var.persistable
 
@@ -266,10 +280,6 @@ def serialize_program(feed_vars, fetch_vars, **kwargs):
     Returns:
         bytes: serialized program.
 
-    Raises:
-        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -329,10 +339,6 @@ def serialize_persistables(feed_vars, fetch_vars, executor, **kwargs):
     Returns:
         bytes: serialized program.
 
-    Raises:
-        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -377,8 +383,10 @@ def _serialize_persistables(program, executor):
     vars_ = list(filter(is_persistable, program.list_vars()))
     # warn if no variable found in model
     if len(vars_) == 0:
-        warnings.warn("no variable in your model, please ensure there are any "
-                      "variables in your model to save")
+        warnings.warn(
+            "no variable in your model, please ensure there are any "
+            "variables in your model to save"
+        )
         return None
     # create a new program and clone persitable vars to it
     save_program = Program()
@@ -395,16 +403,16 @@ def _serialize_persistables(program, executor):
         in_vars.append(save_var_map[name])
 
     out_var_name = unique_name.generate("out_var")
-    out_var = save_block.create_var(type=core.VarDesc.VarType.RAW,
-                                    name=out_var_name)
+    out_var = save_block.create_var(
+        type=core.VarDesc.VarType.RAW, name=out_var_name
+    )
     out_var.desc.set_persistable(True)
-    save_block.append_op(type='save_combine',
-                         inputs={'X': in_vars},
-                         outputs={'Y': out_var},
-                         attrs={
-                             'file_path': '',
-                             'save_to_memory': True
-                         })
+    save_block.append_op(
+        type='save_combine',
+        inputs={'X': in_vars},
+        outputs={'Y': out_var},
+        attrs={'file_path': '', 'save_to_memory': True},
+    )
     # run save_program to save vars
     # NOTE(zhiqiu): save op will add variable kLookupTablePath to save_program.desc,
     # which leads to diff between save_program and its desc. Call _sync_with_cpp
@@ -451,11 +459,10 @@ def save_to_file(path, content):
 
 
 @static_only
-def save_inference_model(path_prefix, feed_vars, fetch_vars, executor,
-                         **kwargs):
+def save_inference_model(
+    path_prefix, feed_vars, fetch_vars, executor, **kwargs
+):
     """
-    :api_attr: Static Graph
-
     Save current model and its parameters to given path. i.e.
     Given path_prefix = "/path/to/modelname", after invoking
     save_inference_model(path_prefix, feed_vars, fetch_vars, executor),
@@ -472,15 +479,11 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor,
 
             - program(Program): specify a program if you don't want to use default main program.
 
-            - clip_extra(bool): set to True if you want to clip extra information for every operator.
+            - clip_extra(bool): the flag indicating whether to clip extra information for every operator. Default: True.
 
     Returns:
         None
 
-    Raises:
-        ValueError: If `feed_vars` is not a Variable or a list of Variable, an exception is thrown.
-        ValueError: If `fetch_vars` is not a Variable or a list of Variable, an exception is thrown.
-
     Examples:
         .. code-block:: python
 
@@ -534,11 +537,12 @@ def save_inference_model(path_prefix, feed_vars, fetch_vars, executor,
     _check_vars('fetch_vars', fetch_vars)
 
     program = _get_valid_program(kwargs.get('program', None))
-    clip_extra = kwargs.get('clip_extra', False)
+    clip_extra = kwargs.get('clip_extra', True)
     program = normalize_program(program, feed_vars, fetch_vars)
     # serialize and save program
     program_bytes = _serialize_program(
-        program._remove_training_info(clip_extra=clip_extra))
+        program._remove_training_info(clip_extra=clip_extra)
+    )
     save_to_file(model_path, program_bytes)
     # serialize and save params
     params_bytes = _serialize_persistables(program, executor)
@@ -588,8 +592,9 @@ def deserialize_program(data):
     """
     program = Program.parse_from_string(data)
     if not core._is_program_version_supported(program._version()):
-        raise ValueError("Unsupported program version: %d\n" %
-                         program._version())
+        raise ValueError(
+            "Unsupported program version: %d\n" % program._version()
+        )
     return program
 
 
@@ -638,8 +643,9 @@ def deserialize_persistables(program, data, executor):
     """
     if not isinstance(program, Program):
         raise TypeError(
-            "program type must be `fluid.Program`, but received `%s`" %
-            type(program))
+            "program type must be `fluid.Program`, but received `%s`"
+            % type(program)
+        )
     # load params to a tmp program
     load_program = Program()
     load_block = load_program.global_block()
@@ -663,9 +669,9 @@ def deserialize_persistables(program, data, executor):
         load_var_map[var_copy.name] = var_copy
 
     if data is None:
-        assert len(
-            origin_shape_map
-        ) == 0, "Required 'data' shall be not None if program contains parameter, but received 'data' is None."
+        assert (
+            len(origin_shape_map) == 0
+        ), "Required 'data' shall be not None if program contains parameter, but received 'data' is None."
         return
 
     # append load_combine op to load parameters,
@@ -677,10 +683,8 @@ def deserialize_persistables(program, data, executor):
         inputs={},
         outputs={"Out": load_var_list},
         # if load from memory, file_path is data
-        attrs={
-            'file_path': data,
-            'model_from_memory': True
-        })
+        attrs={'file_path': data, 'model_from_memory': True},
+    )
     executor.run(load_program)
     # check var shape
     for var in check_vars:
@@ -695,7 +699,9 @@ def deserialize_persistables(program, data, executor):
             raise RuntimeError(
                 "Shape mismatch, program needs a parameter with shape ({}), "
                 "but the loaded parameter ('{}') has a shape of ({}).".format(
-                    origin_shape, var.name, new_shape))
+                    origin_shape, var.name, new_shape
+                )
+            )
 
 
 def load_from_file(path):
@@ -760,9 +766,6 @@ def load_inference_model(path_prefix, executor, **kwargs):
         ``Variable`` (refer to :ref:`api_guide_Program_en`). It contains variables from which
         we can get inference results.
 
-    Raises:
-        ValueError: If `path_prefix.pdmodel` or `path_prefix.pdiparams`  doesn't exist.
-
     Examples:
         .. code-block:: python
 
@@ -803,7 +806,7 @@ def load_inference_model(path_prefix, executor, **kwargs):
     """
     # check kwargs
     supported_args = ('model_filename', 'params_filename')
-    deprecated_args = ('pserver_endpoints', )
+    deprecated_args = ('pserver_endpoints',)
     caller = inspect.currentframe().f_code.co_name
     _check_args(caller, kwargs, supported_args, deprecated_args)
 
@@ -814,7 +817,8 @@ def load_inference_model(path_prefix, executor, **kwargs):
         params_filename = kwargs.get('params_filename', None)
         if params_filename is None:
             raise ValueError(
-                "params_filename cannot be None when path_prefix is None.")
+                "params_filename cannot be None when path_prefix is None."
+            )
         load_dirname = ''
         program_bytes = model_filename
         params_bytes = params_filename
@@ -837,21 +841,26 @@ def load_inference_model(path_prefix, executor, **kwargs):
             if model_filename is None:
                 model_path = os.path.join(path_prefix, "__model__")
             else:
-                model_path = os.path.join(path_prefix,
-                                          model_filename + ".pdmodel")
+                model_path = os.path.join(
+                    path_prefix, model_filename + ".pdmodel"
+                )
                 if not os.path.exists(model_path):
                     model_path = os.path.join(path_prefix, model_filename)
             # set params_path
             if params_filename is None:
                 params_path = os.path.join(path_prefix, "")
             else:
-                params_path = os.path.join(path_prefix,
-                                           params_filename + ".pdiparams")
+                params_path = os.path.join(
+                    path_prefix, params_filename + ".pdiparams"
+                )
                 if not os.path.exists(params_path):
                     params_path = os.path.join(path_prefix, params_filename)
-            _logger.warning("The old way to load inference model is deprecated."
-                            " model path: {}, params path: {}".format(
-                                model_path, params_path))
+            _logger.warning(
+                "The old way to load inference model is deprecated."
+                " model path: {}, params path: {}".format(
+                    model_path, params_path
+                )
+            )
         program_bytes = load_from_file(model_path)
         load_dirname = os.path.dirname(params_path)
         params_filename = os.path.basename(params_path)
diff --git a/python/paddle/static/nn/common.py b/python/paddle/static/nn/common.py
index b8133872aa934c..98c5e81a2e8e06 100755
--- a/python/paddle/static/nn/common.py
+++ b/python/paddle/static/nn/common.py
@@ -19,13 +19,15 @@
 
 
 @static_only
-def fc(x,
-       size,
-       num_flatten_dims=1,
-       weight_attr=None,
-       bias_attr=None,
-       activation=None,
-       name=None):
+def fc(
+    x,
+    size,
+    num_flatten_dims=1,
+    weight_attr=None,
+    bias_attr=None,
+    activation=None,
+    name=None,
+):
     r"""
 
     Fully-Connected layer can take a tensor or a list of tensor as its inputs.
@@ -107,12 +109,12 @@ def fc(x,
             The default value is None, and the weight will be initialized to zero.
             For detailed information, please refer to :attr:`paddle.ParamAttr`.
             Warning, if x is a list of tensor, weight_attr should also be a list of same length.
-        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias. 
+        bias_attr (ParamAttr|bool, optional): The attribute of the learnable bias.
             If it is set to False, no bias will be added to the output.
             If it is set to None or one kind of ParamAttr, a bias parameter will
             be created according to ParamAttr. For detailed information, please refer
             to :attr:`paddle.ParamAttr`. The default value is None and the bias will be
-            initialized to zero. 
+            initialized to zero.
         activation (str, optional): Activation to be applied to the output of
             this layer, such as tanh, softmax, sigmoid, relu. For more information,
             please refer to :ref:`api_guide_activations_en` . Default: None.
@@ -122,9 +124,6 @@ def fc(x,
     Returns:
         Tensor, its shape is :math:`[batch\_size, *, size]` , and the data type is same with input.
 
-    Raises:
-        ValueError: If dimensions of the input tensor is less than 2.
-
     Examples:
         .. code-block:: python
 
@@ -157,30 +156,34 @@ def fc(x,
               bias_attr=paddle.ParamAttr(initializer=paddle.nn.initializer.Constant(value=1.0)))
           # out: [[1.8 1.8]]
     """
-    return paddle.fluid.layers.fc(input=x,
-                                  size=size,
-                                  num_flatten_dims=num_flatten_dims,
-                                  param_attr=weight_attr,
-                                  bias_attr=bias_attr,
-                                  act=activation,
-                                  name=name)
+    return paddle.fluid.layers.fc(
+        input=x,
+        size=size,
+        num_flatten_dims=num_flatten_dims,
+        param_attr=weight_attr,
+        bias_attr=bias_attr,
+        act=activation,
+        name=name,
+    )
 
 
 @static_only
-def deform_conv2d(x,
-                  offset,
-                  mask,
-                  num_filters,
-                  filter_size,
-                  stride=1,
-                  padding=0,
-                  dilation=1,
-                  groups=1,
-                  deformable_groups=1,
-                  im2col_step=1,
-                  weight_attr=None,
-                  bias_attr=None,
-                  name=None):
+def deform_conv2d(
+    x,
+    offset,
+    mask,
+    num_filters,
+    filter_size,
+    stride=1,
+    padding=0,
+    dilation=1,
+    groups=1,
+    deformable_groups=1,
+    im2col_step=1,
+    weight_attr=None,
+    bias_attr=None,
+    name=None,
+):
     r"""
 
     Compute 2-D deformable convolution on 4-D input.
@@ -275,9 +278,7 @@ def deform_conv2d(x,
     Returns:
         Tensor: The tensor storing the deformable convolution \
                   result. A Tensor with type float32, float64.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
+
     Examples:
         .. code-block:: python
 
@@ -323,7 +324,8 @@ def deform_conv2d(x,
             param_attr=weight_attr,
             bias_attr=bias_attr,
             modulated=False,
-            name=name)
+            name=name,
+        )
     else:
         return paddle.fluid.layers.deformable_conv(
             input=x,
@@ -340,4 +342,5 @@ def deform_conv2d(x,
             param_attr=weight_attr,
             bias_attr=bias_attr,
             modulated=True,
-            name=name)
+            name=name,
+        )
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index c8d433002387eb..82a016ce64da7a 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -24,12 +24,25 @@
 from ..framework import core
 from ..framework import in_dygraph_mode, _non_static_mode
 from ..framework import LayerHelper
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
-from ..framework import convert_np_dtype_to_dtype_, _varbase_creator, OpProtoHolder
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+    convert_dtype,
+)
+from ..framework import (
+    convert_np_dtype_to_dtype_,
+    _varbase_creator,
+    OpProtoHolder,
+)
+
 # TODO: define functions to get create a tensor
 import paddle
 from paddle import _C_ops, _legacy_C_ops
-from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check
+from ..fluid.framework import (
+    _in_legacy_dygraph,
+    _in_eager_without_dygraph_check,
+)
 import warnings
 
 __all__ = []
@@ -100,10 +113,17 @@ def linspace(start, stop, num, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_num = fill_constant([1], 'int32', num, force_cpu=True)
     if in_dygraph_mode():
-        return _C_ops.linspace(tensor_start, tensor_stop, tensor_num, dtype)
+        return _C_ops.linspace(
+            tensor_start,
+            tensor_stop,
+            tensor_num,
+            dtype,
+            _current_expected_place(),
+        )
     if _in_legacy_dygraph():
-        return _legacy_C_ops.linspace(tensor_start, tensor_stop, tensor_num,
-                                      'dtype', dtype)
+        return _legacy_C_ops.linspace(
+            tensor_start, tensor_stop, tensor_num, 'dtype', dtype
+        )
 
     helper = LayerHelper("linspace", **locals())
 
@@ -111,41 +131,53 @@ def linspace(start, stop, num, dtype=None, name=None):
     stop_dtype = convert_dtype(tensor_stop.dtype)
     out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start',
-                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
+        check_dtype(
+            start.dtype,
+            'start',
+            ['float32', 'float64', 'int32', 'int64'],
+            'linspace',
+        )
     else:
         check_type(start, 'start', (int, float), 'linspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop',
-                    ['float32', 'float64', 'int32', 'int64'], 'linspace')
+        check_dtype(
+            stop.dtype,
+            'stop',
+            ['float32', 'float64', 'int32', 'int64'],
+            'linspace',
+        )
     else:
         check_type(stop, 'stop', (int, float), 'linspace')
     if isinstance(num, Variable):
         check_dtype(num.dtype, 'num', ['int32'], 'linspace')
-    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
-                'linspace')
-    if ((stop_dtype == "float64" or start_dtype == "float64")
-            and out_dtype in ["float32", "int32"]) or (
-                (stop_dtype == "int64" or start_dtype == "int64")
-                and out_dtype == "int32"):
+    check_dtype(
+        dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'], 'linspace'
+    )
+    if (
+        (stop_dtype == "float64" or start_dtype == "float64")
+        and out_dtype in ["float32", "int32"]
+    ) or (
+        (stop_dtype == "int64" or start_dtype == "int64")
+        and out_dtype == "int32"
+    ):
         raise ValueError(
             "The dtype of start/stop is {}/{} but the attr(dtype) of linspace is {}, "
-            "which may cause data type overflows. Please reset attr(dtype) of linspace."
-            .format(start_dtype, stop_dtype, dtype))
+            "which may cause data type overflows. Please reset attr(dtype) of linspace.".format(
+                start_dtype, stop_dtype, dtype
+            )
+        )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type='linspace',
-                     inputs={
-                         'Start': tensor_start,
-                         'Stop': tensor_stop,
-                         'Num': tensor_num
-                     },
-                     attrs={'dtype': dtype},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='linspace',
+        inputs={'Start': tensor_start, 'Stop': tensor_stop, 'Num': tensor_num},
+        attrs={'dtype': dtype},
+        outputs={'Out': [out]},
+    )
     if isinstance(num, int):
-        out.desc.set_shape((num, ))
+        out.desc.set_shape((num,))
     return out
 
 
@@ -211,8 +243,9 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
         with device_guard("cpu"):
             tensor_base = fill_constant([1], dtype, base)
     if _non_static_mode():
-        return _legacy_C_ops.logspace(tensor_start, tensor_stop, tensor_num,
-                                      tensor_base, 'dtype', dtype)
+        return _legacy_C_ops.logspace(
+            tensor_start, tensor_stop, tensor_num, tensor_base, 'dtype', dtype
+        )
 
     helper = LayerHelper("logspace", **locals())
 
@@ -221,14 +254,22 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
     base_dtype = convert_dtype(tensor_base.dtype)
     out_dtype = convert_dtype(dtype)
     if isinstance(start, Variable):
-        check_dtype(start.dtype, 'start',
-                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+        check_dtype(
+            start.dtype,
+            'start',
+            ['float32', 'float64', 'int32', 'int64'],
+            'logspace',
+        )
     else:
         check_type(start, 'start', (int, float), 'logspace')
 
     if isinstance(stop, Variable):
-        check_dtype(stop.dtype, 'stop',
-                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+        check_dtype(
+            stop.dtype,
+            'stop',
+            ['float32', 'float64', 'int32', 'int64'],
+            'logspace',
+        )
     else:
         check_type(stop, 'stop', (int, float), 'logspace')
 
@@ -236,37 +277,55 @@ def logspace(start, stop, num, base=10.0, dtype=None, name=None):
         check_dtype(num.dtype, 'num', ['int32'], 'logspace')
 
     if isinstance(base, Variable):
-        check_dtype(base.dtype, 'base',
-                    ['float32', 'float64', 'int32', 'int64'], 'logspace')
+        check_dtype(
+            base.dtype,
+            'base',
+            ['float32', 'float64', 'int32', 'int64'],
+            'logspace',
+        )
     else:
         check_type(base, 'base', (int, float), 'logspace')
 
-    check_dtype(dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'],
-                'logspace')
-    if ((stop_dtype == "float64" or start_dtype == "float64"
-                                 or base_dtype == "float64")
-                                 and out_dtype in ["float32", "int32"]) or \
-       ((stop_dtype == "int64" or start_dtype == "int64"
-                               or base_dtype == "int64")
-                               and out_dtype == "int32"):
+    check_dtype(
+        dtype, 'dtype', ['int32', 'int64', 'float32', 'float64'], 'logspace'
+    )
+    if (
+        (
+            stop_dtype == "float64"
+            or start_dtype == "float64"
+            or base_dtype == "float64"
+        )
+        and out_dtype in ["float32", "int32"]
+    ) or (
+        (
+            stop_dtype == "int64"
+            or start_dtype == "int64"
+            or base_dtype == "int64"
+        )
+        and out_dtype == "int32"
+    ):
         raise ValueError(
             "The dtype of start/stop/base is {}/{}/{} but the attr(dtype) of logspace is {}, "
-            "which may cause data type overflows. Please reset attr(dtype) of logspace."
-            .format(start_dtype, stop_dtype, base_dtype, dtype))
+            "which may cause data type overflows. Please reset attr(dtype) of logspace.".format(
+                start_dtype, stop_dtype, base_dtype, dtype
+            )
+        )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type='logspace',
-                     inputs={
-                         'Start': tensor_start,
-                         'Stop': tensor_stop,
-                         'Num': tensor_num,
-                         'Base': tensor_base
-                     },
-                     attrs={'dtype': dtype},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='logspace',
+        inputs={
+            'Start': tensor_start,
+            'Stop': tensor_stop,
+            'Num': tensor_num,
+            'Base': tensor_base,
+        },
+        attrs={'dtype': dtype},
+        outputs={'Out': [out]},
+    )
     if isinstance(num, int):
-        out.desc.set_shape((num, ))
+        out.desc.set_shape((num,))
     return out
 
 
@@ -314,17 +373,25 @@ def _handle_dtype(data, dtype):
             return data
         else:
             raise TypeError(
-                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor"
-                .format(type(data)))
+                "Can't constructs a 'paddle.Tensor' with data type {}, data type must be scalar|list|tuple|np.ndarray|paddle.Tensor".format(
+                    type(data)
+                )
+            )
         if not dtype:
             if data.dtype in [
-                    'float16', 'float32', 'float64', 'complex64', 'complex128'
+                'float16',
+                'float32',
+                'float64',
+                'complex64',
+                'complex128',
             ]:
                 default_type = paddle.get_default_dtype()
                 if np.iscomplexobj(data):
-                    default_type = 'complex64' if default_type in [
-                        'float16', 'float32'
-                    ] else 'complex128'
+                    default_type = (
+                        'complex64'
+                        if default_type in ['float16', 'float32']
+                        else 'complex128'
+                    )
                 data = data.astype(default_type)
             # Windows default type is 'int32', while Linux/Mac is 'int64'. Unify they.
             if data.dtype in ['int32']:
@@ -335,18 +402,22 @@ def _handle_dtype(data, dtype):
         data = data.astype(convert_dtype(dtype))
 
     if _in_eager_without_dygraph_check() and isinstance(data, np.ndarray):
-        return core.eager.Tensor(value=data,
-                                 place=place,
-                                 persistable=False,
-                                 zero_copy=False,
-                                 name=None,
-                                 stop_gradient=stop_gradient)
+        return core.eager.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=False,
+            name=None,
+            stop_gradient=stop_gradient,
+        )
     else:
-        return paddle.Tensor(value=data,
-                             place=place,
-                             persistable=False,
-                             zero_copy=False,
-                             stop_gradient=stop_gradient)
+        return paddle.Tensor(
+            value=data,
+            place=place,
+            persistable=False,
+            zero_copy=False,
+            stop_gradient=stop_gradient,
+        )
 
 
 def _to_tensor_static(data, dtype=None, stop_gradient=None):
@@ -361,8 +432,11 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
             elif isinstance(data, (list, tuple)):
                 data = np.array(data)
 
-            if isinstance(data,
-                          np.ndarray) and not dtype and data.dtype != 'object':
+            if (
+                isinstance(data, np.ndarray)
+                and not dtype
+                and data.dtype != 'object'
+            ):
                 if data.dtype in ['float16', 'float32', 'float64']:
                     data = data.astype(paddle.get_default_dtype())
                 elif data.dtype in ['int32']:
@@ -377,10 +451,14 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
 
         target_dtype = convert_dtype(target_dtype)
 
-        if isinstance(data, np.ndarray) and len(data.shape) > 0 and any(
-                isinstance(x, Variable) for x in data):
+        if (
+            isinstance(data, np.ndarray)
+            and len(data.shape) > 0
+            and any(isinstance(x, Variable) for x in data)
+        ):
             if not all(
-                [x.shape == (1, ) for x in data if isinstance(x, Variable)]):
+                [x.shape == (1,) for x in data if isinstance(x, Variable)]
+            ):
                 raise TypeError(
                     "Unsupport paddle.to_tensor([Variable, Variable...]) with non-scalar variable."
                 )
@@ -404,7 +482,7 @@ def _to_tensor_static(data, dtype=None, stop_gradient=None):
 
 def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     r"""
-    Constructs a ``paddle.Tensor`` from ``data`` , 
+    Constructs a ``paddle.Tensor`` from ``data`` ,
     which can be scalar, tuple, list, numpy\.ndarray, paddle\.Tensor.
 
     If the ``data`` is already a Tensor, copy will be performed and return a new tensor.
@@ -413,13 +491,13 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     Args:
         data(scalar|tuple|list|ndarray|Tensor): Initial data for the tensor.
             Can be a scalar, list, tuple, numpy\.ndarray, paddle\.Tensor.
-        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' , 
+        dtype(str|np.dtype, optional): The desired data type of returned tensor. Can be 'bool' , 'float16' ,
             'float32' , 'float64' , 'int8' , 'int16' , 'int32' , 'int64' , 'uint8',
-            'complex64' , 'complex128'. Default: None, infers dtype from ``data`` 
+            'complex64' , 'complex128'. Default: None, infers dtype from ``data``
             except for python float number which gets dtype from ``get_default_type`` .
-        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be  
-            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is 
-            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs. 
+        place(CPUPlace|CUDAPinnedPlace|CUDAPlace|str, optional): The place to allocate Tensor. Can be
+            CPUPlace, CUDAPinnedPlace, CUDAPlace. Default: None, means global place. If ``place`` is
+            string, It can be ``cpu``, ``gpu:x`` and ``gpu_pinned``, where ``x`` is the index of the GPUs.
         stop_gradient(bool, optional): Whether to block the gradient propagation of Autograd. Default: True.
 
     Returns:
@@ -430,7 +508,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     .. code-block:: python
 
         import paddle
-                
+
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
 
@@ -445,7 +523,7 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
 
         paddle.to_tensor(x)  # A new tensor will be created with default stop_gradient=True
         # Tensor(shape=[1], dtype=int64, place=CPUPlace, stop_gradient=True,
-        #        [1])        
+        #        [1])
 
         paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CPUPlace(), stop_gradient=False)
         # Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=False,
@@ -486,18 +564,18 @@ def full_like(x, fill_value, dtype=None, name=None):
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         fill_value(bool|float|int): The value to fill the tensor with. Note: this value shouldn't exceed the range of the output data type.
         dtype(np.dtype|str, optional): The data type of output. The data type can be one
-            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output
             data type is the same as input.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``x``, ``fill_value`` and ``dtype``.
-    
+
     Examples:
         .. code-block:: python
 
           import paddle
-          
+
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
           # [[2. 2. 2.]
@@ -514,27 +592,31 @@ def full_like(x, fill_value, dtype=None, name=None):
         return _C_ops.full_like(x, fill_value, dtype, x.place)
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.fill_any_like(x, 'value', fill_value, 'dtype',
-                                           dtype)
+        return _legacy_C_ops.fill_any_like(
+            x, 'value', fill_value, 'dtype', dtype
+        )
 
     helper = LayerHelper("full_like", **locals())
     check_variable_and_dtype(
-        x, 'x',
+        x,
+        'x',
         ['bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'full_like')
+        'full_like',
+    )
     check_dtype(
-        dtype, 'dtype',
+        dtype,
+        'dtype',
         ['bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'full_like/zeros_like/ones_like')
+        'full_like/zeros_like/ones_like',
+    )
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
-    helper.append_op(type='fill_any_like',
-                     inputs={'X': [x]},
-                     attrs={
-                         'value': fill_value,
-                         "dtype": dtype
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='fill_any_like',
+        inputs={'X': [x]},
+        attrs={'value': fill_value, "dtype": dtype},
+        outputs={'Out': [out]},
+    )
     out.stop_gradient = True
     return out
 
@@ -548,28 +630,28 @@ def ones(shape, dtype=None, name=None):
         dtype (np.dtype|str, optional): Data type of output Tensor, it should be one of
             bool, float16, float32, float64, int32 and int64. If it is set to None, the data type will be float32.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: A Tensor of data type :attr:`dtype` with shape :attr:`shape` and all elements are 1.
 
     Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             # default dtype for ones OP
-            data1 = paddle.ones(shape=[3, 2]) 
+            data1 = paddle.ones(shape=[3, 2])
             # [[1. 1.]
             #  [1. 1.]
             #  [1. 1.]]
 
-            data2 = paddle.ones(shape=[2, 2], dtype='int32') 
+            data2 = paddle.ones(shape=[2, 2], dtype='int32')
             # [[1 1]
             #  [1 1]]
 
             # shape is a Tensor
             shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
-            data3 = paddle.ones(shape=shape, dtype='int32') 
+            data3 = paddle.ones(shape=shape, dtype='int32')
             # [[1 1]
             #  [1 1]]
     """
@@ -627,18 +709,18 @@ def zeros(shape, dtype=None, name=None):
         .. code-block:: python
 
           import paddle
-          
-          data = paddle.zeros(shape=[3, 2], dtype='float32') 
+
+          data = paddle.zeros(shape=[3, 2], dtype='float32')
           # [[0. 0.]
           #  [0. 0.]
           #  [0. 0.]]
-          data = paddle.zeros(shape=[2, 2]) 
+          data = paddle.zeros(shape=[2, 2])
           # [[0. 0.]
           #  [0. 0.]]
-          
+
           # shape is a Tensor
           shape = paddle.full(shape=[2], dtype='int32', fill_value=2)
-          data3 = paddle.zeros(shape=shape, dtype='int32') 
+          data3 = paddle.zeros(shape=shape, dtype='int32')
           # [[0 0]
           #  [0 0]]
     """
@@ -681,7 +763,7 @@ def zeros_like(x, dtype=None, name=None):
 
 def eye(num_rows, num_columns=None, dtype=None, name=None):
     """
-    
+
     This function constructs 2-D Tensor with ones on the diagonal and zeros elsewhere.
 
     Args:
@@ -698,7 +780,7 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
 
     Examples:
         .. code-block:: python
-          
+
           import paddle
 
           data = paddle.eye(3, dtype='int32')
@@ -729,26 +811,34 @@ def _check_attr(attr, message):
 
     if _non_static_mode():
         if in_dygraph_mode():
-            out = _C_ops.eye(num_rows, num_columns, dtype,
-                             _current_expected_place())
+            out = _C_ops.eye(
+                num_rows, num_columns, dtype, _current_expected_place()
+            )
         elif _in_legacy_dygraph():
-            out = _legacy_C_ops.eye('dtype', dtype, 'num_rows', num_rows,
-                                    'num_columns', num_columns)
+            out = _legacy_C_ops.eye(
+                'dtype', dtype, 'num_rows', num_rows, 'num_columns', num_columns
+            )
 
     else:
         helper = LayerHelper("eye", **locals())
-        check_dtype(dtype, 'dtype',
-                    ['float16', 'float32', 'float64', 'int32', 'int64'], 'eye')
+        check_dtype(
+            dtype,
+            'dtype',
+            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            'eye',
+        )
         out = helper.create_variable_for_type_inference(dtype=dtype)
-        helper.append_op(type='eye',
-                         inputs={},
-                         outputs={'Out': [out]},
-                         attrs={
-                             'num_rows': num_rows,
-                             'num_columns': num_columns,
-                             'dtype': dtype
-                         },
-                         stop_gradient=True)
+        helper.append_op(
+            type='eye',
+            inputs={},
+            outputs={'Out': [out]},
+            attrs={
+                'num_rows': num_rows,
+                'num_columns': num_columns,
+                'dtype': dtype,
+            },
+            stop_gradient=True,
+        )
 
     out.stop_gradient = True
     return out
@@ -758,7 +848,7 @@ def full(shape, fill_value, dtype=None, name=None):
     """
 
     Return a Tensor with the ``fill_value`` which size is same as ``shape``.
-    
+
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
                 The data type is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
@@ -770,7 +860,7 @@ def full(shape, fill_value, dtype=None, name=None):
             which can be float16, float32, float64, int32, int64, if dytpe is `None`, the data
             type of created Tensor is `float32`.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``shape``, ``fill_value`` and ``dtype``.
 
@@ -779,7 +869,7 @@ def full(shape, fill_value, dtype=None, name=None):
 
             import paddle
 
-            data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
+            data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64')
             #[[0]
             # [0]]
 
@@ -790,14 +880,14 @@ def full(shape, fill_value, dtype=None, name=None):
 
             # attr shape is a Tensor.
             shape = paddle.full([2], 2, "int32")
-            data4 = paddle.full(shape=shape, dtype='bool', fill_value=True) 
-            # [[True True] 
+            data4 = paddle.full(shape=shape, dtype='bool', fill_value=True)
+            # [[True True]
             #  [True True]]
-            
+
             # attr fill_value is a Tensor.
             val = paddle.full([1], 2.0, "float32")
             data5 = paddle.full(shape=[2,1], fill_value=val, dtype='float32')
-            # [[2.0] 
+            # [[2.0]
             #  [2.0]]
     """
 
@@ -835,7 +925,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             If ``dytpe`` is None, the data type is float32. Default is None.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor: A 1-D Tensor with values from the interval [``start``, ``end``)
         taken with common difference ``step`` beginning from ``start``. Its
         data type is set by ``dtype``.
@@ -858,7 +948,7 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
             start_var = paddle.to_tensor([3])
             out4 = paddle.arange(start_var, 7)
             # [3, 4, 5, 6]
-             
+
     """
     if dtype is None:
         dtype = 'int64'
@@ -867,8 +957,11 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         start = 0
 
     out_shape = None
-    if not isinstance(start, Variable) and not isinstance(
-            end, Variable) and not isinstance(step, Variable):
+    if (
+        not isinstance(start, Variable)
+        and not isinstance(end, Variable)
+        and not isinstance(step, Variable)
+    ):
         out_shape = [int(math.ceil((end - start) / step))]
 
     if not isinstance(dtype, core.VarDesc.VarType):
@@ -900,17 +993,16 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
         out.stop_gradient = True
         return out
 
-    check_dtype(dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'],
-                'range/arange')
+    check_dtype(
+        dtype, 'dtype', ['float32', 'float64', 'int32', 'int64'], 'range/arange'
+    )
     helper = LayerHelper('range', **locals())
     out = helper.create_variable_for_type_inference(dtype, shape=out_shape)
-    helper.append_op(type='range',
-                     inputs={
-                         'Start': start,
-                         'End': end,
-                         'Step': step
-                     },
-                     outputs={'Out': out})
+    helper.append_op(
+        type='range',
+        inputs={'Start': start, 'End': end, 'Step': step},
+        outputs={'Out': out},
+    )
     out.stop_gradient = True
     if out_shape is not None:
         out.desc.set_shape(out_shape)
@@ -918,27 +1010,30 @@ def arange(start=0, end=None, step=1, dtype=None, name=None):
 
 
 def _tril_triu_op(helper):
-    """Base op of tril_op and triu_op
-    """
+    """Base op of tril_op and triu_op"""
     op_type = helper.layer_type
     x = helper.kwargs.get('x', None)
 
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'], op_type)
+        x,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+        op_type,
+    )
     if len(x.shape) < 2:
         raise ValueError("x shape in {} must be at least 2-D".format(op_type))
     diagonal = helper.kwargs.get('diagonal', 0)
-    if not isinstance(diagonal, (int, )):
+    if not isinstance(diagonal, (int,)):
         raise TypeError("diagonal in {} must be a python Int".format(op_type))
     name = helper.kwargs.get('name', None)
 
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
-        out = helper.create_variable(name=name,
-                                     dtype=x.dtype,
-                                     persistable=False)
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False
+        )
 
     helper.append_op(
         type="tril_triu",
@@ -956,8 +1051,8 @@ def _tril_triu_op(helper):
 def tril(x, diagonal=0, name=None):
     r"""
     Returns the lower triangular part of a matrix (2-D tensor) or batch
-    of matrices :attr:`x`, the other elements of the result tensor are set 
-    to 0. The lower triangular part of the matrix is defined as the elements 
+    of matrices :attr:`x`, the other elements of the result tensor are set
+    to 0. The lower triangular part of the matrix is defined as the elements
     on and below the diagonal.
 
     Args:
@@ -1043,33 +1138,34 @@ def triu(x, diagonal=0, name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            data = np.arange(1, 13, dtype="int64").reshape(3,-1)
-            # array([[ 1,  2,  3,  4],
-            #        [ 5,  6,  7,  8],
-            #        [ 9, 10, 11, 12]])
-
+            x = paddle.arange(1, 13, dtype="int64").reshape([3,-1])
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1 , 2 , 3 , 4 ],
+            #         [5 , 6 , 7 , 8 ],
+            #         [9 , 10, 11, 12]])
 
             # example 1, default diagonal
-            x = paddle.to_tensor(data)
             triu1 = paddle.tensor.triu(x)
-            # array([[ 1,  2,  3,  4],
-            #        [ 0,  6,  7,  8],
-            #        [ 0,  0, 11, 12]])
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1 , 2 , 3 , 4 ],
+            #         [0 , 6 , 7 , 8 ],
+            #         [0 , 0 , 11, 12]])
 
             # example 2, positive diagonal value
             triu2 = paddle.tensor.triu(x, diagonal=2)
-            # array([[0, 0, 3, 4],
-            #        [0, 0, 0, 8],
-            #        [0, 0, 0, 0]])
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 0, 3, 4],
+            #         [0, 0, 0, 8],
+            #         [0, 0, 0, 0]])
 
             # example 3, negative diagonal value
             triu3 = paddle.tensor.triu(x, diagonal=-1)
-            # array([[ 1,  2,  3,  4],
-            #        [ 5,  6,  7,  8],
-            #        [ 0, 10, 11, 12]])
+            # Tensor(shape=[3, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1 , 2 , 3 , 4 ],
+            #         [5 , 6 , 7 , 8 ],
+            #         [0 , 10, 11, 12]])
 
     """
     if in_dygraph_mode():
@@ -1084,15 +1180,16 @@ def triu(x, diagonal=0, name=None):
 
 def meshgrid(*args, **kwargs):
     """
-    Takes a list of N tensors as input *args, each of which is 1-dimensional vector, and creates N-dimensional grids.
-    
+
+    Takes a list of N tensors as input :attr:`*args`, each of which is 1-dimensional vector, and creates N-dimensional grids.
+
     Args:
-        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,), 
+        *args(Tensor|list of Tensor) : tensors (tuple(list) of tensor): the shapes of input k tensors are (N1,),
             (N2,),..., (Nk,). Support data types: ``float64``, ``float32``, ``int32``, ``int64``.
-        **kwargs (optional): Currently, only accept name in **kwargs 
+        **kwargs (optional): Currently, only accept name in **kwargs
             The default value is None. Normally there is no need for
             user to set this property. For more information, please refer to :ref:`api_guide_Name`.
- 
+
     Returns:
          Tensor: k tensors. The shape of each tensor is (N1, N2, ..., Nk)
 
@@ -1130,18 +1227,21 @@ def meshgrid(*args, **kwargs):
         raise TypeError("The type of input args in meshgrid should be list.")
 
     for id, input_ in enumerate(args):
-        check_dtype(input_.dtype, 'create data type',
-                    ['float16', 'float32', 'float64', 'int32', 'int64'],
-                    'meshgrid')
+        check_dtype(
+            input_.dtype,
+            'create data type',
+            ['float16', 'float32', 'float64', 'int32', 'int64'],
+            'meshgrid',
+        )
 
     num = len(args)
     out = [
         helper.create_variable_for_type_inference(dtype=args[i].dtype)
         for i in range(num)
     ]
-    helper.append_op(type='meshgrid',
-                     inputs={'X': list(args)},
-                     outputs={'Out': out})
+    helper.append_op(
+        type='meshgrid', inputs={'X': list(args)}, outputs={'Out': out}
+    )
 
     return out
 
@@ -1177,24 +1277,27 @@ def diagflat(x, offset=0, name=None):
 
             x = paddle.to_tensor([1, 2, 3])
             y = paddle.diagflat(x)
-            print(y.numpy())
-            # [[1 0 0]
-            #  [0 2 0]
-            #  [0 0 3]]
+            print(y)
+            # Tensor(shape=[3, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1, 0, 0],
+            #         [0, 2, 0],
+            #         [0, 0, 3]])
 
             y = paddle.diagflat(x, offset=1)
-            print(y.numpy())
-            # [[0 1 0 0]
-            #  [0 0 2 0]
-            #  [0 0 0 3]
-            #  [0 0 0 0]]
+            print(y)
+            # Tensor(shape=[4, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 1, 0, 0],
+            #         [0, 0, 2, 0],
+            #         [0, 0, 0, 3],
+            #         [0, 0, 0, 0]])
 
             y = paddle.diagflat(x, offset=-1)
-            print(y.numpy())
-            # [[0 0 0 0]
-            #  [1 0 0 0]
-            #  [0 2 0 0]
-            #  [0 0 3 0]]
+            print(y)
+            # Tensor(shape=[4, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 0, 0, 0],
+            #         [1, 0, 0, 0],
+            #         [0, 2, 0, 0],
+            #         [0, 0, 3, 0]])
 
         .. code-block:: python
             :name: code-example-2
@@ -1203,27 +1306,30 @@ def diagflat(x, offset=0, name=None):
 
             x = paddle.to_tensor([[1, 2], [3, 4]])
             y = paddle.diagflat(x)
-            print(y.numpy())
-            # [[1 0 0 0]
-            #  [0 2 0 0]
-            #  [0 0 3 0]
-            #  [0 0 0 4]]
+            print(y)
+            # Tensor(shape=[4, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1, 0, 0, 0],
+            #         [0, 2, 0, 0],
+            #         [0, 0, 3, 0],
+            #         [0, 0, 0, 4]])
 
             y = paddle.diagflat(x, offset=1)
-            print(y.numpy())
-            # [[0 1 0 0 0]
-            #  [0 0 2 0 0]
-            #  [0 0 0 3 0]
-            #  [0 0 0 0 4]
-            #  [0 0 0 0 0]]
+            print(y)
+            # Tensor(shape=[5, 5], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 1, 0, 0, 0],
+            #         [0, 0, 2, 0, 0],
+            #         [0, 0, 0, 3, 0],
+            #         [0, 0, 0, 0, 4],
+            #         [0, 0, 0, 0, 0]])
 
             y = paddle.diagflat(x, offset=-1)
-            print(y.numpy())
-            # [[0 0 0 0 0]
-            #  [1 0 0 0 0]
-            #  [0 2 0 0 0]
-            #  [0 0 3 0 0]
-            #  [0 0 0 4 0]]
+            print(y)
+            # Tensor(shape=[5, 5], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 0, 0, 0, 0],
+            #         [1, 0, 0, 0, 0],
+            #         [0, 2, 0, 0, 0],
+            #         [0, 0, 3, 0, 0],
+            #         [0, 0, 0, 4, 0]])
     """
     padding_value = 0
     if in_dygraph_mode():
@@ -1235,17 +1341,21 @@ def diagflat(x, offset=0, name=None):
 
     if _in_legacy_dygraph():
         if len(x.shape) == 1:
-            return _legacy_C_ops.diag_v2(x, "offset", offset, "padding_value",
-                                         padding_value)
+            return _legacy_C_ops.diag_v2(
+                x, "offset", offset, "padding_value", padding_value
+            )
         else:
             y, _ = _legacy_C_ops.flatten_contiguous_range(
-                x, "start_axis", 0, "stop_axis", -1)
-            return _legacy_C_ops.diag_v2(y, "offset", offset, "padding_value",
-                                         padding_value)
+                x, "start_axis", 0, "stop_axis", -1
+            )
+            return _legacy_C_ops.diag_v2(
+                y, "offset", offset, "padding_value", padding_value
+            )
 
     check_type(x, 'x', (Variable), 'diagflat')
-    check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
-                'diagflat')
+    check_dtype(
+        x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'], 'diagflat'
+    )
     check_type(offset, 'offset', (int), 'diagflat')
 
     helper = LayerHelper("diagflat", **locals())
@@ -1254,33 +1364,27 @@ def diagflat(x, offset=0, name=None):
     out2 = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if len(x.shape) == 1:
-        helper.append_op(type='diag_v2',
-                         inputs={'X': x},
-                         outputs={'Out': out2},
-                         attrs={
-                             'offset': offset,
-                             'padding_value': padding_value
-                         })
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': x},
+            outputs={'Out': out2},
+            attrs={'offset': offset, 'padding_value': padding_value},
+        )
     else:
-        helper.append_op(type='flatten_contiguous_range',
-                         inputs={'X': x},
-                         outputs={
-                             'Out': out1,
-                             'XShape': out1_shape
-                         },
-                         attrs={
-                             'start_axis': 0,
-                             'stop_axis': -1
-                         })
+        helper.append_op(
+            type='flatten_contiguous_range',
+            inputs={'X': x},
+            outputs={'Out': out1, 'XShape': out1_shape},
+            attrs={'start_axis': 0, 'stop_axis': -1},
+        )
         out1.stop_gradient = True
 
-        helper.append_op(type='diag_v2',
-                         inputs={'X': out1},
-                         outputs={'Out': out2},
-                         attrs={
-                             'offset': offset,
-                             'padding_value': padding_value
-                         })
+        helper.append_op(
+            type='diag_v2',
+            inputs={'X': out1},
+            outputs={'Out': out2},
+            attrs={'offset': offset, 'padding_value': padding_value},
+        )
     out2.stop_gradient = True
     return out2
 
@@ -1304,7 +1408,7 @@ def diag(x, offset=0, padding_value=0, name=None):
         offset (int, optional): The diagonal offset. A positive value represents superdiagonal, 0 represents the main diagonal, and a negative value represents subdiagonal.
         padding_value (int|float, optional): Use this value to fill the area outside the specified diagonal band. Only takes effect when the input is a 1-D Tensor. The default value is 0.
         name (str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-        
+
     Returns:
         Tensor, a square matrix or a vector. The output data type is the same as input data type.
 
@@ -1317,23 +1421,26 @@ def diag(x, offset=0, padding_value=0, name=None):
             paddle.disable_static()
             x = paddle.to_tensor([1, 2, 3])
             y = paddle.diag(x)
-            print(y.numpy())
-            # [[1 0 0]
-            #  [0 2 0]
-            #  [0 0 3]]
+            print(y)
+            # Tensor(shape=[3, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1, 0, 0],
+            #         [0, 2, 0],
+            #         [0, 0, 3]])
 
             y = paddle.diag(x, offset=1)
-            print(y.numpy())
-            # [[0 1 0 0]
-            #  [0 0 2 0]
-            #  [0 0 0 3]
-            #  [0 0 0 0]]
+            print(y)
+            # Tensor(shape=[4, 4], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[0, 1, 0, 0],
+            #         [0, 0, 2, 0],
+            #         [0, 0, 0, 3],
+            #         [0, 0, 0, 0]])
 
             y = paddle.diag(x, padding_value=6)
-            print(y.numpy())
-            # [[1 6 6]
-            #  [6 2 6]
-            #  [6 6 3]]
+            print(y)
+            # Tensor(shape=[3, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1, 6, 6],
+            #         [6, 2, 6],
+            #         [6, 6, 3]])
 
         .. code-block:: python
             :name: code-example-2
@@ -1343,45 +1450,54 @@ def diag(x, offset=0, padding_value=0, name=None):
             paddle.disable_static()
             x = paddle.to_tensor([[1, 2, 3], [4, 5, 6]])
             y = paddle.diag(x)
-            print(y.numpy())
-            # [1 5]
+            print(y)
+            # Tensor(shape=[2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [1, 5])
 
             y = paddle.diag(x, offset=1)
-            print(y.numpy())
-            # [2 6]
+            print(y)
+            # Tensor(shape=[2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [2, 6])
 
             y = paddle.diag(x, offset=-1)
-            print(y.numpy())
-            # [4]
+            print(y)
+            # Tensor(shape=[1], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [4])
     """
     if in_dygraph_mode():
         return _C_ops.diag(x, offset, padding_value)
     else:
         if _in_legacy_dygraph():
-            return _legacy_C_ops.diag_v2(x, "offset", offset, "padding_value",
-                                         padding_value)
+            return _legacy_C_ops.diag_v2(
+                x, "offset", offset, "padding_value", padding_value
+            )
         else:
             check_type(x, 'x', (Variable), 'diag_v2')
-            check_dtype(x.dtype, 'x', ['float32', 'float64', 'int32', 'int64'],
-                        'diag_v2')
+            check_dtype(
+                x.dtype,
+                'x',
+                ['float32', 'float64', 'int32', 'int64'],
+                'diag_v2',
+            )
             check_type(offset, 'offset', (int), 'diag_v2')
             check_type(padding_value, 'padding_value', (int, float), 'diag_v2')
             if len(x.shape) != 1 and len(x.shape) != 2:
                 raise ValueError(
-                    "The dimension of input x must be either 1 or 2, but received {}"
-                    .format(len(x.shape)))
+                    "The dimension of input x must be either 1 or 2, but received {}".format(
+                        len(x.shape)
+                    )
+                )
 
             helper = LayerHelper("diag_v2", **locals())
 
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-            helper.append_op(type='diag_v2',
-                             inputs={'X': x},
-                             outputs={'Out': out},
-                             attrs={
-                                 'offset': offset,
-                                 'padding_value': padding_value
-                             })
+            helper.append_op(
+                type='diag_v2',
+                inputs={'X': x},
+                outputs={'Out': out},
+                attrs={'offset': offset, 'padding_value': padding_value},
+            )
 
             out.stop_gradient = True
             return out
@@ -1390,7 +1506,7 @@ def diag(x, offset=0, padding_value=0, name=None):
 def empty(shape, dtype=None, name=None):
     """
     Returns a Tensor with uninitialized data which size is same as ``shape``.
-    
+
     Args:
         shape(list|tuple|Tensor): Shape of the Tensor to be created.
                 The data type of dimension of shape is ``int32`` or ``int64`` . If ``shape`` is a list or tuple,
@@ -1401,7 +1517,7 @@ def empty(shape, dtype=None, name=None):
             type of created Tensor use global default dtype (see ``get_default_dtype``
             for details).
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``shape`` and ``dtype``, and is uninitialized.
 
@@ -1443,42 +1559,48 @@ def empty(shape, dtype=None, name=None):
 
     if in_dygraph_mode():
         shape = utils.convert_shape_to_list(shape)
-        out = _C_ops.empty(shape, convert_np_dtype_to_dtype_(dtype),
-                           _current_expected_place())
+        out = _C_ops.empty(
+            shape, convert_np_dtype_to_dtype_(dtype), _current_expected_place()
+        )
         out.stop_gradient = True
         return out
 
     if _in_legacy_dygraph():
         shape = utils.convert_shape_to_list(shape)
-        out = _legacy_C_ops.empty('shape', shape, 'dtype',
-                                  convert_np_dtype_to_dtype_(dtype))
+        out = _legacy_C_ops.empty(
+            'shape', shape, 'dtype', convert_np_dtype_to_dtype_(dtype)
+        )
         out.stop_gradient = True
         return out
 
     helper = LayerHelper("empty", **locals())
     inputs = {}
 
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'empty')
+    check_dtype(
+        dtype,
+        'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty',
+    )
     check_type(shape, 'shape', (Variable, list, tuple), 'empty')
 
     if isinstance(shape, Variable):
         check_dtype(shape.dtype, 'shape', ['int32', 'int64'], 'empty')
 
     attrs = {}
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='empty')
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty'
+    )
 
     out = helper.create_variable_for_type_inference(dtype=dtype)
     attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
-    helper.append_op(type='empty',
-                     inputs=inputs,
-                     outputs={'Out': [out]},
-                     attrs=attrs,
-                     stop_gradient=True)
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True,
+    )
     out.stop_gradient = True
     return out
 
@@ -1487,14 +1609,14 @@ def empty_like(x, dtype=None, name=None):
     """
     Returns a Tensor with uninitialized data which has identical shape of ``x`` and ``dtype``.
     If the ``dtype`` is None, the data type of Tensor is same with ``x``.
-    
+
     Args:
         x(Tensor): The input tensor which specifies shape and data type. The data type can be bool, float16, float32, float64, int32, int64.
         dtype(np.dtype|str, optional): The data type of output. The data type can be one
-            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output 
+            of bool, float16, float32, float64, int32, int64. The default value is None, which means the output
             data type is the same as input.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
-    
+
     Returns:
         Tensor: Tensor which is created according to ``x`` and ``dtype``, and is uninitialized.
 
@@ -1516,40 +1638,51 @@ def empty_like(x, dtype=None, name=None):
     dtype = convert_dtype(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.empty(x.shape, convert_np_dtype_to_dtype_(dtype),
-                           _current_expected_place())
+        out = _C_ops.empty(
+            x.shape,
+            convert_np_dtype_to_dtype_(dtype),
+            _current_expected_place(),
+        )
         out.stop_gradient = True
         return out
 
     if _in_legacy_dygraph():
-        out = _legacy_C_ops.empty('shape', x.shape, 'dtype',
-                                  convert_np_dtype_to_dtype_(dtype))
+        out = _legacy_C_ops.empty(
+            'shape', x.shape, 'dtype', convert_np_dtype_to_dtype_(dtype)
+        )
         out.stop_gradient = True
         return out
 
     helper = LayerHelper("empty_like", **locals())
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'empty_like')
-    check_dtype(dtype, 'dtype',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'empty_like')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like',
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'empty_like',
+    )
     out = helper.create_variable_for_type_inference(dtype=dtype)
 
     inputs = {}
     attrs = {}
     attrs['dtype'] = convert_np_dtype_to_dtype_(dtype)
     shape = paddle.shape(x)
-    utils.get_shape_tensor_inputs(inputs=inputs,
-                                  attrs=attrs,
-                                  shape=shape,
-                                  op_type='empty_like')
-
-    helper.append_op(type='empty',
-                     inputs=inputs,
-                     outputs={'Out': [out]},
-                     attrs=attrs,
-                     stop_gradient=True)
+    utils.get_shape_tensor_inputs(
+        inputs=inputs, attrs=attrs, shape=shape, op_type='empty_like'
+    )
+
+    helper.append_op(
+        type='empty',
+        inputs=inputs,
+        outputs={'Out': [out]},
+        attrs=attrs,
+        stop_gradient=True,
+    )
     out.stop_gradient = True
     return out
 
@@ -1558,16 +1691,16 @@ def assign(x, output=None):
     """
 
     Copy value of the :attr:`x` to the :attr:`output`.
- 
+
     Parameters:
         x (Tensor|np.ndarray|list|tuple|scalar): A Tensor, numpy ndarray, tuple/list of scalar,
             or scalar. Its data type can be float16, float32, float64, int32, int64 or bool. Note: the float64 data will be converted to float32 because of current platform protobuf
             data limitation.
         output (Tensor, optional): A Tensor. If :attr:`output` is None, a new Tensor will be created as :attr:`output`. Default: None.
- 
+
     Returns:
         Tensor: A Tensor with the same shape, data type and value as :attr:`x`.
- 
+
     Examples:
         .. code-block:: python
 
@@ -1584,8 +1717,12 @@ def assign(x, output=None):
     """
     input = x
     helper = LayerHelper('assign', **locals())
-    check_type(input, 'input',
-               (Variable, np.ndarray, list, tuple, float, int, bool), 'assign')
+    check_type(
+        input,
+        'input',
+        (Variable, np.ndarray, list, tuple, float, int, bool),
+        'assign',
+    )
     is_inplace = True if output is not None else False
 
     if np.isscalar(input) and not isinstance(input, str):
@@ -1608,24 +1745,40 @@ def assign(x, output=None):
                 output = core.VarBase()
             _legacy_C_ops.assign(input, output)
         else:
-            check_dtype(input.dtype, 'input', [
-                'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
-                'uint8', 'bool'
-            ], 'assign', '(When the type of input in assign is Variable.)')
+            check_dtype(
+                input.dtype,
+                'input',
+                [
+                    'float16',
+                    'uint16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'uint8',
+                    'bool',
+                ],
+                'assign',
+                '(When the type of input in assign is Variable.)',
+            )
             if output is None:
                 output = helper.create_variable_for_type_inference(
-                    dtype=input.dtype)
-            helper.append_op(type='assign',
-                             inputs={'X': [input]},
-                             outputs={'Out': [output]})
+                    dtype=input.dtype
+                )
+            helper.append_op(
+                type='assign', inputs={'X': [input]}, outputs={'Out': [output]}
+            )
     elif isinstance(input, np.ndarray):
         # We now support the form of [var, VAR...] if the Var.shape=[1,]
         if len(input.shape) > 0 and any(isinstance(x, Variable) for x in input):
             # We only deal with the case where the list is nested one level, convert all scalars into variables, and then use stack to process. It is necessary to ensure the consistency of types.
-            if not all([
-                    x.shape == (1, ) for x in input
+            if not all(
+                [
+                    x.shape == (1,)
+                    for x in input
                     if isinstance(x, (Variable, core.eager.Tensor))
-            ]):
+                ]
+            ):
                 raise TypeError(
                     "Unsupport paddle.assign([Variable, Variable...]) with non-scalar variable."
                 )
@@ -1641,8 +1794,7 @@ def convert_scalar(x):
             return ret
 
         if input.dtype == 'object':
-            """ may be this form [[Var], [Var], [3], [4]], we reject them.
-            """
+            """may be this form [[Var], [Var], [3], [4]], we reject them."""
             raise TypeError(
                 "The type of received input == `object`, it is not supported to convert to tensor, such as [[Var], [Var], [3], [4]]"
             )
@@ -1654,7 +1806,8 @@ def convert_scalar(x):
             warnings.warn(
                 "paddle.assign doesn't support float64 input now due "
                 "to current platform protobuf data limitation, we convert "
-                "it to float32")
+                "it to float32"
+            )
             dtype = core.VarDesc.VarType.FP32
         if dtype == core.VarDesc.VarType.BOOL:
             value_name = "bool_values"
@@ -1672,31 +1825,49 @@ def convert_scalar(x):
             raise TypeError(
                 "When the type of 'input' in assign is numpy.ndarray, "
                 "the data type of 'input' must be bool, float32, int32 or int64, but "
-                "received %s." % convert_dtype(dtype))
+                "received %s." % convert_dtype(dtype)
+            )
         if input.size > 1024 * 1024:
-            raise ValueError("The size of input is too big. Please consider "
-                             "saving it to file and 'load_op' to load it")
+            raise ValueError(
+                "The size of input is too big. Please consider "
+                "saving it to file and 'load_op' to load it"
+            )
         if in_dygraph_mode():
             if output is None:
                 output = zeros(list(input.shape), dtype)
-            _C_ops.assign_value_(output, list(input.shape), dtype, values,
-                                 _current_expected_place())
+            _C_ops.assign_value_(
+                output,
+                list(input.shape),
+                dtype,
+                values,
+                _current_expected_place(),
+            )
         elif _in_legacy_dygraph():
             if output is None:
                 output = core.VarBase()
-            _legacy_C_ops.assign_value(output, 'shape', list(input.shape),
-                                       'dtype', dtype, value_name, values)
+            _legacy_C_ops.assign_value(
+                output,
+                'shape',
+                list(input.shape),
+                'dtype',
+                dtype,
+                value_name,
+                values,
+            )
         else:
             if output is None:
                 output = helper.create_variable_for_type_inference(
-                    dtype=input.dtype)
-            helper.append_op(type='assign_value',
-                             outputs={'Out': [output]},
-                             attrs={
-                                 'dtype': dtype,
-                                 'shape': list(input.shape),
-                                 value_name: values
-                             })
+                    dtype=input.dtype
+                )
+            helper.append_op(
+                type='assign_value',
+                outputs={'Out': [output]},
+                attrs={
+                    'dtype': dtype,
+                    'shape': list(input.shape),
+                    value_name: values,
+                },
+            )
 
     if is_inplace and _in_legacy_dygraph():
         output._bump_inplace_version()
@@ -1706,15 +1877,15 @@ def convert_scalar(x):
 
 def clone(x, name=None):
     """
-    Returns a copy of input Tensor. It will always have a Tensor copy. 
-    
+    Returns a copy of input Tensor. It will always have a Tensor copy.
+
     In addition, This function is derivable, so gradients will flow back from the output to input.
 
     Parameters:
         x (Tensor): The input Tensor.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
-    Returns: 
+    Returns:
         Tensor, A Tensor copied from ``input``.
 
     Examples:
@@ -1734,7 +1905,7 @@ def clone(x, name=None):
     return x.clone()
 
 
-#NOTE(zhiqiu): not public
+# NOTE(zhiqiu): not public
 def _memcpy(input, place=None, output=None):
     """
 
@@ -1754,7 +1925,7 @@ def _memcpy(input, place=None, output=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
+
           data = paddle.full(shape=[3, 2], fill_value=2.5, dtype='float64') # [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
           result = paddle._memcpy(data, place=paddle.CPUPlace())  # result2 = [[2.5, 2.5], [2.5, 2.5], [2.5, 2.5]]
     """
@@ -1762,10 +1933,22 @@ def _memcpy(input, place=None, output=None):
     check_type(input, 'input', (Variable), 'memcpy')
 
     if isinstance(input, (Variable, core.VarBase)):
-        check_dtype(input.dtype, 'input', [
-            'float16', 'uint16', 'float32', 'float64', 'int32', 'int64',
-            'uint8', 'bool'
-        ], 'memcpy', '(When the type of input in memcpy is Variable.)')
+        check_dtype(
+            input.dtype,
+            'input',
+            [
+                'float16',
+                'uint16',
+                'float32',
+                'float64',
+                'int32',
+                'int64',
+                'uint8',
+                'bool',
+            ],
+            'memcpy',
+            '(When the type of input in memcpy is Variable.)',
+        )
     if output is None:
         output = helper.create_variable_for_type_inference(dtype=input.dtype)
 
@@ -1787,10 +1970,12 @@ def _memcpy(input, place=None, output=None):
             dst_place_type = 4
 
     attrs = {'dst_place_type': dst_place_type}
-    helper.append_op(type='memcpy',
-                     inputs={'X': [input]},
-                     outputs={'Out': [output]},
-                     attrs=attrs)
+    helper.append_op(
+        type='memcpy',
+        inputs={'X': [input]},
+        outputs={'Out': [output]},
+        attrs=attrs,
+    )
     return output
 
 
@@ -1815,10 +2000,10 @@ def complex(real, imag, name=None):
             x = paddle.arange(2, dtype=paddle.float32).unsqueeze(-1)
             y = paddle.arange(3, dtype=paddle.float32)
             z = paddle.complex(x, y)
-            print(z.numpy())
-
-            # [[0.+0.j 0.+1.j 0.+2.j]
-            #  [1.+0.j 1.+1.j 1.+2.j]]
+            print(z)
+            # Tensor(shape=[2, 3], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            #        [[0j    , 1j    , 2j    ],
+            #         [(1+0j), (1+1j), (1+2j)]])
     """
     if in_dygraph_mode():
         return _C_ops.complex(real, imag)
@@ -1833,7 +2018,8 @@ def complex(real, imag, name=None):
     helper = LayerHelper(op_type, **locals())
     inputs = {"X": real, "Y": imag}
     out = helper.create_variable_for_type_inference(
-        dtype=_real_to_complex_dtype(real.dtype))
+        dtype=_real_to_complex_dtype(real.dtype)
+    )
     outputs = {"Out": out}
     attrs = {}
     helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
@@ -1842,20 +2028,20 @@ def complex(real, imag, name=None):
 
 def tril_indices(row, col, offset=0, dtype='int64'):
     """
-    Return the indices of the lower triangular part of the 2-D matrix 
-    whose row and col is knowed.Indices are ordered based on row and then columns. 
+    Return the indices of the lower triangular part of the 2-D matrix
+    whose row and col is knowed.Indices are ordered based on row and then columns.
     The lower triangular part of the matrix is defined as the elements on
     and below the diagonal.
-    
+
     Args:
         row (int): The input x which is a int number describe the number of row of the matrix.
         col (int): The input x which is a int number describe the number of col of the matrix.
         offset (int, optional): The offset to consider, default value is 0.
 
-            - If offset = 0, all elements on and below the main diagonal are retained.  
-            - If offset > 0, include just as many diagonals above the main diagonal.  
-            - If offset < 0, excludes just as many diagonals below the main diagonal.  
- 
+            - If offset = 0, all elements on and below the main diagonal are retained.
+            - If offset > 0, include just as many diagonals above the main diagonal.
+            - If offset < 0, excludes just as many diagonals below the main diagonal.
+
         dtype (int, optional): the data type of the output tensor, can be int32, int64.
 
     Returns:
@@ -1866,17 +2052,17 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         .. code-block:: python
 
             import paddle
-            
+
             # example 1, default offset value
             data1 = paddle.tril_indices(4,4,0)
             print(data1)
-            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3], 
+            # [[0, 1, 1, 2, 2, 2, 3, 3, 3, 3],
             #  [0, 0, 1, 0, 1, 2, 0, 1, 2, 3]]
 
             # example 2, positive offset value
             data2 = paddle.tril_indices(4,4,2)
             print(data2)
-            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], 
+            # [[0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3],
             #  [0, 1, 2, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]]
 
             # example 3, negative offset value
@@ -1901,13 +2087,15 @@ def tril_indices(row, col, offset=0, dtype='int64'):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.tril_indices(row, col, offset, dtype,
-                                  _current_expected_place())
+        out = _C_ops.tril_indices(
+            row, col, offset, dtype, _current_expected_place()
+        )
         return out
 
     if _in_legacy_dygraph():
-        out = _legacy_C_ops.tril_indices('rows', row, 'cols', col, 'offset',
-                                         offset, "dtype", dtype)
+        out = _legacy_C_ops.tril_indices(
+            'rows', row, 'cols', col, 'offset', offset, "dtype", dtype
+        )
         return out
 
     else:
@@ -1915,15 +2103,12 @@ def tril_indices(row, col, offset=0, dtype='int64'):
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='tril_indices',
-                         inputs={},
-                         outputs={'out': [out]},
-                         attrs={
-                             'rows': row,
-                             'cols': col,
-                             'offset': offset,
-                             'dtype': dtype
-                         })
+        helper.append_op(
+            type='tril_indices',
+            inputs={},
+            outputs={'out': [out]},
+            attrs={'rows': row, 'cols': col, 'offset': offset, 'dtype': dtype},
+        )
     return out
 
 
@@ -1986,13 +2171,15 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
         dtype = convert_np_dtype_to_dtype_(dtype)
 
     if in_dygraph_mode():
-        out = _C_ops.triu_indices(row, col, offset, dtype,
-                                  _current_expected_place())
+        out = _C_ops.triu_indices(
+            row, col, offset, dtype, _current_expected_place()
+        )
         return out
 
     if _in_legacy_dygraph():
-        out = _legacy_C_ops.triu_indices('row', row, 'col', col, 'offset',
-                                         offset, "dtype", dtype)
+        out = _legacy_C_ops.triu_indices(
+            'row', row, 'col', col, 'offset', offset, "dtype", dtype
+        )
         return out
 
     else:
@@ -2000,13 +2187,10 @@ def triu_indices(row, col=None, offset=0, dtype='int64'):
 
         out = helper.create_variable_for_type_inference(dtype=dtype)
 
-        helper.append_op(type='triu_indices',
-                         inputs={},
-                         outputs={'out': [out]},
-                         attrs={
-                             'row': row,
-                             'col': col,
-                             'offset': offset,
-                             'dtype': dtype
-                         })
+        helper.append_op(
+            type='triu_indices',
+            inputs={},
+            outputs={'out': [out]},
+            attrs={'row': row, 'col': col, 'offset': offset, 'dtype': dtype},
+        )
     return out
diff --git a/python/paddle/tensor/einsum.py b/python/paddle/tensor/einsum.py
index 55726831d2e359..0ffd461c633c5c 100644
--- a/python/paddle/tensor/einsum.py
+++ b/python/paddle/tensor/einsum.py
@@ -22,9 +22,17 @@
 from .math import sum as paddle_sum
 from ..fluid.framework import _in_legacy_dygraph
 from paddle import _C_ops, _legacy_C_ops
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from ..fluid.framework import (
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 import collections
 import string
 import opt_einsum
@@ -47,17 +55,18 @@ def parse_op_labels(labelstr, operand):
 
     Returns
     -------
-    the input operand's full label string in which all anonymous dimensions are 
-    labeled in dots. 
+    the input operand's full label string in which all anonymous dimensions are
+    labeled in dots.
     '''
     # Sanity checks
     for c in labelstr.replace('.', ''):
-        assert c.isalpha(), (
-            f"Invalid equation: {c} is not a valid label, which should be letters."
-        )
+        assert (
+            c.isalpha()
+        ), f"Invalid equation: {c} is not a valid label, which should be letters."
 
-    assert labelstr.replace('...', '', 1).find('.') == -1, (
-        f"Invalid equation: `.` is found outside of an ellipsis.")
+    assert (
+        labelstr.replace('...', '', 1).find('.') == -1
+    ), f"Invalid equation: `.` is found outside of an ellipsis."
 
     # Check shape. Note, in Paddle a tensor rank is always nonzero
     ndims = len(operand.shape)
@@ -65,8 +74,9 @@ def parse_op_labels(labelstr, operand):
 
     full_labelstr = labelstr.replace('...', '.' * (ndims - len(labelstr) + 3))
 
-    assert len(full_labelstr) == ndims, (
-        f"Invalid equation: the label string '{labelstr}' misses dimensions.")
+    assert (
+        len(full_labelstr) == ndims
+    ), f"Invalid equation: the label string '{labelstr}' misses dimensions."
 
     return full_labelstr
 
@@ -74,14 +84,14 @@ def parse_op_labels(labelstr, operand):
 def parse_labels(labelstr, operands):
     '''
     Parse label strings for all input operands.
-    
+
     Parameters
     ----------
     labelstr:
         The equation's label string
     operands:
         The input operands
-    
+
     Returns
     -------
     list of full label strings for all input operands
@@ -90,19 +100,21 @@ def parse_labels(labelstr, operands):
     nop_labels = labelstr.split(',')
     assert len(nop_labels) == len(operands), (
         f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(nop_labels)} segments in the label equation.")
+        f"but found {len(nop_labels)} segments in the label equation."
+    )
 
     return list(map(parse_op_labels, nop_labels, operands))
 
 
 def validate_rhs(rhs, input_labels, n_bcast_dims):
     '''
-    Check whether the equation's right hand side is valid 
+    Check whether the equation's right hand side is valid
     '''
     # Sanity check.
     if n_bcast_dims > 0:
-        assert '...' in rhs, (
-            f"Invalid equation: missing ellipsis in output labels.")
+        assert (
+            '...' in rhs
+        ), f"Invalid equation: missing ellipsis in output labels."
 
     rhs = rhs.replace('...', '')
     rhs_set = set(rhs)
@@ -114,16 +126,18 @@ def validate_rhs(rhs, input_labels, n_bcast_dims):
     non_input_labels = rhs_set.difference(input_labels)
     assert not non_input_labels, (
         f"Invalid equation: "
-        f"output label {sorted(non_input_labels)} not used by any input.")
+        f"output label {sorted(non_input_labels)} not used by any input."
+    )
     # Verify that output labels are not duplicate
-    assert len(rhs) == len(rhs_set), (
-        f"Invalid equation: duplicate output labels are found.")
+    assert len(rhs) == len(
+        rhs_set
+    ), f"Invalid equation: duplicate output labels are found."
 
 
 def build_view(in_labels, out_labels):
     '''
-    Build an inverse map of dimension indices. Three conditions must hold for 
-    the result to be meaningful. 
+    Build an inverse map of dimension indices. Three conditions must hold for
+    the result to be meaningful.
     First, no duplicate letter labels in each label string.
     Second, the number of dots in dimout_labels >= that in in_labels.
     Third, dots are contiguous in each label string.
@@ -134,7 +148,7 @@ def build_view(in_labels, out_labels):
         The dimension labels to map to
     out_labels:
         The dimension labels to map from
-    
+
     Returns
     -------
     The inverse map from out_labels to in_labels. The length of the inverse map equals that of
@@ -159,8 +173,8 @@ def build_view(in_labels, out_labels):
         # fill the broadcast dimension indices from right to left.
         if s:
             for ax, dim in zip(
-                    range(start, end)[::-1],
-                    range(s.start(), s.end())[::-1]):
+                range(start, end)[::-1], range(s.start(), s.end())[::-1]
+            ):
                 inv_map[ax] = dim
 
     # Now work on non-broadcast dimensions
@@ -181,7 +195,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
     plus an index table that maps from the layout to the dimensions
     in each operand. In the global view, the dimensions are arranged
     such that output ones are put on the left and contraction ones
-    are put on the right.  
+    are put on the right.
 
     Parameters
     ----------
@@ -191,7 +205,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
         The equation right hand side
     n_bcast_dims:
         The maxium number of broadcast dimensions
-    
+
     Returns
     -------
     A tuple of g_labels, g_view, g_nout, g_count
@@ -219,7 +233,8 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
         g_labels_out = rhs.replace('...', '.' * n_bcast_dims)
     else:
         g_labels_out = '.' * n_bcast_dims + ''.join(
-            l for l, c in zip(labels, count) if c == 1)
+            l for l, c in zip(labels, count) if c == 1
+        )
 
     for i in range(len(count))[::-1]:
         if labels[i] in g_labels_out:
@@ -237,7 +252,7 @@ def build_global_view(nop_labels, rhs, n_bcast_dims):
 
 def build_global_shape(g_view, g_labels, op_shapes):
     '''
-    The global shape is the shape of all dimensions rearranged and broadcasting 
+    The global shape is the shape of all dimensions rearranged and broadcasting
     to the global view. It's a reference data structure for einsum planning.
 
     Parameters
@@ -267,12 +282,14 @@ def build_global_shape(g_view, g_labels, op_shapes):
 
     assert not non_bcastable, (
         f"Invalid operands: label {g_labels[non_bcastable[0]]} "
-        f"corresponds to non-broadcastable dimensions.")
+        f"corresponds to non-broadcastable dimensions."
+    )
 
     g_shape = [sizes.pop() if len(sizes) > 0 else 1 for sizes in g_shape]
 
-    g_masks = [[s > 1 or s == -1 for s in view_shape]
-               for view_shape in view_shapes]
+    g_masks = [
+        [s > 1 or s == -1 for s in view_shape] for view_shape in view_shapes
+    ]
 
     return g_shape, g_masks
 
@@ -287,18 +304,19 @@ def has_duplicated_labels(labels):
 
 def diagonalize(labels, operand):
     '''
-    Merges dimensions with duplicate labels. 
-    
+    Merges dimensions with duplicate labels.
+
     For those dimensions with duplicate labels, merge them into one dimension
     which represents the diagonal elements. This requires the dimensions with
     duplicate labels are equal sized.
-    
+
     Examples
-    -------- 
+    --------
     'ijj...i' would be merged into 'ij...'
     '''
-    assert not has_duplicated_labels(labels), (
-        f'Duplicate labels are not supported.')
+    assert not has_duplicated_labels(
+        labels
+    ), f'Duplicate labels are not supported.'
 
     return labels, operand
 
@@ -358,12 +376,21 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
         plan.add_step(step)
 
     # Check if conditions hold for turnning the operation into a matmul
-    if j1 + j2 > 0 and k > 0 and -1 not in np.concatenate(
-        (op1_vshape, op2_vshape)):
-        op1_shape = list(op1_vshape[I]) + [np.prod(op1_vshape[J1])
-                                           ] + [np.prod(op1_vshape[K])]
-        op2_shape = list(op2_vshape[I]) + [np.prod(op2_vshape[J2])
-                                           ] + [np.prod(op2_vshape[K])]
+    if (
+        j1 + j2 > 0
+        and k > 0
+        and -1 not in np.concatenate((op1_vshape, op2_vshape))
+    ):
+        op1_shape = (
+            list(op1_vshape[I])
+            + [np.prod(op1_vshape[J1])]
+            + [np.prod(op1_vshape[K])]
+        )
+        op2_shape = (
+            list(op2_vshape[I])
+            + [np.prod(op2_vshape[J2])]
+            + [np.prod(op2_vshape[K])]
+        )
 
         # Merge J dims and K dims by reshaping
         step = reshape, [var1], var1, op1_shape
@@ -412,15 +439,22 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
             step = squeeze, [var2], var2, [-1, -2]
             plan.add_step(step)
         elif j1 + j2 == 0 and not -1 in np.concatenate(
-            (op1_vshape[K], op2_vshape[K])):
+            (op1_vshape[K], op2_vshape[K])
+        ):
             assert all(op1_vshape[K] == op2_vshape[K])
-            step = reshape, [
-                var1
-            ], var1, list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])]
+            step = (
+                reshape,
+                [var1],
+                var1,
+                list(op1_vshape[I]) + [1] + [np.prod(op1_vshape[K])],
+            )
             plan.add_step(step)
-            step = reshape, [
-                var2
-            ], var2, list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])]
+            step = (
+                reshape,
+                [var2],
+                var2,
+                list(op2_vshape[I]) + [1] + [np.prod(op2_vshape[K])],
+            )
             plan.add_step(step)
             step = matmul, [var1, var2], var2, False, True
             plan.add_step(step)
@@ -449,8 +483,9 @@ def plan_matmul(plan, g_view, op1, op2, g_supports, g_shape, I, J1, J2, K):
     g_view[op2] = list(op2_view)
 
 
-def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
-                   n_bcast):
+def plan_summation(
+    plan, g_view, op1, op2, g_supports, g_shape, g_count, n_bcast
+):
     '''
     Plan various kinds of summation
     '''
@@ -464,8 +499,9 @@ def plan_summation(plan, g_view, op1, op2, g_supports, g_shape, g_count,
 
     I, K, J1, J2 = list(range(n_bcast)), [], [], []
 
-    for ax, dim1, dim2 in zip(range(n_bcast, ndim), op1_view[n_bcast:],
-                              op2_view[n_bcast:]):
+    for ax, dim1, dim2 in zip(
+        range(n_bcast, ndim), op1_view[n_bcast:], op2_view[n_bcast:]
+    ):
 
         if (dim1 != -1) != (dim2 != -1):
             if dim1 != -1:
@@ -531,7 +567,6 @@ def f(*args):
 
 
 class Plan:
-
     def __init__(self):
         self.env = {}
         self.steps = []
@@ -635,8 +670,9 @@ def plan_einsum(operands, g_view, g_shape, g_supports, g_count, n_bcast):
             # op1 is a one element tensor.
             plan_scalar_prod(plan, i - 1, i)
         else:
-            plan_summation(plan, g_view, i - 1, i, g_supports, g_shape, g_count,
-                           n_bcast)
+            plan_summation(
+                plan, g_view, i - 1, i, g_supports, g_shape, g_count, n_bcast
+            )
 
     # for ax, dim in enumerate(g_view[nop-1][:nout]):
     #     assert dim == ax
@@ -678,7 +714,9 @@ def preprocess(equation, *operands):
     """
     equation = equation.replace(" ", "")
     nop = len(operands)
-    assert nop > 0, "Required at least one operand in Einsum API, but received %s " % nop
+    assert nop > 0, (
+        "Required at least one operand in Einsum API, but received %s " % nop
+    )
 
     # Part the equation to left hand side and right hand side
     lhs, *rhs = equation.lower().split('->')
@@ -692,37 +730,43 @@ def preprocess(equation, *operands):
 
     assert len(lhs.split(',')) == len(operands), (
         f"Invalid equation: the number of operands is {len(operands)}, "
-        f"but found {len(lhs.split(','))} segments in the label equation.")
+        f"but found {len(lhs.split(','))} segments in the label equation."
+    )
 
-    assert not ('...' in lhs and '...' not in rhs
-                ), f'Invalid equation: missing ellipsis in output labels.'
+    assert not (
+        '...' in lhs and '...' not in rhs
+    ), f'Invalid equation: missing ellipsis in output labels.'
 
-    assert not (len(list(filter(has_duplicated_labels, lhs.split(',')))) >
-                0), f'Duplicate labels are not supported.'
+    assert not (
+        len(list(filter(has_duplicated_labels, lhs.split(',')))) > 0
+    ), f'Duplicate labels are not supported.'
 
     assert not has_duplicated_labels(
-        rhs), f'Invalid equation: duplicate output labels are found.'
+        rhs
+    ), f'Invalid equation: duplicate output labels are found.'
 
     return lhs, rhs, labels
 
 
 def parse_fake_shape(equation, operands, labels):
-    """ 
+    """
+
     this shape is just used for operands planning. may differ with the original shape.
-    for example: 
+    for example:
     ... is replaced by 1
     -1  is replaced by 1
     Results
     -------
     list of shape
+
     """
     shaped = collections.namedtuple('shaped', ['shape'])
 
     def fake_shape(label, op):
-        assert len(op.shape) == len(
-            label
-        ), "length of shape and length of label must be the same, but received %d != %d" % (
-            len(op.shape), len(label))
+        assert len(op.shape) == len(label), (
+            "length of shape and length of label must be the same, but received %d != %d"
+            % (len(op.shape), len(label))
+        )
         fakes = [s for i, (l, s) in enumerate(zip(label, op.shape)) if l != '.']
         fakes = list(map(abs, fakes))  # make -1 -> 1
         if '.' in label:
@@ -734,7 +778,6 @@ def fake_shape(label, op):
 
 
 def rhs_inference(lhs):
-
     def is_free(key):
         return cnt.get(key) == 1 and key not in ['.', ',']
 
@@ -745,7 +788,7 @@ def is_free(key):
 
 
 def gen_equation_for_opteinsum(lhs, rhs):
-    """ 
+    """
     1. gen rhs if rhs is None
     2. '...' -> 'A'
     """
@@ -753,7 +796,8 @@ def gen_equation_for_opteinsum(lhs, rhs):
     def get_used_label(counter):
         used = set(counter.elements())
         for c in string.ascii_lowercase:
-            if c not in used: return c
+            if c not in used:
+                return c
         raise ValueError(
             "You have used all `a` - `z`, there can't find a unused for einsum optimization"
         )
@@ -768,7 +812,7 @@ def get_used_label(counter):
 
 
 def einsum_v2(equation, *operands):
-    """ 
+    """
     einsum v2 implementation.
     1. Implement C++ EinsumOp.
     2. V2 create the EinsumOp to calculate, so just a little verifty work in python.
@@ -786,20 +830,21 @@ def einsum_v2(equation, *operands):
     var_list = list(operands)
     for path in cons:
         (a, b), _, eq, *__ = path
-        assert a > b, "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
+        assert (
+            a > b
+        ), "Assume the first var_idx is smaller than the second_idx. opt_einsum can guarantee it."
         var_s = [var_list.pop(a), var_list.pop(b)]
         eq = eq.replace(broadcast_label, "...")
         var_list.append(gen_einsum_op(eq, *var_s))
-    assert len(
-        var_list
-    ) == 1, "There must be one elements in list, but received %d." % len(
-        var_list)
+    assert (
+        len(var_list) == 1
+    ), "There must be one elements in list, but received %d." % len(var_list)
     return var_list[0]
 
 
 def gen_einsum_op(equation, *operands):
-    """ 
-    EinsumOp Python Interface: 
+    """
+    EinsumOp Python Interface:
     """
     assert len(operands) <= 2, "Only support two operands in EinsumOp."
     if in_dygraph_mode():
@@ -807,8 +852,9 @@ def gen_einsum_op(equation, *operands):
 
     if _in_legacy_dygraph():
         # dygraph
-        return _legacy_C_ops.einsum(operands, len(operands), len(operands),
-                                    'equation', equation)[0]
+        return _legacy_C_ops.einsum(
+            operands, len(operands), len(operands), 'equation', equation
+        )[0]
 
     for inp in operands:
         check_variable_and_dtype(inp, 'dtype', ['float32', 'float64'], 'einsum')
@@ -825,19 +871,18 @@ def gen_einsum_op(equation, *operands):
         helper.create_variable_for_type_inference(dtype=operands[0].dtype)
         for i in range(len(operands))
     ]
-    helper.append_op(type='einsum',
-                     inputs={'Operands': operands},
-                     outputs={
-                         'Out': out,
-                         "InnerCache": caches,
-                         "XShape": xshape
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='einsum',
+        inputs={'Operands': operands},
+        outputs={'Out': out, "InnerCache": caches, "XShape": xshape},
+        attrs=attrs,
+    )
     return out
 
 
 def einsum(equation, *operands):
     r"""
+
     einsum(equation, *operands)
 
     The current version of this API should be used in dygraph only mode.
@@ -862,39 +907,39 @@ def einsum(equation, *operands):
         - for many operads
             - broadcasting multiply
             - chained matrix multiply
-    
+
     **The summation notation**
 
         - The tensor dimensions are labeled using uncased English letters. E.g., `ijk`
-        relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
+          relates to a three dimensional tensor whose dimensions are labeled i, j, and k.
         - The equation is `,` separated into terms, each being a distinct input's
-        dimension label string.
+          dimension label string.
         - Ellipsis `...` enables broadcasting by automatically converting the unlabeled
-        dimensions into broadcasting dimensions. 
+          dimensions into broadcasting dimensions.
         - Singular labels are called free labels, duplicate are dummy labels. Dummy labeled
-        dimensions will be reduced and removed in the output.
-        - Output labels can be explicitly specified on the right hand side of `->` or omitted.
-        In the latter case, the output labels will be inferred from the input labels.
+          dimensions will be reduced and removed in the output.
+        - Output labels can be explicitly specified on the right hand side of `->` or omitted. In the latter case, the output labels will be inferred from the input labels.
             - Inference of output labels
                 - Broadcasting label `...`, if present, is put on the leftmost position.
                 - Free labels are reordered alphabetically and put after `...`.
             - On explicit output labels
                 - If broadcasting is enabled, then `...` must be present.
                 - The output labels can be an empty, an indication to output as a scalar
-                the sum over the original output.
+                  the sum over the original output.
                 - Non-input labels are invalid.
                 - Duplicate labels are invalid.
-                - For any dummmy label which is present for the output, it's promoted to
-                a free label.
+                - For any dummy label which is present for the output, it's promoted to
+                  a free label.
                 - For any free label which is not present for the output, it's lowered to
-                a dummy label.
+                  a dummy label.
+
         - Examples
             - '...ij, ...jk', where i and k are free labels, j is dummy. The output label
-            string is '...ik'
-            - 'ij -> i', where i is a free label and j is a dummy label. 
+              string is '...ik'
+            - 'ij -> i', where i is a free label and j is a dummy label.
             - '...ij, ...jk -> ...ijk', where i, j and k are all free labels.
             - '...ij, ...jk -> ij', an invalid equation since `...` is not present for
-            the output.
+              the output.
 
     **The summation rule**
 
@@ -902,15 +947,15 @@ def einsum(equation, *operands):
     may vary significantly due to implementation specific optimization.
 
         - Step 1: preparation for broadcasting, that is, transposing and unsqueezing
-        the input operands to have each resulting dimension identically labeled across
-        all the input operands.
+          the input operands to have each resulting dimension identically labeled across
+          all the input operands.
         - Step 2: broadcasting multiply all the resulting operands from step 1.
         - Step 3: reducing dummy labeled dimensions.
         - Step 4: transposing the result tensor to match the output labels.
 
     **On trace and diagonal**
 
-    The trace and diagonal are planned yet unimplemented features. 
+    The trace and diagonal are planned yet unimplemented features.
 
     Args:
         equation (`str`):
@@ -918,82 +963,84 @@ def einsum(equation, *operands):
         operands (`list|Tensor`):
             The input tensors over which to compute the Einstein summation. The number of
             operands should equal the number of input terms in the equation.
-    
+
     Returns:
-        result (`Tensor`): the result tensor.
-    
+        result (`Tensor`), the result tensor.
+
     Examples:
         .. code-block:: python
 
-        import paddle
-        paddle.seed(102)
-        x = paddle.rand([4])
-        y = paddle.rand([5])
-
-        # sum
-        print(paddle.einsum('i->', x))
-        # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   1.95791852)
-
-        # dot
-        print(paddle.einsum('i,i->', x, x))
-        # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [1.45936954])
-        
-        # outer
-        print(paddle.einsum("i,j->ij", x, y))
-        # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
-        #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
-        #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
-        #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
-        
-        A = paddle.rand([2, 3, 2])
-        B = paddle.rand([2, 2, 3])
-        
-        # transpose
-        print(paddle.einsum('ijk->kji', A))
-        #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.95649719, 0.49684682],
-        #     [0.80071914, 0.46258664],
-        #     [0.49814570, 0.33383518]],
-        #
-        #    [[0.07637714, 0.29374704],
-        #     [0.51470858, 0.51907635],
-        #     [0.99066722, 0.55802226]]])
-        
-        # batch matrix multiplication
-        print(paddle.einsum('ijk, ikl->ijl', A,B))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.32172769, 0.50617385, 0.41394392],
-        #     [0.51736701, 0.49921003, 0.38730967],
-        #     [0.69078457, 0.42282537, 0.30161136]],
-        #
-        #    [[0.32043904, 0.18164253, 0.27810261],
-        #     [0.50226176, 0.24512935, 0.39881429],
-        #     [0.51476848, 0.23367381, 0.39229113]]])
-        
-        # Ellipsis transpose
-        print(paddle.einsum('...jk->...kj', A))
-        # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.95649719, 0.80071914, 0.49814570],
-        #     [0.07637714, 0.51470858, 0.99066722]],
-        #
-        #    [[0.49684682, 0.46258664, 0.33383518],
-        #     [0.29374704, 0.51907635, 0.55802226]]])
-        
-        # Ellipsis batch matrix multiplication
-        print(paddle.einsum('...jk, ...kl->...jl', A,B))
-        # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
-        #   [[[0.32172769, 0.50617385, 0.41394392],
-        #     [0.51736701, 0.49921003, 0.38730967],
-        #     [0.69078457, 0.42282537, 0.30161136]],
-        #
-        #    [[0.32043904, 0.18164253, 0.27810261],
-        #     [0.50226176, 0.24512935, 0.39881429],
-        #     [0.51476848, 0.23367381, 0.39229113]]])
+            import paddle
+            paddle.seed(102)
+            x = paddle.rand([4])
+            y = paddle.rand([5])
+
+            # sum
+            print(paddle.einsum('i->', x))
+            # Tensor(shape=[], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   1.95791852)
+
+            # dot
+            print(paddle.einsum('i,i->', x, x))
+            # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [1.45936954])
+
+            # outer
+            print(paddle.einsum("i,j->ij", x, y))
+            # Tensor(shape=[4, 5], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[0.00079869, 0.00120950, 0.00136844, 0.00187187, 0.00192194],
+            #    [0.23455200, 0.35519385, 0.40186870, 0.54970956, 0.56441545],
+            #    [0.11773264, 0.17828843, 0.20171674, 0.27592498, 0.28330654],
+            #    [0.32897076, 0.49817693, 0.56364071, 0.77099484, 0.79162055]])
+
+            A = paddle.rand([2, 3, 2])
+            B = paddle.rand([2, 2, 3])
+
+            # transpose
+            print(paddle.einsum('ijk->kji', A))
+            #  Tensor(shape=[2, 3, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.95649719, 0.49684682],
+            #     [0.80071914, 0.46258664],
+            #     [0.49814570, 0.33383518]],
+            #
+            #    [[0.07637714, 0.29374704],
+            #     [0.51470858, 0.51907635],
+            #     [0.99066722, 0.55802226]]])
+
+            # batch matrix multiplication
+            print(paddle.einsum('ijk, ikl->ijl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.32172769, 0.50617385, 0.41394392],
+            #     [0.51736701, 0.49921003, 0.38730967],
+            #     [0.69078457, 0.42282537, 0.30161136]],
+            #
+            #    [[0.32043904, 0.18164253, 0.27810261],
+            #     [0.50226176, 0.24512935, 0.39881429],
+            #     [0.51476848, 0.23367381, 0.39229113]]])
+
+            # Ellipsis transpose
+            print(paddle.einsum('...jk->...kj', A))
+            # Tensor(shape=[2, 2, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.95649719, 0.80071914, 0.49814570],
+            #     [0.07637714, 0.51470858, 0.99066722]],
+            #
+            #    [[0.49684682, 0.46258664, 0.33383518],
+            #     [0.29374704, 0.51907635, 0.55802226]]])
+
+            # Ellipsis batch matrix multiplication
+            print(paddle.einsum('...jk, ...kl->...jl', A,B))
+            # Tensor(shape=[2, 3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #   [[[0.32172769, 0.50617385, 0.41394392],
+            #     [0.51736701, 0.49921003, 0.38730967],
+            #     [0.69078457, 0.42282537, 0.30161136]],
+            #
+            #    [[0.32043904, 0.18164253, 0.27810261],
+            #     [0.50226176, 0.24512935, 0.39881429],
+            #     [0.51476848, 0.23367381, 0.39229113]]])
+
     """
     import os
+
     if int(os.environ.get('FLAGS_new_einsum', "1")):
         return einsum_v2(equation, *operands)
 
@@ -1039,9 +1086,11 @@ def einsum(equation, *operands):
     #   Counting how many non-trivial dimensions remain for each ax
 
     g_labels, g_view, g_nout, g_count = build_global_view(
-        nop_labels, rhs, n_bcast_dims)
-    g_shape, g_supports = build_global_shape(g_view, g_labels,
-                                             [op.shape for op in operands])
+        nop_labels, rhs, n_bcast_dims
+    )
+    g_shape, g_supports = build_global_shape(
+        g_view, g_labels, [op.shape for op in operands]
+    )
 
     # Now we're ready to build up an execution plan
     args = operands, g_view, g_shape, g_supports, g_count, n_bcast_dims
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 700c6c340dc3c5..132129ae628bc2 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -14,8 +14,17 @@
 
 import numpy as np
 from ..framework import LayerHelper
-from ..framework import _varbase_creator, _dygraph_tracer, in_dygraph_mode, _non_static_mode
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..framework import (
+    _varbase_creator,
+    _dygraph_tracer,
+    in_dygraph_mode,
+    _non_static_mode,
+)
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..static import Variable
 from ..fluid.framework import _in_legacy_dygraph
 from .manipulation import cast
@@ -92,10 +101,21 @@ def transpose(x, perm, name=None):
             out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'transpose')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'transpose',
+    )
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -104,24 +124,25 @@ def transpose(x, perm, name=None):
             "Input(perm) is the permutation of dimensions of Input(x), "
             "its length should be equal to dimensions of Input(x), "
             "but received dimension of Input(x) is %s, "
-            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm))
+        )
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
                 "Each element in Input(perm) should be less than Input(x)'s dimension, "
                 "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
-                "dimension %d." % (idx, perm[idx], len(x.shape)))
+                "dimension %d." % (idx, perm[idx], len(x.shape))
+            )
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
@@ -236,21 +257,22 @@ def __check_input(x, y):
         var_names = {'x': x, 'y': y}
         for name, val in var_names.items():
             check_variable_and_dtype(
-                val, name,
+                val,
+                name,
                 ['float16', 'float32', 'float64', 'complex64', 'complex128'],
-                'matmul')
+                'matmul',
+            )
 
     __check_input(x, y)
 
     helper = LayerHelper('matmul_v2', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='matmul_v2',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='matmul_v2',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs=attrs,
+    )
     return out
 
 
@@ -260,7 +282,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
     Returns the matrix norm (Frobenius) or vector norm (the 1-norm, the Euclidean
     or 2-norm, and in general the p-norm for p > 0) of a given tensor.
 
-    .. note::
+    Note:
         This norm API is different from `numpy.linalg.norm`.
         This api supports high-order input tensors (rank >= 3), and certain axis need to be pointed out to calculate the norm.
         But `numpy.linalg.norm` only supports 1-D vector or 2-D matrix as input tensor.
@@ -276,7 +298,7 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
             or list(int)/tuple(int)  with only one element, the vector norm is computed over the axis.
             If `axis < 0`, the dimension to norm operation is rank(input) + axis.
             If axis is a list(int)/tuple(int) with two elements, the matrix norm is computed over the axis.
-            Defalut value is `None`.
+            Default value is `None`.
         keepdim (bool, optional): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have fewer dimension
             than the :attr:`input` unless :attr:`keepdim` is true, default
@@ -292,38 +314,53 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            shape=[2, 3, 4]
-            np_input = np.arange(24).astype('float32') - 12
-            np_input = np_input.reshape(shape)
-            x = paddle.to_tensor(np_input)
-            #[[[-12. -11. -10.  -9.] [ -8.  -7.  -6.  -5.] [ -4.  -3.  -2.  -1.]]
-            # [[  0.   1.   2.   3.] [  4.   5.   6.   7.] [  8.   9.  10.  11.]]]
+            x = paddle.arange(24, dtype="float32").reshape([2, 3, 4]) - 12
+            # x: Tensor(shape=[2, 3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #          [[[-12., -11., -10., -9. ],
+            #            [-8. , -7. , -6. , -5. ],
+            #            [-4. , -3. , -2. , -1. ]],
+
+            #           [[ 0. ,  1. ,  2. ,  3. ],
+            #            [ 4. ,  5. ,  6. ,  7. ],
+            #            [ 8. ,  9. ,  10.,  11.]]])
 
             # compute frobenius norm along last two dimensions.
             out_fro = paddle.linalg.norm(x, p='fro', axis=[0,1])
-            # out_fro.numpy() [17.435596 16.911535 16.7332   16.911535]
+            # out_fro: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                 [17.43559647, 16.91153526, 16.73320007, 16.91153526])
 
             # compute 2-order vector norm along last dimension.
             out_pnorm = paddle.linalg.norm(x, p=2, axis=-1)
-            #out_pnorm.numpy(): [[21.118711  13.190906   5.477226]
-            #                    [ 3.7416575 11.224972  19.131126]]
+            # out_pnorm: Tensor(shape=[2, 3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                [[21.11871147, 13.19090557, 5.47722578 ],
+            #                 [3.74165750 , 11.22497177, 19.13112640]])
 
             # compute 2-order  norm along [0,1] dimension.
             out_pnorm = paddle.linalg.norm(x, p=2, axis=[0,1])
-            #out_pnorm.numpy(): [17.435596 16.911535 16.7332   16.911535]
+            # out_pnorm: Tensor(shape=[4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                  [17.43559647, 16.91153526, 16.73320007, 16.91153526])
 
             # compute inf-order  norm
-            out_pnorm = paddle.linalg.norm(x, p=np.inf)
-            #out_pnorm.numpy()  = [12.]
-            out_pnorm = paddle.linalg.norm(x, p=np.inf, axis=0)
-            #out_pnorm.numpy(): [[12. 11. 10. 9.] [8. 7. 6. 7.] [8. 9. 10. 11.]]
+            out_pnorm = paddle.linalg.norm(x, p=float("inf"))
+            # out_pnorm  = Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                    [12.])
+
+            out_pnorm = paddle.linalg.norm(x, p=float("inf"), axis=0)
+            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                 [[12., 11., 10., 9. ],
+            #                  [8. , 7. , 6. , 7. ],
+            #                  [8. , 9. , 10., 11.]])
 
             # compute -inf-order  norm
-            out_pnorm = paddle.linalg.norm(x, p=-np.inf)
-            #out_pnorm.numpy(): [0.]
-            out_pnorm = paddle.linalg.norm(x, p=-np.inf, axis=0)
-            #out_pnorm.numpy(): [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
+            out_pnorm = paddle.linalg.norm(x, p=-float("inf"))
+            # out_pnorm: Tensor(shape=[1], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                  [0.])
+
+            out_pnorm = paddle.linalg.norm(x, p=-float("inf"), axis=0)
+            # out_pnorm: Tensor(shape=[3, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #                  [[0., 1., 2., 3.],
+            #                  [4., 5., 6., 5.],
+            #                  [4., 3., 2., 1.]])
     """
 
     def frobenius_norm(input, dim=None, keepdim=False, name=None):
@@ -345,32 +382,35 @@ def frobenius_norm(input, dim=None, keepdim=False, name=None):
             return _C_ops.frobenius_norm(input, dim, keepdim, False)
         if _in_legacy_dygraph():
             if dim is None:
-                return _legacy_C_ops.frobenius_norm(input, 'keep_dim', keepdim,
-                                                    'reduce_all', True)
-            return _legacy_C_ops.frobenius_norm(input, 'dim', dim, 'keep_dim',
-                                                keepdim, 'reduce_all', False)
+                return _legacy_C_ops.frobenius_norm(
+                    input, 'keep_dim', keepdim, 'reduce_all', True
+                )
+            return _legacy_C_ops.frobenius_norm(
+                input, 'dim', dim, 'keep_dim', keepdim, 'reduce_all', False
+            )
         attrs = {'dim': dim, 'keep_dim': keepdim, 'reduce_all': False}
         if dim is None:
             attrs['reduce_all'] = True
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'frobenius_norm')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'frobenius_norm'
+        )
 
         helper = LayerHelper('frobenius_norm', **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
 
-        helper.append_op(type='frobenius_norm',
-                         inputs={'X': input},
-                         outputs={'Out': out},
-                         attrs=attrs)
+        helper.append_op(
+            type='frobenius_norm',
+            inputs={'X': input},
+            outputs={'Out': out},
+            attrs=attrs,
+        )
         return out
 
-    def vector_norm(input,
-                    porder=None,
-                    axis=None,
-                    keepdim=False,
-                    asvector=False,
-                    name=None):
+    def vector_norm(
+        input, porder=None, axis=None, keepdim=False, asvector=False, name=None
+    ):
         """
         Calculate the p-order vector norm for certain  dimension of Tensor `input`.
         Args:
@@ -380,21 +420,32 @@ def vector_norm(input,
           keepdim (bool, optional): Whether keep the dimensions as the `input`, Default False.
         """
         if in_dygraph_mode():
-            if axis is None: axis = -1
+            if axis is None:
+                axis = -1
             return _C_ops.p_norm(input, porder, axis, 1e-12, keepdim, asvector)
 
         if _in_legacy_dygraph():
-            if axis is None: axis = -1
-            return _legacy_C_ops.p_norm(input, 'porder', porder, 'axis', axis,
-                                        'keepdim', keepdim, 'asvector',
-                                        asvector)
+            if axis is None:
+                axis = -1
+            return _legacy_C_ops.p_norm(
+                input,
+                'porder',
+                porder,
+                'axis',
+                axis,
+                'keepdim',
+                keepdim,
+                'asvector',
+                asvector,
+            )
 
         if porder is not None:
             check_type(porder, 'porder', (float, int), 'p_norm')
         if axis is not None:
             check_type(axis, 'axis', (int), 'p_norm')
-        check_variable_and_dtype(input, 'input', ['float32', 'float64'],
-                                 'p_norm')
+        check_variable_and_dtype(
+            input, 'input', ['float32', 'float64'], 'p_norm'
+        )
 
         attrs = {
             'axis': axis if axis is not None else -1,
@@ -405,23 +456,27 @@ def vector_norm(input,
         }
         helper = LayerHelper('p_norm', **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
 
-        helper.append_op(type='p_norm',
-                         inputs={'X': input},
-                         outputs={'Out': out},
-                         attrs=attrs)
+        helper.append_op(
+            type='p_norm',
+            inputs={'X': input},
+            outputs={'Out': out},
+            attrs=attrs,
+        )
         return out
 
-    def inf_norm(input,
-                 porder=None,
-                 axis=axis,
-                 keepdim=False,
-                 asvector=False,
-                 name=None):
+    def inf_norm(
+        input, porder=None, axis=axis, keepdim=False, asvector=False, name=None
+    ):
         if in_dygraph_mode():
             out = _C_ops.abs(input)
-            reduce_all = True if axis == None or axis == [] or asvector == True else False
+            reduce_all = (
+                True
+                if axis == None or axis == [] or asvector == True
+                else False
+            )
             axis = axis if axis != None and axis != [] else [0]
             if reduce_all:
                 assert (axis == []) or (axis is None)
@@ -432,28 +487,31 @@ def inf_norm(input,
 
         helper = LayerHelper('inf_norm', **locals())
         out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
         helper.append_op(type='abs', inputs={'X': input}, outputs={'Out': out})
         reduce_out = helper.create_variable_for_type_inference(
-            dtype=helper.input_dtype())
+            dtype=helper.input_dtype()
+        )
 
-        reduce_all = True if axis == None or axis == [] or asvector == True else False
+        reduce_all = (
+            True if axis == None or axis == [] or asvector == True else False
+        )
         axis = axis if axis != None and axis != [] else [0]
 
-        reduce_type = 'reduce_max' if porder == np.float64(
-            'inf') else 'reduce_min'
-        helper.append_op(type=reduce_type,
-                         inputs={'X': out},
-                         outputs={'Out': reduce_out},
-                         attrs={
-                             'dim': axis,
-                             'keep_dim': keepdim,
-                             'reduce_all': reduce_all
-                         })
+        reduce_type = (
+            'reduce_max' if porder == np.float64('inf') else 'reduce_min'
+        )
+        helper.append_op(
+            type=reduce_type,
+            inputs={'X': out},
+            outputs={'Out': reduce_out},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
 
         return reduce_out
 
-    def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
+    def p_matrix_norm(input, porder=1.0, axis=axis, keepdim=False, name=None):
         """
         NOTE:
             This function actually treats the matrix as flattened vector to calculate vector norm instead of matrix norm.
@@ -462,38 +520,48 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
             abs_out = _C_ops.abs(input)
             pow_out = _C_ops.pow(abs_out, porder)
             sum_out = _C_ops.sum(pow_out, axis, None, keepdim)
-            out = _C_ops.pow(sum_out, float(1. / porder))
+            out = _C_ops.pow(sum_out, float(1.0 / porder))
             return out
 
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         abs_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='abs',
-                        inputs={'X': input},
-                        outputs={'Out': abs_out})
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='abs', inputs={'X': input}, outputs={'Out': abs_out}
+        )
         pow_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
 
-        block.append_op(type='pow',
-                        inputs={'X': abs_out},
-                        outputs={'Out': pow_out},
-                        attrs={'factor': porder})
+        block.append_op(
+            type='pow',
+            inputs={'X': abs_out},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder},
+        )
         sum_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='reduce_sum',
-                        inputs={'X': pow_out},
-                        outputs={'Out': sum_out},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': True if axis is None else False
-                        })
-        block.append_op(type='pow',
-                        inputs={'X': sum_out},
-                        outputs={'Out': out},
-                        attrs={'factor': float(1. / porder)})
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out},
+            attrs={
+                'dim': axis,
+                'keep_dim': keepdim,
+                'reduce_all': True if axis is None else False,
+            },
+        )
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out},
+            outputs={'Out': out},
+            attrs={'factor': float(1.0 / porder)},
+        )
         return out
 
     if axis is None and p is not None:
@@ -502,50 +570,60 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
                 return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
             else:
                 raise ValueError(
-                    "only valid string values are 'fro', found {}".format(p))
+                    "only valid string values are 'fro', found {}".format(p)
+                )
         elif isinstance(p, (int, float)):
-            return vector_norm(x,
-                               porder=p,
-                               axis=axis,
-                               keepdim=keepdim,
-                               asvector=True,
-                               name=name)
+            return vector_norm(
+                x,
+                porder=p,
+                axis=axis,
+                keepdim=keepdim,
+                asvector=True,
+                name=name,
+            )
         else:
             raise ValueError(
-                "only valid p type is string or float, found {}".format(
-                    type(p)))
+                "only valid p type is string or float, found {}".format(type(p))
+            )
 
     if isinstance(axis, tuple):
         axis = list(axis)
     if isinstance(axis, list) and len(axis) == 1:
         axis = axis[0]
 
-    #calculate vector norm, where axis is int or list with only one integer
+    # calculate vector norm, where axis is int or list with only one integer
     if isinstance(axis, int):
         if isinstance(p, str):
             if p == "fro":
-                return vector_norm(x,
-                                   porder=2,
-                                   axis=axis,
-                                   keepdim=keepdim,
-                                   asvector=False,
-                                   name=name)
+                return vector_norm(
+                    x,
+                    porder=2,
+                    axis=axis,
+                    keepdim=keepdim,
+                    asvector=False,
+                    name=name,
+                )
 
             else:
                 raise ValueError(
-                    "only valid string values are 'fro', found {}".format(p))
+                    "only valid string values are 'fro', found {}".format(p)
+                )
         elif isinstance(p, (int, float)):
-            return vector_norm(x,
-                               axis=axis,
-                               porder=p,
-                               keepdim=keepdim,
-                               asvector=False,
-                               name=name)
+            return vector_norm(
+                x,
+                axis=axis,
+                porder=p,
+                keepdim=keepdim,
+                asvector=False,
+                name=name,
+            )
         else:
             raise ValueError(
-                "unspport p for p-order vector norm. except float, found {}".
-                format(p))
-    #calculate matrix norm, where axis is list with two integers
+                "unspport p for p-order vector norm. except float, found {}".format(
+                    p
+                )
+            )
+    # calculate matrix norm, where axis is list with two integers
     elif isinstance(axis, list) and len(axis) == 2:
         if p == "fro":
             return frobenius_norm(x, dim=axis, keepdim=keepdim, name=name)
@@ -553,18 +631,20 @@ def p_matrix_norm(input, porder=1., axis=axis, keepdim=False, name=None):
             return inf_norm(x, porder=p, axis=axis, keepdim=keepdim, name=name)
         elif p == 0:
             raise ValueError(
-                "just suport axis type int or list (length of list <=1) if p = 0, found {}"
-                .format(axis))
+                "just suport axis type int or list (length of list <=1) if p = 0, found {}".format(
+                    axis
+                )
+            )
         else:
-            return p_matrix_norm(x,
-                                 porder=p,
-                                 axis=axis,
-                                 keepdim=keepdim,
-                                 name=name)
+            return p_matrix_norm(
+                x, porder=p, axis=axis, keepdim=keepdim, name=name
+            )
     else:
         raise ValueError(
-            "except axis type int or list (length of list <=2), found {}".
-            format(axis))
+            "except axis type int or list (length of list <=2), found {}".format(
+                axis
+            )
+        )
 
 
 def dist(x, y, p=2, name=None):
@@ -634,10 +714,9 @@ def dist(x, y, p=2, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x = paddle.to_tensor(np.array([[3, 3],[3, 3]]), "float32")
-            y = paddle.to_tensor(np.array([[3, 3],[3, 1]]), "float32")
+            x = paddle.to_tensor([[3, 3],[3, 3]], dtype="float32")
+            y = paddle.to_tensor([[3, 3],[3, 1]], dtype="float32")
             out = paddle.dist(x, y, 0)
             print(out) # out = [1.]
 
@@ -662,10 +741,9 @@ def dist(x, y, p=2, name=None):
     inputs = {"X": [x], "Y": [y]}
     outputs = {'Out': [out]}
     attrs = {"p": float(p)}
-    helper.append_op(type='dist',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='dist', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -690,73 +768,86 @@ def cond(x, p=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             x = paddle.to_tensor([[1., 0, -1], [0, 1, 0], [1, 0, 1]])
 
             # compute conditional number when p is None
             out = paddle.linalg.cond(x)
-            # out.numpy() [1.4142135]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.41421342])
 
             # compute conditional number when order of the norm is 'fro'
             out_fro = paddle.linalg.cond(x, p='fro')
-            # out_fro.numpy() [3.1622777]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [3.16227770])
 
             # compute conditional number when order of the norm is 'nuc'
             out_nuc = paddle.linalg.cond(x, p='nuc')
-            # out_nuc.numpy() [9.2426405]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [9.24263859])
 
             # compute conditional number when order of the norm is 1
             out_1 = paddle.linalg.cond(x, p=1)
-            # out_1.numpy() [2.]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [2.])
 
             # compute conditional number when order of the norm is -1
             out_minus_1 = paddle.linalg.cond(x, p=-1)
-            # out_minus_1.numpy() [1.]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.])
 
             # compute conditional number when order of the norm is 2
             out_2 = paddle.linalg.cond(x, p=2)
-            # out_2.numpy() [1.4142135]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.41421342])
 
             # compute conditional number when order of the norm is -1
             out_minus_2 = paddle.linalg.cond(x, p=-2)
-            # out_minus_2.numpy() [0.70710677]
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [0.70710683])
 
             # compute conditional number when order of the norm is inf
-            out_inf = paddle.linalg.cond(x, p=np.inf)
-            # out_inf.numpy() [2.]
+            out_inf = paddle.linalg.cond(x, p=float("inf"))
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [2.])
 
             # compute conditional number when order of the norm is -inf
-            out_minus_inf = paddle.linalg.cond(x, p=-np.inf)
-            # out_minus_inf.numpy() [1.]
-
-            a = paddle.to_tensor(np.random.randn(2, 4, 4).astype('float32'))
-            # a.numpy()
-            # [[[ 0.14063153 -0.996288    0.7996131  -0.02571543]
-            #   [-0.16303636  1.5534962  -0.49919784 -0.04402903]
-            #   [-1.1341571  -0.6022629   0.5445269   0.29154757]
-            #   [-0.16816919 -0.30972657  1.7521842  -0.5402487 ]]
-            #  [[-0.58081484  0.12402827  0.7229862  -0.55046535]
-            #   [-0.15178485 -1.1604939   0.75810957  0.30971205]
-            #   [-0.9669573   1.0940945  -0.27363303 -0.35416734]
-            #   [-1.216529    2.0018666  -0.7773689  -0.17556527]]]
+            out_minus_inf = paddle.linalg.cond(x, p=-float("inf"))
+            # Tensor(shape=[1], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [1.])
+
+            a = paddle.randn([2, 4, 4])
+            # Tensor(shape=[2, 4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[-0.06784091, -0.07095790,  1.31792855, -0.58959651],
+            #          [ 0.20818676, -0.85640615, -0.89998871, -1.47439921],
+            #          [-0.49132481,  0.42250812, -0.77383220, -2.19794774],
+            #          [-0.33551720, -1.70003879, -1.09795380, -0.63737559]],
+
+            #         [[ 1.12026262, -0.16119350, -1.21157813,  2.74383283],
+            #          [-0.15999718,  0.18798758, -0.69392562,  1.35720372],
+            #          [-0.53013402, -2.26304483,  1.40843511, -1.02288902],
+            #          [ 0.69533503,  2.05261683, -0.02251151, -1.43127477]]])
+
             a_cond_fro = paddle.linalg.cond(a, p='fro')
-            # a_cond_fro.numpy()  [31.572273 28.120834]
-
-            b = paddle.to_tensor(np.random.randn(2, 3, 4).astype('float64'))
-            # b.numpy()
-            # [[[ 1.61707487  0.46829144  0.38130416  0.82546736]
-            #   [-1.72710298  0.08866375 -0.62518804  0.16128892]
-            #   [-0.02822879 -1.67764516  0.11141444  0.3220113 ]]
-            #  [[ 0.22524372  0.62474921 -0.85503233 -1.03960523]
-            #   [-0.76620689  0.56673047  0.85064753 -0.45158196]
-            #   [ 1.47595418  2.23646462  1.5701758   0.10497519]]]
+            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [8.86691189 , 75.23817444])
+
+            b = paddle.randn([2, 3, 4])
+            # Tensor(shape=[2, 3, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[-0.43754861,  1.80796063, -0.78729683, -1.82264030],
+            #          [-0.27670753,  0.06620564,  0.29072434, -0.31155765],
+            #          [ 0.34123746, -0.05444612,  0.05001324, -1.46877074]],
+
+            #         [[-0.64331555, -1.51103854, -1.26277697, -0.68024760],
+            #          [ 2.59375715, -1.06665540,  0.96575671, -0.73330832],
+            #          [-0.47064447, -0.23945692, -0.95150250, -1.07125998]]])
             b_cond_2 = paddle.linalg.cond(b, p=2)
-            # b_cond_2.numpy()  [3.30064451 2.51976252]
+            # Tensor(shape=[2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [6.64228773, 3.89068866])
 
     """
 
-    def mat_norm(input, porder=1., axis=None):
+    def mat_norm(input, porder=1.0, axis=None):
         """
         NOTE:
             Calculate the matrix norm of a square matrix or batches of square matrices,
@@ -777,54 +868,81 @@ def mat_norm(input, porder=1., axis=None):
 
         elif _in_legacy_dygraph():
             abs_out = _legacy_C_ops.abs(input)
-            sum_out = _legacy_C_ops.reduce_sum(abs_out, 'dim', axis, 'keepdim',
-                                               keepdim, 'reduce_all',
-                                               reduce_all)
+            sum_out = _legacy_C_ops.reduce_sum(
+                abs_out,
+                'dim',
+                axis,
+                'keepdim',
+                keepdim,
+                'reduce_all',
+                reduce_all,
+            )
             if porder == 1 or porder == np.inf:
-                return _legacy_C_ops.reduce_max(sum_out, 'dim', [-1], 'keepdim',
-                                                keepdim, 'reduce_all',
-                                                reduce_all)
+                return _legacy_C_ops.reduce_max(
+                    sum_out,
+                    'dim',
+                    [-1],
+                    'keepdim',
+                    keepdim,
+                    'reduce_all',
+                    reduce_all,
+                )
             if porder == -1 or porder == -np.inf:
-                return _legacy_C_ops.reduce_min(sum_out, 'dim', [-1], 'keepdim',
-                                                keepdim, 'reduce_all',
-                                                reduce_all)
+                return _legacy_C_ops.reduce_min(
+                    sum_out,
+                    'dim',
+                    [-1],
+                    'keepdim',
+                    keepdim,
+                    'reduce_all',
+                    reduce_all,
+                )
         else:
             block = LayerHelper('norm', **locals())
             abs_out = block.create_variable_for_type_inference(
-                dtype=block.input_dtype())
+                dtype=block.input_dtype()
+            )
             sum_out = block.create_variable_for_type_inference(
-                dtype=block.input_dtype())
+                dtype=block.input_dtype()
+            )
             out = block.create_variable_for_type_inference(
-                dtype=block.input_dtype())
-            block.append_op(type='abs',
-                            inputs={'X': input},
-                            outputs={'Out': abs_out})
-            block.append_op(type='reduce_sum',
-                            inputs={'X': abs_out},
-                            outputs={'Out': sum_out},
-                            attrs={
-                                'dim': axis,
-                                'keep_dim': keepdim,
-                                'reduce_all': reduce_all
-                            })
+                dtype=block.input_dtype()
+            )
+            block.append_op(
+                type='abs', inputs={'X': input}, outputs={'Out': abs_out}
+            )
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': abs_out},
+                outputs={'Out': sum_out},
+                attrs={
+                    'dim': axis,
+                    'keep_dim': keepdim,
+                    'reduce_all': reduce_all,
+                },
+            )
             if porder == 1 or porder == np.inf:
-                block.append_op(type='reduce_max',
-                                inputs={'X': sum_out},
-                                outputs={'Out': out},
-                                attrs={
-                                    'dim': [-1],
-                                    'keep_dim': keepdim,
-                                    'reduce_all': reduce_all
-                                })
+                block.append_op(
+                    type='reduce_max',
+                    inputs={'X': sum_out},
+                    outputs={'Out': out},
+                    attrs={
+                        'dim': [-1],
+                        'keep_dim': keepdim,
+                        'reduce_all': reduce_all,
+                    },
+                )
             if porder == -1 or porder == -np.inf:
-                block.append_op(type='reduce_min',
-                                inputs={'X': sum_out},
-                                outputs={'Out': out},
-                                attrs={
-                                    'dim': [-1],
-                                    'keep_dim': keepdim,
-                                    'reduce_all': reduce_all
-                                })
+                block.append_op(
+                    type='reduce_min',
+                    inputs={'X': sum_out},
+                    outputs={'Out': out},
+                    attrs={
+                        'dim': [-1],
+                        'keep_dim': keepdim,
+                        'reduce_all': reduce_all,
+                    },
+                )
             return out
 
     def fro_norm(input, porder=2, axis=[-1]):
@@ -839,50 +957,66 @@ def fro_norm(input, porder=2, axis=[-1]):
             pow_out = _C_ops.pow(input, porder)
             sum_out_1 = _C_ops.sum(pow_out, axis, None, keepdim)
             sum_out_2 = _C_ops.sum(sum_out_1, axis, None, keepdim)
-            return _C_ops.pow(sum_out_2, float(1. / porder))
+            return _C_ops.pow(sum_out_2, float(1.0 / porder))
         elif paddle.in_dynamic_mode():
             pow_out = _legacy_C_ops.pow(input, 'factor', porder)
-            sum_out_1 = _legacy_C_ops.reduce_sum(pow_out, 'dim', axis,
-                                                 'keepdim', keepdim,
-                                                 'reduce_all', reduce_all)
-            sum_out_2 = _legacy_C_ops.reduce_sum(sum_out_1, 'dim', axis,
-                                                 'keepdim', keepdim,
-                                                 'reduce_all', reduce_all)
-            return _legacy_C_ops.pow(sum_out_2, 'factor', float(1. / porder))
+            sum_out_1 = _legacy_C_ops.reduce_sum(
+                pow_out,
+                'dim',
+                axis,
+                'keepdim',
+                keepdim,
+                'reduce_all',
+                reduce_all,
+            )
+            sum_out_2 = _legacy_C_ops.reduce_sum(
+                sum_out_1,
+                'dim',
+                axis,
+                'keepdim',
+                keepdim,
+                'reduce_all',
+                reduce_all,
+            )
+            return _legacy_C_ops.pow(sum_out_2, 'factor', float(1.0 / porder))
 
         block = LayerHelper('norm', **locals())
         pow_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         sum_out_1 = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         sum_out_2 = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='pow',
-                        inputs={'X': input},
-                        outputs={'Out': pow_out},
-                        attrs={'factor': porder})
-        block.append_op(type='reduce_sum',
-                        inputs={'X': pow_out},
-                        outputs={'Out': sum_out_1},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
-        block.append_op(type='reduce_sum',
-                        inputs={'X': sum_out_1},
-                        outputs={'Out': sum_out_2},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
-        block.append_op(type='pow',
-                        inputs={'X': sum_out_2},
-                        outputs={'Out': out},
-                        attrs={'factor': float(1. / porder)})
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='pow',
+            inputs={'X': input},
+            outputs={'Out': pow_out},
+            attrs={'factor': porder},
+        )
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': pow_out},
+            outputs={'Out': sum_out_1},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
+        block.append_op(
+            type='reduce_sum',
+            inputs={'X': sum_out_1},
+            outputs={'Out': sum_out_2},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
+        block.append_op(
+            type='pow',
+            inputs={'X': sum_out_2},
+            outputs={'Out': out},
+            attrs={'factor': float(1.0 / porder)},
+        )
         return out
 
     def svd_norm(input, porder, axis=[-1]):
@@ -901,9 +1035,15 @@ def svd_norm(input, porder, axis=[-1]):
                 if in_dygraph_mode():
                     return _C_ops.sum(s, axis, None, keepdim)
                 else:
-                    return _legacy_C_ops.reduce_sum(s, 'dim', axis, 'keepdim',
-                                                    keepdim, 'reduce_all',
-                                                    reduce_all)
+                    return _legacy_C_ops.reduce_sum(
+                        s,
+                        'dim',
+                        axis,
+                        'keepdim',
+                        keepdim,
+                        'reduce_all',
+                        reduce_all,
+                    )
             if in_dygraph_mode():
                 max_out = _C_ops.max(s, axis, keepdim)
                 min_out = _C_ops.min(s, axis, keepdim)
@@ -913,75 +1053,70 @@ def svd_norm(input, porder, axis=[-1]):
                     return _C_ops.divide(min_out, max_out)
 
             else:
-                max_out = _legacy_C_ops.reduce_max(s, 'dim', axis, 'keepdim',
-                                                   keepdim, 'reduce_all',
-                                                   reduce_all)
-                min_out = _legacy_C_ops.reduce_min(s, 'dim', axis, 'keepdim',
-                                                   keepdim, 'reduce_all',
-                                                   reduce_all)
+                max_out = _legacy_C_ops.reduce_max(
+                    s, 'dim', axis, 'keepdim', keepdim, 'reduce_all', reduce_all
+                )
+                min_out = _legacy_C_ops.reduce_min(
+                    s, 'dim', axis, 'keepdim', keepdim, 'reduce_all', reduce_all
+                )
                 if porder == 2:
                     return _legacy_C_ops.elementwise_div(
-                        max_out, min_out, 'aixs', axis, 'use_mkldnn', False)
+                        max_out, min_out, 'aixs', axis, 'use_mkldnn', False
+                    )
                 if porder == -2:
                     return _legacy_C_ops.elementwise_div(
-                        min_out, max_out, 'aixs', axis, 'use_mkldnn', False)
+                        min_out, max_out, 'aixs', axis, 'use_mkldnn', False
+                    )
 
         block = LayerHelper('norm', **locals())
         out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         if porder == "nuc":
-            block.append_op(type='reduce_sum',
-                            inputs={'X': s},
-                            outputs={'Out': out},
-                            attrs={
-                                'dim': axis,
-                                'keep_dim': keepdim,
-                                'reduce_all': reduce_all
-                            })
+            block.append_op(
+                type='reduce_sum',
+                inputs={'X': s},
+                outputs={'Out': out},
+                attrs={
+                    'dim': axis,
+                    'keep_dim': keepdim,
+                    'reduce_all': reduce_all,
+                },
+            )
             return out
         max_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
+            dtype=block.input_dtype()
+        )
         min_out = block.create_variable_for_type_inference(
-            dtype=block.input_dtype())
-        block.append_op(type='reduce_max',
-                        inputs={'X': s},
-                        outputs={'Out': max_out},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
-        block.append_op(type='reduce_min',
-                        inputs={'X': s},
-                        outputs={'Out': min_out},
-                        attrs={
-                            'dim': axis,
-                            'keep_dim': keepdim,
-                            'reduce_all': reduce_all
-                        })
+            dtype=block.input_dtype()
+        )
+        block.append_op(
+            type='reduce_max',
+            inputs={'X': s},
+            outputs={'Out': max_out},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
+        block.append_op(
+            type='reduce_min',
+            inputs={'X': s},
+            outputs={'Out': min_out},
+            attrs={'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all},
+        )
         if porder == 2:
-            block.append_op(type='elementwise_div',
-                            inputs={
-                                'X': max_out,
-                                'Y': min_out
-                            },
-                            outputs={'Out': out},
-                            attrs={
-                                'aixs': axis,
-                                'use_mkldnn': False
-                            })
+            block.append_op(
+                type='elementwise_div',
+                inputs={'X': max_out, 'Y': min_out},
+                outputs={'Out': out},
+                attrs={'aixs': axis, 'use_mkldnn': False},
+            )
             return out
         if porder == -2:
-            block.append_op(type='elementwise_div',
-                            inputs={
-                                'X': min_out,
-                                'Y': max_out
-                            },
-                            outputs={'Out': out},
-                            attrs={
-                                'aixs': axis,
-                                'use_mkldnn': False
-                            })
+            block.append_op(
+                type='elementwise_div',
+                inputs={'X': min_out, 'Y': max_out},
+                outputs={'Out': out},
+                attrs={'aixs': axis, 'use_mkldnn': False},
+            )
             return out
 
     def empty_tensor(input, shape):
@@ -992,8 +1127,9 @@ def empty_tensor(input, shape):
     x_shape = list(x.shape)
     if not len(x_shape) >= 2:
         raise ValueError(
-            "input should be a matrix or batches of matrices, " +
-            "but the dimention of received input is {}".format(len(x_shape)))
+            "input should be a matrix or batches of matrices, "
+            + "but the dimention of received input is {}".format(len(x_shape))
+        )
     if p == None:
         p = 2
     x_size = 0 if (0 in x_shape) else 1
@@ -1008,28 +1144,33 @@ def empty_tensor(input, shape):
                 return svd_norm(x, p) * svd_norm(x_inv, p)
             if p in (1, -1):
                 return mat_norm(x, porder=p, axis=[-2]) * mat_norm(
-                    x_inv, porder=p, axis=[-2])
+                    x_inv, porder=p, axis=[-2]
+                )
             if p in (np.inf, -np.inf):
                 return mat_norm(x, porder=p, axis=[-1]) * mat_norm(
-                    x_inv, porder=p, axis=[-1])
+                    x_inv, porder=p, axis=[-1]
+                )
         else:
-            raise ValueError("only support p is {} when input is a ".format(p) +
-                             "square matrix or batches of square matrices")
+            raise ValueError(
+                "only support p is {} when input is a ".format(p)
+                + "square matrix or batches of square matrices"
+            )
     elif p in (2, -2):
         if x_size == 0:
             return empty_tensor(x, x_shape[:-2])
         return svd_norm(x, porder=p)
     else:
         raise ValueError(
-            "unsupported {} for p, only supporting ('fro', 'nuc', ".format(p) +
-            "1, -1, 2, -2, inf, -inf) or none")
+            "unsupported {} for p, only supporting ('fro', 'nuc', ".format(p)
+            + "1, -1, 2, -2, inf, -inf) or none"
+        )
 
 
 def dot(x, y, name=None):
     """
     This operator calculates inner product for vectors.
 
-    .. note::
+    Note:
        Support 1-d and 2-d Tensor. When it is 2d, the first dimension of this matrix
        is the batch dimension, which means that the vectors of multiple batches are dotted.
 
@@ -1046,14 +1187,18 @@ def dot(x, y, name=None):
     .. code-block:: python
 
         import paddle
-        import numpy as np
 
-        x_data = np.random.uniform(0.1, 1, [10]).astype(np.float32)
-        y_data = np.random.uniform(1, 3, [10]).astype(np.float32)
-        x = paddle.to_tensor(x_data)
-        y = paddle.to_tensor(y_data)
+        # 1-D Tensor * 1-D Tensor
+        x = paddle.to_tensor([1, 2, 3])
+        y = paddle.to_tensor([4, 5, 6])
+        z = paddle.dot(x, y)
+        print(z)  # [32]
+
+        # 2-D Tensor * 2-D Tensor
+        x = paddle.to_tensor([[1, 2, 3], [2, 4, 6]])
+        y = paddle.to_tensor([[4, 5, 6], [4, 5, 6]])
         z = paddle.dot(x, y)
-        print(z)
+        print(z)  # [[32], [64]]
 
     """
     if in_dygraph_mode():
@@ -1066,25 +1211,23 @@ def dot(x, y, name=None):
     assert x is not None, 'x cannot be None in {}'.format(op_type)
     assert y is not None, 'y cannot be None in {}'.format(op_type)
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             op_type)
-    check_variable_and_dtype(y, 'y', ['float32', 'float64', 'int32', 'int64'],
-                             op_type)
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], op_type
+    )
+    check_variable_and_dtype(
+        y, 'y', ['float32', 'float64', 'int32', 'int64'], op_type
+    )
 
     helper = LayerHelper(op_type, **locals())
     if name is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
     else:
-        out = helper.create_variable(name=name,
-                                     dtype=x.dtype,
-                                     persistable=False)
-    helper.append_op(type="dot",
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     attrs={},
-                     outputs={"Out": out})
+        out = helper.create_variable(
+            name=name, dtype=x.dtype, persistable=False
+        )
+    helper.append_op(
+        type="dot", inputs={'X': x, 'Y': y}, attrs={}, outputs={"Out": out}
+    )
     return out
 
 
@@ -1093,7 +1236,7 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
     Estimate the covariance matrix of the input variables, given data and weights.
 
     A covariance matrix is a square matrix, indicate the covariance of each pair variables in the input matrix.
-    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix 
+    For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the covariance matrix
     element Cij is the covariance of xi and xj. The element Cii is the variance of xi itself.
 
     Parameters:
@@ -1127,7 +1270,8 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
     if len(x.shape) > 2 or len(x.shape) < 1:
         raise ValueError(
             "Input(x) only support N-D (1<=N<=2) tensor in cov, but received "
-            "length of Input(input) is %s." % len(x.shape))
+            "length of Input(input) is %s." % len(x.shape)
+        )
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'cov')
     nx = x
     if len(x.shape) == 1:
@@ -1141,16 +1285,20 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         if len(w.shape) > 1:
             raise ValueError(
                 "Input(fweights) only support N-D (N<=1) tensor in cov, but received "
-                "shape of Input(input) is %s." % len(fweights.shape))
+                "shape of Input(input) is %s." % len(fweights.shape)
+            )
         if fweights.shape[0] != observation_num:
             raise ValueError(
                 "The number of Input(fweights) should equal to x's dim[1]: {}, but received "
-                "size of Input(fweights) is {}.".format(observation_num,
-                                                        fweights.shape[0]))
+                "size of Input(fweights) is {}.".format(
+                    observation_num, fweights.shape[0]
+                )
+            )
         if fweights.min() < 0:
             raise ValueError(
                 "The value of Input(fweights) cannot be negtive, but received "
-                "min of Input(fweights) is {}.".format(fweights.min()))
+                "min of Input(fweights) is {}.".format(fweights.min())
+            )
         if not paddle.all(fweights == paddle.round(fweights.astype('float64'))):
             raise ValueError("Input(fweights) must be integer ")
 
@@ -1159,18 +1307,23 @@ def cov(x, rowvar=True, ddof=True, fweights=None, aweights=None, name=None):
         if len(aw.shape) > 1:
             raise ValueError(
                 "Input(aweights) only support N-D (N<=1) tensor in cov, but received "
-                "length of Input(input) is %s." % len(aweights.shape))
-        check_variable_and_dtype(aweights, 'dtype', ['float32', 'float64'],
-                                 'cov')
+                "length of Input(input) is %s." % len(aweights.shape)
+            )
+        check_variable_and_dtype(
+            aweights, 'dtype', ['float32', 'float64'], 'cov'
+        )
         if aweights.shape[0] != observation_num:
             raise ValueError(
                 "The number of Input(aweights) should equal to x's dim[1]: {}, but received "
-                "size of Input(aweights) is {}.".format(observation_num,
-                                                        aweights.shape[0]))
+                "size of Input(aweights) is {}.".format(
+                    observation_num, aweights.shape[0]
+                )
+            )
         if aweights.min() < 0:
             raise ValueError(
                 "The value of Input(aweights) cannot be negtive, but received "
-                "min of Input(aweights) is {}.".format(aweights.min()))
+                "min of Input(aweights) is {}.".format(aweights.min())
+            )
         if w is not None:
             w = w * aw
         else:
@@ -1219,11 +1372,11 @@ def t(input, name=None):
         .. code-block:: python
            :name: code-example
              import paddle
-             
+
              # Example 1 (0-D tensor)
              x = paddle.to_tensor([0.79])
              paddle.t(x) # [0.79]
-             
+
              # Example 2 (1-D tensor)
              x = paddle.to_tensor([0.79, 0.84, 0.32])
              paddle.t(x) # [0.79000002, 0.83999997, 0.31999999]
@@ -1244,7 +1397,8 @@ def t(input, name=None):
         raise ValueError(
             "Input(input) only support N-D (N<=2) tensor, but received "
             "length of Input(input) is %s. Perhaps you can use paddle."
-            "tensor.transpose() instead." % len(input.shape))
+            "tensor.transpose() instead." % len(input.shape)
+        )
     if in_dygraph_mode():
         if len(input.shape) == 1:
             return input
@@ -1262,8 +1416,11 @@ def t(input, name=None):
         return out
 
     check_variable_and_dtype(
-        input, 'input', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'transpose')
+        input,
+        'input',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'transpose',
+    )
 
     helper = LayerHelper('t', **locals())
     out = helper.create_variable_for_type_inference(input.dtype)
@@ -1271,13 +1428,12 @@ def t(input, name=None):
     if len(input.shape) == 1:
         out = input
     else:
-        helper.append_op(type='transpose2',
-                         inputs={'X': [input]},
-                         outputs={
-                             'Out': [out],
-                             'XShape': [input_shape]
-                         },
-                         attrs={'axis': [1, 0]})
+        helper.append_op(
+            type='transpose2',
+            inputs={'X': [input]},
+            outputs={'Out': [out], 'XShape': [input_shape]},
+            attrs={'axis': [1, 0]},
+        )
     return out
 
 
@@ -1334,13 +1490,12 @@ def cross(x, y, axis=9, name=None):
             attrs = dict()
             attrs['dim'] = axis
 
-            helper.append_op(type='cross',
-                             inputs={
-                                 'X': x,
-                                 'Y': y
-                             },
-                             outputs={'Out': out},
-                             attrs=attrs)
+            helper.append_op(
+                type='cross',
+                inputs={'X': x, 'Y': y},
+                outputs={'Out': out},
+                attrs=attrs,
+            )
             return out
 
 
@@ -1361,27 +1516,24 @@ def cholesky(x, upper=False, name=None):
             Its data type should be float32 or float64.
         upper (bool): The flag indicating whether to return upper or lower
             triangular matrices. Default: False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: A Tensor with same shape and data type as `x`. It represents \
-            triangular matrices generated by Cholesky decomposition.
+        Tensor, A Tensor with same shape and data type as `x`. It represents
+        triangular matrices generated by Cholesky decomposition.
 
     Examples:
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            a = np.random.rand(3, 3)
-            a_t = np.transpose(a, [1, 0])
-            x_data = np.matmul(a, a_t) + 1e-03
-            x = paddle.to_tensor(x_data)
+            a = paddle.rand([3, 3], dtype="float32")
+            a_t = paddle.transpose(a, [1, 0])
+            x = paddle.matmul(a, a_t) + 1e-03
+
             out = paddle.linalg.cholesky(x, upper=False)
             print(out)
-            # [[1.190523   0.         0.        ]
-            #  [0.9906703  0.27676893 0.        ]
-            #  [1.25450498 0.05600871 0.06400121]]
-
     """
     if in_dygraph_mode():
         return _C_ops.cholesky(x, upper)
@@ -1393,10 +1545,12 @@ def cholesky(x, upper=False, name=None):
     check_type(upper, 'upper', bool, 'cholesky')
     helper = LayerHelper('cholesky', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='cholesky',
-                     inputs={'X': [x]},
-                     outputs={'Out': out},
-                     attrs={'upper': upper})
+    helper.append_op(
+        type='cholesky',
+        inputs={'X': [x]},
+        outputs={'Out': out},
+        attrs={'upper': upper},
+    )
     return out
 
 
@@ -1446,8 +1600,9 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             else:
                 tol_tensor = tol
             use_default_tol = False
-            return _C_ops.matrix_rank_tol(x, tol_tensor, use_default_tol,
-                                          hermitian)
+            return _C_ops.matrix_rank_tol(
+                x, tol_tensor, use_default_tol, hermitian
+            )
 
         if tol is None:
             tol_attr = 0.0
@@ -1473,9 +1628,16 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
             tol_tensor = None
             tol_attr = float(tol)
             use_default_tol = False
-        return _legacy_C_ops.matrix_rank(x, tol_tensor, "tol", tol_attr,
-                                         'hermitian', hermitian,
-                                         'use_default_tol', use_default_tol)
+        return _legacy_C_ops.matrix_rank(
+            x,
+            tol_tensor,
+            "tol",
+            tol_attr,
+            'hermitian',
+            hermitian,
+            'use_default_tol',
+            use_default_tol,
+        )
 
     inputs = {}
     attrs = {}
@@ -1498,10 +1660,9 @@ def matrix_rank(x, tol=None, hermitian=False, name=None):
 
     helper = LayerHelper('matrix_rank', **locals())
     out = helper.create_variable_for_type_inference(dtype='int32')
-    helper.append_op(type='matrix_rank',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='matrix_rank', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -1548,16 +1709,22 @@ def bmm(x, y, name=None):
     y_shape = y.shape
     if not len(x_shape) == len(y_shape) == 3:
         raise ValueError(
-            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}"
-            .format(x_shape, y_shape))
+            "x and y should be 3-dimensional. But received x's dimention: {}, y's dimention: {}".format(
+                x_shape, y_shape
+            )
+        )
     if x_shape[2] != y_shape[1]:
         raise ValueError(
-            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}"
-            .format(x_shape, y_shape))
+            "x's width must be equal with y's height. But received x's shape: {}, y's shape: {}".format(
+                x_shape, y_shape
+            )
+        )
     if x_shape[0] != y_shape[0]:
         raise ValueError(
-            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}"
-            .format(x_shape, y_shape))
+            "x's batch (shape[0]) must be equal with y's batch (shape[0]). But received x's shape: {}, y's shape: {}".format(
+                x_shape, y_shape
+            )
+        )
 
     if in_dygraph_mode():
         return _C_ops.bmm(x, y)
@@ -1600,28 +1767,27 @@ def histogram(input, bins=100, min=0, max=0, name=None):
         return _C_ops.histogram(input, bins, min, max)
 
     if _in_legacy_dygraph():
-        return _legacy_C_ops.histogram(input, "bins", bins, "min", min, "max",
-                                       max)
+        return _legacy_C_ops.histogram(
+            input, "bins", bins, "min", min, "max", max
+        )
 
     helper = LayerHelper('histogram', **locals())
-    check_variable_and_dtype(input, 'X',
-                             ['int32', 'int64', 'float32', 'float64'],
-                             'histogram')
+    check_variable_and_dtype(
+        input, 'X', ['int32', 'int64', 'float32', 'float64'], 'histogram'
+    )
     out = helper.create_variable_for_type_inference(VarDesc.VarType.INT64)
-    helper.append_op(type='histogram',
-                     inputs={'X': input},
-                     outputs={'Out': out},
-                     attrs={
-                         'bins': bins,
-                         'min': min,
-                         'max': max
-                     })
+    helper.append_op(
+        type='histogram',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={'bins': bins, 'min': min, 'max': max},
+    )
     return out
 
 
 def bincount(x, weights=None, minlength=0, name=None):
     """
-    Computes frequency of each value in the input tensor. 
+    Computes frequency of each value in the input tensor.
 
     Args:
         x (Tensor): A Tensor with non-negative integer. Should be 1-D tensor.
@@ -1657,19 +1823,21 @@ def bincount(x, weights=None, minlength=0, name=None):
     check_variable_and_dtype(x, 'X', ['int32', 'int64'], 'bincount')
 
     if weights is not None:
-        check_variable_and_dtype(weights, 'Weights',
-                                 ['int32', 'int64', 'float32', 'float64'],
-                                 'bincount')
+        check_variable_and_dtype(
+            weights,
+            'Weights',
+            ['int32', 'int64', 'float32', 'float64'],
+            'bincount',
+        )
         out = helper.create_variable_for_type_inference(dtype=weights.dtype)
     else:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='bincount',
-                     inputs={
-                         'X': x,
-                         'Weights': weights
-                     },
-                     outputs={'Out': out},
-                     attrs={'minlength': minlength})
+    helper.append_op(
+        type='bincount',
+        inputs={'X': x, 'Weights': weights},
+        outputs={'Out': out},
+        attrs={'minlength': minlength},
+    )
     return out
 
 
@@ -1714,53 +1882,61 @@ def mv(x, vec, name=None):
             def __check_input(x, vec):
                 var_names = {'x': x, 'vec': vec}
                 for name, val in var_names.items():
-                    check_variable_and_dtype(val, name, ['float32', 'float64'],
-                                             'mv')
+                    check_variable_and_dtype(
+                        val, name, ['float32', 'float64'], 'mv'
+                    )
                 x_shape = list(x.shape)
                 vec_shape = list(vec.shape)
                 if len(x_shape) != 2:
                     raise ValueError(
-                        "x should be 2-dimensional. But received x's dimention: {}"
-                        .format(x_shape))
+                        "x should be 2-dimensional. But received x's dimention: {}".format(
+                            x_shape
+                        )
+                    )
                 if len(vec_shape) != 1:
                     raise ValueError(
-                        "vec should be 1-dimensional. But received vec's dimention: {}"
-                        .format(vec_shape))
+                        "vec should be 1-dimensional. But received vec's dimention: {}".format(
+                            vec_shape
+                        )
+                    )
 
             __check_input(x, vec)
 
             helper = LayerHelper('mv', **locals())
             out = helper.create_variable_for_type_inference(dtype=x.dtype)
-            helper.append_op(type='mv',
-                             inputs={
-                                 'X': x,
-                                 'Vec': vec
-                             },
-                             outputs={'Out': out})
+            helper.append_op(
+                type='mv', inputs={'X': x, 'Vec': vec}, outputs={'Out': out}
+            )
             return out
 
 
 def det(x, name=None):
     """
+
     Calculates determinant value of a square matrix or batches of square matrices.
+
     Args:
-        x (Tensor): input (Tensor): the input matrix of size `(n, n)` or the batch of matrices of size
-                    `(*, n, n)` where `*` is one or more batch dimensions.
+        x (Tensor): the input matrix of size `(n, n)` or the
+            batch of matrices of size `(*, n, n)` where `*` is one or more
+            batch dimensions.
+        name(str, optional): Name of the output. Default is None. It's used
+            to print debug info for developers. Details: :ref:`api_guide_Name`
+
     Returns:
-        y (Tensor):the determinant value of a square matrix or batches of square matrices.
+        Tensor, the determinant value of a square matrix or batches of square matrices.
 
     Examples:
         .. code-block:: python
 
-        import paddle
+            import paddle
 
-        x =  paddle.randn([3,3,3])
+            x =  paddle.randn([3,3,3])
 
-        A = paddle.linalg.det(x)
+            A = paddle.linalg.det(x)
 
-        print(A)
+            print(A)
 
-        # [ 0.02547996,  2.52317095, -6.15900707])
+            # [ 0.02547996,  2.52317095, -6.15900707])
 
 
     """
@@ -1773,54 +1949,57 @@ def det(x, name=None):
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'det')
 
     input_shape = list(x.shape)
-    assert len(input_shape) >= 2,                     \
-            "The x must be at least 2-dimensional, "   \
-            "but received Input x's dimensional: %s.\n" %  \
-            len(input_shape)
-
-    assert (input_shape[-1] == input_shape[-2]),    \
-            "Expect squared input," \
-            "but received %s by %s matrix.\n" \
-            %(input_shape[-2], input_shape[-1]) \
+    assert len(input_shape) >= 2, (
+        "The x must be at least 2-dimensional, "
+        "but received Input x's dimensional: %s.\n" % len(input_shape)
+    )
 
+    assert (
+        input_shape[-1] == input_shape[-2]
+    ), "Expect squared input," "but received %s by %s matrix.\n" % (
+        input_shape[-2],
+        input_shape[-1],
+    )
     helper = LayerHelper('determinant', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='determinant',
-                     inputs={'Input': [x]},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='determinant', inputs={'Input': [x]}, outputs={'Out': [out]}
+    )
     return out
 
 
 def slogdet(x, name=None):
     """
+
     Calculates the sign and natural logarithm of the absolute value of a square matrix's or batches square matrices' determinant.
-    The determinant can be computed with ``sign * exp(logabsdet)
+    The determinant can be computed with ``sign * exp`` (logabsdet)
 
     Supports input of float, double
 
     Note that for matrices that have zero determinant, this returns ``(0, -inf)``
+
     Args:
         x (Tensor): the batch of matrices of size :math:`(*, n, n)`
             where math:`*` is one or more batch dimensions.
 
     Returns:
-        y (Tensor): A tensor containing the sign of the determinant and the natural logarithm
+        y (Tensor), A tensor containing the sign of the determinant and the natural logarithm
         of the absolute value of determinant, respectively.
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            import paddle
 
-        x =  paddle.randn([3,3,3])
+            x =  paddle.randn([3,3,3])
 
-        A = paddle.linalg.slogdet(x)
+            A = paddle.linalg.slogdet(x)
 
-        print(A)
+            print(A)
 
-        # [[ 1.        ,  1.        , -1.        ],
-        # [-0.98610914, -0.43010661, -0.10872950]])
+            # [[ 1.        ,  1.        , -1.        ],
+            # [-0.98610914, -0.43010661, -0.10872950]])
 
     """
     if in_dygraph_mode():
@@ -1832,22 +2011,23 @@ def slogdet(x, name=None):
     check_dtype(x.dtype, 'Input', ['float32', 'float64'], 'slogdet')
 
     input_shape = list(x.shape)
-    assert len(input_shape) >= 2,                     \
-            "The x must be at least 2-dimensional, "   \
-            "but received Input x's dimensional: %s.\n" %  \
-            len(input_shape)
-
-    assert (input_shape[-1] == input_shape[-2]),    \
-            "Expect squared input," \
-            "but received %s by %s matrix.\n" \
-            %(input_shape[-2], input_shape[-1]) \
+    assert len(input_shape) >= 2, (
+        "The x must be at least 2-dimensional, "
+        "but received Input x's dimensional: %s.\n" % len(input_shape)
+    )
 
+    assert (
+        input_shape[-1] == input_shape[-2]
+    ), "Expect squared input," "but received %s by %s matrix.\n" % (
+        input_shape[-2],
+        input_shape[-1],
+    )
     helper = LayerHelper('slogdeterminant', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='slogdeterminant',
-                     inputs={'Input': [x]},
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='slogdeterminant', inputs={'Input': [x]}, outputs={'Out': [out]}
+    )
     return out
 
 
@@ -1914,11 +2094,7 @@ def svd(x, full_matrices=False, name=None):
     helper.append_op(
         type='svd',
         inputs={'X': [x]},
-        outputs={
-            'U': u,
-            'VH': vh,
-            'S': s
-        },
+        outputs={'U': u, 'VH': vh, 'S': s},
         attrs=attrs,
     )
     return u, s, vh
@@ -1926,6 +2102,7 @@ def svd(x, full_matrices=False, name=None):
 
 def matrix_power(x, n, name=None):
     r"""
+
     Computes the n-th power of a square matrix or a batch of square matrices.
 
     Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`n` be
@@ -1936,13 +2113,11 @@ def matrix_power(x, n, name=None):
 
     Specifically,
 
-    - If `n > 0`, it returns the matrix or a batch of matrices raised to the power
-    of `n`.
+    - If `n > 0`, it returns the matrix or a batch of matrices raised to the power of `n`.
 
     - If `n = 0`, it returns the identity matrix or a batch of identity matrices.
 
-    - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to
-    the power of `abs(n)`.
+    - If `n < 0`, it returns the inverse of each matrix (if invertible) raised to the power of `abs(n)`.
 
     Args:
         x (Tensor): A square matrix or a batch of square matrices to be raised
@@ -1953,8 +2128,8 @@ def matrix_power(x, n, name=None):
             For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tensor: The n-th power of the matrix (or the batch of matrices) `x`. Its
-            data type should be the same as that of `x`.
+        - Tensor, The n-th power of the matrix (or the batch of matrices) `x`. Its
+          data type should be the same as that of `x`.
 
     Examples:
         .. code-block:: python
@@ -1989,10 +2164,12 @@ def matrix_power(x, n, name=None):
     check_type(n, 'n', int, 'matrix_power')
     helper = LayerHelper('matrix_power', **locals())
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type='matrix_power',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs={'n': n})
+    helper.append_op(
+        type='matrix_power',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs={'n': n},
+    )
     return out
 
 
@@ -2003,26 +2180,26 @@ def qr(x, mode="reduced", name=None):
     Args:
         x (Tensor): The input tensor. Its shape should be `[..., M, N]`,
             where ... is zero or more batch dimensions. M and N can be arbitrary
-            positive number. The data type of x should be float32 or float64. 
-        mode (str, optional): A flag to control the behavior of qr, the default is "reduced". 
+            positive number. The data type of x should be float32 or float64.
+        mode (str, optional): A flag to control the behavior of qr, the default is "reduced".
             Suppose x's shape is `[..., M, N]` and denoting `K = min(M, N)`:
-            If mode = "reduced", qr op will return reduced Q and R matrices, 
+            If mode = "reduced", qr op will return reduced Q and R matrices,
             which means Q's shape is `[..., M, K]` and R's shape is `[..., K, N]`.
-            If mode = "complete", qr op will return complete Q and R matrices, 
+            If mode = "complete", qr op will return complete Q and R matrices,
             which means Q's shape is `[..., M, M]` and R's shape is `[..., M, N]`.
             If mode = "r", qr op will only return reduced R matrix, which means
             R's shape is `[..., K, N]`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
-        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R. 
+        If mode = "reduced" or mode = "complete", qr will return a two tensor-tuple, which represents Q and R.
         If mode = "r", qr will return a tensor which represents R.
-        
-    Examples:            
+
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             q, r = paddle.linalg.qr(x)
@@ -2035,8 +2212,8 @@ def qr(x, mode="reduced", name=None):
 
             # R = [[-5.91607978, -7.43735744],
             #      [ 0.        ,  0.82807867]])
-            
-            # one can verify : X = Q * R ;     
+
+            # one can verify : X = Q * R ;
     """
     if in_dygraph_mode():
         q, r = _C_ops.qr(x, mode)
@@ -2057,13 +2234,9 @@ def qr(x, mode="reduced", name=None):
     r = helper.create_variable_for_type_inference(dtype=x.dtype)
     attrs = dict()
     attrs['mode'] = mode
-    helper.append_op(type='qr',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Q': q,
-                         'R': r
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='qr', inputs={'X': [x]}, outputs={'Q': q, 'R': r}, attrs=attrs
+    )
     if mode == "r":
         return r
     else:
@@ -2072,17 +2245,19 @@ def qr(x, mode="reduced", name=None):
 
 def lu(x, pivot=True, get_infos=False, name=None):
     r"""
-    Computes the LU factorization of an N-D(N>=2) matrix x. 
+    Computes the LU factorization of an N-D(N>=2) matrix x.
 
-    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and 
+    Returns the LU factorization(inplace x) and Pivots. low triangular matrix L and
     upper triangular matrix U are combined to a single LU matrix.
 
     Pivoting is done if pivot is set to True.
     P mat can be get by pivots:
-    # ones = eye(rows) #eye matrix of rank rows
-    # for i in range(cols):
-    #     swap(ones[i], ones[pivots[i]])
-    # return ones
+
+    .. code-block:: text
+        ones = eye(rows) #eye matrix of rank rows
+        for i in range(cols):
+            swap(ones[i], ones[pivots[i]])
+        return ones
 
     Args:
 
@@ -2094,23 +2269,23 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
-        factorization (Tensor): LU matrix, the factorization of input X.
+        factorization (Tensor), LU matrix, the factorization of input X.
+
+        pivots (IntTensor), the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the
+        intermediate transpositions of rows. The final permutation `perm` could be
+        reconstructed by this, details refer to upper example.
 
-        pivots (IntTensor): the pivots of size(∗(N-2), min(m,n)). `pivots` stores all the 
-                    intermediate transpositions of rows. The final permutation `perm` could be 
-                    reconstructed by this, details refer to upper example.
+        infos (IntTensor, optional), if `get_infos` is `True`, this is a tensor of size (∗(N-2))
+        where non-zero values indicate whether factorization for the matrix or each minibatch
+        has succeeded or failed.
 
-        infos (IntTensor, optional): if `get_infos` is `True`, this is a tensor of size (∗(N-2)) 
-                    where non-zero values indicate whether factorization for the matrix or each minibatch 
-                    has succeeded or failed.
 
-        
-    Examples:            
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             lu,p,info = paddle.linalg.lu(x, get_infos=True)
@@ -2126,26 +2301,26 @@ def lu(x, pivot=True, get_infos=False, name=None):
             # >>> info
             # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
             #    0)
-            
+
             P,L,U = paddle.linalg.lu_unpack(lu,p)
 
             # >>> P
             # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[0., 1., 0.],
             # [0., 0., 1.],
-            # [1., 0., 0.]]), 
+            # [1., 0., 0.]]),
             # >>> L
             # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[1.        , 0.        ],
             # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]), 
+            # [0.60000000, 0.50000000]]),
             # >>> U
             # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[5.        , 6.        ],
             # [0.        , 0.80000000]]))
-            
 
-            # one can verify : X = P @ L @ U ;     
+
+            # one can verify : X = P @ L @ U ;
     """
 
     if in_dygraph_mode():
@@ -2160,14 +2335,12 @@ def lu(x, pivot=True, get_infos=False, name=None):
         info = helper.create_variable_for_type_inference(dtype='int')
         attrs = dict()
         attrs['pivot'] = pivot
-        helper.append_op(type='lu',
-                         inputs={'X': x},
-                         outputs={
-                             'Out': lu,
-                             'Pivots': p,
-                             'Infos': info
-                         },
-                         attrs=attrs)
+        helper.append_op(
+            type='lu',
+            inputs={'X': x},
+            outputs={'Out': lu, 'Pivots': p, 'Infos': info},
+            attrs=attrs,
+        )
     if get_infos:
         return lu, p, info
     else:
@@ -2176,13 +2349,15 @@ def lu(x, pivot=True, get_infos=False, name=None):
 
 def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     r"""
-    Unpack L U and P to single matrix tensor . 
+    Unpack L U and P to single matrix tensor .
     unpack L and U matrix from LU, unpack permutation matrix P from Pivtos .
 
     P mat can be get by pivots:
-    # ones = eye(rows) #eye matrix of rank rows
-    # for i in range(cols):
-    #     swap(ones[i], ones[pivots[i]])
+
+    .. code-block:: text
+        ones = eye(rows) #eye matrix of rank rows
+        for i in range(cols):
+            swap(ones[i], ones[pivots[i]])
 
 
     Args:
@@ -2196,19 +2371,19 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
 
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
-        P (Tensor): Permutation matrix P of lu factorization.
+        P (Tensor), Permutation matrix P of lu factorization.
+
+        L (Tensor), The lower triangular matrix tensor of lu factorization.
 
-        L (Tensor): The lower triangular matrix tensor of lu factorization.
+        U (Tensor), The upper triangular matrix tensor of lu factorization.
 
-        U (Tensor): The upper triangular matrix tensor of lu factorization.
 
-        
-    Examples:            
+    Examples:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([[1.0, 2.0], [3.0, 4.0], [5.0, 6.0]]).astype('float64')
             lu,p,info = paddle.linalg.lu(x, get_infos=True)
@@ -2224,25 +2399,25 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
             # >>> info
             # Tensor(shape=[], dtype=int32, place=CUDAPlace(0), stop_gradient=True,
             #    0)
-            
+
             P,L,U = paddle.linalg.lu_unpack(lu,p)
 
             # >>> P
             # (Tensor(shape=[3, 3], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[0., 1., 0.],
             # [0., 0., 1.],
-            # [1., 0., 0.]]), 
+            # [1., 0., 0.]]),
             # >>> L
             # Tensor(shape=[3, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[1.        , 0.        ],
             # [0.20000000, 1.        ],
-            # [0.60000000, 0.50000000]]), 
+            # [0.60000000, 0.50000000]]),
             # >>> U
             # Tensor(shape=[2, 2], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
             # [[5.        , 6.        ],
             # [0.        , 0.80000000]]))
 
-            # one can verify : X = P @ L @ U ;   
+            # one can verify : X = P @ L @ U ;
     """
 
     if in_dygraph_mode():
@@ -2250,8 +2425,9 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
         return P, L, U
 
     if paddle.in_dynamic_mode():
-        P, L, U = _legacy_C_ops.lu_unpack(x, y, 'unpack_ludata', unpack_ludata,
-                                          'unpack_pivots', unpack_pivots)
+        P, L, U = _legacy_C_ops.lu_unpack(
+            x, y, 'unpack_ludata', unpack_ludata, 'unpack_pivots', unpack_pivots
+        )
         return P, L, U
 
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'lu_unpack')
@@ -2263,35 +2439,30 @@ def lu_unpack(x, y, unpack_ludata=True, unpack_pivots=True, name=None):
     attrs = dict()
     attrs['unpack_ludata'] = unpack_ludata
     attrs['unpack_pivots'] = unpack_pivots
-    helper.append_op(type='lu_unpack',
-                     inputs={
-                         'X': x,
-                         'Pivots': y
-                     },
-                     outputs={
-                         'Pmat': p,
-                         'L': l,
-                         'U': u
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='lu_unpack',
+        inputs={'X': x, 'Pivots': y},
+        outputs={'Pmat': p, 'L': l, 'U': u},
+        attrs=attrs,
+    )
     return p, l, u
 
 
 def eig(x, name=None):
     """
-    This API performs the eigenvalue decomposition of a square matrix or a batch of square matrices.
+    Performs the eigenvalue decomposition of a square matrix or a batch of square matrices.
 
-    .. note::
-        If the matrix is a Hermitian or a real symmetric matrix, please use :ref:`paddle.linalg.eigh` instead, which is much faster.
-        If only eigenvalues is needed, please use :ref:`paddle.linalg.eigvals` instead.
-        If the matrix is of any shape, please use :ref:`paddle.linalg.svd`.
-        This API is only supported on CPU device.
-        The output datatype is always complex for both real and complex input.
+    Note:
+        - If the matrix is a Hermitian or a real symmetric matrix, please use :ref:`paddle.linalg.eigh` instead, which is much faster.
+        - If only eigenvalues is needed, please use :ref:`paddle.linalg.eigvals` instead.
+        - If the matrix is of any shape, please use :ref:`paddle.linalg.svd`.
+        - This API is only supported on CPU device.
+        - The output datatype is always complex for both real and complex input.
 
     Args:
         x (Tensor): A tensor with shape math:`[*, N, N]`, The data type of the x should be one of ``float32``,
             ``float64``, ``compplex64`` or ``complex128``.
-        name (str, optional): The default value is `None`. Normally there is no need for user to set 
+        name (str, optional): The default value is `None`. Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -2302,16 +2473,14 @@ def eig(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             paddle.device.set_device("cpu")
 
-            x_data = np.array([[1.6707249, 7.2249975, 6.5045543],
+            x = paddle.to_tensor([[1.6707249, 7.2249975, 6.5045543],
                                [9.956216,  8.749598,  6.066444 ],
-                               [4.4251957, 1.7983172, 0.370647 ]]).astype("float32")
-            x = paddle.to_tensor(x_data)
+                               [4.4251957, 1.7983172, 0.370647 ]])
             w, v = paddle.linalg.eig(x)
-            print(w)
+            print(v)
             # Tensor(shape=[3, 3], dtype=complex128, place=CPUPlace, stop_gradient=False,
             #       [[(-0.5061363550800655+0j) , (-0.7971760990842826+0j) ,
             #         (0.18518077798279986+0j)],
@@ -2320,7 +2489,7 @@ def eig(x, name=None):
             #        [(-0.23142567697893396+0j),  (0.4944999840400175+0j) ,
             #         (0.7058765252952796+0j) ]])
 
-            print(v)
+            print(w)
             # Tensor(shape=[3], dtype=complex128, place=CPUPlace, stop_gradient=False,
             #       [ (16.50471283351188+0j)  , (-5.5034820550763515+0j) ,
             #         (-0.21026087843552282+0j)])
@@ -2331,9 +2500,9 @@ def eig(x, name=None):
         w, v = _legacy_C_ops.eig(x)
         return w, v
 
-    check_variable_and_dtype(x, 'X',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eig')
+    check_variable_and_dtype(
+        x, 'X', ['float32', 'float64', 'complex64', 'complex128'], 'eig'
+    )
     helper = LayerHelper('eig', **locals())
 
     w = helper.create_variable_for_type_inference(x.dtype)
@@ -2360,10 +2529,10 @@ def eigvals(x, name=None):
             Its data type should be float32, float64, complex64, or complex128.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
-            
+
     Returns:
-        Tensor: A tensor containing the unsorted eigenvalues which has the same batch dimensions with `x`.
-            The eigenvalues are complex-valued even when `x` is real.
+        Tensor, A tensor containing the unsorted eigenvalues which has the same batch
+        dimensions with `x`. The eigenvalues are complex-valued even when `x` is real.
 
     Examples:
         .. code-block:: python
@@ -2382,20 +2551,24 @@ def eigvals(x, name=None):
             # [(-0.27078833542132674+0j), (0.29962280156230725+0j), (0.8824477020120244+0j)] #complex128
     """
 
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eigvals')
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'eigvals'
+    )
 
     x_shape = list(x.shape)
     if len(x_shape) < 2:
         raise ValueError(
-            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}"
-            .format(len(x_shape), x_shape))
+            "The dimension of Input(x) should be at least 2, but received x's dimention = {}, x's shape = {}".format(
+                len(x_shape), x_shape
+            )
+        )
 
     if x_shape[-1] != x_shape[-2]:
         raise ValueError(
-            "The last two dimensions of Input(x) should be equal, but received x's shape = {}"
-            .format(x_shape))
+            "The last two dimensions of Input(x) should be equal, but received x's shape = {}".format(
+                x_shape
+            )
+        )
 
     if in_dygraph_mode():
         return _C_ops.eigvals(x)
@@ -2447,28 +2620,19 @@ def multi_dot(x, name=None):
     .. code-block:: python
 
         import paddle
-        import numpy as np
-
         # A * B
-        A_data = np.random.random([3, 4]).astype(np.float32)
-        B_data = np.random.random([4, 5]).astype(np.float32)
-        A = paddle.to_tensor(A_data)
-        B = paddle.to_tensor(B_data)
+        A = paddle.rand([3, 4])
+        B = paddle.rand([4, 5])
         out = paddle.linalg.multi_dot([A, B])
-        print(out.numpy().shape)
+        print(out.shape)
         # [3, 5]
-
         # A * B * C
-        A_data = np.random.random([10, 5]).astype(np.float32)
-        B_data = np.random.random([5, 8]).astype(np.float32)
-        C_data = np.random.random([8, 7]).astype(np.float32)
-        A = paddle.to_tensor(A_data)
-        B = paddle.to_tensor(B_data)
-        C = paddle.to_tensor(C_data)
+        A = paddle.rand([10, 5])
+        B = paddle.rand([5, 8])
+        C = paddle.rand([8, 7])
         out = paddle.linalg.multi_dot([A, B, C])
-        print(out.numpy().shape)
+        print(out.shape)
         # [10, 7]
-
     """
     if _in_legacy_dygraph():
         return _legacy_C_ops.multi_dot(x)
@@ -2477,11 +2641,16 @@ def multi_dot(x, name=None):
 
     check_type(x, 'x', (list, tuple), 'multi_dot')
     for id, item in enumerate(x):
-        check_variable_and_dtype(item, 'x[' + str(id) + ']',
-                                 ['float16', 'float32', 'float64'], 'multi_dot')
+        check_variable_and_dtype(
+            item,
+            'x[' + str(id) + ']',
+            ['float16', 'float32', 'float64'],
+            'multi_dot',
+        )
         if item.dtype != x[0].dtype:
             raise TypeError(
-                "All the Tensors in the input must have the same data type.")
+                "All the Tensors in the input must have the same data type."
+            )
 
     helper = LayerHelper('multi_dot', **locals())
     dtype = helper.input_dtype(input_param_name='x')
@@ -2504,18 +2673,17 @@ def eigh(x, UPLO='L', name=None):
             property.  For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-
-        out_value(Tensor):  A Tensor with shape [*, N] and data type of float32 and float64. The eigenvalues of eigh op.
-        out_vector(Tensor): A Tensor with shape [*, N, N] and data type of float32,float64,complex64 and complex128. The eigenvectors of eigh op.
+        - out_value(Tensor):  A Tensor with shape [*, N] and data type of float32 and float64.
+            The eigenvalues of eigh op.
+        - out_vector(Tensor): A Tensor with shape [*, N, N] and data type of float32,float64,
+            complex64 and complex128. The eigenvectors of eigh op.
 
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x_data = np.array([[1, -2j], [2j, 5]])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([[1, -2j], [2j, 5]])
             out_value, out_vector = paddle.linalg.eigh(x, UPLO='L')
             print(out_value)
             #[0.17157288, 5.82842712]
@@ -2535,32 +2703,35 @@ def __check_input(x, UPLO):
         if len(x.shape) < 2:
             raise ValueError(
                 "Input(input) only support >=2 tensor, but received "
-                "length of Input(input) is %s." % len(x.shape))
+                "length of Input(input) is %s." % len(x.shape)
+            )
         if x_shape[-1] != x_shape[-2]:
             raise ValueError(
-                "The input matrix must be batches of square matrices. But received x's dimention: {}"
-                .format(x_shape))
+                "The input matrix must be batches of square matrices. But received x's dimention: {}".format(
+                    x_shape
+                )
+            )
         if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
-                "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
+                "UPLO must be L or U. But received UPLO is: {}".format(UPLO)
+            )
 
     __check_input(x, UPLO)
 
     helper = LayerHelper('eigh', **locals())
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eigh')
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'eigh'
+    )
 
     out_value = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='eigh',
-                     inputs={'X': x},
-                     outputs={
-                         'Eigenvalues': out_value,
-                         'Eigenvectors': out_vector
-                     },
-                     attrs={'UPLO': UPLO})
+    helper.append_op(
+        type='eigh',
+        inputs={'X': x},
+        outputs={'Eigenvalues': out_value, 'Eigenvectors': out_vector},
+        attrs={'UPLO': UPLO},
+    )
     return out_value, out_vector
 
 
@@ -2589,7 +2760,7 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             True.
 
         rcond(Tensor, optional): the tolerance value to determine
-            when is a singular value zero. Defalut:1e-15.
+            when is a singular value zero. Default:1e-15.
 
         hermitian(bool, optional): indicates whether x is Hermitian
             if complex or symmetric if real. Default: False.
@@ -2679,8 +2850,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
         if not hermitian:
             # combine svd and matmul op
             u, s, vt = _legacy_C_ops.svd(x, 'full_matrices', False)
-            max_singular_val = _legacy_C_ops.reduce_max(s, 'dim', [-1], 'keep_dim', True, \
-                'reduce_all', False)
+            max_singular_val = _legacy_C_ops.reduce_max(
+                s, 'dim', [-1], 'keep_dim', True, 'reduce_all', False
+            )
             rcond = paddle.to_tensor(rcond, dtype=x.dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
@@ -2702,15 +2874,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             if in_dygraph_mode():
                 out_2 = _C_ops.matmul(out_1, u, False, True)
             else:
-                out_2 = _legacy_C_ops.matmul_v2(out_1, u, 'trans_x', False,
-                                                'trans_y', True)
+                out_2 = _legacy_C_ops.matmul_v2(
+                    out_1, u, 'trans_x', False, 'trans_y', True
+                )
             return out_2
         else:
             # combine eigh and matmul op
             s, u = _legacy_C_ops.eigh(x, 'UPLO', 'L')
             s_abs = paddle.abs(s)
-            max_singular_val = _legacy_C_ops.reduce_max(s_abs, 'dim', [-1], 'keep_dim', True, \
-                'reduce_all', False)
+            max_singular_val = _legacy_C_ops.reduce_max(
+                s_abs, 'dim', [-1], 'keep_dim', True, 'reduce_all', False
+            )
             rcond = paddle.to_tensor(rcond, dtype=s.dtype)
             cutoff = rcond * max_singular_val
             y = float('inf')
@@ -2729,8 +2903,9 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             if in_dygraph_mode():
                 out_2 = _C_ops.matmul(out_1, u_conj, False, True)
             else:
-                out_2 = _legacy_C_ops.matmul_v2(out_1, u_conj, 'trans_x', False,
-                                                'trans_y', True)
+                out_2 = _legacy_C_ops.matmul_v2(
+                    out_1, u_conj, 'trans_x', False, 'trans_y', True
+                )
             return out_2
     else:
         if not hermitian:
@@ -2744,23 +2919,17 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
             helper.append_op(
                 type='svd',
                 inputs={'X': [x]},
-                outputs={
-                    'U': u,
-                    'VH': vt,
-                    'S': s
-                },
+                outputs={'U': u, 'VH': vt, 'S': s},
                 attrs={'full_matrices': False},
             )
 
             max_singular_val = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='reduce_max',
-                             inputs={'X': s},
-                             outputs={'Out': max_singular_val},
-                             attrs={
-                                 'dim': [-1],
-                                 'keep_dim': True,
-                                 'reduce_all': False
-                             })
+            helper.append_op(
+                type='reduce_max',
+                inputs={'X': s},
+                outputs={'Out': max_singular_val},
+                attrs={'dim': [-1], 'keep_dim': True, 'reduce_all': False},
+            )
 
             rcond = full(shape=[1], fill_value=rcond, dtype=dtype)
             cutoff = rcond * max_singular_val
@@ -2776,59 +2945,50 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             st = helper.create_variable_for_type_inference(dtype=dtype)
             st_shape = helper.create_variable_for_type_inference(dtype=dtype)
-            helper.append_op(type='unsqueeze2',
-                             inputs={'X': singular},
-                             attrs={'axes': [-2]},
-                             outputs={
-                                 'Out': st,
-                                 'XShape': st_shape
-                             })
+            helper.append_op(
+                type='unsqueeze2',
+                inputs={'X': singular},
+                attrs={'axes': [-2]},
+                outputs={'Out': st, 'XShape': st_shape},
+            )
 
             dims = list(range(len(vt.shape)))
             perm = dims[:-2] + [dims[-1]] + [dims[-2]]
             v = helper.create_variable_for_type_inference(dtype)
             v_shape = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='transpose2',
-                             inputs={'X': [vt]},
-                             outputs={
-                                 'Out': [v],
-                                 'XShape': [v_shape]
-                             },
-                             attrs={'axis': perm})
+            helper.append_op(
+                type='transpose2',
+                inputs={'X': [vt]},
+                outputs={'Out': [v], 'XShape': [v_shape]},
+                attrs={'axis': perm},
+            )
 
             out_1 = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_mul',
-                             inputs={
-                                 'X': v,
-                                 'Y': st
-                             },
-                             outputs={'Out': out_1},
-                             attrs={
-                                 'axis': -1,
-                                 'use_mkldnn': False
-                             })
+            helper.append_op(
+                type='elementwise_mul',
+                inputs={'X': v, 'Y': st},
+                outputs={'Out': out_1},
+                attrs={'axis': -1, 'use_mkldnn': False},
+            )
             out_1 = helper.append_activation(out_1)
 
             out_2 = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
                 type='matmul_v2',
-                inputs={
-                    'X': out_1,
-                    'Y': u
-                },
+                inputs={'X': out_1, 'Y': u},
                 outputs={'Out': out_2},
-                attrs={
-                    'trans_x': False,
-                    'trans_y': True
-                },
+                attrs={'trans_x': False, 'trans_y': True},
             )
             return out_2
         else:
             helper = LayerHelper('pinv', **locals())
             dtype = x.dtype
             check_variable_and_dtype(
-                x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'],
-                'pinv')
+                x,
+                'dtype',
+                ['float32', 'float64', 'complex64', 'complex128'],
+                'pinv',
+            )
 
             if dtype == paddle.complex128:
                 s_type = 'float64'
@@ -2839,26 +2999,23 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             u = helper.create_variable_for_type_inference(dtype)
             s = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(type='eigh',
-                             inputs={'X': x},
-                             outputs={
-                                 'Eigenvalues': s,
-                                 'Eigenvectors': u
-                             },
-                             attrs={'UPLO': 'L'})
+            helper.append_op(
+                type='eigh',
+                inputs={'X': x},
+                outputs={'Eigenvalues': s, 'Eigenvectors': u},
+                attrs={'UPLO': 'L'},
+            )
             s_abs = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(type='abs',
-                             inputs={'X': s},
-                             outputs={'Out': s_abs})
+            helper.append_op(
+                type='abs', inputs={'X': s}, outputs={'Out': s_abs}
+            )
             max_singular_val = helper.create_variable_for_type_inference(s_type)
-            helper.append_op(type='reduce_max',
-                             inputs={'X': s_abs},
-                             outputs={'Out': max_singular_val},
-                             attrs={
-                                 'dim': [-1],
-                                 'keep_dim': True,
-                                 'reduce_all': False
-                             })
+            helper.append_op(
+                type='reduce_max',
+                inputs={'X': s_abs},
+                outputs={'Out': max_singular_val},
+                attrs={'dim': [-1], 'keep_dim': True, 'reduce_all': False},
+            )
 
             rcond = full(shape=[1], fill_value=rcond, dtype=s_type)
             cutoff = rcond * max_singular_val
@@ -2874,63 +3031,53 @@ def pinv(x, rcond=1e-15, hermitian=False, name=None):
 
             st = helper.create_variable_for_type_inference(dtype=s_type)
             st_shape = helper.create_variable_for_type_inference(dtype=s_type)
-            helper.append_op(type='unsqueeze2',
-                             inputs={'X': singular},
-                             attrs={'axes': [-2]},
-                             outputs={
-                                 'Out': st,
-                                 'XShape': st_shape
-                             })
+            helper.append_op(
+                type='unsqueeze2',
+                inputs={'X': singular},
+                attrs={'axes': [-2]},
+                outputs={'Out': st, 'XShape': st_shape},
+            )
 
             out_1 = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_mul',
-                             inputs={
-                                 'X': u,
-                                 'Y': st
-                             },
-                             outputs={'Out': out_1},
-                             attrs={
-                                 'axis': -1,
-                                 'use_mkldnn': False
-                             })
+            helper.append_op(
+                type='elementwise_mul',
+                inputs={'X': u, 'Y': st},
+                outputs={'Out': out_1},
+                attrs={'axis': -1, 'use_mkldnn': False},
+            )
             out_1 = helper.append_activation(out_1)
 
             u_conj = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='conj',
-                             inputs={'X': u},
-                             outputs={'Out': [u_conj]})
+            helper.append_op(
+                type='conj', inputs={'X': u}, outputs={'Out': [u_conj]}
+            )
 
             out_2 = helper.create_variable_for_type_inference(dtype)
             helper.append_op(
                 type='matmul_v2',
-                inputs={
-                    'X': out_1,
-                    'Y': u_conj
-                },
+                inputs={'X': out_1, 'Y': u_conj},
                 outputs={'Out': out_2},
-                attrs={
-                    'trans_x': False,
-                    'trans_y': True
-                },
+                attrs={'trans_x': False, 'trans_y': True},
             )
             return out_2
 
 
 def solve(x, y, name=None):
     r"""
+
     Computes the solution of a square system of linear equations with a unique solution for input 'X' and 'Y'.
-    Let :math: `X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
+    Let :math:`X` be a sqaure matrix or a batch of square matrices, :math:`Y` be
     a vector/matrix or a batch of vectors/matrices, the equation should be:
 
     .. math::
         Out = X^-1 * Y
-    Specifically,
-    - This system of linear equations has one solution if and only if input 'X' is invertible.
+
+    Specifically, this system of linear equations has one solution if and only if input 'X' is invertible.
 
     Args:
-        x (Tensor): A square matrix or a batch of square matrices. Its shape should be `[*, M, M]`, where `*` is zero or
+        x (Tensor): A square matrix or a batch of square matrices. Its shape should be ``[*, M, M]``, where ``*`` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be `[*, M, K]`, where `*` is zero or
+        y (Tensor): A vector/matrix or a batch of vectors/matrices. Its shape should be ``[*, M, K]``, where ``*`` is zero or
             more batch dimensions. Its data type should be float32 or float64.
         name(str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
@@ -2940,23 +3087,21 @@ def solve(x, y, name=None):
         Its data type should be the same as that of `x`.
 
     Examples:
-    .. code-block:: python
 
-        # a square system of linear equations:
-        # 2*X0 + X1 = 9
-        # X0 + 2*X1 = 8
+        .. code-block:: python
+
+            # a square system of linear equations:
+            # 2*X0 + X1 = 9
+            # X0 + 2*X1 = 8
 
-        import paddle
-        import numpy as np
+            import paddle
 
-        np_x = np.array([[3, 1],[1, 2]])
-        np_y = np.array([9, 8])
-        x = paddle.to_tensor(np_x, dtype="float64")
-        y = paddle.to_tensor(np_y, dtype="float64")
-        out = paddle.linalg.solve(x, y)
+            x = paddle.to_tensor([[3, 1],[1, 2]], dtype="float64")
+            y = paddle.to_tensor([9, 8], dtype="float64")
+            out = paddle.linalg.solve(x, y)
 
-        print(out)
-        # [2., 3.])
+            print(out)
+            # [2., 3.])
     """
     if in_dygraph_mode():
         return _C_ops.solve(x, y)
@@ -2970,37 +3115,31 @@ def solve(x, y, name=None):
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type="solve",
-                     inputs={
-                         "X": x,
-                         "Y": y
-                     },
-                     outputs={"Out": out})
+    helper.append_op(
+        type="solve", inputs={"X": x, "Y": y}, outputs={"Out": out}
+    )
     return out
 
 
-def triangular_solve(x,
-                     y,
-                     upper=True,
-                     transpose=False,
-                     unitriangular=False,
-                     name=None):
+def triangular_solve(
+    x, y, upper=True, transpose=False, unitriangular=False, name=None
+):
     r"""
-    Computes the solution of a system of equations with a triangular coefficient matrix `x` and
-    multiple right-hand sides `y` .
+        Computes the solution of a system of equations with a triangular coefficient matrix `x` and
+        multiple right-hand sides `y` .
 
-    Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs
-    is also batches.
+        Input `x` and `y` is 2D matrices or batches of 2D matrices. If the inputs are batches, the outputs
+        is also batches.
 
     Args:
         x (Tensor): The input triangular coefficient matrix. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
             zero or more batch dimensions. Its data type should be float32 or float64.
-        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular 
+        upper (bool, optional): Whether to solve the upper-triangular system of equations (default) or the lower-triangular
             system of equations. Default: True.
         transpose (bool, optional): whether `x` should be transposed before calculation. Default: False.
-        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed 
+        unitriangular (bool, optional): whether `x` is unit triangular. If True, the diagonal elements of `x` are assumed
             to be 1 and not referenced from `x` . Default: False.
         name(str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
@@ -3009,32 +3148,38 @@ def triangular_solve(x,
         Tensor: The solution of the system of equations. Its data type should be the same as that of `x`.
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-        # a square system of linear equations:
-        # x1 +   x2  +   x3 = 0
-        #      2*x2  +   x3 = -9
-        #               -x3 = 5
+            # a square system of linear equations:
+            # x1 +   x2  +   x3 = 0
+            #      2*x2  +   x3 = -9
+            #               -x3 = 5
 
-        import paddle
-        import numpy as np
+            import paddle
 
-        x = paddle.to_tensor([[1, 1, 1], 
-                              [0, 2, 1],
-                              [0, 0,-1]], dtype="float64")
-        y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-        out = paddle.linalg.triangular_solve(x, y, upper=True)
+            x = paddle.to_tensor([[1, 1, 1],
+                                  [0, 2, 1],
+                                  [0, 0,-1]], dtype="float64")
+            y = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            out = paddle.linalg.triangular_solve(x, y, upper=True)
 
-        print(out)
-        # [7, -2, -5]
+            print(out)
+            # [7, -2, -5]
     """
     if in_dygraph_mode():
         return _C_ops.triangular_solve(x, y, upper, transpose, unitriangular)
 
     if paddle.in_dynamic_mode():
-        return _legacy_C_ops.triangular_solve(x, y, 'upper', upper, 'transpose',
-                                              transpose, 'unitriangular',
-                                              unitriangular)
+        return _legacy_C_ops.triangular_solve(
+            x,
+            y,
+            'upper',
+            upper,
+            'transpose',
+            transpose,
+            'unitriangular',
+            unitriangular,
+        )
 
     inputs = {"X": [x], "Y": [y]}
     helper = LayerHelper("triangular_solve", **locals())
@@ -3042,17 +3187,16 @@ def triangular_solve(x,
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'triangular_solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='triangular_solve',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'upper': upper,
-                         'transpose': transpose,
-                         'unitriangular': unitriangular
-                     })
+    helper.append_op(
+        type='triangular_solve',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={
+            'upper': upper,
+            'transpose': transpose,
+            'unitriangular': unitriangular,
+        },
+    )
     return out
 
 
@@ -3066,7 +3210,7 @@ def cholesky_solve(x, y, upper=False, name=None):
     Args:
         x (Tensor): The input matrix which is upper or lower triangular Cholesky factor of square matrix A. Its shape should be `[*, M, M]`, where `*` is zero or
             more batch dimensions. Its data type should be float32 or float64.
-        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is 
+        y (Tensor): Multiple right-hand sides of system of equations. Its shape should be `[*, M, K]`, where `*` is
             zero or more batch dimensions. Its data type should be float32 or float64.
         upper (bool, optional): whether to consider the Cholesky factor as a lower or upper triangular matrix. Default: False.
         name(str, optional): Name for the operation (optional, default is None).
@@ -3076,18 +3220,18 @@ def cholesky_solve(x, y, upper=False, name=None):
         Tensor: The solution of the system of equations. Its data type is the same as that of `x`.
 
     Examples:
-    .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            import paddle
 
-        u = paddle.to_tensor([[1, 1, 1], 
-                                [0, 2, 1],
-                                [0, 0,-1]], dtype="float64")
-        b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
-        out = paddle.linalg.cholesky_solve(b, u, upper=True)
+            u = paddle.to_tensor([[1, 1, 1],
+                                    [0, 2, 1],
+                                    [0, 0,-1]], dtype="float64")
+            b = paddle.to_tensor([[0], [-9], [5]], dtype="float64")
+            out = paddle.linalg.cholesky_solve(b, u, upper=True)
 
-        print(out)
-        # [-2.5, -7, 9.5]
+            print(out)
+            # [-2.5, -7, 9.5]
     """
     if in_dygraph_mode():
         return _C_ops.cholesky_solve(x, y, upper)
@@ -3100,19 +3244,18 @@ def cholesky_solve(x, y, upper=False, name=None):
     check_variable_and_dtype(y, 'y', ['float32', 'float64'], 'cholesky_solve')
     out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='cholesky_solve',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={'Out': out},
-                     attrs={'upper': upper})
+    helper.append_op(
+        type='cholesky_solve',
+        inputs={'X': x, 'Y': y},
+        outputs={'Out': out},
+        attrs={'upper': upper},
+    )
     return out
 
 
 def eigvalsh(x, UPLO='L', name=None):
     """
-    Computes the eigenvalues of a 
+    Computes the eigenvalues of a
     complex Hermitian (conjugate symmetric) or a real symmetric matrix.
 
     Args:
@@ -3128,14 +3271,13 @@ def eigvalsh(x, UPLO='L', name=None):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x_data = np.array([[1, -2j], [2j, 5]])
-            x = paddle.to_tensor(x_data)
+            x = paddle.to_tensor([[1, -2j], [2j, 5]])
             out_value = paddle.eigvalsh(x, UPLO='L')
             print(out_value)
-            #[0.17157288, 5.82842712]
+            # Tensor(shape=[2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [0.17157286, 5.82842731])
     """
     if in_dygraph_mode():
         values, _ = _C_ops.eigvalsh(x, UPLO, x.stop_gradient)
@@ -3151,36 +3293,39 @@ def __check_input(x, UPLO):
         if len(x.shape) < 2:
             raise ValueError(
                 "Input(input) only support >=2 tensor, but received "
-                "length of Input(input) is %s." % len(x.shape))
+                "length of Input(input) is %s." % len(x.shape)
+            )
         if x_shape[-1] != x_shape[-2]:
             raise ValueError(
-                "The input matrix must be batches of square matrices. But received x's dimention: {}"
-                .format(x_shape))
+                "The input matrix must be batches of square matrices. But received x's dimention: {}".format(
+                    x_shape
+                )
+            )
         if UPLO != 'L' and UPLO != 'U':
             raise ValueError(
-                "UPLO must be L or U. But received UPLO is: {}".format(UPLO))
+                "UPLO must be L or U. But received UPLO is: {}".format(UPLO)
+            )
 
     __check_input(x, UPLO)
 
     helper = LayerHelper('eigvalsh', **locals())
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'eigvalsh')
+    check_variable_and_dtype(
+        x,
+        'dtype',
+        ['float32', 'float64', 'complex64', 'complex128'],
+        'eigvalsh',
+    )
 
     out_value = helper.create_variable_for_type_inference(dtype=x.dtype)
     out_vector = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     is_test = x.stop_gradient
-    helper.append_op(type='eigvalsh',
-                     inputs={'X': x},
-                     outputs={
-                         'Eigenvalues': out_value,
-                         'Eigenvectors': out_vector
-                     },
-                     attrs={
-                         'UPLO': UPLO,
-                         'is_test': is_test
-                     })
+    helper.append_op(
+        type='eigvalsh',
+        inputs={'X': x},
+        outputs={'Eigenvalues': out_value, 'Eigenvectors': out_vector},
+        attrs={'UPLO': UPLO, 'is_test': is_test},
+    )
     return out_value
 
 
@@ -3192,26 +3337,26 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     Args:
         x (Tensor): A tensor with shape ``(*, M, N)`` , the data type of the input Tensor ``x``
             should be one of float32, float64.
-        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y`` 
+        y (Tensor): A tensor with shape ``(*, M, K)`` , the data type of the input Tensor ``y``
             should be one of float32, float64.
-        rcond(float, optional): The default value is None. A float pointing number used to determine 
-            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the 
+        rcond(float, optional): The default value is None. A float pointing number used to determine
+            the effective rank of ``x``. If ``rcond`` is None, it will be set to max(M, N) times the
             machine precision of x_dtype.
-        driver(str, optional): The default value is None. The name of LAPACK method to be used. For 
-            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only 
-            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’ 
+        driver(str, optional): The default value is None. The name of LAPACK method to be used. For
+            CPU inputs the valid values are ‘gels’, ‘gelsy’, ‘gelsd, ‘gelss’. For CUDA input, the only
+            valid driver is ‘gels’. If ``driver`` is None, ‘gelsy’ is used for CPU inputs and ‘gels’
             for CUDA inputs.
-        name(str, optional): The default value is None. Normally there is no need for user to set 
+        name(str, optional): The default value is None. Normally there is no need for user to set
             this property. For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``). 
-        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals`` 
-        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed 
-        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor 
-        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in 
-        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with 
-        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when 
+        Tuple: A tuple of 4 Tensors which is (``solution``, ``residuals``, ``rank``, ``singular_values``).
+        ``solution`` is a tensor with shape ``(*, N, K)``, meaning the least squares solution. ``residuals``
+        is a tensor with shape ``(*, K)``, meaning the squared residuals of the solutions, which is computed
+        when M > N and every matrix in ``x`` is full-rank, otherwise return an empty tensor. ``rank`` is a tensor
+        with shape ``(*)``, meaning the ranks of the matrices in ``x``, which is computed when ``driver`` in
+        (‘gelsy’, ‘gelsd’, ‘gelss’), otherwise return an empty tensor. ``singular_values`` is a tensor with
+        shape ``(*, min(M, N))``, meaning singular values of the matrices in ``x``, which is computed when
         ``driver`` in (‘gelsd’, ‘gelss’), otherwise return an empty tensor.
 
     Examples:
@@ -3247,14 +3392,18 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     if device == "cpu":
         if driver not in (None, "gels", "gelss", "gelsd", "gelsy"):
             raise ValueError(
-                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}"
-                .format(driver))
+                "Only support valid driver is 'gels', 'gelss', 'gelsd', 'gelsy' or None for CPU inputs. But got {}".format(
+                    driver
+                )
+            )
         driver = "gelsy" if driver is None else driver
     elif "gpu" in device:
         if driver not in (None, "gels"):
             raise ValueError(
-                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}"
-                .format(driver))
+                "Only support valid driver is 'gels' or None for CUDA inputs. But got {}".format(
+                    driver
+                )
+            )
         driver = "gels" if driver is None else driver
     else:
         raise RuntimeError("Only support lstsq api for CPU or CUDA device.")
@@ -3275,10 +3424,12 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
     if _non_static_mode():
         if in_dygraph_mode():
             solution, residuals, rank, singular_values = _C_ops.lstsq(
-                x, y, rcond, driver)
+                x, y, rcond, driver
+            )
         else:
             solution, residuals, rank, singular_values = _legacy_C_ops.lstsq(
-                x, y, 'rcond', rcond, 'driver', driver)
+                x, y, 'rcond', rcond, 'driver', driver
+            )
 
         if driver == "gels":
             rank = paddle.empty(shape=[0], dtype=paddle.int32)
@@ -3289,33 +3440,29 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
         return solution, residuals, rank, singular_values
 
     helper = LayerHelper('lstsq', **locals())
-    check_variable_and_dtype(x, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'lstsq')
-    check_variable_and_dtype(y, 'dtype',
-                             ['float32', 'float64', 'complex64', 'complex128'],
-                             'lstsq')
+    check_variable_and_dtype(
+        x, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq'
+    )
+    check_variable_and_dtype(
+        y, 'dtype', ['float32', 'float64', 'complex64', 'complex128'], 'lstsq'
+    )
 
     solution = helper.create_variable_for_type_inference(dtype=x.dtype)
     residuals = helper.create_variable_for_type_inference(dtype=x.dtype)
     rank = helper.create_variable_for_type_inference(dtype=paddle.int32)
     singular_values = helper.create_variable_for_type_inference(dtype=x.dtype)
 
-    helper.append_op(type='lstsq',
-                     inputs={
-                         'X': x,
-                         'Y': y
-                     },
-                     outputs={
-                         'Solution': solution,
-                         'Residuals': residuals,
-                         'Rank': rank,
-                         'SingularValues': singular_values
-                     },
-                     attrs={
-                         'rcond': rcond,
-                         'driver': driver
-                     })
+    helper.append_op(
+        type='lstsq',
+        inputs={'X': x, 'Y': y},
+        outputs={
+            'Solution': solution,
+            'Residuals': residuals,
+            'Rank': rank,
+            'SingularValues': singular_values,
+        },
+        attrs={'rcond': rcond, 'driver': driver},
+    )
 
     if driver == "gels":
         rank = paddle.static.data(name='rank', shape=[0])
@@ -3328,7 +3475,7 @@ def lstsq(x, y, rcond=None, driver=None, name=None):
 
 def corrcoef(x, rowvar=True, name=None):
     """
-    
+
     A correlation coefficient matrix indicate the correlation of each pair variables in the input matrix.
     For example, for an N-dimensional samples X=[x1,x2,…xN]T, then the correlation coefficient matrix
     element Rij is the correlation of xi and xj. The element Rii is the covariance of xi itself.
@@ -3367,11 +3514,12 @@ def corrcoef(x, rowvar=True, name=None):
     if len(x.shape) > 2 or len(x.shape) < 1:
         raise ValueError(
             "Input(x) only support N-D (1<=N<=2) tensor in corrcoef, but received "
-            "length of Input(input) is %s." % len(x.shape))
+            "length of Input(input) is %s." % len(x.shape)
+        )
     check_variable_and_dtype(x, 'dtype', ['float32', 'float64'], 'corrcoef')
 
     c = cov(x, rowvar)
-    if (c.ndim == 0):
+    if c.ndim == 0:
         # scalar covariance
         # nan if incorrect value (nan, inf, 0), 1 otherwise
         return c / c
@@ -3386,8 +3534,9 @@ def corrcoef(x, rowvar=True, name=None):
 
     # Clip to [-1, 1].  This does not guarantee
     if paddle.is_complex(c):
-        return paddle.complex(paddle.clip(c.real(), -1, 1),
-                              paddle.clip(c.imag(), -1, 1))
+        return paddle.complex(
+            paddle.clip(c.real(), -1, 1), paddle.clip(c.imag(), -1, 1)
+        )
     else:
         c = paddle.clip(c, -1, 1)
 
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 63a89327505671..c998c198d49216 100755
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -16,8 +16,10 @@
 from ..fluid.data_feeder import check_type, check_variable_and_dtype
 from .layer_function_generator import templatedoc
 from ..static import Variable
+
 # TODO: define logic functions of a tensor
 from ..fluid.framework import _in_eager_mode_
+
 if _in_eager_mode_:
     Tensor = paddle.fluid.framework.core.eager.Tensor
 else:
@@ -26,6 +28,7 @@
 from ..framework import in_dygraph_mode, _non_static_mode
 from ..framework import LayerHelper
 from ..fluid.framework import _in_legacy_dygraph
+
 # TODO: define logic functions of a tensor
 from paddle import _C_ops, _legacy_C_ops
 from paddle.tensor.creation import full
@@ -47,14 +50,18 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
         else:
             return op(x)
     check_variable_and_dtype(
-        x, "x",
+        x,
+        "x",
         ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-        op_name)
+        op_name,
+    )
     if y is not None:
         check_variable_and_dtype(
-            y, "y",
+            y,
+            "y",
             ["bool", "int8", "int16", "int32", "int64", "float32", "float64"],
-            op_name)
+            op_name,
+        )
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -63,18 +70,16 @@ def _logical_op(op_name, x, y, out=None, name=None, binary_op=True):
     if binary_op and x.dtype != y.dtype:
         raise ValueError(
             "(InvalidArgument) The DataType of %s Op's Variable must be consistent, but received %s and %s."
-            % (op_name, x.dtype, y.dtype))
+            % (op_name, x.dtype, y.dtype)
+        )
 
     if out is None:
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(type=op_name,
-                         inputs={
-                             "X": x,
-                             "Y": y
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type=op_name, inputs={"X": x, "Y": y}, outputs={"Out": out}
+        )
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -91,7 +96,7 @@ def logical_and(x, y, out=None, name=None):
 
         out = x \&\& y
 
-    .. note::
+    Note:
         ``paddle.logical_and`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
@@ -116,12 +121,9 @@ def logical_and(x, y, out=None, name=None):
     if in_dygraph_mode():
         return _C_ops.logical_and(x, y)
 
-    return _logical_op(op_name="logical_and",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_and", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_or(x, y, out=None, name=None):
@@ -134,9 +136,9 @@ def logical_or(x, y, out=None, name=None):
 
         out = x || y
 
-    .. note::
+    Note:
         ``paddle.logical_or`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
-    
+
     Args:
         x (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
         y (Tensor): the input tensor, it's data type should be one of bool, int8, int16, in32, in64, float32, float64.
@@ -150,23 +152,20 @@ def logical_or(x, y, out=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x_data = np.array([True, False], dtype=np.bool_).reshape(2, 1)
-            y_data = np.array([True, False, True, False], dtype=np.bool_).reshape(2, 2)
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
+            y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
             res = paddle.logical_or(x, y)
-            print(res) # [[ True  True] [ True False]]
+            print(res)
+            # Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            #        [[True , True ],
+            #         [True , False]])
     """
     if in_dygraph_mode():
         return _C_ops.logical_or(x, y)
-    return _logical_op(op_name="logical_or",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_or", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 def logical_xor(x, y, out=None, name=None):
@@ -179,7 +178,7 @@ def logical_xor(x, y, out=None, name=None):
 
         out = (x || y) \&\& !(x \&\& y)
 
-    .. note::
+    Note:
         ``paddle.logical_xor`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
 
     Args:
@@ -195,24 +194,21 @@ def logical_xor(x, y, out=None, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            x_data = np.array([True, False], dtype=np.bool_).reshape([2, 1])
-            y_data = np.array([True, False, True, False], dtype=np.bool_).reshape([2, 2])
-            x = paddle.to_tensor(x_data)
-            y = paddle.to_tensor(y_data)
+            x = paddle.to_tensor([True, False], dtype="bool").reshape([2, 1])
+            y = paddle.to_tensor([True, False, True, False], dtype="bool").reshape([2, 2])
             res = paddle.logical_xor(x, y)
-            print(res) # [[False,  True], [ True, False]]
+            print(res)
+            # Tensor(shape=[2, 2], dtype=bool, place=Place(cpu), stop_gradient=True,
+            #        [[False, True ],
+            #         [True , False]])
     """
     if in_dygraph_mode():
         return _C_ops.logical_xor(x, y)
 
-    return _logical_op(op_name="logical_xor",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _logical_op(
+        op_name="logical_xor", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -245,12 +241,9 @@ def logical_not(x, out=None, name=None):
     """
     if in_dygraph_mode():
         return _C_ops.logical_not(x)
-    return _logical_op(op_name="logical_not",
-                       x=x,
-                       y=None,
-                       name=name,
-                       out=out,
-                       binary_op=False)
+    return _logical_op(
+        op_name="logical_not", x=x, y=None, name=name, out=out, binary_op=False
+    )
 
 
 def is_empty(x, name=None):
@@ -288,16 +281,17 @@ def is_empty(x, name=None):
     if _in_legacy_dygraph():
         return _legacy_C_ops.is_empty(x)
 
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'is_empty')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'is_empty'
+    )
     check_type(name, "name", (str, type(None)), "is_empty")
 
     helper = LayerHelper("is_empty", **locals())
     cond = helper.create_variable_for_type_inference(dtype='bool')
     cond.stop_gradient = True
-    helper.append_op(type='is_empty',
-                     inputs={'X': [x]},
-                     outputs={'Out': [cond]})
+    helper.append_op(
+        type='is_empty', inputs={'X': [x]}, outputs={'Out': [cond]}
+    )
     return cond
 
 
@@ -305,7 +299,7 @@ def equal_all(x, y, name=None):
     """
     Returns the truth value of :math:`x == y`. True if two inputs have the same elements, False otherwise.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -338,12 +332,9 @@ def equal_all(x, y, name=None):
 
     helper = LayerHelper("equal_all", **locals())
     out = helper.create_variable_for_type_inference(dtype='bool')
-    helper.append_op(type='equal_all',
-                     inputs={
-                         'X': [x],
-                         'Y': [y]
-                     },
-                     outputs={'Out': [out]})
+    helper.append_op(
+        type='equal_all', inputs={'X': [x], 'Y': [y]}, outputs={'Out': [out]}
+    )
     return out
 
 
@@ -373,22 +364,20 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           y = paddle.to_tensor([10000.1, 1e-08])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          np_result1 = result1.numpy()
           # [False]
+
           result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-          np_result2 = result2.numpy()
           # [False]
 
           x = paddle.to_tensor([1.0, float('nan')])
           y = paddle.to_tensor([1.0, float('nan')])
           result1 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          np_result1 = result1.numpy()
           # [False]
+
           result2 = paddle.allclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-          np_result2 = result2.numpy()
           # [True]
     """
 
@@ -396,12 +385,15 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
         # C++ backend will cast it into float32 if passing float from python.
         as_tensor = lambda x: paddle.to_tensor(
-            [x], dtype='float64', place='cpu')
-        return _C_ops.allclose(x, y, as_tensor(rtol), as_tensor(atol),
-                               equal_nan)
+            [x], dtype='float64', place='cpu'
+        )
+        return _C_ops.allclose(
+            x, y, as_tensor(rtol), as_tensor(atol), equal_nan
+        )
     if _in_legacy_dygraph():
-        return _legacy_C_ops.allclose(x, y, 'rtol', str(rtol), 'atol',
-                                      str(atol), 'equal_nan', equal_nan)
+        return _legacy_C_ops.allclose(
+            x, y, 'rtol', str(rtol), 'atol', str(atol), 'equal_nan', equal_nan
+        )
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'allclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'allclose')
     check_type(rtol, 'rtol', float, 'allclose')
@@ -414,10 +406,9 @@ def allclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': str(rtol), 'atol': str(atol), 'equal_nan': equal_nan}
-    helper.append_op(type='allclose',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='allclose', inputs=inputs, outputs=outputs, attrs=attrs
+    )
 
     return out
 
@@ -428,7 +419,7 @@ def equal(x, y, name=None):
 
     This layer returns the truth value of :math:`x == y` elementwise.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -439,7 +430,7 @@ def equal(x, y, name=None):
 
     Returns:
         Tensor: output Tensor, it's shape is the same as the input's Tensor,
-        and the data type is bool. The result of this op is stop_gradient. 
+        and the data type is bool. The result of this op is stop_gradient.
 
     Examples:
         .. code-block:: python
@@ -453,8 +444,10 @@ def equal(x, y, name=None):
     """
     if not isinstance(y, (int, bool, float, Variable)):
         raise TypeError(
-            "Type of input args must be float, bool, int or Tensor, but received type {}"
-            .format(type(y)))
+            "Type of input args must be float, bool, int or Tensor, but received type {}".format(
+                type(y)
+            )
+        )
     if not isinstance(y, Variable):
         y = full(shape=[1], dtype=x.dtype, fill_value=y)
 
@@ -466,21 +459,26 @@ def equal(x, y, name=None):
             return _legacy_C_ops.equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "equal",
+            )
             helper = LayerHelper("equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -489,7 +487,7 @@ def greater_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x >= y` elementwise, which is equivalent function to the overloaded operator `>=`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -518,21 +516,26 @@ def greater_equal(x, y, name=None):
             return _legacy_C_ops.greater_equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_equal",
+            )
             helper = LayerHelper("greater_equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='greater_equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='greater_equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -541,7 +544,7 @@ def greater_than(x, y, name=None):
     """
     Returns the truth value of :math:`x > y` elementwise, which is equivalent function to the overloaded operator `>`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -569,21 +572,26 @@ def greater_than(x, y, name=None):
             return _legacy_C_ops.greater_than(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_than")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_than",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "greater_than")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "greater_than",
+            )
             helper = LayerHelper("greater_than", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='greater_than',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='greater_than',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -592,7 +600,7 @@ def less_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x <= y` elementwise, which is equivalent function to the overloaded operator `<=`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -622,21 +630,26 @@ def less_equal(x, y, name=None):
             return _legacy_C_ops.less_equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "less_equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "less_equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_equal",
+            )
             helper = LayerHelper("less_equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='less_equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='less_equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -645,7 +658,7 @@ def less_than(x, y, name=None):
     """
     Returns the truth value of :math:`x < y` elementwise, which is equivalent function to the overloaded operator `<`.
 
-    Note: 
+    Note:
         The output has no gradient.
 
     Args:
@@ -675,21 +688,26 @@ def less_than(x, y, name=None):
             return _legacy_C_ops.less_than(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "less_than")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_than",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "less_than")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "less_than",
+            )
             helper = LayerHelper("less_than", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='less_than',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='less_than',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -697,8 +715,8 @@ def less_than(x, y, name=None):
 def not_equal(x, y, name=None):
     """
     Returns the truth value of :math:`x != y` elementwise, which is equivalent function to the overloaded operator `!=`.
-    
-    Note: 
+
+    Note:
         The output has no gradient.
 
     Args:
@@ -728,21 +746,26 @@ def not_equal(x, y, name=None):
             return _legacy_C_ops.not_equal(x, y)
         else:
             check_variable_and_dtype(
-                x, "x", ["bool", "float32", "float64", "int32", "int64"],
-                "not_equal")
+                x,
+                "x",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "not_equal",
+            )
             check_variable_and_dtype(
-                y, "y", ["bool", "float32", "float64", "int32", "int64"],
-                "not_equal")
+                y,
+                "y",
+                ["bool", "float32", "float64", "int32", "int64"],
+                "not_equal",
+            )
             helper = LayerHelper("not_equal", **locals())
             out = helper.create_variable_for_type_inference(dtype='bool')
             out.stop_gradient = True
 
-            helper.append_op(type='not_equal',
-                             inputs={
-                                 'X': [x],
-                                 'Y': [y]
-                             },
-                             outputs={'Out': [out]})
+            helper.append_op(
+                type='not_equal',
+                inputs={'X': [x], 'Y': [y]},
+                outputs={'Out': [out]},
+            )
             return out
 
 
@@ -769,7 +792,7 @@ def is_tensor(x):
             input3 = [1, 4]
             check = paddle.is_tensor(input3)
             print(check)  #False
-            
+
     """
     return isinstance(x, (Tensor, paddle.fluid.core.eager.Tensor))
 
@@ -789,11 +812,15 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
             return op(x)
 
     check_variable_and_dtype(
-        x, "x", ["bool", "uint8", "int8", "int16", "int32", "int64"], op_name)
+        x, "x", ["bool", "uint8", "int8", "int16", "int32", "int64"], op_name
+    )
     if y is not None:
         check_variable_and_dtype(
-            y, "y", ["bool", "uint8", "int8", "int16", "int32", "int64"],
-            op_name)
+            y,
+            "y",
+            ["bool", "uint8", "int8", "int16", "int32", "int64"],
+            op_name,
+        )
     if out is not None:
         check_type(out, "out", Variable, op_name)
 
@@ -805,12 +832,9 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
         out = helper.create_variable_for_type_inference(dtype=x.dtype)
 
     if binary_op:
-        helper.append_op(type=op_name,
-                         inputs={
-                             "X": x,
-                             "Y": y
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type=op_name, inputs={"X": x, "Y": y}, outputs={"Out": out}
+        )
     else:
         helper.append_op(type=op_name, inputs={"X": x}, outputs={"Out": out})
 
@@ -821,7 +845,7 @@ def _bitwise_op(op_name, x, y, out=None, name=None, binary_op=True):
 def bitwise_and(x, y, out=None, name=None):
     """
     ${comment}
-    
+
     Args:
         x (Tensor): ${x_comment}
         y (Tensor): ${y_comment}
@@ -829,7 +853,7 @@ def bitwise_and(x, y, out=None, name=None):
 
     Returns:
         Tensor: ${out_comment}
-        
+
     Examples:
         .. code-block:: python
 
@@ -841,19 +865,16 @@ def bitwise_and(x, y, out=None, name=None):
     """
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_and(x, y)
-    return _bitwise_op(op_name="bitwise_and",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _bitwise_op(
+        op_name="bitwise_and", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
 def bitwise_or(x, y, out=None, name=None):
     """
     ${comment}
-    
+
     Args:
         x (Tensor): ${x_comment}
         y (Tensor): ${y_comment}
@@ -874,12 +895,9 @@ def bitwise_or(x, y, out=None, name=None):
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_or(x, y)
 
-    return _bitwise_op(op_name="bitwise_or",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _bitwise_op(
+        op_name="bitwise_or", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -906,12 +924,9 @@ def bitwise_xor(x, y, out=None, name=None):
     """
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_xor(x, y)
-    return _bitwise_op(op_name="bitwise_xor",
-                       x=x,
-                       y=y,
-                       name=name,
-                       out=out,
-                       binary_op=True)
+    return _bitwise_op(
+        op_name="bitwise_xor", x=x, y=y, name=name, out=out, binary_op=True
+    )
 
 
 @templatedoc()
@@ -922,7 +937,7 @@ def bitwise_not(x, out=None, name=None):
     Args:
         x(Tensor):  ${x_comment}
         out(Tensor): ${out_comment}
-    
+
     Returns:
         Tensor: ${out_comment}
 
@@ -937,12 +952,9 @@ def bitwise_not(x, out=None, name=None):
     if in_dygraph_mode() and out is None:
         return _C_ops.bitwise_not(x)
 
-    return _bitwise_op(op_name="bitwise_not",
-                       x=x,
-                       y=None,
-                       name=name,
-                       out=out,
-                       binary_op=False)
+    return _bitwise_op(
+        op_name="bitwise_not", x=x, y=None, name=name, out=out, binary_op=False
+    )
 
 
 @templatedoc()
@@ -962,13 +974,6 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     Returns:
         Tensor: ${out_comment}.
 
-    Raises:
-        TypeError: The data type of ``x`` must be one of float32, float64.
-        TypeError: The data type of ``y`` must be one of float32, float64.
-        TypeError: The type of ``rtol`` must be float.
-        TypeError: The type of ``atol`` must be float.
-        TypeError: The type of ``equal_nan`` must be bool.
-
     Examples:
         .. code-block:: python
 
@@ -978,22 +983,18 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
           y = paddle.to_tensor([10000.1, 1e-08])
           result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          np_result1 = result1.numpy()
           # [True, False]
           result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-          np_result2 = result2.numpy()
           # [True, False]
 
           x = paddle.to_tensor([1.0, float('nan')])
           y = paddle.to_tensor([1.0, float('nan')])
           result1 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
                                   equal_nan=False, name="ignore_nan")
-          np_result1 = result1.numpy()
           # [True, False]
           result2 = paddle.isclose(x, y, rtol=1e-05, atol=1e-08,
                                       equal_nan=True, name="equal_nan")
-          np_result2 = result2.numpy()
           # [True, True]
     """
 
@@ -1001,11 +1002,13 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
         # NOTE(dev): Pass tol as Tensor to fix precision loss problem, because
         # C++ backend will cast it into float32 if passing float from python.
         as_tensor = lambda x: paddle.to_tensor(
-            [x], dtype='float64', place='cpu')
+            [x], dtype='float64', place='cpu'
+        )
         return _C_ops.isclose(x, y, as_tensor(rtol), as_tensor(atol), equal_nan)
     if _in_legacy_dygraph():
-        return _legacy_C_ops.isclose(x, y, 'rtol', str(rtol), 'atol', str(atol),
-                                     'equal_nan', equal_nan)
+        return _legacy_C_ops.isclose(
+            x, y, 'rtol', str(rtol), 'atol', str(atol), 'equal_nan', equal_nan
+        )
 
     check_variable_and_dtype(x, "input", ['float32', 'float64'], 'isclose')
     check_variable_and_dtype(y, "input", ['float32', 'float64'], 'isclose')
@@ -1019,8 +1022,7 @@ def isclose(x, y, rtol=1e-05, atol=1e-08, equal_nan=False, name=None):
     inputs = {'Input': x, 'Other': y}
     outputs = {'Out': out}
     attrs = {'rtol': str(rtol), 'atol': str(atol), 'equal_nan': equal_nan}
-    helper.append_op(type='isclose',
-                     inputs=inputs,
-                     outputs=outputs,
-                     attrs=attrs)
+    helper.append_op(
+        type='isclose', inputs=inputs, outputs=outputs, attrs=attrs
+    )
     return out
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
old mode 100755
new mode 100644
index 5e05a93e905963..f987e8b89cf254
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -17,12 +17,22 @@
 
 from ..static import Variable, device_guard
 from ..framework import core, in_dygraph_mode
-from ..fluid.framework import _in_legacy_dygraph, _in_eager_without_dygraph_check, _non_static_mode
+from ..fluid.framework import (
+    _in_legacy_dygraph,
+    _in_eager_without_dygraph_check,
+    _non_static_mode,
+)
 from ..framework import LayerHelper
 from ..framework import OpProtoHolder, convert_np_dtype_to_dtype_, dygraph_only
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layers import utils
 import numpy as np
+
 # TODO: define functions to manipulate a tensor
 from ..fluid.layers.nn import _elementwise_op_in_dygraph
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
@@ -72,25 +82,50 @@ def cast(x, dtype):
         out = _legacy_C_ops.cast(x, 'in_dtype', x.dtype, 'out_dtype', dtype)
         return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int16', 'int32', 'int64',
-        'uint8', 'uint16'
-    ], 'cast')
-    check_dtype(dtype, 'dtype', [
-        'bool', 'float16', 'float32', 'float64', 'int8', 'int16', 'int32',
-        'int64', 'uint8', 'uint16'
-    ], 'cast')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+        ],
+        'cast',
+    )
+    check_dtype(
+        dtype,
+        'dtype',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'uint8',
+            'uint16',
+        ],
+        'cast',
+    )
 
     helper = LayerHelper('cast', **locals())
     out = helper.create_variable_for_type_inference(
-        dtype=dtype, stop_gradient=x.stop_gradient)
-    helper.append_op(type='cast',
-                     inputs={'X': [x]},
-                     outputs={'Out': [out]},
-                     attrs={
-                         'in_dtype': x.dtype,
-                         'out_dtype': out.dtype
-                     })
+        dtype=dtype, stop_gradient=x.stop_gradient
+    )
+    helper.append_op(
+        type='cast',
+        inputs={'X': [x]},
+        outputs={'Out': [out]},
+        attrs={'in_dtype': x.dtype, 'out_dtype': out.dtype},
+    )
     return out
 
 
@@ -128,7 +163,7 @@ def slice(input, axes, starts, ends):
                 ends = [-1, 1000]       # -1 denotes the reverse 0th position of dimension 0.
             Then:
                 result = [ [2, 3, 4], ] # result = data[0:1, 1:4]
-    
+
     Args:
         input (Tensor): A ``Tensor`` . The data type is ``float16``, ``float32``, ``float64``, ``int32`` or ``int64``.
         axes (list|tuple): The data type is ``int32`` . Axes that `starts` and `ends` apply to .
@@ -142,10 +177,6 @@ def slice(input, axes, starts, ends):
     Returns:
         Tensor:  A ``Tensor``. The data type is same as ``input``.
 
-    Raises:
-        TypeError: The type of ``starts`` must be list, tuple or Tensor.
-        TypeError: The type of ``ends`` must be list, tuple or Tensor.
-
     Examples:
         .. code-block:: python
 
@@ -175,7 +206,8 @@ def slice(input, axes, starts, ends):
             axes = list(axes)
             if len(axes) == 0:
                 raise ValueError(
-                    "Input axes should not be an empty list/tuple.")
+                    "Input axes should not be an empty list/tuple."
+                )
             for i in range(len(axes)):
                 if axes[i] < 0:
                     axes[i] = max(0, axes[i] + len(input.shape))
@@ -184,8 +216,10 @@ def slice(input, axes, starts, ends):
 
         else:
             raise ValueError(
-                "Input axes must be a python list or tuple, but reveived {}".
-                format(type(axes)))
+                "Input axes must be a python list or tuple, but reveived {}".format(
+                    type(axes)
+                )
+            )
 
         infer_flags = list(1 for i in range(len(axes)))
 
@@ -194,7 +228,8 @@ def slice(input, axes, starts, ends):
         if isinstance(starts, (list, tuple)):
             starts = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item
+                if isinstance(item, tmp_tensor_type)
+                else item
                 for item in starts
             ]
         elif isinstance(starts, tmp_tensor_type):
@@ -205,7 +240,9 @@ def slice(input, axes, starts, ends):
         if isinstance(ends, (list, tuple)):
             ends = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in ends
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in ends
             ]
         elif isinstance(ends, tmp_tensor_type):
             tensor_t = ends.numpy()
@@ -223,7 +260,8 @@ def slice(input, axes, starts, ends):
                 axes = list(axes)
                 if len(axes) == 0:
                     raise ValueError(
-                        "Input axes should not be an empty list/tuple.")
+                        "Input axes should not be an empty list/tuple."
+                    )
                 for i in range(len(axes)):
                     if axes[i] < 0:
                         axes[i] = max(0, axes[i] + len(input.shape))
@@ -232,8 +270,10 @@ def slice(input, axes, starts, ends):
 
             else:
                 raise ValueError(
-                    "Input axes must be a python list or tuple, but reveived {}"
-                    .format(type(axes)))
+                    "Input axes must be a python list or tuple, but reveived {}".format(
+                        type(axes)
+                    )
+                )
 
             infer_flags = list(1 for i in range(len(axes)))
 
@@ -242,7 +282,8 @@ def slice(input, axes, starts, ends):
             if isinstance(starts, (list, tuple)):
                 starts = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in starts
                 ]
                 attrs += ('starts', starts)
@@ -254,7 +295,8 @@ def slice(input, axes, starts, ends):
             if isinstance(ends, (list, tuple)):
                 ends = [
                     item.numpy().item(0)
-                    if isinstance(item, tmp_tensor_type) else item
+                    if isinstance(item, tmp_tensor_type)
+                    else item
                     for item in ends
                 ]
                 attrs += ('ends', ends)
@@ -263,16 +305,27 @@ def slice(input, axes, starts, ends):
                 ends_tensor.stop_gradient = True
                 infer_flags = list(-1 for i in range(len(axes)))
 
-            return _legacy_C_ops.slice(input, starts_tensor, ends_tensor, None,
-                                       None, 'axes', axes, 'infer_flags',
-                                       infer_flags, *attrs)
+            return _legacy_C_ops.slice(
+                input,
+                starts_tensor,
+                ends_tensor,
+                None,
+                None,
+                'axes',
+                axes,
+                'infer_flags',
+                infer_flags,
+                *attrs,
+            )
 
     if not isinstance(starts, (list, tuple, Variable)):
         raise ValueError(
-            "Input starts must be an Variable, python list or tuple.")
+            "Input starts must be an Variable, python list or tuple."
+        )
     if not isinstance(ends, (list, tuple, Variable)):
         raise ValueError(
-            "Input ends must be an Variable, python list or tuple.")
+            "Input ends must be an Variable, python list or tuple."
+        )
 
     helper = LayerHelper('slice', **locals())
 
@@ -319,11 +372,11 @@ def slice(input, axes, starts, ends):
     # infer_flags
     attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('input'))
-    helper.append_op(type='slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('input')
+    )
+    helper.append_op(
+        type='slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
@@ -385,10 +438,21 @@ def transpose(x, perm, name=None):
             out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
             return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'transpose')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'transpose',
+    )
     check_type(perm, 'perm', (list, tuple), 'transpose')
     if isinstance(perm, tuple):
         perm = list(perm)
@@ -397,32 +461,33 @@ def transpose(x, perm, name=None):
             "Input(perm) is the permutation of dimensions of Input(x), "
             "its length should be equal to dimensions of Input(x), "
             "but received dimension of Input(x) is %s, "
-            "the length of Input(perm) is %s." % (len(x.shape), len(perm)))
+            "the length of Input(perm) is %s." % (len(x.shape), len(perm))
+        )
     for idx, dim in enumerate(perm):
         if dim >= len(x.shape):
             raise ValueError(
                 "Each element in Input(perm) should be less than Input(x)'s dimension, "
                 "but %d-th element in Input(perm) is %d which exceeds Input(x)'s "
-                "dimension %d." % (idx, perm[idx], len(x.shape)))
+                "dimension %d." % (idx, perm[idx], len(x.shape))
+            )
 
     helper = LayerHelper('transpose', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
 def unstack(x, axis=0, num=None):
     """
     :alias_main: paddle.unstack
-	:alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
-	:old_api: paddle.fluid.layers.unstack
+        :alias: paddle.unstack,paddle.tensor.unstack,paddle.tensor.manipulation.unstack
+        :old_api: paddle.fluid.layers.unstack
 
     **UnStack Layer**
 
@@ -441,9 +506,6 @@ def unstack(x, axis=0, num=None):
     Returns:
         list(Tensor): The unstacked Tensors list. The list elements are N-D Tensors of data types float32, float64, int32, int64.
 
-    Raises:
-        ValueError: If x.shape[axis] <= 0 or axis is not in range [-D, D).
-
     Examples:
         .. code-block:: python
 
@@ -477,13 +539,12 @@ def unstack(x, axis=0, num=None):
     for _ in range(num):
         outs.append(helper.create_variable_for_type_inference(x.dtype))
 
-    helper.append_op(type='unstack',
-                     inputs={'X': [x]},
-                     outputs={'Y': outs},
-                     attrs={
-                         'axis': axis,
-                         'num': num
-                     })
+    helper.append_op(
+        type='unstack',
+        inputs={'X': [x]},
+        outputs={'Y': outs},
+        attrs={'axis': axis, 'num': num},
+    )
     return outs
 
 
@@ -505,7 +566,7 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
     For each value `v` in `input`, we reset it to a new value according to the
     following formula:
     ::
-   
+
         v = v - shard_id * shard_size if shard_id * shard_size <= v < (shard_id+1) * shard_size else ignore_value
 
     That is, the value `v` is set to the new offset within the range represented by the shard `shard_id`
@@ -534,27 +595,31 @@ def shard_index(input, index_num, nshards, shard_id, ignore_value=-1):
             # [[-1], [1]]
     """
     if in_dygraph_mode():
-        return _C_ops.shard_index(input, index_num, nshards, shard_id,
-                                  ignore_value)
+        return _C_ops.shard_index(
+            input, index_num, nshards, shard_id, ignore_value
+        )
 
     check_variable_and_dtype(input, 'input', ['int64', 'int32'], 'shard_index')
     op_type = 'shard_index'
     helper = LayerHelper(op_type, **locals())
     if shard_id < 0 or shard_id >= nshards:
-        raise ValueError('The shard_id(%d) should be in [0, %d)' %
-                         (shard_id, nshards))
+        raise ValueError(
+            'The shard_id(%d) should be in [0, %d)' % (shard_id, nshards)
+        )
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type=op_type,
-                     inputs={'X': [input]},
-                     outputs={'Out': out},
-                     attrs={
-                         'index_num': index_num,
-                         'nshards': nshards,
-                         'shard_id': shard_id,
-                         'ignore_value': ignore_value
-                     },
-                     stop_gradient=True)
+    helper.append_op(
+        type=op_type,
+        inputs={'X': [input]},
+        outputs={'Out': out},
+        attrs={
+            'index_num': index_num,
+            'nshards': nshards,
+            'shard_id': shard_id,
+            'ignore_value': ignore_value,
+        },
+        stop_gradient=True,
+    )
     return out
 
 
@@ -646,11 +711,13 @@ def crop(x, shape=None, offsets=None, name=None):
     """
 
     helper = LayerHelper('crop_tensor', **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'crop_tensor')
+    check_variable_and_dtype(
+        x, 'x', ['float32', 'float64', 'int32', 'int64'], 'crop_tensor'
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'crop_tensor')
-    check_type(offsets, 'offsets', (list, tuple, Variable, type(None)),
-               'crop_tensor')
+    check_type(
+        offsets, 'offsets', (list, tuple, Variable, type(None)), 'crop_tensor'
+    )
 
     if offsets is None:
         offsets = [0] * len(x.shape)
@@ -666,25 +733,30 @@ def _attr_shape_check(shape_val):
         if not isinstance(shape_val, int):
             raise TypeError(
                 "Attr(shape)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(shape_val))
+                % type(shape_val)
+            )
         if shape_val == 0:
             raise ValueError(
                 "Attr(shape) of Op(crop_tensor) should not be zero, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
         if shape_val < -1:
             raise ValueError(
                 "When the element in Attr(shape) of Op(crop_tensor) is negative, only -1 is supported, but received: %s."
-                % str(shape_val))
+                % str(shape_val)
+            )
 
     def _attr_offsets_check(offset_val):
         if not isinstance(offset_val, int):
             raise TypeError(
                 "Attr(offsets)'s dtype of Op(crop_tensor) should be int32, but received: %s."
-                % type(offset_val))
+                % type(offset_val)
+            )
         if offset_val < 0:
             raise ValueError(
                 "Attr(offsets) of Op(crop_tensor) should be greater or equal to zero, but received: %s."
-                % str(offset_val))
+                % str(offset_val)
+            )
 
     if isinstance(offsets, Variable):
         offsets.stop_gradient = True
@@ -725,11 +797,9 @@ def _attr_offsets_check(offset_val):
             else:
                 _attr_shape_check(dim_size)
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 new_shape_tensor.append(temp_out)
                 shape_attr.append(dim_size)
         ipts['ShapeTensor'] = new_shape_tensor
@@ -739,10 +809,12 @@ def _attr_offsets_check(offset_val):
             _attr_shape_check(dim_size)
         attrs['shape'] = shape
 
-    helper.append_op(type='crop_tensor',
-                     inputs=ipts,
-                     outputs={'Out': out},
-                     attrs=None if len(attrs) == 0 else attrs)
+    helper.append_op(
+        type='crop_tensor',
+        inputs=ipts,
+        outputs={'Out': out},
+        attrs=None if len(attrs) == 0 else attrs,
+    )
     return out
 
 
@@ -774,13 +846,15 @@ def fill_(x, value):
     """
     if not isinstance(value, (float, int)):
         raise TypeError(
-            "The type of 'value'  must be int or float, but received %s." %
-            (type(value)))
+            "The type of 'value'  must be int or float, but received %s."
+            % (type(value))
+        )
     if in_dygraph_mode():
         return _C_ops.fill_(x, value)
     else:
-        return _legacy_C_ops.fill_any_(x, "value_float", float(value),
-                                       "value_int", int(value))
+        return _legacy_C_ops.fill_any_(
+            x, "value_float", float(value), "value_int", int(value)
+        )
 
 
 @dygraph_only
@@ -809,10 +883,11 @@ def zero_(x):
 
     """
     if in_dygraph_mode():
-        return _C_ops.fill_(x, 0.)
+        return _C_ops.fill_(x, 0.0)
     else:
-        return _legacy_C_ops.fill_any_(x, "value_float", 0., "value_int",
-                                       int(0))
+        return _legacy_C_ops.fill_any_(
+            x, "value_float", 0.0, "value_int", int(0)
+        )
 
 
 @dygraph_only
@@ -820,16 +895,16 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
     """
     Note:
         This API is ONLY available in Dygraph mode.
-	
+
     This function fill the value into the x Tensor's diagonal inplace.
-    
+
     Args:
         x(Tensor): ``x`` is the original Tensor
         value(Scale): ``value`` is the value to filled in x
         offset(int,optional): the offset to the main diagonal. Default: 0 (main diagonal).
         wrap(bool,optional): the diagonal 'wrapped' after N columns for tall matrices.
         name(str,optional): Name for the operation (optional, default is None)
-    
+
     Returns:
         Tensor: Tensor with diagonal filled with value.
 
@@ -844,39 +919,45 @@ def fill_diagonal_(x, value, offset=0, wrap=False, name=None):
     helper = LayerHelper("fill_diagonal_", **locals())
     check_type(x, 'X', (Variable), 'fill_diagonal_')
     dtype = helper.input_dtype('x')
-    check_dtype(dtype, 'X',
-                ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-                'fill_diagonal_')
+    check_dtype(
+        dtype,
+        'X',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'fill_diagonal_',
+    )
     check_type(value, 'value', (bool, int, float), 'fill_diagonal_')
     check_type(wrap, 'wrap', (bool), 'fill_diagonal_')
 
     inshape = x.shape
     inshapeset = set(inshape)
-    assert len(inshape) >= 2, ('Tensor dims should >= 2 in fill_diagonal_ API')
+    assert len(inshape) >= 2, 'Tensor dims should >= 2 in fill_diagonal_ API'
     if len(inshape) > 2:
-        assert len(inshapeset) == 1, (
-            'Tensor dims should be equal while input dims > 2 in fill_diagonal_ API'
-        )
+        assert (
+            len(inshapeset) == 1
+        ), 'Tensor dims should be equal while input dims > 2 in fill_diagonal_ API'
     if in_dygraph_mode():
         if len(inshape) == 2:
             return _C_ops.fill_diagonal_(x, value, offset, wrap)
         return _C_ops.fill_diagonal_(x, value, offset, True)
 
     if len(inshape) == 2:
-        return _legacy_C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
-                                            'wrap', wrap)
-    return _legacy_C_ops.fill_diagonal_(x, 'value', value, 'offset', offset,
-                                        'wrap', True)
+        return _legacy_C_ops.fill_diagonal_(
+            x, 'value', value, 'offset', offset, 'wrap', wrap
+        )
+    return _legacy_C_ops.fill_diagonal_(
+        x, 'value', value, 'offset', offset, 'wrap', True
+    )
 
 
 def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
     inshape = x.shape
-    assert dim1 < len(inshape) and dim1 >= -len(inshape), (
-        'dim1 should between [-rank,rank) in fill_diagonal_tensor_')
-    assert dim2 < len(inshape) and dim2 >= -len(inshape), (
-        'dim2 should between [-rank,rank) in fill_diagonal_tensor_')
-    assert len(inshape) >= 2, (
-        'Tensor dims should >= 2 in fill_diagonal_tensor_')
+    assert dim1 < len(inshape) and dim1 >= -len(
+        inshape
+    ), 'dim1 should between [-rank,rank) in fill_diagonal_tensor_'
+    assert dim2 < len(inshape) and dim2 >= -len(
+        inshape
+    ), 'dim2 should between [-rank,rank) in fill_diagonal_tensor_'
+    assert len(inshape) >= 2, 'Tensor dims should >= 2 in fill_diagonal_tensor_'
     dim1 %= len(inshape)
     dim2 %= len(inshape)
 
@@ -884,11 +965,14 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
     for i in range(len(inshape)):
         if i != dim1 and i != dim2:
             predshape.append(inshape[i])
-    diaglen = min(min(inshape[dim1], inshape[dim1] + offset),
-                  min(inshape[dim2], inshape[dim2] - offset))
+    diaglen = min(
+        min(inshape[dim1], inshape[dim1] + offset),
+        min(inshape[dim2], inshape[dim2] - offset),
+    )
     predshape.append(diaglen)
     assert tuple(predshape) == tuple(
-        y.shape), ("the y shape should be {}".format(predshape))
+        y.shape
+    ), "the y shape should be {}".format(predshape)
     if len(y.shape) == 1:
         y = y.reshape([1, -1])
 
@@ -896,14 +980,15 @@ def _fill_diagonal_tensor_impl(x, y, offset=0, dim1=0, dim2=1, inplace=False):
         if in_dygraph_mode():
             return _C_ops.fill_diagonal_tensor_(x, y, offset, dim1, dim2)
         else:
-            return _legacy_C_ops.fill_diagonal_tensor_(x, y, 'offset', offset,
-                                                       'dim1', dim1, 'dim2',
-                                                       dim2)
+            return _legacy_C_ops.fill_diagonal_tensor_(
+                x, y, 'offset', offset, 'dim1', dim1, 'dim2', dim2
+            )
     if in_dygraph_mode():
         return _C_ops.fill_diagonal_tensor(x, y, offset, dim1, dim2)
     else:
-        return _legacy_C_ops.fill_diagonal_tensor(x, y, 'offset', offset,
-                                                  'dim1', dim1, 'dim2', dim2)
+        return _legacy_C_ops.fill_diagonal_tensor(
+            x, y, 'offset', offset, 'dim1', dim1, 'dim2', dim2
+        )
 
 
 def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -935,12 +1020,9 @@ def fill_diagonal_tensor_(x, y, offset=0, dim1=0, dim2=1, name=None):
             print(x.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
-    return _fill_diagonal_tensor_impl(x,
-                                      y,
-                                      offset=offset,
-                                      dim1=dim1,
-                                      dim2=dim2,
-                                      inplace=True)
+    return _fill_diagonal_tensor_impl(
+        x, y, offset=offset, dim1=dim1, dim2=dim2, inplace=True
+    )
 
 
 def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
@@ -969,12 +1051,9 @@ def fill_diagonal_tensor(x, y, offset=0, dim1=0, dim2=1, name=None):
             print(nx.tolist())   #[[1.0, 2.0, 2.0], [2.0, 1.0, 2.0], [2.0, 2.0, 1.0], [2.0, 2.0, 2.0]]
 
     """
-    return _fill_diagonal_tensor_impl(x,
-                                      y,
-                                      offset=offset,
-                                      dim1=dim1,
-                                      dim2=dim2,
-                                      inplace=False)
+    return _fill_diagonal_tensor_impl(
+        x, y, offset=offset, dim1=dim1, dim2=dim2, inplace=False
+    )
 
 
 @dygraph_only
@@ -1017,7 +1096,7 @@ def concat(x, axis=0, name=None):
         x (list|tuple): ``x`` is a Tensor list or Tensor tuple which is with data type bool, float16,
             float32, float64, int32, int64, int8, uint8. All the Tensors in ``x`` must have same data type.
         axis (int|Tensor, optional): Specify the axis to operate on the input Tensors.
-            It's a scalar with data type int or a Tensor with shape [1] and data type int32 
+            It's a scalar with data type int or a Tensor with shape [1] and data type int32
             or int64. The effective range is [-R, R), where R is Rank(x). When ``axis < 0``,
             it works the same way as ``axis+R``. Default is 0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
@@ -1027,9 +1106,9 @@ def concat(x, axis=0, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
-            
+
             x1 = paddle.to_tensor([[1, 2, 3],
                                    [4, 5, 6]])
             x2 = paddle.to_tensor([[11, 12, 13],
@@ -1073,10 +1152,21 @@ def concat(x, axis=0, name=None):
     check_type(input, 'input', (list, tuple, Variable), 'concat')
     if not isinstance(input, Variable):
         for id, x in enumerate(input):
-            check_variable_and_dtype(x, 'input[' + str(id) + ']', [
-                'bool', 'float16', 'float32', 'float64', 'int32', 'int64',
-                'int8', 'unit8'
-            ], 'concat')
+            check_variable_and_dtype(
+                x,
+                'input[' + str(id) + ']',
+                [
+                    'bool',
+                    'float16',
+                    'float32',
+                    'float64',
+                    'int32',
+                    'int64',
+                    'int8',
+                    'unit8',
+                ],
+                'concat',
+            )
             if x.dtype != input[0].dtype:
                 raise TypeError(
                     "All the Tensors in the input must have the same data type."
@@ -1087,8 +1177,11 @@ def concat(x, axis=0, name=None):
 
     if isinstance(axis, Variable):
         check_dtype(
-            axis.dtype, 'axis', ['int32', 'int64'], 'concat',
-            "The data type of axis must be int32 or int64 when axis is a Tensor"
+            axis.dtype,
+            'axis',
+            ['int32', 'int64'],
+            'concat',
+            "The data type of axis must be int32 or int64 when axis is a Tensor",
         )
 
     helper = LayerHelper('concat', **locals())
@@ -1099,19 +1192,17 @@ def concat(x, axis=0, name=None):
         # This feature is supported for Dynamic-to-Static, because after transformed, the type of inputs[0]
         # is LOD_TENSOR_ARRAY in some scenarios. And this feature can be used in static mode.
 
-        assert len(input) == 1, "If the elements of 'input' in concat are Variable(LoDTensorArray), " \
-                "number of the elements must be 1, but received %s." % len(input)
+        assert len(input) == 1, (
+            "If the elements of 'input' in concat are Variable(LoDTensorArray), "
+            "number of the elements must be 1, but received %s." % len(input)
+        )
         out_index = helper.create_variable_for_type_inference(dtype="int32")
-        helper.append_op(type='tensor_array_to_tensor',
-                         inputs={'X': input[0]},
-                         outputs={
-                             'Out': [out],
-                             'OutIndex': [out_index]
-                         },
-                         attrs={
-                             'axis': axis,
-                             'use_stack': False
-                         })
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': input[0]},
+            outputs={'Out': [out], 'OutIndex': [out_index]},
+            attrs={'axis': axis, 'use_stack': False},
+        )
     else:
         inputs = {'X': input}
         attrs = {}
@@ -1121,10 +1212,9 @@ def concat(x, axis=0, name=None):
         else:
             attrs['axis'] = axis
 
-        helper.append_op(type='concat',
-                         inputs=inputs,
-                         outputs={'Out': [out]},
-                         attrs=attrs)
+        helper.append_op(
+            type='concat', inputs=inputs, outputs={'Out': [out]}, attrs=attrs
+        )
     return out
 
 
@@ -1132,8 +1222,10 @@ def broadcast_tensors(input, name=None):
     """
     This OP broadcast a list of tensors following broadcast semantics
 
-    .. note::
-        If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
+    Note:
+        If you want know more about broadcasting, please refer to `Introduction to Tensor`_ .
+
+    .. _Introduction to Tensor: ../../guides/beginner/tensor_en.html#chapter5-broadcasting-of-tensor
 
     Args:
         input (list|tuple): ``input`` is a Tensor list or Tensor tuple which is with data type bool,
@@ -1164,17 +1256,21 @@ def broadcast_tensors(input, name=None):
     check_type(input, 'input', (list, tuple), 'broadcast_tensors')
     if num_inputs < 1:
         raise TypeError(
-            "At least 1 tensor is needed to perform broadcast_tensors")
+            "At least 1 tensor is needed to perform broadcast_tensors"
+        )
 
     # Check input types
     for id, x in enumerate(input):
         check_variable_and_dtype(
-            x, 'input[' + str(id) + ']',
+            x,
+            'input[' + str(id) + ']',
             ['bool', 'float32', 'float64', 'int32', 'int64'],
-            'broadcast_tensors')
+            'broadcast_tensors',
+        )
         if x.dtype != input[0].dtype:
             raise TypeError(
-                "All the Tensors in the input must have the same data type.")
+                "All the Tensors in the input must have the same data type."
+            )
 
     # Check bcast semantics
     output_shape_r_last_tensor_index = []
@@ -1192,8 +1288,11 @@ def broadcast_tensors(input, name=None):
                 output_shape_r.append(shape[i])
                 output_shape_r_last_tensor_index.append(j)
             else:
-                invalid = (output_shape_r[i] != shape[i]
-                           and output_shape_r[i] != 1 and shape[i] != 1)
+                invalid = (
+                    output_shape_r[i] != shape[i]
+                    and output_shape_r[i] != 1
+                    and shape[i] != 1
+                )
                 if invalid:
                     last_index = output_shape_r_last_tensor_index[i]
                     raise TypeError(
@@ -1212,14 +1311,15 @@ def broadcast_tensors(input, name=None):
     while i < num_inputs:
         out.append(
             helper.create_variable_for_type_inference(
-                dtype=helper.input_dtype()))
+                dtype=helper.input_dtype()
+            )
+        )
         i += 1
 
     inputs = {'X': input}
-    helper.append_op(type='broadcast_tensors',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs={})
+    helper.append_op(
+        type='broadcast_tensors', inputs=inputs, outputs={'Out': out}, attrs={}
+    )
 
     return out
 
@@ -1241,12 +1341,9 @@ def flip(x, axis, name=None):
         .. code-block:: python
 
           import paddle
-          import numpy as np
 
           image_shape=(3, 2, 2)
-          x = np.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
-          x = x.astype('float32')
-          img = paddle.to_tensor(x)
+          img = paddle.arange(image_shape[0] * image_shape[1] * image_shape[2]).reshape(image_shape)
           tmp = paddle.flip(img, [0,1])
           print(tmp) # [[[10,11],[8, 9]], [[6, 7],[4, 5]], [[2, 3],[0, 1]]]
 
@@ -1265,19 +1362,21 @@ def flip(x, axis, name=None):
     helper = LayerHelper("flip", **locals())
     check_type(x, 'X', (Variable), 'flip')
     dtype = helper.input_dtype('x')
-    check_dtype(dtype, 'X',
-                ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-                'flip')
+    check_dtype(
+        dtype,
+        'X',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+        'flip',
+    )
     check_type(axis, 'axis', (list, tuple), 'flip')
     if name is None:
         out = helper.create_variable_for_type_inference(dtype)
     else:
         out = helper.create_variable(name=name, dtype=dtype, persistable=False)
 
-    helper.append_op(type="flip",
-                     inputs={"X": x},
-                     outputs={"Out": out},
-                     attrs={"axis": axis})
+    helper.append_op(
+        type="flip", inputs={"X": x}, outputs={"Out": out}, attrs={"axis": axis}
+    )
     return out
 
 
@@ -1303,23 +1402,23 @@ def rot90(x, k=1, axes=[0, 1], name=None):
 
           data = paddle.arange(4)
           data = paddle.reshape(data, (2, 2))
-          print(data) 
+          print(data)
           #[[0, 1],
           # [2, 3]]
 
           y = paddle.rot90(data, 1, [0, 1])
-          print(y) 
+          print(y)
           #[[1, 3],
           # [0, 2]]
 
           y= paddle.rot90(data, -1, [0, 1])
-          print(y) 
+          print(y)
           #[[2, 0],
           # [3, 1]]
 
           data2 = paddle.arange(8)
           data2 = paddle.reshape(data2, (2,2,2))
-          print(data2) 
+          print(data2)
           #[[[0, 1],
           #  [2, 3]],
           # [[4, 5],
@@ -1336,9 +1435,12 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     helper = LayerHelper("rot90", **locals())
     check_type(x, 'X', (Variable), 'rot90')
     dtype = helper.input_dtype('x')
-    check_dtype(dtype, 'X',
-                ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
-                'rot90')
+    check_dtype(
+        dtype,
+        'X',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'bool'],
+        'rot90',
+    )
     check_type(axes, 'axes', (list, tuple), 'rot90')
 
     input_total_dims = len(x.shape)
@@ -1346,23 +1448,31 @@ def rot90(x, k=1, axes=[0, 1], name=None):
     if total_rot_dims != 2:
         raise ValueError(
             "expected total rotation axes == 2, but got axes = {}".format(
-                total_rot_dims))
+                total_rot_dims
+            )
+        )
     if input_total_dims < 2:
         raise ValueError(
             "expected total dims >= 2, but got total dims = {}".format(
-                input_total_dims))
+                input_total_dims
+            )
+        )
 
     if not (axes[0] != axes[1] and abs(axes[0] - axes[1]) != input_total_dims):
         raise ValueError(
-            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}"
-            .format(axes[0], axes[1]))
+            "expected rotation axes to be different, but got axis0 = {}, and axis1 = {}".format(
+                axes[0], axes[1]
+            )
+        )
 
     if not (axes[0] < input_total_dims and axes[0] >= -input_total_dims):
-        raise ValueError("Rotation axis0 out of range, axis0 = {}".format(
-            axes[0]))
+        raise ValueError(
+            "Rotation axis0 out of range, axis0 = {}".format(axes[0])
+        )
     if not (axes[1] < input_total_dims and axes[1] >= -input_total_dims):
-        raise ValueError("Rotation axis1 out of range, axis1 = {}".format(
-            axes[1]))
+        raise ValueError(
+            "Rotation axis1 out of range, axis1 = {}".format(axes[1])
+        )
 
     k %= 4
     if k == 0:
@@ -1371,8 +1481,10 @@ def rot90(x, k=1, axes=[0, 1], name=None):
         return flip(flip(x, axes[0]), axes[1])
 
     axes_list = list(range(0, input_total_dims))
-    (axes_list[axes[0]], axes_list[axes[1]]) = (axes_list[axes[1]],
-                                                axes_list[axes[0]])
+    (axes_list[axes[0]], axes_list[axes[1]]) = (
+        axes_list[axes[1]],
+        axes_list[axes[0]],
+    )
     if k == 1:
         return transpose(flip(x, axes[1]), axes_list)
     else:
@@ -1428,10 +1540,6 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
                   axes flattened by indicated start axis and end axis. \
                   A Tensor with data type same as input x.
 
-    Raises:
-        ValueError: If x is not a Tensor.
-        ValueError: If start_axis or stop_axis is illegal.
-
     Examples:
 
         .. code-block:: python
@@ -1455,19 +1563,29 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
     if not paddle.in_dynamic_mode():
         check_variable_and_dtype(
-            x, 'x',
+            x,
+            'x',
             ['float32', 'float64', 'int8', 'int16', 'int32', 'int64', 'uint8'],
-            'flatten')
+            'flatten',
+        )
 
     x_dim = len(x.shape)
-    if not (isinstance(start_axis,
-                       int)) or (start_axis > x_dim - 1) or start_axis < -x_dim:
+    if (
+        not (isinstance(start_axis, int))
+        or (start_axis > x_dim - 1)
+        or start_axis < -x_dim
+    ):
         raise ValueError(
-            "The start_axis should be a int, and in range [-rank(x), rank(x))")
-    if not (isinstance(stop_axis,
-                       int)) or (stop_axis > x_dim - 1) or stop_axis < -x_dim:
+            "The start_axis should be a int, and in range [-rank(x), rank(x))"
+        )
+    if (
+        not (isinstance(stop_axis, int))
+        or (stop_axis > x_dim - 1)
+        or stop_axis < -x_dim
+    ):
         raise ValueError(
-            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))"
+        )
     if start_axis < 0:
         start_axis = start_axis + x_dim
     if stop_axis < 0:
@@ -1480,22 +1598,19 @@ def flatten(x, start_axis=0, stop_axis=-1, name=None):
 
     if _in_legacy_dygraph():
         dy_out, _ = _legacy_C_ops.flatten_contiguous_range(
-            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis
+        )
         return dy_out
 
     helper = LayerHelper('flatten', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='flatten_contiguous_range',
-                     inputs={"X": x},
-                     outputs={
-                         'Out': out,
-                         'XShape': x_shape
-                     },
-                     attrs={
-                         "start_axis": start_axis,
-                         "stop_axis": stop_axis
-                     })
+    helper.append_op(
+        type='flatten_contiguous_range',
+        inputs={"X": x},
+        outputs={'Out': out, 'XShape': x_shape},
+        attrs={"start_axis": start_axis, "stop_axis": stop_axis},
+    )
     return out
 
 
@@ -1509,14 +1624,22 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
         raise ValueError("The input x should be a Tensor")
 
     x_dim = len(x.shape)
-    if not (isinstance(start_axis,
-                       int)) or (start_axis > x_dim - 1) or start_axis < -x_dim:
+    if (
+        not (isinstance(start_axis, int))
+        or (start_axis > x_dim - 1)
+        or start_axis < -x_dim
+    ):
         raise ValueError(
-            "The start_axis should be a int, and in range [-rank(x), rank(x))")
-    if not (isinstance(stop_axis,
-                       int)) or (stop_axis > x_dim - 1) or stop_axis < -x_dim:
+            "The start_axis should be a int, and in range [-rank(x), rank(x))"
+        )
+    if (
+        not (isinstance(stop_axis, int))
+        or (stop_axis > x_dim - 1)
+        or stop_axis < -x_dim
+    ):
         raise ValueError(
-            "The stop_axis should be a int, and in range [-rank(x), rank(x))")
+            "The stop_axis should be a int, and in range [-rank(x), rank(x))"
+        )
     if start_axis < 0:
         start_axis = start_axis + x_dim
     if stop_axis < 0:
@@ -1529,15 +1652,16 @@ def flatten_(x, start_axis=0, stop_axis=-1, name=None):
 
     if _in_legacy_dygraph():
         dy_out, _ = _legacy_C_ops.flatten_contiguous_range_(
-            x, 'start_axis', start_axis, 'stop_axis', stop_axis)
+            x, 'start_axis', start_axis, 'stop_axis', stop_axis
+        )
         return dy_out
 
 
 def roll(x, shifts, axis=None, name=None):
     """
-    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that 
-    roll beyond the last position are re-introduced at the first according to 'shifts'. 
-    If a axis is not specified, 
+    Roll the `x` tensor along the given axis(axes). With specific 'shifts', Elements that
+    roll beyond the last position are re-introduced at the first according to 'shifts'.
+    If a axis is not specified,
     the tensor will be flattened before rolling and then restored to the original shape.
 
     Args:
@@ -1554,7 +1678,7 @@ def roll(x, shifts, axis=None, name=None):
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1.0, 2.0, 3.0],
@@ -1587,8 +1711,10 @@ def roll(x, shifts, axis=None, name=None):
         for i in range(len(axis)):
             if axis[i] >= len_origin_shape or axis[i] < -len_origin_shape:
                 raise ValueError(
-                    "axis is out of range, it should be in range [{}, {}), but received {}"
-                    .format(-len_origin_shape, len_origin_shape, axis))
+                    "axis is out of range, it should be in range [{}, {}), but received {}".format(
+                        -len_origin_shape, len_origin_shape, axis
+                    )
+                )
     else:
         axis = []
 
@@ -1604,34 +1730,32 @@ def roll(x, shifts, axis=None, name=None):
     out = helper.create_variable_for_type_inference(x.dtype)
 
     if isinstance(shifts, Variable):
-        helper.append_op(type='roll',
-                         inputs={
-                             'X': x,
-                             "ShiftsTensor": shifts
-                         },
-                         outputs={'Out': out},
-                         attrs={'axis': axis})
+        helper.append_op(
+            type='roll',
+            inputs={'X': x, "ShiftsTensor": shifts},
+            outputs={'Out': out},
+            attrs={'axis': axis},
+        )
     else:
         check_type(shifts, 'shifts', (list, tuple), 'roll')
-        helper.append_op(type='roll',
-                         inputs={'X': x},
-                         outputs={'Out': out},
-                         attrs={
-                             'axis': axis,
-                             'shifts': shifts
-                         })
+        helper.append_op(
+            type='roll',
+            inputs={'X': x},
+            outputs={'Out': out},
+            attrs={'axis': axis, 'shifts': shifts},
+        )
     return out
 
 
 def stack(x, axis=0, name=None):
     """
-    Stacks all the input tensors ``x`` along ``axis`` dimemsion. 
+    Stacks all the input tensors ``x`` along ``axis`` dimemsion.
     All tensors must be of the same shape and same dtype.
-    
-    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked 
-    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked 
+
+    For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked
+    tensor is [N, A, B]; if ``axis == 1``, the shape of stacked
     tensor is [A, N, B], etc.
-    
+
 
     .. code-block:: text
 
@@ -1679,35 +1803,35 @@ def stack(x, axis=0, name=None):
         x (list[Tensor]|tuple[Tensor]): Input ``x`` can be a ``list`` or ``tuple`` of tensors, the Tensors in ``x``
                                      must be of the same shape and dtype. Supported data types: float32, float64, int32, int64.
         axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
-                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``. 
+                              where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
                               If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
-        
+
     Returns:
         Tensor: The stacked tensor with same data type as input.
 
-    Example:    
+    Example:
         .. code-block:: python
 
             import paddle
-            
+
             x1 = paddle.to_tensor([[1.0, 2.0]])
             x2 = paddle.to_tensor([[3.0, 4.0]])
             x3 = paddle.to_tensor([[5.0, 6.0]])
-	    
+
             out = paddle.stack([x1, x2, x3], axis=0)
             print(out.shape)  # [3, 1, 2]
             print(out)
             # [[[1., 2.]],
             #  [[3., 4.]],
             #  [[5., 6.]]]
-	    
-	    out = paddle.stack([x1, x2, x3], axis=-2)
-	    print(out.shape)  # [1, 3, 2]
-	    print(out)
-	    # [[[1., 2.],
-	    #   [3., 4.],
-	    #   [5., 6.]]]
+
+            out = paddle.stack([x1, x2, x3], axis=-2)
+            print(out.shape)  # [1, 3, 2]
+            print(out)
+            # [[[1., 2.],
+            #   [3., 4.],
+            #   [5., 6.]]]
     """
     axis = 0 if axis is None else axis
 
@@ -1720,42 +1844,53 @@ def stack(x, axis=0, name=None):
     if not isinstance(x, list) and not isinstance(x, tuple):
         # NOTE:(zhiqiu) Only support Variable as input if the Variable is a LOD_TENSOR_ARRAY create by create_array, array_write, array_read, etc.
         # In that case, Variable is array of tensors indeed.
-        if isinstance(x, Variable) and x.desc.type(
-        ) == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
+        if (
+            isinstance(x, Variable)
+            and x.desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY
+        ):
             x = [x]
         else:
             raise TypeError(
-                "The type of '%s' in %s must be %s, but received %s" %
-                ('x', 'stack', 'list[Tensor], tuple[Tensor] or TensorArray',
-                 type(x)))
+                "The type of '%s' in %s must be %s, but received %s"
+                % (
+                    'x',
+                    'stack',
+                    'list[Tensor], tuple[Tensor] or TensorArray',
+                    type(x),
+                )
+            )
 
     helper = LayerHelper('stack', **locals())
 
     out = helper.create_variable_for_type_inference(x[0].dtype)
     if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
-        assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
-                            "number of the elements must be 1, but received %s." % len(x)
+        assert len(x) == 1, (
+            "If the elements of 'x' in stack are Variable(LoDTensorArray), "
+            "number of the elements must be 1, but received %s." % len(x)
+        )
         out_index = helper.create_variable_for_type_inference(dtype="int32")
 
         for i in x:
-            check_variable_and_dtype(i, 'x', \
-                ['float16', 'float32', 'float64', 'int32', 'int64'], 'stack')
-
-        helper.append_op(type='tensor_array_to_tensor',
-                         inputs={'X': x[0]},
-                         outputs={
-                             'Out': [out],
-                             'OutIndex': [out_index]
-                         },
-                         attrs={
-                             'axis': axis,
-                             'use_stack': True
-                         })
+            check_variable_and_dtype(
+                i,
+                'x',
+                ['float16', 'float32', 'float64', 'int32', 'int64'],
+                'stack',
+            )
+
+        helper.append_op(
+            type='tensor_array_to_tensor',
+            inputs={'X': x[0]},
+            outputs={'Out': [out], 'OutIndex': [out_index]},
+            attrs={'axis': axis, 'use_stack': True},
+        )
     else:
-        helper.append_op(type='stack',
-                         inputs={'X': x},
-                         outputs={'Y': out},
-                         attrs={'axis': axis})
+        helper.append_op(
+            type='stack',
+            inputs={'X': x},
+            outputs={'Y': out},
+            attrs={'axis': axis},
+        )
 
     return out
 
@@ -1763,27 +1898,27 @@ def stack(x, axis=0, name=None):
 def split(x, num_or_sections, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
-    
+
     Args:
         x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, uint8, int8, int32 or int64.
-        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections`` 
+        num_or_sections (int|list|tuple): If ``num_or_sections`` is an int, then ``num_or_sections``
             indicates the number of equal sized sub-Tensors that the ``x`` will be divided into.
             If ``num_or_sections`` is a list or tuple, the length of it indicates the number of
             sub-Tensors and the elements in it indicate the sizes of sub-Tensors'  dimension orderly.
             The length of the list must not  be larger than the ``x`` 's size of specified ``axis``.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
             ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
             If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    
+
     Example:
         .. code-block:: python
-            
+
             import paddle
-            
+
             # x is a Tensor of shape [3, 9, 5]
             x = paddle.rand([3, 9, 5])
 
@@ -1801,7 +1936,7 @@ def split(x, num_or_sections, axis=0, name=None):
             print(out0.shape)  # [3, 2, 5]
             print(out1.shape)  # [3, 3, 5]
             print(out2.shape)  # [3, 4, 5]
-            
+
             # axis is negative, the real axis is (rank(x) + axis)=1
             out0, out1, out2 = paddle.split(x, num_or_sections=3, axis=-2)
             print(out0.shape)  # [3, 3, 5]
@@ -1829,15 +1964,17 @@ def split(x, num_or_sections, axis=0, name=None):
             if utils._contain_var(num_or_sections):
                 for index, item in enumerate(num_or_sections):
                     if isinstance(item, Variable):
-                        num_or_sections[index] = num_or_sections[index].numpy(
-                        )[0]
+                        num_or_sections[index] = num_or_sections[index].numpy()[
+                            0
+                        ]
                 attrs += ('sections', list(num_or_sections))
             else:
                 attrs += ('sections', list(num_or_sections))
         else:
             raise TypeError(
                 "The type of 'num_or_sections' in split must be int, list or tuple in imperative mode, but "
-                "received %s." % (type(num_or_sections)))
+                "received %s." % (type(num_or_sections))
+            )
         if in_dygraph_mode():
             if isinstance(num_or_sections, int):
                 return _C_ops.split_with_num(input, num_or_sections, dim)
@@ -1848,10 +1985,21 @@ def split(x, num_or_sections, axis=0, name=None):
             _legacy_C_ops.split(input, out, *attrs)
             return out
 
-    check_variable_and_dtype(input, 'input', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'uint8',
-        'int8'
-    ], 'split')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'uint8',
+            'int8',
+        ],
+        'split',
+    )
     check_type(num_or_sections, 'num_or_sections', (list, int, tuple), 'split')
     check_type(dim, 'dim', (int, Variable), 'split')
     if isinstance(dim, Variable):
@@ -1871,19 +2019,18 @@ def _get_SectionsTensorList(one_list):
                 dim_size.stop_gradient = True
                 tensor_list.append(dim_size)
             else:
-                assert (isinstance(dim_size, int))
+                assert isinstance(dim_size, int)
                 if dim_size == -1:
                     assert unk_dim_idx == -1, (
                         "Only one value of 'num_or_section' in split can "
-                        "be -1. But received num_or_section[%d] is also -1." %
-                        idx)
+                        "be -1. But received num_or_section[%d] is also -1."
+                        % idx
+                    )
                     unk_dim_idx = idx
                 temp_out = helper.create_variable_for_type_inference('int32')
-                fill_constant([1],
-                              'int32',
-                              dim_size,
-                              force_cpu=True,
-                              out=temp_out)
+                fill_constant(
+                    [1], 'int32', dim_size, force_cpu=True, out=temp_out
+                )
                 tensor_list.append(temp_out)
         return tensor_list
 
@@ -1898,44 +2045,50 @@ def _get_SectionsTensorList(one_list):
     if isinstance(num_or_sections, int):
         assert num_or_sections > 1, 'num_or_sections must be more than 1.'
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert input_shape[dim] % num_or_sections ==0, \
-                "The input's size along the split dimension " \
-                "must be evenly divisible by Attr(num_or_sections). " \
-                "But %d is not evenly divisible by %d. " % (num_or_sections,input_shape[dim])
+            assert input_shape[dim] % num_or_sections == 0, (
+                "The input's size along the split dimension "
+                "must be evenly divisible by Attr(num_or_sections). "
+                "But %d is not evenly divisible by %d. "
+                % (num_or_sections, input_shape[dim])
+            )
         num = num_or_sections
     else:
         if isinstance(dim, int) and input_shape[dim] > 0:
-            assert len(num_or_sections) <= input_shape[
-                dim], 'len(num_or_sections) must not be more than input.shape[dim].'
+            assert (
+                len(num_or_sections) <= input_shape[dim]
+            ), 'len(num_or_sections) must not be more than input.shape[dim].'
         num = len(num_or_sections)
         attrs['sections'] = list(
-            map(lambda ele: -1
-                if isinstance(ele, Variable) else ele, num_or_sections))
+            map(
+                lambda ele: -1 if isinstance(ele, Variable) else ele,
+                num_or_sections,
+            )
+        )
         if utils._contain_var(num_or_sections):
             inputs['SectionsTensorList'] = _get_SectionsTensorList(
-                num_or_sections)
+                num_or_sections
+            )
 
     outs = [
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(type='split',
-                     inputs=inputs,
-                     outputs={'Out': outs},
-                     attrs=attrs)
+    helper.append_op(
+        type='split', inputs=inputs, outputs={'Out': outs}, attrs=attrs
+    )
     return outs
 
 
 def squeeze(x, axis=None, name=None):
     """
-    Squeeze the dimension(s) of size 1 of input tensor x's shape. 
-    
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
+    Squeeze the dimension(s) of size 1 of input tensor x's shape.
+
+    Note that the output Tensor will share data with origin Tensor and doesn't have a
+    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version,
     please use `Tensor.clone` like ``squeeze_clone_x = x.squeeze().clone()``.
 
-    If axis is provided, it will remove the dimension(s) by given axis that of size 1. 
-    If the dimension of given axis is not of size 1, the dimension remain unchanged. 
+    If axis is provided, it will remove the dimension(s) by given axis that of size 1.
+    If the dimension of given axis is not of size 1, the dimension remain unchanged.
     If axis is not provided, all dims equal of size 1 will be removed.
 
     .. code-block:: text
@@ -1955,11 +2108,11 @@ def squeeze(x, axis=None, name=None):
             axis = 0
           Output:
             out.shape = [3, 1, 5]
-        
+
         Case4:
 
           Input:
-            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged. 
+            x.shape = [1, 3, 1, 5]  # If the dimension of one given axis (3) is not of size 1, the dimension remain unchanged.
             axis = [0, 2, 3]
           Output:
             out.shape = [3, 5]
@@ -1967,7 +2120,7 @@ def squeeze(x, axis=None, name=None):
         Case4:
 
           Input:
-            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x). 
+            x.shape = [1, 3, 1, 5]  # If axis is negative, axis = axis + ndim (number of dimensions in x).
             axis = [-2]
           Output:
             out.shape = [1, 3, 5]
@@ -1987,7 +2140,7 @@ def squeeze(x, axis=None, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.rand([5, 1, 10])
             output = paddle.squeeze(x, axis=1)
 
@@ -2015,10 +2168,22 @@ def squeeze(x, axis=None, name=None):
         return out
 
     helper = LayerHelper("squeeze", **locals())
-    check_variable_and_dtype(input, 'input', [
-        'float16', 'float32', 'float64', 'bool', 'int8', 'int32', 'int64',
-        'complex64', 'complex128'
-    ], 'squeeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'squeeze',
+    )
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'squeeze')
     attrs = {}
@@ -2033,13 +2198,12 @@ def squeeze(x, axis=None, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="squeeze2",
-                     inputs={"X": input},
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="squeeze2",
+        inputs={"X": input},
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -2066,16 +2230,19 @@ def squeeze_(x, axis=None, name=None):
         return out
 
 
-def unique_consecutive(x,
-                       return_inverse=False,
-                       return_counts=False,
-                       axis=None,
-                       dtype="int64",
-                       name=None):
+def unique_consecutive(
+    x,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+    dtype="int64",
+    name=None,
+):
     r"""
     Eliminates all but the first element from every consecutive group of equivalent elements.
 
-    .. note:: This function is different from :func:`paddle.unique` in the sense that this function
+    Note:
+        This function is different from :func:`paddle.unique` in the sense that this function
         only eliminates consecutive duplicate values. This semantics is similar to `std::unique` in C++.
 
     Args:
@@ -2097,25 +2264,37 @@ def unique_consecutive(x,
     Example:
         .. code-block:: python
 
-            import paddle 
+            import paddle
 
             x = paddle.to_tensor([1, 1, 2, 2, 3, 1, 1, 2])
-            output = paddle.unique_consecutive(x) # 
-            np_output = output.numpy() # [1 2 3 1 2]
+            output = paddle.unique_consecutive(x) #
+            print(output)
+            # Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [1, 2, 3, 1, 2])
+
             _, inverse, counts = paddle.unique_consecutive(x, return_inverse=True, return_counts=True)
-            np_inverse = inverse.numpy() # [0 0 1 1 2 3 3 4]
-            np_counts = inverse.numpy() # [2 2 1 2 1]
+            print(inverse)
+            # Tensor(shape=[8], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [0, 0, 1, 1, 2, 3, 3, 4])
+            print(counts)
+            # Tensor(shape=[5], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [2, 2, 1, 2, 1])
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) # 
-            np_output = output.numpy() # [2 1 3 0 1 2 1 3 2 1 3]
+            output = paddle.unique_consecutive(x, axis=0) #
+            print(output)
+            # Tensor(shape=[3, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [[2, 1, 3],
+            #         [3, 0, 1],
+            #         [2, 1, 3]])
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3], [2, 1, 3]])
-            output = paddle.unique_consecutive(x, axis=0) # 
-            np_output = output.numpy()
-            # [[2 1 3]
-            #  [3 0 1]
-            #  [2 1 3]]
+            output = paddle.unique_consecutive(x, axis=0) #
+            print(output)
+            # Tensor(shape=[3, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [[2, 1, 3],
+            #         [3, 0, 1],
+            #         [2, 1, 3]])
     """
 
     if axis is None:
@@ -2125,7 +2304,8 @@ def unique_consecutive(x,
     attr_dtype = convert_np_dtype_to_dtype_(dtype)
     if in_dygraph_mode():
         out, inverse, counts = _C_ops.unique_consecutive(
-            x, return_inverse, return_counts, axis, attr_dtype)
+            x, return_inverse, return_counts, axis, attr_dtype
+        )
         outs = [out]
         if return_inverse:
             outs.append(inverse)
@@ -2136,8 +2316,16 @@ def unique_consecutive(x,
         return tuple(outs)
     elif paddle.in_dynamic_mode():
         out, inverse, counts = _legacy_C_ops.unique_consecutive(
-            x, 'dtype', attr_dtype, 'return_inverse', return_inverse,
-            'return_counts', return_counts, 'axis', axis)
+            x,
+            'dtype',
+            attr_dtype,
+            'return_inverse',
+            return_inverse,
+            'return_counts',
+            return_counts,
+            'axis',
+            axis,
+        )
         outs = [out]
         if return_inverse:
             outs.append(inverse)
@@ -2146,9 +2334,12 @@ def unique_consecutive(x,
         if len(outs) == 1:
             return outs[0]
         return tuple(outs)
-    check_variable_and_dtype(x, "input",
-                             ['float32', 'float64', 'int32', 'int64'],
-                             'unique_consecutive')
+    check_variable_and_dtype(
+        x,
+        "input",
+        ['float32', 'float64', 'int32', 'int64'],
+        'unique_consecutive',
+    )
     check_type(return_inverse, 'return_inverse', bool, 'unique_consecutive')
     check_type(return_counts, 'return_counts', bool, 'unique_consecutive')
     check_dtype(dtype, 'dtype', ['int32', 'int64'], 'unique_consecutive')
@@ -2161,34 +2352,38 @@ def unique_consecutive(x,
         "return_counts": return_counts,
         "axis": axis,
     }
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                    stop_gradient=True)
-    inverse = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                        stop_gradient=True)
-    counts = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                       stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    inverse = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
+    counts = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
     outputs = {"Out": out, "Index": inverse, "Counts": counts}
     outs = [out]
     if return_inverse:
         outs.append(inverse)
     if return_counts:
         outs.append(counts)
-    helper.append_op(type="unique_consecutive",
-                     inputs={"X": x},
-                     attrs=attrs,
-                     outputs=outputs)
+    helper.append_op(
+        type="unique_consecutive", inputs={"X": x}, attrs=attrs, outputs=outputs
+    )
     if len(outs) == 1:
         return outs[0]
     return tuple(outs)
 
 
-def unique(x,
-           return_index=False,
-           return_inverse=False,
-           return_counts=False,
-           axis=None,
-           dtype="int64",
-           name=None):
+def unique(
+    x,
+    return_index=False,
+    return_inverse=False,
+    return_counts=False,
+    axis=None,
+    dtype="int64",
+    name=None,
+):
     r"""
     Returns the unique elements of `x` in ascending order.
 
@@ -2220,18 +2415,27 @@ def unique(x,
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [1 2 3 5]
             _, indices, inverse, counts = paddle.unique(x, return_index=True, return_inverse=True, return_counts=True)
-            np_indices = indices.numpy() # [3 0 1 4]
-            np_inverse = inverse.numpy() # [1 2 2 0 3 2]
-            np_counts = counts.numpy() # [1 1 3 1]
+            print(indices)
+            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [3, 0, 1, 4])
+            print(inverse)
+            # Tensor(shape=[6], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [1, 2, 2, 0, 3, 2])
+            print(counts)
+            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [1, 1, 3, 1])
 
             x = paddle.to_tensor([[2, 1, 3], [3, 0, 1], [2, 1, 3]])
             unique = paddle.unique(x)
-            np_unique = unique.numpy() # [0 1 2 3]
+            print(unique)
+            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [0, 1, 2, 3])
 
             unique = paddle.unique(x, axis=0)
-            np_unique = unique.numpy() 
-            # [[2 1 3]
-            #  [3 0 1]]
+            print(unique)
+            # Tensor(shape=[2, 3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [[2, 1, 3],
+            #         [3, 0, 1]])
     """
     if axis is None:
         axis = []
@@ -2241,13 +2445,24 @@ def unique(x,
     if _non_static_mode():
         if in_dygraph_mode():
             out, indices, inverse, counts = _C_ops.unique(
-                x, return_index, return_inverse, return_counts, axis,
-                attr_dtype)
+                x, return_index, return_inverse, return_counts, axis, attr_dtype
+            )
         if _in_legacy_dygraph():
             out, inverse, indices, counts = _legacy_C_ops.unique(
-                x, 'dtype', attr_dtype, 'return_index', return_index,
-                'return_inverse', return_inverse, 'return_counts',
-                return_counts, 'axis', axis, "is_sorted", True)
+                x,
+                'dtype',
+                attr_dtype,
+                'return_index',
+                return_index,
+                'return_inverse',
+                return_inverse,
+                'return_counts',
+                return_counts,
+                'axis',
+                axis,
+                "is_sorted",
+                True,
+            )
         outs = [out]
         if return_index:
             outs.append(indices)
@@ -2261,8 +2476,9 @@ def unique(x,
 
         return tuple(outs)
 
-    check_variable_and_dtype(x, "input",
-                             ['float32', 'float64', 'int32', 'int64'], 'unique')
+    check_variable_and_dtype(
+        x, "input", ['float32', 'float64', 'int32', 'int64'], 'unique'
+    )
     check_type(return_index, 'return_index', bool, 'unique')
     check_type(return_inverse, 'return_inverse', bool, 'unique')
     check_type(return_counts, 'return_counts', bool, 'unique')
@@ -2277,21 +2493,25 @@ def unique(x,
         "return_inverse": return_inverse,
         "return_counts": return_counts,
         "axis": axis,
-        "is_sorted": True
+        "is_sorted": True,
     }
-    out = helper.create_variable_for_type_inference(dtype=x.dtype,
-                                                    stop_gradient=True)
-    indices = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                        stop_gradient=True)
-    inverse = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                        stop_gradient=True)
-    counts = helper.create_variable_for_type_inference(dtype=attr_dtype,
-                                                       stop_gradient=True)
+    out = helper.create_variable_for_type_inference(
+        dtype=x.dtype, stop_gradient=True
+    )
+    indices = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
+    inverse = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
+    counts = helper.create_variable_for_type_inference(
+        dtype=attr_dtype, stop_gradient=True
+    )
     outputs = {
         "Out": out,
         "Indices": indices,
         "Index": inverse,
-        "Counts": counts
+        "Counts": counts,
     }
     outs = [out]
     if return_index:
@@ -2301,10 +2521,9 @@ def unique(x,
     if return_counts:
         outs.append(counts)
 
-    helper.append_op(type="unique",
-                     inputs={"X": x},
-                     attrs=attrs,
-                     outputs=outputs)
+    helper.append_op(
+        type="unique", inputs={"X": x}, attrs=attrs, outputs=outputs
+    )
 
     if len(outs) == 1:
         return outs[0]
@@ -2318,14 +2537,14 @@ def unsqueeze(x, axis, name=None):
     required argument axis, a dimension or list of dimensions that will be inserted.
     Dimension indices in axis are as seen in the output tensor.
 
-    Note that the output Tensor will share data with origin Tensor and doesn't have a 
-    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version, 
+    Note that the output Tensor will share data with origin Tensor and doesn't have a
+    Tensor copy in ``dygraph`` mode. If you want to use the Tensor copy version,
     please use `Tensor.clone` like ``unsqueeze_clone_x = x.unsqueeze(-1).clone()``.
 
     Args:
         x (Tensor): The input Tensor to be unsqueezed. Supported data type: float32, float64, bool, int8, int32, int64.
-        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` . 
-                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1]. 
+        axis (int|list|tuple|Tensor): Indicates the dimensions to be inserted. The data type is ``int32`` .
+                                    If ``axis`` is a list or tuple, the elements of it should be integers or Tensors with shape [1].
                                     If ``axis`` is a Tensor, it should be an 1-D Tensor .
                                     If ``axis`` is negative, ``axis = axis + ndim(x) + 1``.
         name (str|None): Name for this layer. Please refer to :ref:`api_guide_Name`, Default None.
@@ -2340,15 +2559,15 @@ def unsqueeze(x, axis, name=None):
 
             x = paddle.rand([5, 10])
             print(x.shape)  # [5, 10]
-            
+
             out1 = paddle.unsqueeze(x, axis=0)
             print(out1.shape)  # [1, 5, 10]
-            
-            out2 = paddle.unsqueeze(x, axis=[0, 2]) 
+
+            out2 = paddle.unsqueeze(x, axis=[0, 2])
             print(out2.shape)  # [1, 5, 1, 10]
 
             axis = paddle.to_tensor([0, 1, 2])
-            out3 = paddle.unsqueeze(x, axis=axis) 
+            out3 = paddle.unsqueeze(x, axis=axis)
             print(out3.shape)  # [1, 1, 1, 5, 10]
 
             # out1, out2, out3 share data with x in dygraph mode
@@ -2356,7 +2575,7 @@ def unsqueeze(x, axis, name=None):
             print(out1[0, 0, 0]) # [10.]
             print(out2[0, 0, 0, 0]) # [10.]
             print(out3[0, 0, 0, 0, 0]) # [10.]
-            
+
     """
     input = x
     axes = axis
@@ -2376,18 +2595,23 @@ def unsqueeze(x, axis, name=None):
         return _C_ops.unsqueeze(input, axes)
 
     check_type(axes, 'axis/axes', (int, list, tuple, Variable), 'unsqueeze')
-    check_variable_and_dtype(input, 'input', [
-        'float16',
-        'float32',
-        'float64',
-        'bool',
-        'int8',
-        'int16',
-        'int32',
-        'int64',
-        'complex64',
-        'complex128',
-    ], 'unsqueeze')
+    check_variable_and_dtype(
+        input,
+        'input',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'bool',
+            'int8',
+            'int16',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'unsqueeze',
+    )
     helper = LayerHelper("unsqueeze2", **locals())
     inputs = {"X": input}
     attrs = {}
@@ -2405,13 +2629,12 @@ def unsqueeze(x, axis, name=None):
 
     out = helper.create_variable_for_type_inference(dtype=input.dtype)
     x_shape = helper.create_variable_for_type_inference(dtype=input.dtype)
-    helper.append_op(type="unsqueeze2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="unsqueeze2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return out
 
@@ -2459,7 +2682,7 @@ def gather(x, index, axis=None, name=None):
                 Then:
 
                 out = [[3, 4],
-                       [5, 6]] 
+                       [5, 6]]
 
     Args:
         x (Tensor): The source input tensor with rank>=1. Supported data type is
@@ -2472,7 +2695,7 @@ def gather(x, index, axis=None, name=None):
 
     Returns:
         output (Tensor): The output is a tensor with the same rank as ``x``.
-    
+
     Examples:
 
         .. code-block:: python
@@ -2491,13 +2714,16 @@ def gather(x, index, axis=None, name=None):
         return _C_ops.gather(x, index, axis)
     if _in_legacy_dygraph():
         axis = axis.item() if isinstance(axis, paddle.Tensor) else axis
-        return _legacy_C_ops.gather(x, index, None, "axis", axis, "overwrite",
-                                    False)
+        return _legacy_C_ops.gather(
+            x, index, None, "axis", axis, "overwrite", False
+        )
 
     check_variable_and_dtype(
-        x, 'x',
+        x,
+        'x',
         ['float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'uint8'],
-        'gather')
+        'gather',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather')
 
     if isinstance(axis, Variable):
@@ -2507,25 +2733,19 @@ def gather(x, index, axis=None, name=None):
     dtype = helper.input_dtype('x')
     out = helper.create_variable_for_type_inference(dtype)
     if not isinstance(axis, Variable):
-        helper.append_op(type="gather",
-                         inputs={
-                             "X": x,
-                             "Index": index
-                         },
-                         attrs={
-                             'axis': axis,
-                             'overwrite': False
-                         },
-                         outputs={"Out": out})
+        helper.append_op(
+            type="gather",
+            inputs={"X": x, "Index": index},
+            attrs={'axis': axis, 'overwrite': False},
+            outputs={"Out": out},
+        )
     else:
-        helper.append_op(type="gather",
-                         inputs={
-                             "X": x,
-                             "Index": index,
-                             "Axis": axis
-                         },
-                         attrs={"overwrite": False},
-                         outputs={"Out": out})
+        helper.append_op(
+            type="gather",
+            inputs={"X": x, "Index": index, "Axis": axis},
+            attrs={"overwrite": False},
+            outputs={"Out": out},
+        )
 
     return out
 
@@ -2537,7 +2757,7 @@ def unbind(input, axis=0):
 
     Args:
         input (Tensor): The input variable which is an N-D Tensor, data type being float32, float64, int32 or int64.
-        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind. 
+        axis (int32|int64, optional): A scalar with type ``int32|int64`` shape [1]. The dimension along which to unbind.
             If :math:`axis < 0`, the dimension to unbind along is :math:`rank(input) + axis`. Default is 0.
     Returns:
         list(Tensor): The list of segmented Tensor variables.
@@ -2549,7 +2769,7 @@ def unbind(input, axis=0):
 
             # input is a Tensor which shape is [3, 4, 5]
             input = paddle.rand([3, 4, 5])
-       
+
             [x0, x1, x2] = paddle.unbind(input, axis=0)
             # x0.shape [4, 5]
             # x1.shape [4, 5]
@@ -2565,8 +2785,9 @@ def unbind(input, axis=0):
         return _C_ops.unbind(input, axis)
 
     if not isinstance(axis, (int)):
-        raise TypeError("The type of 'axis'  must be int, but received %s." %
-                        (type(axis)))
+        raise TypeError(
+            "The type of 'axis'  must be int, but received %s." % (type(axis))
+        )
     if isinstance(axis, np.generic):
         axis = np.asscalar(axis)
     input_shape = input.shape
@@ -2578,16 +2799,19 @@ def unbind(input, axis=0):
     helper = LayerHelper("unbind", **locals())
     check_type(input, 'input', (Variable), 'unbind')
     dtype = helper.input_dtype()
-    check_dtype(dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'],
-                'unbind')
+    check_dtype(
+        dtype, 'unbind', ['float32', 'float64', 'int32', 'int64'], 'unbind'
+    )
     outs = [
         helper.create_variable_for_type_inference(dtype=helper.input_dtype())
         for i in range(num)
     ]
-    helper.append_op(type="unbind",
-                     inputs={"X": input},
-                     outputs={"Out": outs},
-                     attrs={"axis": axis})
+    helper.append_op(
+        type="unbind",
+        inputs={"X": input},
+        outputs={"Out": outs},
+        attrs={"axis": axis},
+    )
     return outs
 
 
@@ -2595,9 +2819,9 @@ def scatter(x, index, updates, overwrite=True, name=None):
     """
     **Scatter Layer**
     Output is obtained by updating the input on selected indices based on updates.
-    
+
     .. code-block:: python
-    
+
         import numpy as np
         #input:
         x = np.array([[1, 1], [2, 2], [3, 3]])
@@ -2619,32 +2843,32 @@ def scatter(x, index, updates, overwrite=True, name=None):
         out = np.array([[3, 3], [6, 6], [1, 1]])
         out.shape # [3, 2]
 
-    **NOTICE**: The order in which updates are applied is nondeterministic, 
+    **NOTICE**: The order in which updates are applied is nondeterministic,
     so the output will be nondeterministic if index contains duplicates.
 
     Args:
         x (Tensor): The input N-D Tensor with ndim>=1. Data type can be float32, float64.
         index (Tensor): The index 1-D Tensor. Data type can be int32, int64. The length of index cannot exceed updates's length, and the value in index cannot exceed input's length.
         updates (Tensor): update input with updates parameter based on index. shape should be the same as input, and dim value with dim > 1 should be the same as input.
-        overwrite (bool): The mode that updating the output when there are same indices. 
-            
+        overwrite (bool): The mode that updating the output when there are same indices.
+
             If True, use the overwrite mode to update the output of the same index,
-	        if False, use the accumulate mode to update the output of the same index.Default value is True.
-        
+                if False, use the accumulate mode to update the output of the same index.Default value is True.
+
         name(str, optional): The default value is None. Normally there is no need for user to set this property.  For more information, please refer to :ref:`api_guide_Name` .
- 
+
     Returns:
         Tensor: The output is a Tensor with the same shape as x.
 
     Examples:
         .. code-block:: python
-            
+
             import paddle
 
             x = paddle.to_tensor([[1, 1], [2, 2], [3, 3]], dtype='float32')
             index = paddle.to_tensor([2, 1, 0, 1], dtype='int64')
             updates = paddle.to_tensor([[1, 1], [2, 2], [3, 3], [4, 4]], dtype='float32')
-  
+
             output1 = paddle.scatter(x, index, updates, overwrite=False)
             # [[3., 3.],
             #  [6., 6.],
@@ -2669,23 +2893,25 @@ def scatter(x, index, updates, overwrite=True, name=None):
         return _C_ops.scatter(x, index, updates, overwrite)
     else:
         if _in_legacy_dygraph():
-            return _legacy_C_ops.scatter(x, index, updates, 'overwrite',
-                                         overwrite)
+            return _legacy_C_ops.scatter(
+                x, index, updates, 'overwrite', overwrite
+            )
         else:
             check_variable_and_dtype(
-                x, 'dtype', ['float32', 'float64', 'float16', 'int32', 'int64'],
-                'scatter')
+                x,
+                'dtype',
+                ['float32', 'float64', 'float16', 'int32', 'int64'],
+                'scatter',
+            )
             check_type(overwrite, 'overwrite', bool, 'scatter')
             helper = LayerHelper('scatter', **locals())
             out = helper.create_variable_for_type_inference(x.dtype)
-            helper.append_op(type="scatter",
-                             inputs={
-                                 "X": x,
-                                 "Ids": index,
-                                 "Updates": updates
-                             },
-                             attrs={'overwrite': overwrite},
-                             outputs={"Out": out})
+            helper.append_op(
+                type="scatter",
+                inputs={"X": x, "Ids": index, "Updates": updates},
+                attrs={'overwrite': overwrite},
+                outputs={"Out": out},
+            )
             return out
 
 
@@ -2764,7 +2990,7 @@ def scatter_nd_add(x, index, updates, name=None):
             index = paddle.to_tensor([[1, 1],
                                     [0, 1],
                                     [1, 3]], dtype='int64')
-            
+
             output = paddle.scatter_nd_add(x, index, updates)
             print(output.shape)
             # [3, 5, 9, 10]
@@ -2782,13 +3008,11 @@ def scatter_nd_add(x, index, updates, name=None):
             helper = LayerHelper('scatter_nd_add', **locals())
             dtype = helper.input_dtype(input_param_name='x')
             output = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type="scatter_nd_add",
-                             inputs={
-                                 "X": x,
-                                 "Index": index,
-                                 "Updates": updates
-                             },
-                             outputs={"Out": output})
+            helper.append_op(
+                type="scatter_nd_add",
+                inputs={"X": x, "Index": index, "Updates": updates},
+                outputs={"Out": output},
+            )
             return output
 
 
@@ -2821,12 +3045,10 @@ def scatter_nd(index, updates, shape, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
-            index_data = np.array([[1, 1],
-                                   [0, 1],
-                                   [1, 3]]).astype(np.int64)
-            index = paddle.to_tensor(index_data)
+            index = paddle.to_tensor([[1, 1],
+                                    [0, 1],
+                                    [1, 3]], dtype="int64")
             updates = paddle.rand(shape=[3, 9, 10], dtype='float32')
             shape = [3, 5, 9, 10]
 
@@ -2838,34 +3060,31 @@ def scatter_nd(index, updates, shape, name=None):
 def chunk(x, chunks, axis=0, name=None):
     """
     Split the input tensor into multiple sub-Tensors.
-    
+
     Args:
         x (Tensor): A N-D Tensor. The data type is bool, float16, float32, float64, int32 or int64.
         chunks(int): The number of tensor to be split along the certain axis.
-        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type 
+        axis (int|Tensor, optional): The axis along which to split, it can be a scalar with type
             ``int`` or a ``Tensor`` with shape [1] and data type  ``int32`` or ``int64``.
             If :math::`axis < 0`, the axis to split along is :math:`rank(x) + axis`. Default is 0.
         name (str, optional): The default value is None.  Normally there is no need for user to set this property.
             For more information, please refer to :ref:`api_guide_Name` .
     Returns:
         list(Tensor): The list of segmented Tensors.
-    
-    Example:
+
+    Examples:
         .. code-block:: python
-            
-            import numpy as np
+
             import paddle
-            
-            # x is a Tensor which shape is [3, 9, 5]
-            x_np = np.random.random([3, 9, 5]).astype("int32")
-            x = paddle.to_tensor(x_np)
+
+            x = paddle.rand([3, 9, 5])
 
             out0, out1, out2 = paddle.chunk(x, chunks=3, axis=1)
             # out0.shape [3, 3, 5]
             # out1.shape [3, 3, 5]
             # out2.shape [3, 3, 5]
 
-            
+
             # axis is negative, the real axis is (rank(x) + axis) which real
             # value is 1.
             out0, out1, out2 = paddle.chunk(x, chunks=3, axis=-2)
@@ -2901,23 +3120,28 @@ def tile(x, repeat_times, name=None):
 
             data = paddle.to_tensor([1, 2, 3], dtype='int32')
             out = paddle.tile(data, repeat_times=[2, 1])
-            np_out = out.numpy()
-            # [[1, 2, 3]
-            #  [1, 2, 3]]
+            print(out)
+            # Tensor(shape=[2, 3], dtype=int32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1, 2, 3],
+            #         [1, 2, 3]])
 
             out = paddle.tile(data, repeat_times=(2, 2))
-            np_out = out.numpy()
-            # [[1, 2, 3, 1, 2, 3]
-            #  [1, 2, 3, 1, 2, 3]]
+            print(out)
+            # Tensor(shape=[2, 6], dtype=int32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1, 2, 3, 1, 2, 3],
+            #         [1, 2, 3, 1, 2, 3]])
 
             repeat_times = paddle.to_tensor([1, 2], dtype='int32')
             out = paddle.tile(data, repeat_times=repeat_times)
-            np_out = out.numpy()
-            # [[1, 2, 3, 1, 2, 3]]
+            print(out)
+            # Tensor(shape=[1, 6], dtype=int32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1, 2, 3, 1, 2, 3]])
     """
     if in_dygraph_mode():
         if isinstance(repeat_times, core.eager.Tensor):
-            assert repeat_times.ndim == 1, "Only support ndim == 1 while repeat_times is a Tensor."
+            assert (
+                repeat_times.ndim == 1
+            ), "Only support ndim == 1 while repeat_times is a Tensor."
             repeat_times = repeat_times.numpy().tolist()
 
         return _C_ops.tile(x, repeat_times)
@@ -2927,26 +3151,30 @@ def tile(x, repeat_times, name=None):
 
     check_type(repeat_times, 'repeat_times', (list, tuple, Variable), 'tile')
     if isinstance(repeat_times, Variable):
-        assert len(
-            repeat_times.shape) == 1, ('repeat_times must be an 1-D Tensor.')
+        assert (
+            len(repeat_times.shape) == 1
+        ), 'repeat_times must be an 1-D Tensor.'
     else:
         for elem in repeat_times:
             if isinstance(elem, Variable):
-                assert len(elem.shape) == 1, (
-                    'Elements in repeat_times must be 1-D Tensors or integers.')
+                assert (
+                    len(elem.shape) == 1
+                ), 'Elements in repeat_times must be 1-D Tensors or integers.'
             else:
                 type_tuple = (int, np.int32, np.int64)
-                assert isinstance(elem, type_tuple), (
-                    'Elements in repeat_times must be 1-D Tensors or integers.')
+                assert isinstance(
+                    elem, type_tuple
+                ), 'Elements in repeat_times must be 1-D Tensors or integers.'
 
-    check_variable_and_dtype(x, 'x',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'tile')
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'tile'
+    )
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the date type is bool for the input 'x' of tile op, you "
             "must set its stop_gradient to be True by "
-            "some_var.stop_gradient == True supporting some_var is the input.")
+            "some_var.stop_gradient == True supporting some_var is the input."
+        )
 
     helper = LayerHelper('tile', **locals())
 
@@ -2960,8 +3188,9 @@ def get_attr_repeat_times(list_repeat_times):
                 attrs_repeat_times.append(-1)
             else:
                 attrs_repeat_times.append(times)
-                assert times > 0, (
-                    "All elements in repeat_times must be positive for tile.")
+                assert (
+                    times > 0
+                ), "All elements in repeat_times must be positive for tile."
         return attrs_repeat_times
 
     if isinstance(repeat_times, Variable):
@@ -2972,14 +3201,14 @@ def get_attr_repeat_times(list_repeat_times):
         attrs['repeat_times'] = get_attr_repeat_times(repeat_times)
         if utils._contain_var(repeat_times):
             inputs['repeat_times_tensor'] = utils._convert_to_tensor_list(
-                repeat_times)
+                repeat_times
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='tile',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='tile', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -3006,8 +3235,10 @@ def expand_as(x, y, name=None):
             data_x = paddle.to_tensor([1, 2, 3], 'int32')
             data_y = paddle.to_tensor([[1, 2, 3], [4, 5, 6]], 'int32')
             out = paddle.expand_as(data_x, data_y)
-            np_out = out.numpy()
-            # [[1, 2, 3], [1, 2, 3]]
+            print(out)
+            # Tensor(shape=[2, 3], dtype=int32, place=Place(gpu:0), stop_gradient=True,
+            #        [[1, 2, 3],
+            #         [1, 2, 3]])
     """
     if in_dygraph_mode():
         return _C_ops.expand_as(x, None, y.shape)
@@ -3015,9 +3246,9 @@ def expand_as(x, y, name=None):
     if _non_static_mode():
         return _legacy_C_ops.expand_as_v2(x, 'target_shape', y.shape)
 
-    check_variable_and_dtype(x, 'x',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'expand_as')
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'expand_as'
+    )
     check_type(y, 'y', Variable, 'expand_as')
 
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
@@ -3025,16 +3256,19 @@ def expand_as(x, y, name=None):
             "When the data type of input 'x' for expand_as is bool, "
             "you must set its stop_gradient to be False by "
             "some_var.stop_gradient = True, supporting "
-            "some_var as the input 'x'.")
+            "some_var as the input 'x'."
+        )
     inputs = {"X": [x], "Y": [y]}
 
     helper = LayerHelper('expand_as', **locals())
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand_as_v2',
-                     inputs=inputs,
-                     attrs={'target_shape': y.shape},
-                     outputs={'Out': out})
+    helper.append_op(
+        type='expand_as_v2',
+        inputs=inputs,
+        attrs={'target_shape': y.shape},
+        outputs={'Out': out},
+    )
     return out
 
 
@@ -3049,7 +3283,7 @@ def broadcast_to(x, shape, name=None):
     Args:
         x (Tensor): The input tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after broadcasting. The data type is int32. If shape is a list or tuple, all its elements
-            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
     Returns:
@@ -3071,27 +3305,30 @@ def broadcast_to(x, shape, name=None):
         return _legacy_C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
-        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+        assert len(shape.shape) == 1, 'shape must be an 1-D Tensor.'
     else:
         for elem in shape:
             if isinstance(elem, Variable):
-                assert len(elem.shape) == 1, (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert (
+                    len(elem.shape) == 1
+                ), 'Elements in shape must be 1-D Tensors or integers.'
             else:
                 type_tuple = (int, np.int32, np.int64)
-                assert isinstance(elem, type_tuple), (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert isinstance(
+                    elem, type_tuple
+                ), 'Elements in shape must be 1-D Tensors or integers.'
 
-    check_variable_and_dtype(x, 'x',
-                             ['bool', 'float32', 'float64', 'int32', 'int64'],
-                             'broadcast_to')
+    check_variable_and_dtype(
+        x, 'x', ['bool', 'float32', 'float64', 'int32', 'int64'], 'broadcast_to'
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'broadcast_to')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
         raise ValueError(
             "When the data type of input 'x' for broadcast_to is bool, "
             "you must set its stop_gradient to be False by "
             "some_var.stop_gradient = True, supporting "
-            "some_var as the input.")
+            "some_var as the input."
+        )
 
     inputs = {"X": [x]}
     attrs = {}
@@ -3105,9 +3342,9 @@ def get_attr_expand_shape(list_expand_shape):
                 attrs_expand_shape.append(-1)
             else:
                 attrs_expand_shape.append(shape)
-                assert shape > 0 or shape == -1, (
-                    "All elements in shape of broadcast_to must be positive or -1."
-                )
+                assert (
+                    shape > 0 or shape == -1
+                ), "All elements in shape of broadcast_to must be positive or -1."
         return attrs_expand_shape
 
     if isinstance(shape, Variable):
@@ -3117,14 +3354,14 @@ def get_attr_expand_shape(list_expand_shape):
         attrs['shape'] = get_attr_expand_shape(shape)
         if utils._contain_var(shape):
             inputs['expand_shapes_tensor'] = utils._convert_to_tensor_list(
-                shape)
+                shape
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand_v2',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -3139,7 +3376,7 @@ def expand(x, shape, name=None):
     Args:
         x (Tensor): The input Tensor, its data type is bool, float32, float64, int32 or int64.
         shape (list|tuple|Tensor): The result shape after expanding. The data type is int32. If shape is a list or tuple, all its elements
-            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32. 
+            should be integers or 1-D Tensors with the data type int32. If shape is a Tensor, it should be an 1-D Tensor with the data type int32.
             The value -1 in shape means keeping the corresponding dimension unchanged.
         name (str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` .
 
@@ -3163,26 +3400,33 @@ def expand(x, shape, name=None):
         return _legacy_C_ops.expand_v2(x, 'shape', shape)
 
     if isinstance(shape, Variable):
-        assert len(shape.shape) == 1, ('shape must be an 1-D Tensor.')
+        assert len(shape.shape) == 1, 'shape must be an 1-D Tensor.'
     else:
         for elem in shape:
             if isinstance(elem, Variable):
-                assert len(elem.shape) == 1, (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert (
+                    len(elem.shape) == 1
+                ), 'Elements in shape must be 1-D Tensors or integers.'
             else:
                 type_tuple = (int, np.int32, np.int64)
-                assert isinstance(elem, type_tuple), (
-                    'Elements in shape must be 1-D Tensors or integers.')
+                assert isinstance(
+                    elem, type_tuple
+                ), 'Elements in shape must be 1-D Tensors or integers.'
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'expand')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'expand',
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'expand')
     if convert_dtype(x.dtype) == 'bool' and x.stop_gradient == False:
-        raise ValueError("When the data type of input 'x' for expand is bool, "
-                         "you must set its stop_gradient to be False by "
-                         "some_var.stop_gradient = True, supporting "
-                         "some_var as the input.")
+        raise ValueError(
+            "When the data type of input 'x' for expand is bool, "
+            "you must set its stop_gradient to be False by "
+            "some_var.stop_gradient = True, supporting "
+            "some_var as the input."
+        )
 
     inputs = {"X": [x]}
     attrs = {}
@@ -3196,8 +3440,9 @@ def get_attr_expand_shape(list_expand_shape):
                 attrs_expand_shape.append(-2)
             else:
                 attrs_expand_shape.append(shape)
-                assert shape > 0 or shape == -1, (
-                    "All elements in shape of expand must be positive or -1.")
+                assert (
+                    shape > 0 or shape == -1
+                ), "All elements in shape of expand must be positive or -1."
         return attrs_expand_shape
 
     if isinstance(shape, Variable):
@@ -3207,14 +3452,14 @@ def get_attr_expand_shape(list_expand_shape):
         attrs['shape'] = get_attr_expand_shape(shape)
         if utils._contain_var(shape):
             inputs['expand_shapes_tensor'] = utils._convert_to_tensor_list(
-                shape)
+                shape
+            )
 
     dtype = helper.input_dtype(input_param_name='x')
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='expand_v2',
-                     inputs=inputs,
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='expand_v2', inputs=inputs, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -3223,8 +3468,8 @@ def reshape(x, shape, name=None):
     Changes the shape of ``x`` without changing its data.
 
     Note that the output Tensor will share data with origin Tensor and doesn't
-    have a Tensor copy in ``dygraph`` mode. 
-    If you want to use the Tensor copy version, please use `Tensor.clone` like 
+    have a Tensor copy in ``dygraph`` mode.
+    If you want to use the Tensor copy version, please use `Tensor.clone` like
     ``reshape_clone_x = x.reshape([-1]).clone()``.
 
     Some tricks exist when specifying the target shape.
@@ -3283,7 +3528,7 @@ def reshape(x, shape, name=None):
 
     if in_dygraph_mode():
         tmp_tensor_type = core.eager.Tensor
-        #TODO(zhiqiu): enable inplace in dygraph mode.
+        # TODO(zhiqiu): enable inplace in dygraph mode.
         if inplace:
             warnings.warn(
                 "Inplace on reshape is not allowed and will be discarded in dygraph mode currently."
@@ -3291,7 +3536,9 @@ def reshape(x, shape, name=None):
         if isinstance(shape, (list, tuple)):
             shape = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in shape
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in shape
             ]
             out = _C_ops.reshape(x, shape)
         elif isinstance(shape, tmp_tensor_type):
@@ -3300,7 +3547,8 @@ def reshape(x, shape, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape)))
+                " got '{}.'".format(type(shape))
+            )
 
         return dygraph_utils._append_activation_in_dygraph(out, act)
     else:
@@ -3322,14 +3570,26 @@ def reshape(x, shape, name=None):
             else:
                 raise ValueError(
                     "shape must be an instance of `list`, `tuple` or `Variable`,"
-                    " got '{}.'".format(type(shape)))
+                    " got '{}.'".format(type(shape))
+                )
 
             return dygraph_utils._append_activation_in_dygraph(out, act)
 
-    check_variable_and_dtype(x, 'x', [
-        'float16', 'float32', 'float64', 'int16', 'int32', 'int64', 'bool',
-        'uint16'
-    ], 'reshape')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'float16',
+            'float32',
+            'float64',
+            'int16',
+            'int32',
+            'int64',
+            'bool',
+            'uint16',
+        ],
+        'reshape',
+    )
     check_type(shape, 'shape', (list, tuple, Variable), 'reshape')
     check_type(actual_shape, 'actual_shape', (Variable, type(None)), 'reshape')
 
@@ -3353,20 +3613,23 @@ def get_attr_shape(list_shape):
                         "\t# z.shape is [-1, -1, 4]\n\n"
                         "    If your target shape in Reshape represents dynamic shape, "
                         "please turn it into a Tensor under @to_static. See above example for details."
-                        % dim_idx)
+                        % dim_idx
+                    )
                     unk_dim_idx = dim_idx
                 elif dim_size == 0:
                     assert dim_idx < len(x.shape), (
                         "The index of 0 in `shape` must be less than "
                         "the input tensor X's dimensions. "
-                        "But received shape[%d] = 0, X's dimensions = %d." %
-                        (dim_idx, len(x.shape)))
+                        "But received shape[%d] = 0, X's dimensions = %d."
+                        % (dim_idx, len(x.shape))
+                    )
                 else:
                     assert dim_size > 0, (
                         "Each dimension value of 'shape' in reshape must not "
                         "be negative except one unknown dimension. "
-                        "But received shape[%d] = %s." %
-                        (dim_idx, str(dim_size)))
+                        "But received shape[%d] = %s."
+                        % (dim_idx, str(dim_size))
+                    )
         return attrs_shape
 
     inputs = {"X": x}
@@ -3375,8 +3638,10 @@ def get_attr_shape(list_shape):
         shape.stop_gradient = True
         inputs["Shape"] = shape
     elif isinstance(shape, (list, tuple)):
-        assert len(shape) > 0, ("The size of 'shape' in reshape can't be zero, "
-                                "but received %s." % len(shape))
+        assert len(shape) > 0, (
+            "The size of 'shape' in reshape can't be zero, "
+            "but received %s." % len(shape)
+        )
         attrs["shape"] = get_attr_shape(shape)
         if utils._contain_var(shape):
             inputs['ShapeTensor'] = utils._convert_to_tensor_list(shape)
@@ -3384,16 +3649,18 @@ def get_attr_shape(list_shape):
             actual_shape.stop_gradient = True
             inputs["Shape"] = actual_shape
 
-    out = x if inplace else helper.create_variable_for_type_inference(
-        dtype=x.dtype)
+    out = (
+        x
+        if inplace
+        else helper.create_variable_for_type_inference(dtype=x.dtype)
+    )
     x_shape = helper.create_variable_for_type_inference(dtype=x.dtype)
-    helper.append_op(type="reshape2",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={
-                         "Out": out,
-                         "XShape": x_shape
-                     })
+    helper.append_op(
+        type="reshape2",
+        inputs=inputs,
+        attrs=attrs,
+        outputs={"Out": out, "XShape": x_shape},
+    )
 
     return helper.append_activation(out)
 
@@ -3409,7 +3676,9 @@ def reshape_(x, shape, name=None):
         if isinstance(shape, (list, tuple)):
             shape = [
                 item.numpy().item(0)
-                if isinstance(item, tmp_tensor_type) else item for item in shape
+                if isinstance(item, tmp_tensor_type)
+                else item
+                for item in shape
             ]
             out = _C_ops.reshape_(x, shape)
         elif isinstance(shape, tmp_tensor_type):
@@ -3418,7 +3687,8 @@ def reshape_(x, shape, name=None):
         else:
             raise ValueError(
                 "shape must be an instance of `list`, `tuple` or `Variable`,"
-                " got '{}.'".format(type(shape)))
+                " got '{}.'".format(type(shape))
+            )
 
         return out
     else:
@@ -3499,17 +3769,17 @@ def gather_nd(x, index, name=None):
 
     Returns:
         output (Tensor): A tensor with the shape index.shape[:-1] + input.shape[index.shape[-1]:]
-    
+
     Examples:
 
         .. code-block:: python
-            
+
             import paddle
-            
+
             x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
                                   [[7, 8], [9, 10], [11, 12]]])
             index = paddle.to_tensor([[0, 1]])
-            
+
             output = paddle.gather_nd(x, index) #[[3, 4]]
 
     """
@@ -3519,18 +3789,20 @@ def gather_nd(x, index, name=None):
         if _in_legacy_dygraph():
             return _legacy_C_ops.gather_nd(x, index)
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
-        'gather_np')
+        x,
+        'x',
+        ['bool', 'float32', 'float64', 'int16', 'int32', 'int64'],
+        'gather_np',
+    )
     check_variable_and_dtype(index, 'index', ['int32', 'int64'], 'gather_np')
     helper = LayerHelper('gather_nd', **locals())
     dtype = helper.input_dtype()
     output = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="gather_nd",
-                     inputs={
-                         "X": x,
-                         "Index": index
-                     },
-                     outputs={"Out": output})
+    helper.append_op(
+        type="gather_nd",
+        inputs={"X": x, "Index": index},
+        outputs={"Out": output},
+    )
     return output
 
 
@@ -3609,7 +3881,7 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
             strides_1 = [1, 1, 1]
             strides_2 = [1, 1, 2]
             sliced_1 = paddle.strided_slice(x, axes=axes, starts=starts, ends=ends, strides=strides_1)
-            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].                                
+            # sliced_1 is x[:, 1:3:1, 0:2:1, 2:4:1].
             # example 2:
             # attr starts is a list which contain tensor Tensor.
             minus_3 = paddle.full(shape=[1], fill_value=-3, dtype='int32')
@@ -3622,8 +3894,11 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
     helper = LayerHelper('strided_slice', **locals())
 
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        'strided_slice')
+        x,
+        'x',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
+        'strided_slice',
+    )
     check_type(axes, 'axes', (list, tuple), 'strided_slice')
     check_type(starts, 'starts', (list, tuple, Variable), 'strided_slice')
     check_type(ends, 'ends', (list, tuple, Variable), 'strided_slice')
@@ -3631,8 +3906,9 @@ def strided_slice(x, axes, starts, ends, strides, name=None):
 
     def check_list_elements_dtype(list_input, input_name):
         if isinstance(list_input, Variable):
-            check_dtype(list_input.dtype, input_name, ['int32'],
-                        'strided_slice')
+            check_dtype(
+                list_input.dtype, input_name, ['int32'], 'strided_slice'
+            )
         else:
             for i, var in enumerate(list_input):
                 var_name = input_name + '[' + str(i) + ']'
@@ -3651,7 +3927,7 @@ def get_new_list_tensor(old_list):
                 dim.stop_gradient = True
                 new_list_tensor.append(dim)
             else:
-                assert (isinstance(dim, int))
+                assert isinstance(dim, int)
                 temp_out = helper.create_variable_for_type_inference('int32')
                 fill_constant([1], 'int32', dim, force_cpu=True, out=temp_out)
                 new_list_tensor.append(temp_out)
@@ -3668,7 +3944,7 @@ def get_new_list_tensor(old_list):
             'starts': starts,
             'ends': ends,
             'strides': strides,
-            'infer_flags': infer_flags
+            'infer_flags': infer_flags,
         }
     else:
         # starts
@@ -3723,55 +3999,55 @@ def get_new_list_tensor(old_list):
                 attrs['strides'] = strides
         attrs['infer_flags'] = infer_flags
     out = helper.create_variable_for_type_inference(
-        dtype=helper.input_dtype('x'))
-    helper.append_op(type='strided_slice',
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={'Out': out})
+        dtype=helper.input_dtype('x')
+    )
+    helper.append_op(
+        type='strided_slice', inputs=inputs, attrs=attrs, outputs={'Out': out}
+    )
 
     return out
 
 
 def tensordot(x, y, axes=2, name=None):
     r"""
-    This function computes a contraction, which sum the product of elements from two tensors along the given axes. 
+    This function computes a contraction, which sum the product of elements from two tensors along the given axes.
 
     Args:
         x (Tensor): The left tensor for contraction with data type ``float32`` or ``float64``.
         y (Tensor): The right tensor for contraction with the same data type as ``x``.
         axes (int|tuple|list|Tensor, optional):  The axes to contract for ``x`` and ``y``, defaulted to integer ``2``.
 
-            1. It could be a non-negative integer ``n``, 
+            1. It could be a non-negative integer ``n``,
                in which the function will sum over the last ``n`` axes of ``x`` and the first ``n`` axes of ``y`` in order.
-        
-            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes. 
+
+            2. It could be a 1-d tuple or list with data type ``int``, in which ``x`` and ``y`` will be contracted along the same given axes.
                For example, ``axes`` =[0, 1] applies contraction along the first two axes for ``x`` and the first two axes for ``y``.
-        
-            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``. 
-               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract. 
-               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``. 
+
+            3. It could be a tuple or list containing one or two 1-d tuple|list|Tensor with data type ``int``.
+               When containing one tuple|list|Tensor, the data in tuple|list|Tensor specified the same axes for ``x`` and ``y`` to contract.
+               When containing two tuple|list|Tensor, the first will be applied to ``x`` and the second to ``y``.
                When containing more than two tuple|list|Tensor, only the first two axis sequences will be used while the others will be ignored.
-        
-            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list 
-               and applied the same rules described above to determine the contraction axes. 
+
+            4. It could be a tensor, in which the ``axes`` tensor will be translated to a python list
+               and applied the same rules described above to determine the contraction axes.
                Note that the ``axes`` with Tensor type is ONLY available in Dygraph mode.
-        name(str, optional): The default value is None.  Normally there is no need for user to set this property. 
+        name(str, optional): The default value is None.  Normally there is no need for user to set this property.
                              For more information, please refer to :ref:`api_guide_Name` .
 
-    Return: 
-        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``. 
+    Return:
+        Output (Tensor): The contraction result with the same data type as ``x`` and ``y``.
         In general, :math:`output.ndim = x.ndim + y.ndim - 2 \times n_{axes}`, where :math:`n_{axes}` denotes the number of axes to be contracted.
-    
+
     NOTES:
-        1. This function supports tensor broadcast, 
+        1. This function supports tensor broadcast,
            the size in the corresponding dimensions of ``x`` and ``y`` should be equal, or applies to the broadcast rules.
-        2. This function also supports axes expansion, 
-           when the two given axis sequences for ``x`` and ``y`` are of different lengths, 
-           the shorter sequence will expand the same axes as the longer one at the end. 
-           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]], 
-           the axis sequence for ``x`` is [0, 1, 2, 3], 
+        2. This function also supports axes expansion,
+           when the two given axis sequences for ``x`` and ``y`` are of different lengths,
+           the shorter sequence will expand the same axes as the longer one at the end.
+           For example, if ``axes`` =[[0, 1, 2, 3], [1, 0]],
+           the axis sequence for ``x`` is [0, 1, 2, 3],
            while the corresponding axis sequences for ``y`` will be expanded from [1, 0] to [1, 0, 2, 3].
-  
+
     Examples:
         .. code-block:: python
 
@@ -3780,7 +4056,7 @@ def tensordot(x, y, axes=2, name=None):
             data_type = 'float64'
 
             # For two 2-d tensor x and y, the case axes=0 is equivalent to outer product.
-            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.   
+            # Note that tensordot supports empty axis sequence, so all the axes=0, axes=[], axes=[[]], and axes=[[],[]] are equivalent cases.
             x = paddle.arange(4, dtype=data_type).reshape([2, 2])
             y = paddle.arange(4, dtype=data_type).reshape([2, 2])
             z = paddle.tensordot(x, y, axes=0)
@@ -3842,7 +4118,7 @@ def tensordot(x, y, axes=2, name=None):
             # z = [[23217330., 24915630., 26613930., 28312230.],
             #      [24915630., 26775930., 28636230., 30496530.],
             #      [26613930., 28636230., 30658530., 32680830.],
-            #      [28312230., 30496530., 32680830., 34865130.]] 
+            #      [28312230., 30496530., 32680830., 34865130.]]
     """
     op_type = 'tensordot'
     input_dtype = ['float32', 'float64']
@@ -3855,8 +4131,9 @@ def _var_to_list(var):
         if paddle.in_dynamic_mode():
             return tolist(var)
         raise TypeError(
-            "The 'axes' with type 'Tensor' in " + op_type +
-            " is not available in static graph mode, "
+            "The 'axes' with type 'Tensor' in "
+            + op_type
+            + " is not available in static graph mode, "
             "please convert its type to int|Tuple|List, or use dynamic graph mode."
         )
 
@@ -3864,8 +4141,10 @@ def _var_to_list(var):
     axes_y = []
     if np.issubdtype(type(axes), np.integer):
         assert axes >= 0, (
-            "The 'axes' in " + op_type +
-            f" should not be negative, but received axes={axes}.")
+            "The 'axes' in "
+            + op_type
+            + f" should not be negative, but received axes={axes}."
+        )
         axes_x = range(x.ndim - axes, x.ndim)
         axes_y = range(axes)
     else:
@@ -3905,7 +4184,11 @@ def _var_to_list(var):
             shape_x[dim_x] = 1
             x = x.sum(dim_x).reshape(shape_x)
         else:
-            assert sx == sy, "The dimensional size for 'x' and 'y' in " + op_type + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}."
+            assert sx == sy, (
+                "The dimensional size for 'x' and 'y' in "
+                + op_type
+                + f" should match each other, but 'x' has size {sx} in dim {dim_x} while 'y' has size {sy} in dim {dim_y}."
+            )
 
         need_contracted_dim_x[dim_x] = True
         need_contracted_dim_y[dim_y] = True
@@ -3933,20 +4216,22 @@ def _var_to_list(var):
         shape_out = [1]
 
     x = x.transpose(perm=perm_x).reshape(
-        [not_contraction_size_x, contraction_size])
+        [not_contraction_size_x, contraction_size]
+    )
     y = y.transpose(perm=perm_y).reshape(
-        [contraction_size, not_contraction_size_y])
+        [contraction_size, not_contraction_size_y]
+    )
     out = x.matmul(y).reshape(shape_out)
     return out
 
 
 def as_complex(x, name=None):
-    """Transform a real tensor to a complex tensor. 
-    
+    """Transform a real tensor to a complex tensor.
+
     The data type of the input tensor is 'float32' or 'float64', and the data
     type of the returned tensor is 'complex64' or 'complex128', respectively.
 
-    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e. 
+    The shape of the input tensor is ``(* ,2)``, (``*`` means arbitary shape), i.e.
     the size of the last axis shoule be 2, which represent the real and imag part
     of a complex number. The shape of the returned tensor is ``(*,)``.
 
@@ -3956,17 +4241,18 @@ def as_complex(x, name=None):
 
     Returns:
         Tensor: The output. Data type is 'complex64' or 'complex128', with the same precision as the input.
-    
+
     Examples:
         .. code-block:: python
 
             import paddle
             x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
             y = paddle.as_complex(x)
-            print(y.numpy())
+            print(y)
 
-            # [[ 0. +1.j  2. +3.j  4. +5.j]
-            #  [ 6. +7.j  8. +9.j 10.+11.j]]
+            # Tensor(shape=[2, 3], dtype=complex64, place=Place(gpu:0), stop_gradient=True,
+            #        [[1j      , (2+3j)  , (4+5j)  ],
+            #         [(6+7j)  , (8+9j)  , (10+11j)]])
     """
     if in_dygraph_mode():
         return _C_ops.as_complex(x)
@@ -3978,7 +4264,8 @@ def as_complex(x, name=None):
     helper = LayerHelper(op_type, **locals())
     inputs = {"X": x}
     out = helper.create_variable_for_type_inference(
-        dtype=_real_to_complex_dtype(x.dtype))
+        dtype=_real_to_complex_dtype(x.dtype)
+    )
     outputs = {"Out": out}
     attrs = {}
     helper.append_op(type=op_type, inputs=inputs, attrs=attrs, outputs=outputs)
@@ -3986,9 +4273,9 @@ def as_complex(x, name=None):
 
 
 def as_real(x, name=None):
-    """Transform a complex tensor to a real tensor. 
-    
-    The data type of the input tensor is 'complex64' or 'complex128', and the data 
+    """Transform a complex tensor to a real tensor.
+
+    The data type of the input tensor is 'complex64' or 'complex128', and the data
     type of the returned tensor is 'float32' or 'float64', respectively.
 
     When the shape of the input tensor is ``(*, )``, (``*`` means arbitary shape),
@@ -4001,7 +4288,7 @@ def as_real(x, name=None):
 
     Returns:
         Tensor: The output. Data type is 'float32' or 'float64', with the same precision as the input.
-    
+
     Examples:
         .. code-block:: python
 
@@ -4009,15 +4296,16 @@ def as_real(x, name=None):
             x = paddle.arange(12, dtype=paddle.float32).reshape([2, 3, 2])
             y = paddle.as_complex(x)
             z = paddle.as_real(y)
-            print(z.numpy())
+            print(z)
 
-            # [[[ 0.  1.]
-            #   [ 2.  3.]
-            #   [ 4.  5.]]
+            # Tensor(shape=[2, 3, 2], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[[0. , 1. ],
+            #          [2. , 3. ],
+            #          [4. , 5. ]],
 
-            #  [[ 6.  7.]
-            #   [ 8.  9.]
-            #   [10. 11.]]]
+            #         [[6. , 7. ],
+            #          [8. , 9. ],
+            #          [10., 11.]]])
     """
     if in_dygraph_mode():
         return _C_ops.as_real(x)
@@ -4029,7 +4317,8 @@ def as_real(x, name=None):
     helper = LayerHelper(op_type, **locals())
     inputs = {"X": x}
     out = helper.create_variable_for_type_inference(
-        dtype=_complex_to_real_dtype(x.dtype))
+        dtype=_complex_to_real_dtype(x.dtype)
+    )
     outputs = {"Out": out}
     helper.append_op(type=op_type, inputs=inputs, outputs=outputs)
     return out
@@ -4081,23 +4370,27 @@ def repeat_interleave(x, repeats, axis=None, name=None):
         return _C_ops.repeat_interleave(x, repeats, axis)
 
     helper = LayerHelper("repeat_interleave", **locals())
-    check_variable_and_dtype(x, 'x', ['float32', 'float64', 'int32', 'int64'],
-                             'paddle.tensor.manipulation.repeat_interleave')
+    check_variable_and_dtype(
+        x,
+        'x',
+        ['float32', 'float64', 'int32', 'int64'],
+        'paddle.tensor.manipulation.repeat_interleave',
+    )
 
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(type='repeat_interleave',
-                     inputs={
-                         'X':
-                         x,
-                         'RepeatsTensor':
-                         repeats if isinstance(repeats, Variable) else None
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'dim': axis,
-                         'Repeats': repeats if isinstance(repeats, int) else 0
-                     })
+    helper.append_op(
+        type='repeat_interleave',
+        inputs={
+            'X': x,
+            'RepeatsTensor': repeats if isinstance(repeats, Variable) else None,
+        },
+        outputs={'Out': out},
+        attrs={
+            'dim': axis,
+            'Repeats': repeats if isinstance(repeats, int) else 0,
+        },
+    )
     return out
 
 
@@ -4119,7 +4412,7 @@ def moveaxis(x, source, destination, name=None):
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
 
             x = paddle.ones([3, 2, 4])
@@ -4128,13 +4421,14 @@ def moveaxis(x, source, destination, name=None):
 
             x = paddle.ones([2, 3])
             paddle.moveaxis(x, 0, 1).shape # equivalent to paddle.t(x)
-            # [3, 2]  
+            # [3, 2]
     """
     src = [source] if isinstance(source, int) else source
     dst = [destination] if isinstance(destination, int) else destination
 
     assert len(src) == len(
-        dst), "'source' must have the same number with 'destination'"
+        dst
+    ), "'source' must have the same number with 'destination'"
 
     count = Counter(src).most_common(1)
     if count[0][1] > 1:
@@ -4151,29 +4445,31 @@ def moveaxis(x, source, destination, name=None):
     dst_dims = list(range(ndim))
 
     for i, axis in enumerate(zip(src, dst)):
-        assert isinstance(axis[0],
-                          int), "Each elemment of 'source' must be integer."
+        assert isinstance(
+            axis[0], int
+        ), "Each elemment of 'source' must be integer."
         if axis[0] < 0:
-            assert axis[
-                0] >= -ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[0] >= -ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
             src[i] += ndim
         else:
-            assert axis[
-                0] < ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[0] < ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
 
-        assert isinstance(axis[1],
-                          int), "Each elemment of 'source' must be integer."
+        assert isinstance(
+            axis[1], int
+        ), "Each elemment of 'source' must be integer."
         if axis[1] < 0:
-            assert axis[
-                1] >= -ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[1] >= -ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
             dst[i] += ndim
         else:
-            assert axis[
-                1] < ndim, "'source' must be in the range of [-{0}, {0})".format(
-                    ndim)
+            assert (
+                axis[1] < ndim
+            ), "'source' must be in the range of [-{0}, {0})".format(ndim)
         perm[dst[i]] = src[i]
         src_dims.remove(src[i])
         dst_dims.remove(dst[i])
@@ -4189,32 +4485,44 @@ def moveaxis(x, source, destination, name=None):
         out, _ = _legacy_C_ops.transpose2(x, 'axis', perm)
         return out
 
-    check_variable_and_dtype(x, 'x', [
-        'bool', 'float16', 'float32', 'float64', 'int32', 'int64', 'complex64',
-        'complex128'
-    ], 'moveaxis')
+    check_variable_and_dtype(
+        x,
+        'x',
+        [
+            'bool',
+            'float16',
+            'float32',
+            'float64',
+            'int32',
+            'int64',
+            'complex64',
+            'complex128',
+        ],
+        'moveaxis',
+    )
 
     helper = LayerHelper('moveaxis', **locals())
     out = helper.create_variable_for_type_inference(x.dtype)
     x_shape = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='transpose2',
-                     inputs={'X': [x]},
-                     outputs={
-                         'Out': [out],
-                         'XShape': [x_shape]
-                     },
-                     attrs={'axis': perm})
+    helper.append_op(
+        type='transpose2',
+        inputs={'X': [x]},
+        outputs={'Out': [out], 'XShape': [x_shape]},
+        attrs={'axis': perm},
+    )
     return out
 
 
 def non_negative_axis(arr, axis):
     ndim = len(arr.shape)
     if axis >= 0:
-        assert axis < ndim, "'axis'  must be in the range of [-{0}, {0})".format(
-            ndim)
+        assert (
+            axis < ndim
+        ), "'axis'  must be in the range of [-{0}, {0})".format(ndim)
     else:
-        assert axis >= -ndim, "'axis'  must be in the range of [-{0}, {0})".format(
-            ndim)
+        assert (
+            axis >= -ndim
+        ), "'axis'  must be in the range of [-{0}, {0})".format(ndim)
         axis += ndim
 
     return axis
@@ -4240,11 +4548,11 @@ def take_along_axis(arr, indices, axis):
         arr (Tensor) : The input Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to take along each 1d slice of arr. This must match the dimension of arr,
             and need to broadcast against arr. Supported data type are int and int64.
-        axis (int) : The axis to take 1d slices along. 
+        axis (int) : The axis to take 1d slices along.
 
-    Returns: 
+    Returns:
         Tensor: The indexed element, same dtype with arr
-    
+
     Examples:
         .. code-block:: python
 
@@ -4257,9 +4565,10 @@ def take_along_axis(arr, indices, axis):
             print(result)
             # [[1, 2, 3]]
     """
-    if (len(arr.shape) != len(indices.shape)):
+    if len(arr.shape) != len(indices.shape):
         raise ValueError(
-            "`indices` and `arr` must have the same number of dimensions!")
+            "`indices` and `arr` must have the same number of dimensions!"
+        )
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
     if not broadcast_shape:
@@ -4275,10 +4584,14 @@ def take_along_axis(arr, indices, axis):
             return _C_ops.take_along_axis(arr, indices, axis)
         return _legacy_C_ops.take_along_axis(arr, indices, 'Axis', axis)
     check_variable_and_dtype(
-        arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
-        'take_along_axis')
-    check_variable_and_dtype(indices, 'index', ['int32', 'int64'],
-                             'take_along_axis')
+        arr,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'take_along_axis',
+    )
+    check_variable_and_dtype(
+        indices, 'index', ['int32', 'int64'], 'take_along_axis'
+    )
     indices = paddle.broadcast_to(indices, broadcast_shape)
     broadcast_shape_list = list(broadcast_shape)
     broadcast_shape_list[axis] = list(arr.shape)[axis]
@@ -4287,13 +4600,12 @@ def take_along_axis(arr, indices, axis):
     helper = LayerHelper('take_along_axis', **locals())
     dtype = helper.input_dtype()
     result = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="take_along_axis",
-                     inputs={
-                         "Input": arr,
-                         "Index": indices
-                     },
-                     attrs={"Axis": axis},
-                     outputs={"Result": result})
+    helper.append_op(
+        type="take_along_axis",
+        inputs={"Input": arr, "Index": indices},
+        attrs={"Axis": axis},
+        outputs={"Result": result},
+    )
     return result
 
 
@@ -4305,11 +4617,12 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
         arr (Tensor) : The Destination Tensor. Supported data types are float32 and float64.
         indices (Tensor) : Indices to put along each 1d slice of arr. This must match the dimension of arr,
             and need to broadcast against arr. Supported data type are int and int64.
-        axis (int) : The axis to put 1d slices along. 
-        reduce (string | optinal) : The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
-    Returns : 
+        axis (int) : The axis to put 1d slices along.
+        reduce (str, optional): The reduce operation, default is 'assign', support 'add', 'assign', 'mul' and 'multiply'.
+
+    Returns:
         Tensor: The indexed element, same dtype with arr
-    
+
     Examples:
         .. code-block:: python
 
@@ -4325,44 +4638,48 @@ def put_along_axis(arr, indices, values, axis, reduce='assign'):
             # [60, 40, 50]]
 
     """
-    if (len(arr.shape) != len(indices.shape)):
+    if len(arr.shape) != len(indices.shape):
         raise ValueError(
-            "`indices` and `arr` must have the same number of dimensions!")
+            "`indices` and `arr` must have the same number of dimensions!"
+        )
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
     if _non_static_mode():
-        values = paddle.to_tensor(values) if not isinstance(
-            values, paddle.Tensor) else values
+        values = (
+            paddle.to_tensor(values)
+            if not isinstance(values, paddle.Tensor)
+            else values
+        )
         if broadcast_shape:
             indices = paddle.broadcast_to(indices, broadcast_shape)
         values = paddle.broadcast_to(values, indices.shape)
         if in_dygraph_mode():
             return _C_ops.put_along_axis(arr, indices, values, axis, reduce)
-        return _legacy_C_ops.put_along_axis(arr, indices, values, "Axis", axis,
-                                            "Reduce", reduce)
+        return _legacy_C_ops.put_along_axis(
+            arr, indices, values, "Axis", axis, "Reduce", reduce
+        )
 
     check_variable_and_dtype(
-        arr, 'x', ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
-        'put_along_axis')
-    check_variable_and_dtype(indices, 'index', ['int32', 'int64'],
-                             'put_along_axis')
+        arr,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64', 'uint8'],
+        'put_along_axis',
+    )
+    check_variable_and_dtype(
+        indices, 'index', ['int32', 'int64'], 'put_along_axis'
+    )
     if broadcast_shape:
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
     helper = LayerHelper('put_along_axis', **locals())
     dtype = helper.input_dtype()
     result = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type="put_along_axis",
-                     inputs={
-                         "Input": arr,
-                         "Index": indices,
-                         "Value": values
-                     },
-                     attrs={
-                         "Axis": axis,
-                         "Reduce": reduce
-                     },
-                     outputs={"Result": result})
+    helper.append_op(
+        type="put_along_axis",
+        inputs={"Input": arr, "Index": indices, "Value": values},
+        attrs={"Axis": axis, "Reduce": reduce},
+        outputs={"Result": result},
+    )
     return result
 
 
@@ -4372,20 +4689,25 @@ def put_along_axis_(arr, indices, values, axis, reduce='assign'):
     Inplace version of ``put_along_axis`` API, the output Tensor will be inplaced with input ``arr``.
     Please refer to :ref:`api_tensor_put_along_axis`.
     """
-    if (len(arr.shape) != len(indices.shape)):
+    if len(arr.shape) != len(indices.shape):
         raise ValueError(
-            "`indices` and `arr` must have the same number of dimensions!")
+            "`indices` and `arr` must have the same number of dimensions!"
+        )
     axis = non_negative_axis(arr, axis)
     broadcast_shape = infer_broadcast_shape(arr, indices, axis)
-    values = paddle.to_tensor(values) if not isinstance(
-        values, paddle.Tensor) else values
+    values = (
+        paddle.to_tensor(values)
+        if not isinstance(values, paddle.Tensor)
+        else values
+    )
     if broadcast_shape:
         indices = paddle.broadcast_to(indices, broadcast_shape)
     values = paddle.broadcast_to(values, indices.shape)
     if in_dygraph_mode():
         return _C_ops.put_along_axis_(arr, indices, values, axis, reduce)
-    return _legacy_C_ops.put_along_axis_(arr, indices, values, "Axis", axis,
-                                         "Reduce", reduce)
+    return _legacy_C_ops.put_along_axis_(
+        arr, indices, values, "Axis", axis, "Reduce", reduce
+    )
 
 
 def _index_add_params_check(x, index, input_axis, add_value):
@@ -4426,7 +4748,7 @@ def index_add(x, index, axis, value, name=None):
         x (Tensor) : The Destination Tensor. Supported data types are int32, int64, float16, float32, float64.
         index (Tensor): The 1-D Tensor containing the indices to index.
             The data type of ``index`` must be int32 or int64.
-        axis (int): The dimension in which we index. 
+        axis (int): The dimension in which we index.
         value (Tensor): The tensor used to add the elements along the target axis.
         name(str, optional): For details, please refer to :ref:`api_guide_Name`. Generally, no setting is required. Default: None.
 
@@ -4443,10 +4765,11 @@ def index_add(x, index, axis, value, name=None):
             index = paddle.to_tensor([0, 2], dtype="int32")
             value = paddle.to_tensor([[1, 1, 1], [1, 1, 1]], dtype="float32")
             outplace_res = paddle.index_add(input_tensor, index, 0, value)
-            print(outplace_res.numpy())
-            # [[2 2 2]
-            #  [1 1 1]
-            #  [2 2 2]]
+            print(outplace_res)
+            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[2., 2., 2.],
+            #         [1., 1., 1.],
+            #         [2., 2., 2.]])
     """
     _index_add_params_check(x, index, axis, value)
 
@@ -4455,24 +4778,36 @@ def index_add(x, index, axis, value, name=None):
 
     helper = LayerHelper("index_add", **locals())
     check_variable_and_dtype(
-        x, 'x', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'paddle.tensor.manipulation.index_add')
-    check_variable_and_dtype(index, 'index', ['int32', 'int64'],
-                             'paddle.tensor.manipulation.index_add')
+        x,
+        'x',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'paddle.tensor.manipulation.index_add',
+    )
+    check_variable_and_dtype(
+        index,
+        'index',
+        ['int32', 'int64'],
+        'paddle.tensor.manipulation.index_add',
+    )
     check_variable_and_dtype(
-        value, 'add_value', ['float16', 'float32', 'float64', 'int32', 'int64'],
-        'paddle.tensor.manipulation.index_add')
+        value,
+        'add_value',
+        ['float16', 'float32', 'float64', 'int32', 'int64'],
+        'paddle.tensor.manipulation.index_add',
+    )
 
     out = helper.create_variable_for_type_inference(x.dtype)
 
-    helper.append_op(type='index_add',
-                     inputs={
-                         'X': x,
-                         'Index': index,
-                         'AddValue': value,
-                     },
-                     outputs={'Out': out},
-                     attrs={'axis': axis})
+    helper.append_op(
+        type='index_add',
+        inputs={
+            'X': x,
+            'Index': index,
+            'AddValue': value,
+        },
+        outputs={'Out': out},
+        attrs={'axis': axis},
+    )
     return out
 
 
@@ -4481,7 +4816,7 @@ def index_add_(x, index, axis, value, name=None):
     """
     Inplace version of ``index_add`` API, the output Tensor will be inplaced with input ``x``.
     Please refer to :ref:`api_paddle_tensor_index_add`.
-    
+
     Examples:
         .. code-block:: python
 
@@ -4492,10 +4827,11 @@ def index_add_(x, index, axis, value, name=None):
             index = paddle.to_tensor([0, 2], dtype="int32")
             value = paddle.to_tensor([[1, 1], [1, 1], [1, 1]], dtype="float32")
             inplace_res = paddle.index_add_(input_tensor, index, 1, value)
-            print(inplace_res.numpy())
-            # [[2, 1, 2]
-            #  [2, 1, 2]
-            #  [2, 1, 2]]
+            print(inplace_res)
+            # Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[2., 1., 2.],
+            #         [2., 1., 2.],
+            #         [2., 1., 2.]])
     """
 
     _index_add_params_check(x, index, axis, value)
@@ -4509,7 +4845,7 @@ def index_add_(x, index, axis, value, name=None):
     'fill_diagonal_': fill_diagonal_,
     'fill_diagonal_tensor_': fill_diagonal_tensor_,
     "fill_diagonal_tensor": fill_diagonal_tensor,
-    'tolist': tolist
+    'tolist': tolist,
 }
 for name, func in __METHODS.items():
     setattr(core.VarBase, name, func)
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index c5b995454aeea8..2c5523c47a0337 100644
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -25,14 +25,29 @@
 
 from .manipulation import cast
 from .creation import _complex_to_real_dtype
-from .layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
+from .layer_function_generator import (
+    _generate_doc_string_,
+    generate_activation_fn,
+    generate_layer_fn,
+)
 
 import paddle
 from ..static import Variable
-from ..framework import core, in_dygraph_mode, _non_static_mode, LayerHelper, _in_legacy_dygraph
+from ..framework import (
+    core,
+    in_dygraph_mode,
+    _non_static_mode,
+    LayerHelper,
+    _in_legacy_dygraph,
+)
 from ..fluid.framework import _in_legacy_dygraph
 from ..framework import _varbase_creator, convert_np_dtype_to_dtype_
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+    convert_dtype,
+)
 from ..fluid.dygraph.inplace_utils import inplace_apis_in_dygraph_only
 from ..fluid.layers import utils
 
@@ -208,7 +223,8 @@ def scale(x, scale=1.0, bias=0.0, bias_after_scale=True, act=None, name=None):
 
 
 def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
-    """
+    r"""
+
     stanh activation.
 
     .. math::
@@ -219,8 +235,7 @@ def stanh(x, scale_a=0.67, scale_b=1.7159, name=None):
         x (Tensor): The input Tensor with data type float32, float64.
         scale_a (float, optional): The scale factor a of the input. Default is 0.67.
         scale_b (float, optional): The scale factor b of the output. Default is 1.7159.
-        name (str, optional): Name for the operation (optional, default is None).
-            For more information, please refer to :ref:`api_guide_Name`.
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
         A Tensor with the same data type and shape as ``x`` .
@@ -890,34 +905,37 @@ def maximum(x, y, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             x = paddle.to_tensor([[1, 2], [7, 8]])
             y = paddle.to_tensor([[3, 4], [5, 6]])
             res = paddle.maximum(x, y)
             print(res)
-            #    [[3, 4],
-            #     [7, 8]]
+            # Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[3, 4],
+            #         [7, 8]])
 
             x = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
             y = paddle.to_tensor([3, 0, 4])
             res = paddle.maximum(x, y)
             print(res)
-            #    [[3, 2, 4],
-            #     [3, 2, 4]]
+            # Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[3, 2, 4],
+            #         [3, 2, 4]])
 
             x = paddle.to_tensor([2, 3, 5], dtype='float32')
-            y = paddle.to_tensor([1, np.nan, np.nan], dtype='float32')
+            y = paddle.to_tensor([1, float("nan"), float("nan")], dtype='float32')
             res = paddle.maximum(x, y)
             print(res)
-            #    [ 2., nan, nan]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [2. , nan, nan])
 
-            x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
-            y = paddle.to_tensor([1, -np.inf, 5], dtype='float32')
+            x = paddle.to_tensor([5, 3, float("inf")], dtype='float32')
+            y = paddle.to_tensor([1, -float("inf"), 5], dtype='float32')
             res = paddle.maximum(x, y)
             print(res)
-            #    [  5.,   3., inf.]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [5.  , 3.  , inf.])
     """
     op_type = 'elementwise_max'
     axis = -1
@@ -951,34 +969,37 @@ def minimum(x, y, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             x = paddle.to_tensor([[1, 2], [7, 8]])
             y = paddle.to_tensor([[3, 4], [5, 6]])
             res = paddle.minimum(x, y)
             print(res)
-            #       [[1, 2],
-            #        [5, 6]]
+            # Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1, 2],
+            #         [5, 6]])
 
             x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
             y = paddle.to_tensor([3, 0, 4])
             res = paddle.minimum(x, y)
             print(res)
-            #       [[[1, 0, 3],
-            #         [1, 0, 3]]]
+            # Tensor(shape=[1, 2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[[1, 0, 3],
+            #          [1, 0, 3]]])
 
             x = paddle.to_tensor([2, 3, 5], dtype='float32')
-            y = paddle.to_tensor([1, np.nan, np.nan], dtype='float32')
+            y = paddle.to_tensor([1, float("nan"), float("nan")], dtype='float32')
             res = paddle.minimum(x, y)
             print(res)
-            #       [ 1., nan, nan]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [1. , nan, nan])
 
-            x = paddle.to_tensor([5, 3, np.inf], dtype='float64')
-            y = paddle.to_tensor([1, -np.inf, 5], dtype='float64')
+            x = paddle.to_tensor([5, 3, float("inf")], dtype='float64')
+            y = paddle.to_tensor([1, -float("inf"), 5], dtype='float64')
             res = paddle.minimum(x, y)
             print(res)
-            #       [   1., -inf.,    5.]
+            # Tensor(shape=[3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [ 1.  , -inf.,  5.  ])
     """
     op_type = 'elementwise_min'
     axis = -1
@@ -1014,34 +1035,37 @@ def fmax(x, y, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             x = paddle.to_tensor([[1, 2], [7, 8]])
             y = paddle.to_tensor([[3, 4], [5, 6]])
             res = paddle.fmax(x, y)
             print(res)
-            #    [[3, 4],
-            #     [7, 8]]
+            # Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[3, 4],
+            #         [7, 8]])
 
             x = paddle.to_tensor([[1, 2, 3], [1, 2, 3]])
             y = paddle.to_tensor([3, 0, 4])
             res = paddle.fmax(x, y)
             print(res)
-            #    [[3, 2, 4],
-            #     [3, 2, 4]]
+            # Tensor(shape=[2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[3, 2, 4],
+            #         [3, 2, 4]])
 
             x = paddle.to_tensor([2, 3, 5], dtype='float32')
-            y = paddle.to_tensor([1, np.nan, np.nan], dtype='float32')
+            y = paddle.to_tensor([1, float("nan"), float("nan")], dtype='float32')
             res = paddle.fmax(x, y)
             print(res)
-            #    [ 2., 3., 5.]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [2., 3., 5.])
 
-            x = paddle.to_tensor([5, 3, np.inf], dtype='float32')
-            y = paddle.to_tensor([1, -np.inf, 5], dtype='float32')
+            x = paddle.to_tensor([5, 3, float("inf")], dtype='float32')
+            y = paddle.to_tensor([1, -float("inf"), 5], dtype='float32')
             res = paddle.fmax(x, y)
             print(res)
-            #    [  5.,   3., inf.]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [5.  , 3.  , inf.])
     """
     op_type = 'elementwise_fmax'
     axis = -1
@@ -1077,34 +1101,37 @@ def fmin(x, y, name=None):
 
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
             x = paddle.to_tensor([[1, 2], [7, 8]])
             y = paddle.to_tensor([[3, 4], [5, 6]])
             res = paddle.fmin(x, y)
             print(res)
-            #       [[1, 2],
-            #        [5, 6]]
+            # Tensor(shape=[2, 2], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[1, 2],
+            #         [5, 6]])
 
             x = paddle.to_tensor([[[1, 2, 3], [1, 2, 3]]])
             y = paddle.to_tensor([3, 0, 4])
             res = paddle.fmin(x, y)
             print(res)
-            #       [[[1, 0, 3],
-            #         [1, 0, 3]]]
+            # Tensor(shape=[1, 2, 3], dtype=int64, place=Place(cpu), stop_gradient=True,
+            #        [[[1, 0, 3],
+            #          [1, 0, 3]]])
 
             x = paddle.to_tensor([2, 3, 5], dtype='float32')
-            y = paddle.to_tensor([1, np.nan, np.nan], dtype='float32')
+            y = paddle.to_tensor([1, float("nan"), float("nan")], dtype='float32')
             res = paddle.fmin(x, y)
             print(res)
-            #       [ 1., 3., 5.]
+            # Tensor(shape=[3], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [1., 3., 5.])
 
-            x = paddle.to_tensor([5, 3, np.inf], dtype='float64')
-            y = paddle.to_tensor([1, -np.inf, 5], dtype='float64')
+            x = paddle.to_tensor([5, 3, float("inf")], dtype='float64')
+            y = paddle.to_tensor([1, -float("inf"), 5], dtype='float64')
             res = paddle.fmin(x, y)
             print(res)
-            #       [   1., -inf.,    5.]
+            # Tensor(shape=[3], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [ 1.  , -inf.,  5.  ])
     """
     op_type = 'elementwise_fmin'
     axis = -1
@@ -1275,15 +1302,13 @@ def nansum(x, axis=None, dtype=None, keepdim=False, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
 
             # x is a Tensor with following elements:
             #    [[nan, 0.3, 0.5, 0.9]
             #     [0.1, 0.2, -nan, 0.7]]
             # Each example is followed by the corresponding output tensor.
-            x = np.array([[float('nan'), 0.3, 0.5, 0.9],
-                            [0.1, 0.2, float('-nan'), 0.7]]).astype(np.float32)
-            x = paddle.to_tensor(x)
+            x = paddle.to_tensor([[float('nan'), 0.3, 0.5, 0.9],
+                            [0.1, 0.2, float('-nan'), 0.7]],dtype="float32")
             out1 = paddle.nansum(x)  # [2.7]
             out2 = paddle.nansum(x, axis=0)  # [0.1, 0.5, 0.5, 1.6]
             out3 = paddle.nansum(x, axis=-1)  # [1.7, 1.0]
@@ -1293,9 +1318,8 @@ def nansum(x, axis=None, dtype=None, keepdim=False, name=None):
             #      [[[1, nan], [3, 4]],
             #      [[5, 6], [-nan, 8]]]
             # Each example is followed by the corresponding output tensor.
-            y = np.array([[[1, float('nan')], [3, 4]], 
+            y = paddle.to_tensor([[[1, float('nan')], [3, 4]],
                             [[5, 6], [float('-nan'), 8]]])
-            y = paddle.to_tensor(y)
             out5 = paddle.nansum(y, axis=[1, 2]) # [8, 19]
             out6 = paddle.nansum(y, axis=[0, 1]) # [9, 18]
     """
@@ -1501,9 +1525,6 @@ def add_n(inputs, name=None):
     if in_dygraph_mode():
         if isinstance(inputs, Variable):
             inputs = [inputs]
-        for x in inputs:
-            if not x.is_dense():
-                return _legacy_C_ops.sum(inputs, 'use_mkldnn', False)
         return _C_ops.add_n(inputs)
     if _in_legacy_dygraph():
         if isinstance(inputs, Variable):
@@ -4122,9 +4143,8 @@ def lerp_(x, y, weight, name=None):
 
 def erfinv(x, name=None):
     r"""
-    The inverse error function of x.
+    The inverse error function of x. Please refer to :ref:`api_paddle_erf`
 
-    Equation:
         .. math::
 
             erfinv(erf(x)) = x.
@@ -4134,7 +4154,7 @@ def erfinv(x, name=None):
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        out (Tensor): An N-D Tensor, the shape and data type is the same with input.
+        out (Tensor), an N-D Tensor, the shape and data type is the same with input.
 
     Example:
         .. code-block:: python
@@ -4190,8 +4210,8 @@ def rad2deg(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
+            import math
+
             x1 = paddle.to_tensor([3.142, -3.142, 6.283, -6.283, 1.570, -1.570])
             result1 = paddle.rad2deg(x1)
             print(result1)
@@ -4199,7 +4219,7 @@ def rad2deg(x, name=None):
             #         [180.02334595, -180.02334595,  359.98937988, -359.98937988,
             #           9.95437622 , -89.95437622])
 
-            x2 = paddle.to_tensor(np.pi/2)
+            x2 = paddle.to_tensor(math.pi/2)
             result2 = paddle.rad2deg(x2)
             print(result2)
             # Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
@@ -4236,8 +4256,6 @@ def rad2deg(x, name=None):
 def deg2rad(x, name=None):
     r"""
     Convert each of the elements of input x from degrees to angles in radians.
-    
-    Equation:
         .. math::
 
             deg2rad(x)=\pi * x / 180
@@ -4253,8 +4271,6 @@ def deg2rad(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-            
             x1 = paddle.to_tensor([180.0, -180.0, 360.0, -360.0, 90.0, -90.0])
             result1 = paddle.deg2rad(x1)
             print(result1)
@@ -4648,18 +4664,20 @@ def angle(x, name=None):
             x = paddle.to_tensor([-2, -1, 0, 1]).unsqueeze(-1).astype('float32')
             y = paddle.to_tensor([-2, -1, 0, 1]).astype('float32')
             z = x + 1j * y
-            print(z.numpy())
-            # [[-2.-2.j -2.-1.j -2.+0.j -2.+1.j]
-            #  [-1.-2.j -1.-1.j -1.+0.j -1.+1.j]
-            #  [ 0.-2.j  0.-1.j  0.+0.j  0.+1.j]
-            #  [ 1.-2.j  1.-1.j  1.+0.j  1.+1.j]]
+            print(z)
+            # Tensor(shape=[4, 4], dtype=complex64, place=Place(cpu), stop_gradient=True,
+            #        [[(-2-2j), (-2-1j), (-2+0j), (-2+1j)],
+            #         [(-1-2j), (-1-1j), (-1+0j), (-1+1j)],
+            #         [-2j    , -1j    ,  0j    ,  1j    ],
+            #         [ (1-2j),  (1-1j),  (1+0j),  (1+1j)]])
 
             theta = paddle.angle(z)
-            print(theta.numpy())
-            # [[-2.3561945 -2.6779451  3.1415927  2.6779451]
-            #  [-2.0344439 -2.3561945  3.1415927  2.3561945]
-            #  [-1.5707964 -1.5707964  0.         1.5707964]
-            #  [-1.1071488 -0.7853982  0.         0.7853982]]
+            print(theta)
+            # Tensor(shape=[4, 4], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[-2.35619450, -2.67794514,  3.14159274,  2.67794514],
+            #         [-2.03444386, -2.35619450,  3.14159274,  2.35619450],
+            #         [-1.57079637, -1.57079637,  0.        ,  1.57079637],
+            #         [-1.10714877, -0.78539819,  0.        ,  0.78539819]])
     """
 
     if in_dygraph_mode():
@@ -4679,18 +4697,18 @@ def angle(x, name=None):
     return out
 
 def heaviside(x, y, name=None):
-    """
+    r"""
     Computes the Heaviside step function determined by corresponding element in y for each element in x. The equation is
 
     .. math::
         heaviside(x, y)=
             \left\{
-                \\begin{array}{lcl}
-                0,& &\\text{if} \ x < 0, \\\\
-                y,& &\\text{if} \ x = 0, \\\\
-                1,& &\\text{if} \ x > 0.
+                \begin{array}{lcl}
+                0,& &\text{if} \ x < 0, \\
+                y,& &\text{if} \ x = 0, \\
+                1,& &\text{if} \ x > 0.
                 \end{array}
-            \\right.
+            \right.
 
     Note:
         ``paddle.heaviside`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting`.
@@ -4716,7 +4734,7 @@ def heaviside(x, y, name=None):
             paddle.heaviside(x, y)
             #    [[0.        , 0.20000000, 1.        ],
             #     [0.        , 1.        , 0.30000001]]
-     """
+    """
     op_type = 'elementwise_heaviside'
     axis = -1
     act = None
@@ -4740,19 +4758,14 @@ def frac(x, name=None):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            input = paddle.rand([3, 3], 'float32')
-            print(input.numpy())
-            # [[ 1.2203873  -1.0035421  -0.35193074]
-            #  [-0.00928353  0.58917075 -0.8407828 ]
-            #  [-1.5131804   0.5850153  -0.17597814]]
 
+            input = paddle.to_tensor([[12.22000003, -1.02999997],
+                                    [-0.54999995, 0.66000003]])
             output = paddle.frac(input)
-            print(output.numpy())
-            # [[ 0.22038734 -0.00354207 -0.35193074]
-            #  [-0.00928353  0.58917075 -0.8407828 ]
-            #  [-0.5131804   0.5850153  -0.17597814]]
+            print(output)
+            # Tensor(shape=[2, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[ 0.22000003, -0.02999997],
+            #         [-0.54999995,  0.66000003]])
     """
     op_type = 'elementwise_sub'
     axis = -1
diff --git a/python/paddle/tensor/ops.py b/python/paddle/tensor/ops.py
index 4c3f7c55c494bf..f056bda6157f2f 100644
--- a/python/paddle/tensor/ops.py
+++ b/python/paddle/tensor/ops.py
@@ -14,17 +14,27 @@
 
 from __future__ import print_function
 import os
-from .layer_function_generator import generate_layer_fn, generate_activation_fn, generate_inplace_fn, add_sample_code
+from .layer_function_generator import (
+    generate_layer_fn,
+    generate_activation_fn,
+    generate_inplace_fn,
+    add_sample_code,
+)
 from ..framework import core
 from ..framework import convert_np_dtype_to_dtype_
 from ..static import Variable
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.framework import in_dygraph_mode
 from .. import _C_ops, _legacy_C_ops
 
 __deprecated_func_name__ = {
     'tanh_shrink': 'tanhshrink',
-    'logsigmoid': 'log_sigmoid'
+    'logsigmoid': 'log_sigmoid',
 }
 
 __activations_noattr__ = [
@@ -73,9 +83,6 @@
 
 __all__ = []
 
-for _OP in set(__all__):
-    globals()[_OP] = generate_layer_fn(_OP)
-
 # It is a hot fix in some unittest using:
 #   fluid.layers.scale(x=x, scale=10.0, out=out_var)
 # e.g.: test_program_code.py, test_dist_train.py
@@ -83,10 +90,6 @@
 
 globals()['_elementwise_div'] = generate_layer_fn('elementwise_div')
 
-__all__ += __activations_noattr__
-__all__ += __unary_func__
-__all__ += __inplace_unary_func__
-
 for _OP in set(__activations_noattr__):
     _new_OP = _OP
     if _OP in __deprecated_func_name__:
@@ -109,7 +112,8 @@
     globals()[_OP] = _func
 
 add_sample_code(
-    globals()["sigmoid"], r"""
+    globals()["sigmoid"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -121,10 +125,12 @@
         print(out)
         # [0.40131234 0.450166   0.52497919 0.57444252]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["silu"], r"""
+    globals()["silu"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -136,10 +142,12 @@
         print(out)
         # [ 0.7310586 1.7615942 2.8577224, 3.9280552 ]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["logsigmoid"], r"""
+    globals()["logsigmoid"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -151,10 +159,12 @@
         print(out)
         # [-0.91301525 -0.79813887 -0.64439666 -0.55435524]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["exp"], r"""
+    globals()["exp"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -165,10 +175,12 @@
         print(out)
         # [0.67032005 0.81873075 1.10517092 1.34985881]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["expm1"], r"""
+    globals()["expm1"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -179,10 +191,12 @@
         print(out)
         # [-0.32967997, -0.18126924,  0.10517092,  0.34985882]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["tanh"], r"""
+    globals()["tanh"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -193,10 +207,12 @@
         print(out)
         # [-0.37994896 -0.19737532  0.09966799  0.29131261]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["atan"], r"""
+    globals()["atan"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -207,10 +223,12 @@
         print(out)
         # [-0.38050638 -0.19739556  0.09966865  0.29145679]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["tanh_shrink"], r"""
+    globals()["tanh_shrink"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -222,10 +240,12 @@
         print(out)
         # [-0.020051, -0.00262468, 0.000332005, 0.00868739]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["sqrt"], r"""
+    globals()["sqrt"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -236,10 +256,12 @@
         print(out)
         # [0.31622777 0.4472136  0.54772256 0.63245553]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["rsqrt"], r"""
+    globals()["rsqrt"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -250,10 +272,12 @@
         print(out)
         # [3.16227766 2.23606798 1.82574186 1.58113883]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["abs"], r"""
+    globals()["abs"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -264,10 +288,12 @@
         print(out)
         # [0.4 0.2 0.1 0.3]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["ceil"], r"""
+    globals()["ceil"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -278,10 +304,12 @@
         print(out)
         # [-0. -0.  1.  1.]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["floor"], r"""
+    globals()["floor"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -292,10 +320,12 @@
         print(out)
         # [-1. -1.  0.  0.]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["cos"], r"""
+    globals()["cos"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -306,10 +336,12 @@
         print(out)
         # [0.92106099 0.98006658 0.99500417 0.95533649]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["tan"], r"""
+    globals()["tan"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -320,10 +352,12 @@
         print(out)
         # [-0.42279324, -0.20271005, 0.10033467, 0.30933627]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["acos"], r"""
+    globals()["acos"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -334,10 +368,12 @@
         print(out)
         # [1.98231317 1.77215425 1.47062891 1.26610367]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["sin"], r"""
+    globals()["sin"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -348,10 +384,12 @@
         print(out)
         # [-0.38941834 -0.19866933  0.09983342  0.29552021]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["asin"], r"""
+    globals()["asin"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -362,10 +400,12 @@
         print(out)
         # [-0.41151685 -0.20135792  0.10016742  0.30469265]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["cosh"], r"""
+    globals()["cosh"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -376,10 +416,12 @@
         print(out)
         # [1.08107237 1.02006676 1.00500417 1.04533851]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["sinh"], r"""
+    globals()["sinh"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -390,10 +432,12 @@
         print(out)
         # [-0.41075233 -0.201336    0.10016675  0.30452029]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["asinh"], r"""
+    globals()["asinh"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -404,10 +448,12 @@
         print(out)
         # [-0.39003533, -0.19869010,  0.09983408,  0.29567307]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["acosh"], r"""
+    globals()["acosh"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -418,10 +464,12 @@
         print(out)
         # [0.        , 1.76274729, 2.06343699, 2.29243159]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["atanh"], r"""
+    globals()["atanh"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -432,10 +480,12 @@
         print(out)
         # [-0.42364895, -0.20273256,  0.10033535,  0.30951962]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["round"], r"""
+    globals()["round"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -446,10 +496,12 @@
         print(out)
         # [-1. -0.  1.  2.]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["reciprocal"], r"""
+    globals()["reciprocal"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -460,10 +512,12 @@
         print(out)
         # [-2.5        -5.         10.          3.33333333]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["square"], r"""
+    globals()["square"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -474,10 +528,12 @@
         print(out)
         # [0.16 0.04 0.01 0.09]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["softplus"], r"""
+    globals()["softplus"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -489,10 +545,12 @@
         print(out)
         # [0.513015, 0.598139, 0.744397, 0.854355]
 
-""")
+""",
+)
 
 add_sample_code(
-    globals()["softsign"], r"""
+    globals()["softsign"],
+    r"""
 Examples:
     .. code-block:: python
 
@@ -504,9 +562,8 @@
         print(out)
         # [-0.285714, -0.166667, 0.0909091, 0.230769]
 
-""")
-
-__all__ += ['erf']
+""",
+)
 
 _erf_ = generate_layer_fn('erf')
 
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index b3e14784c3d1ce..9bde154bfe08f1 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -20,7 +20,12 @@
 from ..framework import core
 from paddle.fluid.framework import _in_legacy_dygraph, in_dygraph_mode
 from .search import where
-from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    convert_dtype,
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid.layers import utils
 import paddle
 from paddle import _C_ops, _legacy_C_ops
@@ -86,9 +91,11 @@ def mean(x, axis=None, keepdim=False, name=None):
     else:
         if isinstance(axis, int):
             axis = [axis]
-        reduce_all = True if axis is None \
-            or len(axis)==0 \
-            or len(axis) == len(x.shape) else False
+        reduce_all = (
+            True
+            if axis is None or len(axis) == 0 or len(axis) == len(x.shape)
+            else False
+        )
         if axis is None or len(axis) == 0:
             axis = [0]
 
@@ -97,18 +104,27 @@ def mean(x, axis=None, keepdim=False, name=None):
             axis = list(range(len(x.shape)))
         return _C_ops.mean(x, axis, keepdim)
     if _in_legacy_dygraph():
-        return _legacy_C_ops.reduce_mean(x, 'dim', axis, 'keep_dim', keepdim,
-                                         'reduce_all', reduce_all)
-
-    check_variable_and_dtype(x, 'x/input',
-                             ['uint16', 'float16', 'float32', 'float64'],
-                             'mean/reduce_mean')
-    check_type(axis, 'axis/dim', (int, list, tuple, Variable),
-               'mean/reduce_mean')
+        return _legacy_C_ops.reduce_mean(
+            x, 'dim', axis, 'keep_dim', keepdim, 'reduce_all', reduce_all
+        )
+
+    check_variable_and_dtype(
+        x,
+        'x/input',
+        ['uint16', 'float16', 'float32', 'float64'],
+        'mean/reduce_mean',
+    )
+    check_type(
+        axis, 'axis/dim', (int, list, tuple, Variable), 'mean/reduce_mean'
+    )
     if isinstance(axis, (list, tuple)):
         for item in axis:
-            check_type(item, 'elements of axis/dim', (int, Variable),
-                       'mean/reduce_mean')
+            check_type(
+                item,
+                'elements of axis/dim',
+                (int, Variable),
+                'mean/reduce_mean',
+            )
 
     helper = LayerHelper('mean', **locals())
 
@@ -116,10 +132,9 @@ def mean(x, axis=None, keepdim=False, name=None):
         axis = utils._convert_to_tensor_list(axis)
     attrs = {'dim': axis, 'keep_dim': keepdim, 'reduce_all': reduce_all}
     out = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='reduce_mean',
-                     inputs={'X': x},
-                     outputs={'Out': out},
-                     attrs=attrs)
+    helper.append_op(
+        type='reduce_mean', inputs={'X': x}, outputs={'Out': out}, attrs=attrs
+    )
     return out
 
 
@@ -129,10 +144,10 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
 
     Args:
         x (Tensor): The input Tensor with data type float32, float64.
-        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int). 
-        
-            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` . 
-            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` . 
+        axis (int|list|tuple, optional): The axis along which to perform variance calculations. ``axis`` should be int, list(int) or tuple(int).
+
+            - If ``axis`` is a list/tuple of dimension(s), variance is calculated along all element(s) of ``axis`` . ``axis`` or element(s) of ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            - If ``axis`` or element(s) of ``axis`` is less than 0, it works the same way as :math:`axis + D` .
             - If ``axis`` is None, variance is calculated over all elements of ``x``. Default is None.
 
         unbiased (bool, optional): Whether to use the unbiased estimation. If ``unbiased`` is True, the divisor used in the computation is :math:`N - 1`, where :math:`N` represents the number of elements along ``axis`` , otherwise the divisor is :math:`N`. Default is True.
@@ -157,13 +172,17 @@ def var(x, axis=None, unbiased=True, keepdim=False, name=None):
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'var')
 
     u = mean(x, axis, True, name)
-    out = paddle.sum((x - u)**2, axis, keepdim=keepdim, name=name)
+    out = paddle.sum((x - u) ** 2, axis, keepdim=keepdim, name=name)
 
-    n = paddle.cast(paddle.numel(x), x.dtype) \
-        / paddle.cast(paddle.numel(out), x.dtype)
+    dtype = x.dtype
+    n = paddle.cast(paddle.numel(x), paddle.int64) / paddle.cast(
+        paddle.numel(out), paddle.int64
+    )
+    n = n.astype(dtype)
     if unbiased:
         one_const = paddle.ones([1], x.dtype)
-        n = where(n > one_const, n - 1., one_const)
+        n = where(n > one_const, n - 1.0, one_const)
+    n.stop_gradient = True
     out /= n
     return out
 
@@ -236,7 +255,7 @@ def numel(x, name=None):
         .. code-block:: python
 
             import paddle
-            
+
             x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
             numel = paddle.numel(x) # 140
 
@@ -251,7 +270,8 @@ def numel(x, name=None):
         raise TypeError("x must be a Tensor in numel")
     helper = LayerHelper('numel', **locals())
     out = helper.create_variable_for_type_inference(
-        dtype=core.VarDesc.VarType.INT64)
+        dtype=core.VarDesc.VarType.INT64
+    )
     helper.append_op(type='size', inputs={'Input': x}, outputs={'Out': out})
     return out
 
@@ -319,8 +339,9 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
         )
 
     for i in range(len(axis)):
-        if not isinstance(axis[i], int) or not (axis[i] < dims
-                                                and axis[i] >= -dims):
+        if not isinstance(axis[i], int) or not (
+            axis[i] < dims and axis[i] >= -dims
+        ):
             raise ValueError(
                 "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
             )
@@ -331,25 +352,25 @@ def nanmedian(x, axis=None, keepdim=True, name=None):
         raise ValueError("Axis has duplicated elements.")
 
     if _in_legacy_dygraph():
-        median_index, out = _legacy_C_ops.nanmedian(x, 'axis', axis, 'keepdim',
-                                                    keepdim)
+        median_index, out = _legacy_C_ops.nanmedian(
+            x, 'axis', axis, 'keepdim', keepdim
+        )
         return out
 
     check_variable_and_dtype(
-        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'],
-        'nanmedian')
+        x, 'X', ['int32', 'int64', 'float16', 'float32', 'float64'], 'nanmedian'
+    )
 
     helper = LayerHelper('nanmedian', **locals())
     attrs = {'axis': axis, 'keepdim': keepdim}
     out = helper.create_variable_for_type_inference(x.dtype)
     medians = helper.create_variable_for_type_inference(x.dtype)
-    helper.append_op(type='nanmedian',
-                     inputs={'X': x},
-                     outputs={
-                         'Out': out,
-                         'MedianIndex': medians
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='nanmedian',
+        inputs={'X': x},
+        outputs={'Out': out, 'MedianIndex': medians},
+        attrs=attrs,
+    )
     return out
 
 
@@ -422,21 +443,22 @@ def median(x, axis=None, keepdim=False, name=None):
     dtype = 'float64' if x.dtype == core.VarDesc.VarType.FP64 else 'float32'
     if sz & 1 == 0:
         out_tensor = paddle.slice(
-            tensor_topk, axes=[axis], starts=[kth - 1],
-            ends=[kth]) + paddle.slice(
-                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1])
+            tensor_topk, axes=[axis], starts=[kth - 1], ends=[kth]
+        ) + paddle.slice(tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1])
         out_tensor = paddle.cast(out_tensor, dtype=dtype) / 2
     else:
-        out_tensor = paddle.cast(paddle.slice(tensor_topk,
-                                              axes=[axis],
-                                              starts=[kth],
-                                              ends=[kth + 1]),
-                                 dtype=dtype)
+        out_tensor = paddle.cast(
+            paddle.slice(
+                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]
+            ),
+            dtype=dtype,
+        )
     out_tensor = out_tensor + paddle.sum(
-        paddle.cast(paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True)
+        paddle.cast(paddle.isnan(x), dtype=dtype) * x, axis=axis, keepdim=True
+    )
     if not keepdim or is_flatten:
         if not is_flatten:
-            newshape = x.shape[:axis] + x.shape[axis + 1:]
+            newshape = x.shape[:axis] + x.shape[axis + 1 :]
         elif not keepdim:
             newshape = [1]
         else:
@@ -500,7 +522,8 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
             axis_src, axis_dst = [], []
             for axis_single in axis:
                 if not isinstance(axis_single, int) or not (
-                        axis_single < dims and axis_single >= -dims):
+                    axis_single < dims and axis_single >= -dims
+                ):
                     raise ValueError(
                         "Axis should be None, int, or a list, element should in range [-rank(x), rank(x))."
                     )
@@ -522,9 +545,9 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
             out_shape[axis] = 1
 
     mask = x.isnan()
-    valid_counts = mask.logical_not().sum(axis=axis,
-                                          keepdim=True,
-                                          dtype='float64')
+    valid_counts = mask.logical_not().sum(
+        axis=axis, keepdim=True, dtype='float64'
+    )
 
     indices = []
 
@@ -551,15 +574,18 @@ def _compute_quantile(x, q, axis=None, keepdim=False, ignore_nan=False):
     for index in indices:
         indices_below = paddle.floor(index).astype(paddle.int32)
         indices_upper = paddle.ceil(index).astype(paddle.int32)
-        tensor_upper = paddle.take_along_axis(sorted_tensor,
-                                              indices_upper,
-                                              axis=axis)
-        tensor_below = paddle.take_along_axis(sorted_tensor,
-                                              indices_below,
-                                              axis=axis)
-        weights = (index - indices_below.astype('float64'))
-        out = paddle.lerp(tensor_below.astype('float64'),
-                          tensor_upper.astype('float64'), weights)
+        tensor_upper = paddle.take_along_axis(
+            sorted_tensor, indices_upper, axis=axis
+        )
+        tensor_below = paddle.take_along_axis(
+            sorted_tensor, indices_below, axis=axis
+        )
+        weights = index - indices_below.astype('float64')
+        out = paddle.lerp(
+            tensor_below.astype('float64'),
+            tensor_upper.astype('float64'),
+            weights,
+        )
         if not keepdim:
             out = paddle.squeeze(out, axis=axis)
         else:
@@ -603,32 +629,35 @@ def quantile(x, q, axis=None, keepdim=False):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.arange(0, 8, dtype=np.float32).reshape(4, 2)
-            # [[0 1]
-            #  [2 3]
-            #  [4 5]
-            #  [6 7]]
-            y = paddle.to_tensor(x)
+            y = paddle.arange(0, 8 ,dtype="float32").reshape([4, 2])
+            # Tensor(shape=[4, 2], dtype=float32, place=Place(cpu), stop_gradient=True,
+            #        [[0., 1.],
+            #         [2., 3.],
+            #         [4., 5.],
+            #         [6., 7.]])
+
             y1 = paddle.quantile(y, q=0.5, axis=[0, 1])
-            # 3.5
+            # Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        3.50000000)
 
             y2 = paddle.quantile(y, q=0.5, axis=1)
-            # [0.5 2.5 4.5 6.5]
+            # Tensor(shape=[4], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [0.50000000, 2.50000000, 4.50000000, 6.50000000])
 
             y3 = paddle.quantile(y, q=[0.3, 0.5], axis=0)
-            # [[1.8 2.8]
-            #  [3.  4. ]]
+            # Tensor(shape=[2, 2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [[1.80000000, 2.80000000],
+            #         [3.        , 4.        ]])
 
-            x[0][0] = np.nan
-            y = paddle.to_tensor(x)
+            y[0,0] = float("nan")
             y4 = paddle.quantile(y, q=0.8, axis=1, keepdim=True)
-            # [[nan]
-            #  [2.8]
-            #  [4.8]
-            #  [6.8]]
+            # Tensor(shape=[4, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [[nan       ],
+            #         [2.80000000],
+            #         [4.80000000],
+            #         [6.80000000]])
 
     """
     return _compute_quantile(x, q, axis=axis, keepdim=keepdim, ignore_nan=False)
@@ -663,35 +692,37 @@ def nanquantile(x, q, axis=None, keepdim=False):
     Examples:
         .. code-block:: python
 
-            import numpy as np
             import paddle
 
-            x = np.array(
+            x = paddle.to_tensor(
                 [[0, 1, 2, 3, 4],
-                 [5, 6, 7, 8, 9]],
-                dtype=np.float32
-            )
-            x[0][0] = np.nan
+                    [5, 6, 7, 8, 9]],
+                dtype="float32")
+            x[0,0] = float("nan")
 
-            x = paddle.to_tensor(x)
             y1 = paddle.nanquantile(x, q=0.5, axis=[0, 1])
-            # 5.0
+            # Tensor(shape=[], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        5.)
 
             y2 = paddle.nanquantile(x, q=0.5, axis=1)
-            # [2.5 7. ]
+            # Tensor(shape=[2], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [2.50000000, 7.        ])
 
             y3 = paddle.nanquantile(x, q=[0.3, 0.5], axis=0)
-            # [[5.  2.5 3.5 4.5 5.5]
-            #  [5.  3.5 4.5 5.5 6.5]
+            # Tensor(shape=[2, 5], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [[5.        , 2.50000000, 3.50000000, 4.50000000, 5.50000000],
+            #         [5.        , 3.50000000, 4.50000000, 5.50000000, 6.50000000]])
 
             y4 = paddle.nanquantile(x, q=0.8, axis=1, keepdim=True)
-            # [[3.4]
-            #  [8.2]]
+            # Tensor(shape=[2, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [[3.40000000],
+            #         [8.20000000]])
 
-            nan = paddle.full(shape=[2, 3], fill_value=np.nan)
+            nan = paddle.full(shape=[2, 3], fill_value=float("nan"))
             y5 = paddle.nanquantile(nan, q=0.8, axis=1, keepdim=True)
-            # [[nan]
-            #  [nan]]
+            # Tensor(shape=[2, 1], dtype=float64, place=Place(cpu), stop_gradient=True,
+            #        [[nan],
+            #         [nan]])
 
     """
     return _compute_quantile(x, q, axis=axis, keepdim=keepdim, ignore_nan=True)
diff --git a/python/paddle/tests/test_audio_backend.py b/python/paddle/tests/test_audio_backend.py
new file mode 100644
index 00000000000000..79e793e2dc8653
--- /dev/null
+++ b/python/paddle/tests/test_audio_backend.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import soundfile
+import numpy as np
+import os
+import paddle.audio
+
+
+class TestAudioBackends(unittest.TestCase):
+
+    def setUp(self):
+        self.initParmas()
+
+    def initParmas(self):
+
+        def get_wav_data(dtype: str, num_channels: int, num_frames: int):
+            dtype_ = getattr(paddle, dtype)
+            base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
+            data = base.tile([num_channels, 1])
+            return data
+
+        self.duration = 0.5
+        self.num_channels = 1
+        self.sr = 16000
+        self.dtype = "float32"
+        self.window_size = 1024
+        waveform_tensor = get_wav_data(self.dtype,
+                                       self.num_channels,
+                                       num_frames=self.duration * self.sr)
+        # shape (1, 8000)
+        self.waveform = waveform_tensor.numpy()
+
+    def test_backend(self):
+        base_dir = os.getcwd()
+        wave_wav_path = os.path.join(base_dir, "wave_test.wav")
+        paddle.audio.save(wave_wav_path,
+                          paddle.to_tensor(self.waveform),
+                          self.sr,
+                          channels_first=True)
+
+        # test backends(wave)(wave_backend) info
+        wav_info = paddle.audio.info(wave_wav_path)
+        self.assertTrue(wav_info.sample_rate, self.sr)
+        self.assertTrue(wav_info.num_channels, self.num_channels)
+        self.assertTrue(wav_info.bits_per_sample, 16)
+
+        with open(wave_wav_path, 'rb') as file_:
+            wav_info = paddle.audio.info(file_)
+            self.assertTrue(wav_info.sample_rate, self.sr)
+            self.assertTrue(wav_info.num_channels, self.num_channels)
+            self.assertTrue(wav_info.bits_per_sample, 16)
+
+        # test backends(wave_backend) load & save
+        wav_data, sr = paddle.audio.load(wave_wav_path)
+        np.testing.assert_array_almost_equal(wav_data, self.waveform, decimal=4)
+        with soundfile.SoundFile(wave_wav_path, "r") as file_:
+            dtype = "float32"
+            frames = file_._prepare_read(0, None, -1)
+            waveform = file_.read(frames, dtype, always_2d=True)
+            waveform = waveform.T
+            np.testing.assert_array_almost_equal(wav_data, waveform)
+
+        with open(wave_wav_path, 'rb') as file_:
+            wav_data, sr = paddle.audio.load(file_,
+                                             normalize=False,
+                                             num_frames=10000)
+        with soundfile.SoundFile(wave_wav_path, "r") as file_:
+            dtype = "int16"
+            frames = file_._prepare_read(0, None, -1)
+            waveform = file_.read(frames, dtype, always_2d=True)
+            waveform = waveform.T
+            np.testing.assert_array_almost_equal(wav_data, waveform)
+
+        current_backend = paddle.audio.backends.get_current_backend()
+        self.assertTrue(current_backend in ["wave_backend", "soundfile"])
+
+        paddle.audio.backends.set_backend("wave_backend")
+
+        backends = paddle.audio.backends.list_available_backends()
+        for backend in backends:
+            self.assertTrue(backend in ["wave_backend", "soundfile"])
+
+        # Test error
+        try:
+            paddle.audio.backends.set_backend("jfiji")
+        except NotImplementedError:
+            pass
+
+        try:
+            import paddleaudio
+            backends = paddle.audio.backends.list_available_backends()
+            for backend in backends:
+                self.assertTrue(backend in ["wave_backend", "soundfile"])
+            current_backend = paddle.audio.backends.get_current_backend()
+            self.assertTrue(current_backend, "wave_backend")
+            paddleaudio.backends.set_audio_backend("soundfile")
+            paddle.audio.backends.set_backend("soundfile")
+            current_backend = paddle.audio.backends.get_current_backend()
+            self.assertTrue(current_backend, "soundfile")
+            wav_info = paddle.audio.info(wave_wav_path)
+            self.assertTrue(wav_info.sample_rate, self.sr)
+            self.assertTrue(wav_info.num_channels, self.num_channels)
+            self.assertTrue(wav_info.bits_per_sample, 16)
+            paddle.audio.backends.set_backend("wave_backend")
+        except ImportError:
+            pass
+
+        try:
+            paddle.audio.save(wave_wav_path,
+                              paddle.to_tensor(self.waveform),
+                              self.sr,
+                              bits_per_sample=24,
+                              channels_first=True)
+        except ValueError:
+            pass
+
+        try:
+            paddle.audio.save(wave_wav_path,
+                              paddle.to_tensor(self.waveform).unsqueeze(0),
+                              self.sr)
+        except AssertionError:
+            pass
+
+        fake_data = np.array([0, 1, 2, 3, 4, 6], np.float32)
+        soundfile.write(wave_wav_path, fake_data, 1, subtype="DOUBLE")
+        try:
+            wav_info = paddle.audio.info(wave_wav_path)
+        except NotImplementedError:
+            pass
+        try:
+            wav_data = paddle.audio.load(wave_wav_path)
+        except NotImplementedError:
+            pass
+
+        if os.path.exists(wave_wav_path):
+            os.remove(wave_wav_path)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_audio_datasets.py b/python/paddle/tests/test_audio_datasets.py
new file mode 100644
index 00000000000000..59ba1d543bda69
--- /dev/null
+++ b/python/paddle/tests/test_audio_datasets.py
@@ -0,0 +1,123 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import paddle
+import itertools
+from parameterized import parameterized
+
+
+def parameterize(*params):
+    return parameterized.expand(list(itertools.product(*params)))
+
+
+class TestAudioDatasets(unittest.TestCase):
+
+    @parameterize(["dev", "train"], [40, 64])
+    def test_tess_dataset(self, mode: str, params: int):
+        """
+        TESS dataset
+        Reference:
+            Toronto emotional speech set (TESS) https://tspace.library.utoronto.ca/handle/1807/24487
+            https://doi.org/10.5683/SP2/E8H2MF
+        """
+        archive = {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set_lite.zip',
+            'md5': '9ffb5e3adf28d4d6b787fa94bd59b975',
+        }  # small part of TESS dataset for test.
+        tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                  feat_type='mfcc',
+                                                  n_mfcc=params,
+                                                  archive=archive)
+        idx = np.random.randint(0, 30)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 6)
+
+        tess_dataset = paddle.audio.datasets.TESS(mode=mode,
+                                                  feat_type='spectrogram',
+                                                  n_fft=params)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == (params // 2 + 1))
+        self.assertTrue(0 <= elem[1] <= 6)
+
+        tess_dataset = paddle.audio.datasets.TESS(mode="dev",
+                                                  feat_type='logmelspectrogram',
+                                                  n_mels=params)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 6)
+
+        tess_dataset = paddle.audio.datasets.TESS(mode="dev",
+                                                  feat_type='melspectrogram',
+                                                  n_mels=params)
+        elem = tess_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 6)
+
+    @parameterize(["dev", "train"], [40, 64])
+    def test_esc50_dataset(self, mode: str, params: int):
+        """
+        ESC50 dataset
+        Reference:
+            ESC: Dataset for Environmental Sound Classification
+            http://dx.doi.org/10.1145/2733373.2806390
+        """
+        archive = {
+            'url':
+            'https://bj.bcebos.com/paddleaudio/datasets/ESC-50-master-lite.zip',
+            'md5': '1e9ba53265143df5b2804a743f2d1956',
+        }  # small part of ESC50 dataset for test.
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='raw',
+                                                    archive=archive)
+        idx = np.random.randint(0, 6)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == 220500)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='mfcc',
+                                                    n_mfcc=params,
+                                                    archive=archive)
+        idx = np.random.randint(0, 6)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='spectrogram',
+                                                    n_fft=params)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == (params // 2 + 1))
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(
+            mode=mode, feat_type='logmelspectrogram', n_mels=params)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+        esc50_dataset = paddle.audio.datasets.ESC50(mode=mode,
+                                                    feat_type='melspectrogram',
+                                                    n_mels=params)
+        elem = esc50_dataset[idx]
+        self.assertTrue(elem[0].shape[0] == params)
+        self.assertTrue(0 <= elem[1] <= 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_audio_functions.py b/python/paddle/tests/test_audio_functions.py
new file mode 100644
index 00000000000000..da1009558ff506
--- /dev/null
+++ b/python/paddle/tests/test_audio_functions.py
@@ -0,0 +1,320 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import librosa
+import numpy as np
+import os
+import paddle
+
+import paddle.audio
+from scipy import signal
+import itertools
+from parameterized import parameterized
+
+
+def parameterize(*params):
+    return parameterized.expand(list(itertools.product(*params)))
+
+
+class TestAudioFuncitons(unittest.TestCase):
+    def setUp(self):
+        self.initParmas()
+
+    def initParmas(self):
+        def get_wav_data(dtype: str, num_channels: int, num_frames: int):
+            dtype_ = getattr(paddle, dtype)
+            base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
+            data = base.tile([num_channels, 1])
+            return data
+
+        self.n_fft = 512
+        self.hop_length = 128
+        self.n_mels = 40
+        self.n_mfcc = 20
+        self.fmin = 0.0
+        self.window_str = 'hann'
+        self.pad_mode = 'reflect'
+        self.top_db = 80.0
+        self.duration = 0.5
+        self.num_channels = 1
+        self.sr = 16000
+        self.dtype = "float32"
+        self.window_size = 1024
+        waveform_tensor = get_wav_data(
+            self.dtype, self.num_channels, num_frames=self.duration * self.sr
+        )
+        self.waveform = waveform_tensor.numpy()
+
+    @parameterize([1.0, 3.0, 9.0, 25.0], [True, False])
+    def test_audio_function(self, val: float, htk_flag: bool):
+        mel_paddle = paddle.audio.functional.hz_to_mel(val, htk_flag)
+        mel_paddle_tensor = paddle.audio.functional.hz_to_mel(
+            paddle.to_tensor(val), htk_flag
+        )
+        mel_librosa = librosa.hz_to_mel(val, htk_flag)
+        np.testing.assert_almost_equal(mel_paddle, mel_librosa, decimal=5)
+        np.testing.assert_almost_equal(
+            mel_paddle_tensor.numpy(), mel_librosa, decimal=4
+        )
+
+        hz_paddle = paddle.audio.functional.mel_to_hz(val, htk_flag)
+        hz_paddle_tensor = paddle.audio.functional.mel_to_hz(
+            paddle.to_tensor(val), htk_flag
+        )
+        hz_librosa = librosa.mel_to_hz(val, htk_flag)
+        np.testing.assert_almost_equal(hz_paddle, hz_librosa, decimal=4)
+        np.testing.assert_almost_equal(
+            hz_paddle_tensor.numpy(), hz_librosa, decimal=4
+        )
+
+        decibel_paddle = paddle.audio.functional.power_to_db(
+            paddle.to_tensor(val)
+        )
+        decibel_librosa = librosa.power_to_db(val)
+        np.testing.assert_almost_equal(
+            decibel_paddle.numpy(), decibel_paddle, decimal=5
+        )
+
+    @parameterize(
+        [64, 128, 256], [0.0, 0.5, 1.0], [10000, 11025], [False, True]
+    )
+    def test_audio_function_mel(
+        self, n_mels: int, f_min: float, f_max: float, htk_flag: bool
+    ):
+        librosa_mel_freq = librosa.mel_frequencies(
+            n_mels, f_min, f_max, htk_flag
+        )
+        paddle_mel_freq = paddle.audio.functional.mel_frequencies(
+            n_mels, f_min, f_max, htk_flag, 'float64'
+        )
+        np.testing.assert_almost_equal(
+            paddle_mel_freq, librosa_mel_freq, decimal=3
+        )
+
+    @parameterize([8000, 16000], [64, 128, 256])
+    def test_audio_function_fft(self, sr: int, n_fft: int):
+        librosa_fft = librosa.fft_frequencies(sr, n_fft)
+        paddle_fft = paddle.audio.functional.fft_frequencies(sr, n_fft)
+        np.testing.assert_almost_equal(paddle_fft, librosa_fft, decimal=5)
+
+    @parameterize([1.0, 3.0, 9.0])
+    def test_audio_function_exception(self, spect: float):
+        try:
+            paddle.audio.functional.power_to_db(
+                paddle.to_tensor([spect]), amin=0
+            )
+        except Exception:
+            pass
+
+        try:
+            paddle.audio.functional.power_to_db(
+                paddle.to_tensor([spect]), ref_value=0
+            )
+
+        except Exception:
+            pass
+
+        try:
+            paddle.audio.functional.power_to_db(
+                paddle.to_tensor([spect]), top_db=-1
+            )
+        except Exception:
+            pass
+
+    @parameterize(
+        [
+            "hamming",
+            "hann",
+            "triang",
+            "bohman",
+            "blackman",
+            "cosine",
+            "tukey",
+            "taylor",
+        ],
+        [1, 512],
+    )
+    def test_window(self, window_type: str, n_fft: int):
+        window_scipy = signal.get_window(window_type, n_fft)
+        window_paddle = paddle.audio.functional.get_window(window_type, n_fft)
+        np.testing.assert_array_almost_equal(
+            window_scipy, window_paddle.numpy(), decimal=5
+        )
+
+    @parameterize([1, 512])
+    def test_gaussian_window_and_exception(self, n_fft: int):
+        window_scipy_gaussain = signal.windows.gaussian(n_fft, std=7)
+        window_paddle_gaussian = paddle.audio.functional.get_window(
+            ('gaussian', 7), n_fft, False
+        )
+        np.testing.assert_array_almost_equal(
+            window_scipy_gaussain, window_paddle_gaussian.numpy(), decimal=5
+        )
+        window_scipy_general_gaussain = signal.windows.general_gaussian(
+            n_fft, 1, 7
+        )
+        window_paddle_general_gaussian = paddle.audio.functional.get_window(
+            ('general_gaussian', 1, 7), n_fft, False
+        )
+        np.testing.assert_array_almost_equal(
+            window_scipy_gaussain, window_paddle_gaussian.numpy(), decimal=5
+        )
+
+        window_scipy_exp = signal.windows.exponential(n_fft)
+        window_paddle_exp = paddle.audio.functional.get_window(
+            ('exponential', None, 1), n_fft, False
+        )
+        np.testing.assert_array_almost_equal(
+            window_scipy_exp, window_paddle_exp.numpy(), decimal=5
+        )
+        try:
+            window_paddle = paddle.audio.functional.get_window("hann", -1)
+        except ValueError:
+            pass
+
+        try:
+            window_paddle = paddle.audio.functional.get_window(
+                "fake_window", self.n_fft
+            )
+        except ValueError:
+            pass
+
+        try:
+            window_paddle = paddle.audio.functional.get_window(1043, self.n_fft)
+        except ValueError:
+            pass
+
+    @parameterize([5, 13, 23], [257, 513, 1025])
+    def test_create_dct(self, n_mfcc: int, n_mels: int):
+        def dct(n_filters, n_input):
+            basis = np.empty((n_filters, n_input))
+            basis[0, :] = 1.0 / np.sqrt(n_input)
+            samples = np.arange(1, 2 * n_input, 2) * np.pi / (2.0 * n_input)
+
+            for i in range(1, n_filters):
+                basis[i, :] = np.cos(i * samples) * np.sqrt(2.0 / n_input)
+            return basis.T
+
+        librosa_dct = dct(n_mfcc, n_mels)
+        paddle_dct = paddle.audio.functional.create_dct(n_mfcc, n_mels)
+        np.testing.assert_array_almost_equal(librosa_dct, paddle_dct, decimal=5)
+
+    @parameterize(
+        [128, 256, 512], ["hamming", "hann", "triang", "bohman"], [True, False]
+    )
+    def test_stft_and_spect(
+        self, n_fft: int, window_str: str, center_flag: bool
+    ):
+        hop_length = int(n_fft / 4)
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0
+            )  # 1D input for librosa.feature.melspectrogram
+        feature_librosa = librosa.core.stft(
+            y=self.waveform,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=None,
+            window=window_str,
+            center=center_flag,
+            dtype=None,
+            pad_mode=self.pad_mode,
+        )
+        x = paddle.to_tensor(self.waveform).unsqueeze(0)
+        window = paddle.audio.functional.get_window(
+            window_str, n_fft, dtype=x.dtype
+        )
+        feature_paddle = paddle.signal.stft(
+            x=x,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=None,
+            window=window,
+            center=center_flag,
+            pad_mode=self.pad_mode,
+            normalized=False,
+            onesided=True,
+        ).squeeze(0)
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_paddle, decimal=5
+        )
+
+        feature_bg = np.power(np.abs(feature_librosa), 2.0)
+        feature_extractor = paddle.audio.features.Spectrogram(
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=None,
+            window=window_str,
+            power=2.0,
+            center=center_flag,
+            pad_mode=self.pad_mode,
+        )
+        feature_layer = feature_extractor(x).squeeze(0)
+        np.testing.assert_array_almost_equal(
+            feature_layer, feature_bg, decimal=3
+        )
+
+    @parameterize(
+        [128, 256, 512], [64, 82], ["hamming", "hann", "triang", "bohman"]
+    )
+    def test_istft(self, n_fft: int, hop_length: int, window_str: str):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0
+            )  # 1D input for librosa.feature.melspectrogram
+        # librosa
+        # Get stft result from librosa.
+        stft_matrix = librosa.core.stft(
+            y=self.waveform,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=None,
+            window=window_str,
+            center=True,
+            pad_mode=self.pad_mode,
+        )
+        feature_librosa = librosa.core.istft(
+            stft_matrix=stft_matrix,
+            hop_length=hop_length,
+            win_length=None,
+            window=window_str,
+            center=True,
+            dtype=None,
+            length=None,
+        )
+        x = paddle.to_tensor(stft_matrix).unsqueeze(0)
+        window = paddle.audio.functional.get_window(
+            window_str, n_fft, dtype=paddle.to_tensor(self.waveform).dtype
+        )
+        feature_paddle = paddle.signal.istft(
+            x=x,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            win_length=None,
+            window=window,
+            center=True,
+            normalized=False,
+            onesided=True,
+            length=None,
+            return_complex=False,
+        ).squeeze(0)
+
+        np.testing.assert_array_almost_equal(
+            feature_librosa, feature_paddle, decimal=5
+        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_audio_logmel_feature.py b/python/paddle/tests/test_audio_logmel_feature.py
new file mode 100644
index 00000000000000..a89dc583c3d583
--- /dev/null
+++ b/python/paddle/tests/test_audio_logmel_feature.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import librosa
+import numpy as np
+import os
+import paddle
+
+import paddle.audio
+import scipy
+from scipy import signal
+import itertools
+from parameterized import parameterized
+
+
+def parameterize(*params):
+    return parameterized.expand(list(itertools.product(*params)))
+
+
+class TestFeatures(unittest.TestCase):
+
+    def setUp(self):
+        self.initParmas()
+
+    def initParmas(self):
+
+        def get_wav_data(dtype: str, num_channels: int, num_frames: int):
+            dtype_ = getattr(paddle, dtype)
+            base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
+            data = base.tile([num_channels, 1])
+            return data
+
+        self.fmin = 0.0
+        self.top_db = 80.0
+        self.duration = 0.5
+        self.num_channels = 1
+        self.sr = 16000
+        self.dtype = "float32"
+        waveform_tensor = get_wav_data(self.dtype,
+                                       self.num_channels,
+                                       num_frames=self.duration * self.sr)
+        self.waveform = waveform_tensor.numpy()
+
+    @parameterize([16000], ["hamming", "bohman"], [128], [128, 64], [64, 32],
+                  [0.0, 50.0])
+    def test_log_melspect(self, sr: int, window_str: str, n_fft: int,
+                          hop_length: int, n_mels: int, fmin: float):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # librosa:
+        feature_librosa = librosa.feature.melspectrogram(y=self.waveform,
+                                                         sr=sr,
+                                                         n_fft=n_fft,
+                                                         hop_length=hop_length,
+                                                         window=window_str,
+                                                         n_mels=n_mels,
+                                                         center=True,
+                                                         fmin=fmin,
+                                                         pad_mode='reflect')
+        feature_librosa = librosa.power_to_db(feature_librosa, top_db=None)
+        x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze(
+            0)  # Add batch dim.
+        feature_extractor = paddle.audio.features.LogMelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            window=window_str,
+            center=True,
+            n_mels=n_mels,
+            f_min=fmin,
+            top_db=None,
+            dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
+        np.testing.assert_array_almost_equal(feature_librosa,
+                                             feature_layer,
+                                             decimal=2)
+        # relative difference
+        np.testing.assert_allclose(feature_librosa, feature_layer, rtol=1e-4)
+
+    @parameterize([16000], [256, 128], [40, 64], [64, 128],
+                  ['float32', 'float64'])
+    def test_mfcc(self, sr: int, n_fft: int, n_mfcc: int, n_mels: int,
+                  dtype: str):
+        if paddle.version.cuda() != 'False':
+            if float(paddle.version.cuda()) >= 11.0:
+                return
+
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # librosa:
+        np_dtype = getattr(np, dtype)
+        feature_librosa = librosa.feature.mfcc(y=self.waveform,
+                                               sr=sr,
+                                               S=None,
+                                               n_mfcc=n_mfcc,
+                                               dct_type=2,
+                                               lifter=0,
+                                               n_fft=n_fft,
+                                               hop_length=64,
+                                               n_mels=n_mels,
+                                               fmin=50.0,
+                                               dtype=np_dtype)
+        # paddlespeech.audio.features.layer
+        x = paddle.to_tensor(self.waveform,
+                             dtype=dtype).unsqueeze(0)  # Add batch dim.
+        feature_extractor = paddle.audio.features.MFCC(sr=sr,
+                                                       n_mfcc=n_mfcc,
+                                                       n_fft=n_fft,
+                                                       hop_length=64,
+                                                       n_mels=n_mels,
+                                                       top_db=self.top_db,
+                                                       dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(feature_librosa,
+                                             feature_layer,
+                                             decimal=3)
+
+        np.testing.assert_allclose(feature_librosa, feature_layer, rtol=1e-1)
+
+        # split mffcc: logmel-->dct --> mfcc, which prove the difference.
+        # the dct module is correct.
+        feature_extractor = paddle.audio.features.LogMelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=64,
+            n_mels=n_mels,
+            center=True,
+            pad_mode='reflect',
+            top_db=self.top_db,
+            dtype=x.dtype)
+        feature_layer_logmel = feature_extractor(x).squeeze(0).numpy()
+
+        feature_layer_mfcc = scipy.fftpack.dct(feature_layer_logmel,
+                                               axis=0,
+                                               type=2,
+                                               norm="ortho")[:n_mfcc]
+        np.testing.assert_array_almost_equal(feature_layer_mfcc,
+                                             feature_librosa,
+                                             decimal=3)
+        np.testing.assert_allclose(feature_layer_mfcc,
+                                   feature_librosa,
+                                   rtol=1e-1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_audio_mel_feature.py b/python/paddle/tests/test_audio_mel_feature.py
new file mode 100644
index 00000000000000..427e9864117cd3
--- /dev/null
+++ b/python/paddle/tests/test_audio_mel_feature.py
@@ -0,0 +1,117 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import librosa
+import numpy as np
+import os
+import paddle
+
+import paddle.audio
+from scipy import signal
+import itertools
+from parameterized import parameterized
+
+
+def parameterize(*params):
+    return parameterized.expand(list(itertools.product(*params)))
+
+
+class TestFeatures(unittest.TestCase):
+
+    def setUp(self):
+        self.initParmas()
+
+    def initParmas(self):
+
+        def get_wav_data(dtype: str, num_channels: int, num_frames: int):
+            dtype_ = getattr(paddle, dtype)
+            base = paddle.linspace(-1.0, 1.0, num_frames, dtype=dtype_) * 0.1
+            data = base.tile([num_channels, 1])
+            return data
+
+        self.hop_length = 128
+        self.duration = 0.5
+        self.num_channels = 1
+        self.sr = 16000
+        self.dtype = "float32"
+        waveform_tensor = get_wav_data(self.dtype,
+                                       self.num_channels,
+                                       num_frames=self.duration * self.sr)
+        self.waveform = waveform_tensor.numpy()
+
+    @parameterize([8000], [128, 256], [64, 32], [0.0, 1.0],
+                  ['float32', 'float64'])
+    def test_mel(self, sr: int, n_fft: int, n_mels: int, fmin: float,
+                 dtype: str):
+        feature_librosa = librosa.filters.mel(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=None,
+            htk=False,
+            norm='slaney',
+            dtype=np.dtype(dtype),
+        )
+        paddle_dtype = getattr(paddle, dtype)
+        feature_functional = paddle.audio.functional.compute_fbank_matrix(
+            sr=sr,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            f_min=fmin,
+            f_max=None,
+            htk=False,
+            norm='slaney',
+            dtype=paddle_dtype,
+        )
+
+        np.testing.assert_array_almost_equal(feature_librosa,
+                                             feature_functional)
+
+    @parameterize([8000, 16000], [128, 256], [64, 82], [40, 80], [False, True])
+    def test_melspect(self, sr: int, n_fft: int, hop_length: int, n_mels: int,
+                      htk: bool):
+        if len(self.waveform.shape) == 2:  # (C, T)
+            self.waveform = self.waveform.squeeze(
+                0)  # 1D input for librosa.feature.melspectrogram
+
+        # librosa:
+        feature_librosa = librosa.feature.melspectrogram(y=self.waveform,
+                                                         sr=sr,
+                                                         n_fft=n_fft,
+                                                         hop_length=hop_length,
+                                                         n_mels=n_mels,
+                                                         htk=htk,
+                                                         fmin=50.0)
+
+        # paddle.audio.features.layer
+        x = paddle.to_tensor(self.waveform, dtype=paddle.float64).unsqueeze(
+            0)  # Add batch dim.
+        feature_extractor = paddle.audio.features.MelSpectrogram(
+            sr=sr,
+            n_fft=n_fft,
+            hop_length=hop_length,
+            n_mels=n_mels,
+            htk=htk,
+            dtype=x.dtype)
+        feature_layer = feature_extractor(x).squeeze(0).numpy()
+
+        np.testing.assert_array_almost_equal(feature_librosa,
+                                             feature_layer,
+                                             decimal=5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tests/test_dist_hapi_model.py b/python/paddle/tests/test_dist_hapi_model.py
index 895d2bc0c478a3..b286e3906a9e11 100644
--- a/python/paddle/tests/test_dist_hapi_model.py
+++ b/python/paddle/tests/test_dist_hapi_model.py
@@ -21,7 +21,7 @@
 import subprocess
 import paddle.fluid as fluid
 
-from paddle.distributed.utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
+from paddle.distributed.utils.launch_utils import find_free_ports, watch_local_trainers, get_cluster, TrainerProc
 
 
 def get_cluster_from_args(selected_gpus):
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 0367e9ed3e3544..a8821e7a03e0f8 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -22,14 +22,46 @@
 from setuptools.command.build_ext import build_ext
 from distutils.command.build import build
 
-from .extension_utils import find_cuda_home, find_rocm_home, normalize_extension_kwargs, add_compile_flag, run_cmd
-from .extension_utils import is_cuda_file, prepare_unix_cudaflags, prepare_win_cudaflags
-from .extension_utils import _import_module_from_library, _write_setup_file, _jit_compile
-from .extension_utils import check_abi_compatibility, log_v, CustomOpInfo, parse_op_name_from
-from .extension_utils import clean_object_if_change_cflags, _reset_so_rpath, _get_fluid_path
-from .extension_utils import bootstrap_context, get_build_directory, add_std_without_repeat
-
-from .extension_utils import IS_WINDOWS, OS_NAME, MSVC_COMPILE_FLAGS, MSVC_COMPILE_FLAGS
+from .extension_utils import (
+    find_cuda_home,
+    find_rocm_home,
+    normalize_extension_kwargs,
+    add_compile_flag,
+    run_cmd,
+)
+from .extension_utils import (
+    is_cuda_file,
+    prepare_unix_cudaflags,
+    prepare_win_cudaflags,
+)
+from .extension_utils import (
+    _import_module_from_library,
+    _write_setup_file,
+    _jit_compile,
+)
+from .extension_utils import (
+    check_abi_compatibility,
+    log_v,
+    CustomOpInfo,
+    parse_op_name_from,
+)
+from .extension_utils import (
+    clean_object_if_change_cflags,
+    _reset_so_rpath,
+    _get_fluid_path,
+)
+from .extension_utils import (
+    bootstrap_context,
+    get_build_directory,
+    add_std_without_repeat,
+)
+
+from .extension_utils import (
+    IS_WINDOWS,
+    OS_NAME,
+    MSVC_COMPILE_FLAGS,
+    MSVC_COMPILE_FLAGS,
+)
 from .extension_utils import CLANG_COMPILE_FLAGS, CLANG_LINK_FLAGS
 
 from ...fluid import core
@@ -40,6 +72,7 @@
 if IS_WINDOWS and six.PY3:
     from distutils.command.build_ext import build_ext as _du_build_ext
     from unittest.mock import Mock
+
     _du_build_ext.get_export_symbols = Mock(return_value=None)
 
 CUDA_HOME = find_cuda_home()
@@ -51,33 +84,33 @@
 def setup(**attr):
     """
     The interface is used to config the process of compiling customized operators,
-    mainly includes how to compile shared library, automatically generate python API 
+    mainly includes how to compile shared library, automatically generate python API
     and install it into site-package. It supports using customized operators directly with
     ``import`` statement.
 
     It encapsulates the python built-in ``setuptools.setup`` function and keeps arguments
     and usage same as the native interface. Meanwhile, it hiddens Paddle inner framework
     concepts, such as necessary compiling flags, included paths of head files, and linking
-    flags. It also will automatically search and valid local environment and versions of 
-    ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators 
+    flags. It also will automatically search and valid local environment and versions of
+    ``cc(Linux)`` , ``cl.exe(Windows)`` and ``nvcc`` , then compiles customized operators
     supporting CPU or GPU device according to the specified Extension type.
 
-    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_ 
+    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_
     will be checked to ensure that compiler version from ``cc(Linux)`` , ``cl.exe(Windows)``
     on local machine is compatible with pre-installed Paddle whl in python site-packages.
 
-    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
-    then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2017). 
-    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
+    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2,
+    then the version of user's local machine should satisfy GCC >= 8.2.
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of
+    PaddlePaddle (Visual Studio 2017).
+    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
-    .. note::
-        
+    Note:
+
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
-           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
+           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking
            GCC version.
         3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
@@ -86,11 +119,11 @@ def setup(**attr):
     ``python setup.py install`` . Then customized operators API will be available everywhere
     after importing it.
 
-    A simple example of ``setup.py`` as followed: 
+    A simple example of ``setup.py`` as followed:
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Case 1: Compiling customized operators supporting CPU and GPU devices
         from paddle.utils.cpp_extension import CUDAExtension, setup
@@ -124,11 +157,11 @@ def setup(**attr):
         x = paddle.randn([4, 10], dtype='float32')
         relu_out = relu(x)
         tanh_out = tanh(x)
-    
+
 
     Args:
         name(str): Specify the name of shared library file and installed python package.
-        ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al. 
+        ext_modules(Extension): Specify the Extension instance including customized operator source files, compiling flags et.al.
                                 If only compile operator supporting CPU device, please use ``CppExtension`` ; If compile operator
                                 supporting CPU and GPU devices, please use ``CUDAExtension`` .
         include_dirs(list[str], optional): Specify the extra include directories to search head files. The interface will automatically add
@@ -139,7 +172,7 @@ def setup(**attr):
                                 compiler using dict type with ``{'cxx': [...], 'nvcc': [...]}`` . Default is None.
         **attr(dict, optional): Specify other arguments same as ``setuptools.setup`` .
 
-    Returns: 
+    Returns:
         None
 
     """
@@ -148,7 +181,8 @@ def setup(**attr):
     # if not specific cmdclass in setup, add it automatically.
     if 'build_ext' not in cmdclass:
         cmdclass['build_ext'] = BuildExtension.with_options(
-            no_python_abi_suffix=True)
+            no_python_abi_suffix=True
+        )
         attr['cmdclass'] = cmdclass
 
     error_msg = """
@@ -168,17 +202,19 @@ def setup(**attr):
     if 'name' not in attr:
         raise ValueError(error_msg)
 
-    assert not attr['name'].endswith('module'),  \
-    "Please don't use 'module' as suffix in `name` argument, "
+    assert not attr['name'].endswith(
+        'module'
+    ), "Please don't use 'module' as suffix in `name` argument, "
     "it will be stripped in setuptools.bdist_egg and cause import error."
 
     ext_modules = attr.get('ext_modules', [])
     if not isinstance(ext_modules, list):
         ext_modules = [ext_modules]
-    assert len(
-        ext_modules
-    ) == 1, "Required only one Extension, but received {}. If you want to compile multi operators, you can include all necessary source files in one Extension.".format(
-        len(ext_modules))
+    assert (
+        len(ext_modules) == 1
+    ), "Required only one Extension, but received {}. If you want to compile multi operators, you can include all necessary source files in one Extension.".format(
+        len(ext_modules)
+    )
     # replace Extension.name with attr['name] to keep consistant with Package name.
     for ext_module in ext_modules:
         ext_module.name = attr['name']
@@ -219,7 +255,7 @@ def CppExtension(sources, *args, **kwargs):
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Compiling customized operators supporting only CPU device
         from paddle.utils.cpp_extension import CppExtension, setup
@@ -230,7 +266,7 @@ def CppExtension(sources, *args, **kwargs):
         )
 
 
-    .. note::
+    Note:
         It is mainly used in ``setup`` and the nama of built shared library keeps same
         as ``name`` argument specified in ``setup`` interface.
 
@@ -269,7 +305,7 @@ def CUDAExtension(sources, *args, **kwargs):
 
     .. code-block:: text
 
-        # setup.py 
+        # setup.py
 
         # Compiling customized operators supporting CPU and GPU devices
         from paddle.utils.cpp_extension import CUDAExtension, setup
@@ -282,7 +318,7 @@ def CUDAExtension(sources, *args, **kwargs):
         )
 
 
-    .. note::
+    Note:
         It is mainly used in ``setup`` and the nama of built shared library keeps same
         as ``name`` argument specified in ``setup`` interface.
 
@@ -293,7 +329,7 @@ def CUDAExtension(sources, *args, **kwargs):
         **kwargs(dict[option], optional): Specify other arguments same as ``setuptools.Extension`` .
 
     Returns:
-        setuptools.Extension: An instance of setuptools.Extension
+        setuptools.Extension: An instance of setuptools.Extension.
     """
     kwargs = normalize_extension_kwargs(kwargs, use_cuda=True)
     # Note(Aurelius84): While using `setup` and `jit`, the Extension `name` will
@@ -336,7 +372,6 @@ def with_options(cls, **options):
         """
 
         class cls_with_options(cls):
-
             def __init__(self, *args, **kwargs):
                 kwargs.update(options)
                 cls.__init__(self, *args, **kwargs)
@@ -381,8 +416,9 @@ def build_extensions(self):
         # cflags have changed and delete the built shared library to re-compile the source
         # even though source file content keep unchanged.
         so_name = self.get_ext_fullpath(self.extensions[0].name)
-        clean_object_if_change_cflags(os.path.abspath(so_name),
-                                      self.extensions[0])
+        clean_object_if_change_cflags(
+            os.path.abspath(so_name), self.extensions[0]
+        )
 
         # Consider .cu, .cu.cc as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cu.cc']
@@ -394,8 +430,9 @@ def build_extensions(self):
         else:
             original_compile = self.compiler._compile
 
-        def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
-                                        pp_opts):
+        def unix_custom_single_compiler(
+            obj, src, ext, cc_args, extra_postargs, pp_opts
+        ):
             """
             Monkey patch machanism to replace inner compiler to custom complie process on Unix platform.
             """
@@ -408,7 +445,9 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 # nvcc or hipcc compile CUDA source
                 if is_cuda_file(src):
                     if core.is_compiled_with_rocm():
-                        assert ROCM_HOME is not None, "Not found ROCM runtime, \
+                        assert (
+                            ROCM_HOME is not None
+                        ), "Not found ROCM runtime, \
                             please use `export ROCM_PATH= XXX` to specify it."
 
                         hipcc_cmd = os.path.join(ROCM_HOME, 'bin', 'hipcc')
@@ -417,7 +456,9 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                         if isinstance(cflags, dict):
                             cflags = cflags['hipcc']
                     else:
-                        assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                        assert (
+                            CUDA_HOME is not None
+                        ), "Not found CUDA runtime, \
                             please use `export CUDA_HOME= XXX` to specify it."
 
                         nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
@@ -436,11 +477,12 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                     cflags.append('-D__HIP_PLATFORM_HCC__')
                     cflags.append('-D__HIP_NO_HALF_CONVERSIONS__=1')
                     cflags.append(
-                        '-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP')
+                        '-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP'
+                    )
 
                 # NOTE(Aurelius84): Since Paddle 2.0, we require gcc version > 5.x,
                 # so we add this flag to ensure the symbol names from user compiled
-                # shared library have same ABI suffix with core_(no)avx.so.
+                # shared library have same ABI suffix with libpaddle.so.
                 # See https://stackoverflow.com/questions/34571583/understanding-gcc-5s-glibcxx-use-cxx11-abi-or-the-new-abi
                 add_compile_flag(cflags, ['-D_GLIBCXX_USE_CXX11_ABI=1'])
                 # Append this macor only when jointly compiling .cc with .cu
@@ -450,22 +492,24 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                     else:
                         cflags.append('-DPADDLE_WITH_CUDA')
 
-                add_std_without_repeat(cflags,
-                                       self.compiler.compiler_type,
-                                       use_std14=True)
+                add_std_without_repeat(
+                    cflags, self.compiler.compiler_type, use_std14=True
+                )
                 original_compile(obj, src, ext, cc_args, cflags, pp_opts)
             finally:
                 # restore original_compiler
                 self.compiler.set_executable('compiler_so', original_compiler)
 
-        def win_custom_single_compiler(sources,
-                                       output_dir=None,
-                                       macros=None,
-                                       include_dirs=None,
-                                       debug=0,
-                                       extra_preargs=None,
-                                       extra_postargs=None,
-                                       depends=None):
+        def win_custom_single_compiler(
+            sources,
+            output_dir=None,
+            macros=None,
+            include_dirs=None,
+            debug=0,
+            extra_preargs=None,
+            extra_postargs=None,
+            depends=None,
+        ):
 
             self.cflags = copy.deepcopy(extra_postargs)
             extra_postargs = None
@@ -482,27 +526,32 @@ def win_custom_spawn(cmd):
                 # Using regex to match src, obj and include files
                 src_regex = re.compile('/T(p|c)(.*)')
                 src_list = [
-                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    m.group(2)
+                    for m in (src_regex.match(elem) for elem in cmd)
                     if m
                 ]
 
                 obj_regex = re.compile('/Fo(.*)')
                 obj_list = [
-                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    m.group(1)
+                    for m in (obj_regex.match(elem) for elem in cmd)
                     if m
                 ]
 
                 include_regex = re.compile(r'((\-|\/)I.*)')
                 include_list = [
                     m.group(1)
-                    for m in (include_regex.match(elem) for elem in cmd) if m
+                    for m in (include_regex.match(elem) for elem in cmd)
+                    if m
                 ]
 
                 assert len(src_list) == 1 and len(obj_list) == 1
                 src = src_list[0]
                 obj = obj_list[0]
                 if is_cuda_file(src):
-                    assert CUDA_HOME is not None, "Not found CUDA runtime, \
+                    assert (
+                        CUDA_HOME is not None
+                    ), "Not found CUDA runtime, \
                         please use `export CUDA_HOME= XXX` to specify it."
 
                     nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
@@ -516,8 +565,9 @@ def win_custom_spawn(cmd):
                     cflags = prepare_win_cudaflags(cflags) + ['--use-local-env']
                     for flag in MSVC_COMPILE_FLAGS:
                         cflags = ['-Xcompiler', flag] + cflags
-                    cmd = [nvcc_cmd, '-c', src, '-o', obj
-                           ] + include_list + cflags
+                    cmd = (
+                        [nvcc_cmd, '-c', src, '-o', obj] + include_list + cflags
+                    )
                 elif isinstance(self.cflags, dict):
                     cflags = MSVC_COMPILE_FLAGS + self.cflags['cxx']
                     cmd += cflags
@@ -532,9 +582,16 @@ def win_custom_spawn(cmd):
 
             try:
                 self.compiler.spawn = win_custom_spawn
-                return original_compile(sources, output_dir, macros,
-                                        include_dirs, debug, extra_preargs,
-                                        extra_postargs, depends)
+                return original_compile(
+                    sources,
+                    output_dir,
+                    macros,
+                    include_dirs,
+                    debug,
+                    extra_preargs,
+                    extra_postargs,
+                    depends,
+                )
             finally:
                 self.compiler.spawn = original_spawn
 
@@ -547,8 +604,9 @@ def object_filenames_with_cuda(origina_func, build_directory):
 
             def wrapper(source_filenames, strip_dir=0, output_dir=''):
                 try:
-                    objects = origina_func(source_filenames, strip_dir,
-                                           output_dir)
+                    objects = origina_func(
+                        source_filenames, strip_dir, output_dir
+                    )
                     for i, source in enumerate(source_filenames):
                         # modify xx.o -> xx.cu.o/xx.cu.obj
                         if is_cuda_file(source):
@@ -579,7 +637,8 @@ def wrapper(source_filenames, strip_dir=0, output_dir=''):
             self.compiler._compile = unix_custom_single_compiler
 
         self.compiler.object_filenames = object_filenames_with_cuda(
-            self.compiler.object_filenames, self.build_lib)
+            self.compiler.object_filenames, self.build_lib
+        )
         self._record_op_info()
 
         print("Compiling user custom op, it will cost a few seconds.....")
@@ -595,10 +654,11 @@ def get_ext_filename(self, fullname):
         split_str = '.'
         name_items = ext_name.split(split_str)
         if self.no_python_abi_suffix and six.PY3:
-            assert len(
-                name_items
-            ) > 2, "Expected len(name_items) > 2, but received {}".format(
-                len(name_items))
+            assert (
+                len(name_items) > 2
+            ), "Expected len(name_items) > 2, but received {}".format(
+                len(name_items)
+            )
             name_items.pop(-2)
             ext_name = split_str.join(name_items)
 
@@ -614,11 +674,13 @@ def _valid_clang_compiler(self):
         """
         compiler_infos = ['clang'] + CLANG_COMPILE_FLAGS
         linker_infos = ['clang'] + CLANG_LINK_FLAGS
-        self.compiler.set_executables(compiler=compiler_infos,
-                                      compiler_so=compiler_infos,
-                                      compiler_cxx=['clang'],
-                                      linker_exe=['clang'],
-                                      linker_so=linker_infos)
+        self.compiler.set_executables(
+            compiler=compiler_infos,
+            compiler_so=compiler_infos,
+            compiler_cxx=['clang'],
+            linker_exe=['clang'],
+            linker_so=linker_infos,
+        )
 
     def _check_abi(self):
         """
@@ -633,11 +695,16 @@ def _check_abi(self):
 
         check_abi_compatibility(compiler)
         # Warn user if VC env is activated but `DISTUTILS_USE_SDK` is not set.
-        if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
+        if (
+            IS_WINDOWS
+            and 'VSCMD_ARG_TGT_ARCH' in os.environ
+            and 'DISTUTILS_USE_SDK' not in os.environ
+        ):
             msg = (
                 'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
                 'This may lead to multiple activations of the VC env.'
-                'Please run `set DISTUTILS_USE_SDK=1` and try again.')
+                'Please run `set DISTUTILS_USE_SDK=1` and try again.'
+            )
             raise UserWarning(msg)
 
     def _record_op_info(self):
@@ -658,9 +725,9 @@ def _record_op_info(self):
             op_names = parse_op_name_from(sources)
 
             for op_name in op_names:
-                CustomOpInfo.instance().add(op_name,
-                                            so_name=so_name,
-                                            so_path=so_path)
+                CustomOpInfo.instance().add(
+                    op_name, so_name=so_name, so_path=so_path
+                )
 
 
 class EasyInstallCommand(easy_install, object):
@@ -713,7 +780,6 @@ def with_options(cls, **options):
         """
 
         class cls_with_options(cls):
-
             def __init__(self, *args, **kwargs):
                 kwargs.update(options)
                 cls.__init__(self, *args, **kwargs)
@@ -736,47 +802,49 @@ def initialize_options(self):
             self.build_base = self._specified_build_base
 
 
-def load(name,
-         sources,
-         extra_cxx_cflags=None,
-         extra_cuda_cflags=None,
-         extra_ldflags=None,
-         extra_include_paths=None,
-         build_directory=None,
-         verbose=False):
+def load(
+    name,
+    sources,
+    extra_cxx_cflags=None,
+    extra_cuda_cflags=None,
+    extra_ldflags=None,
+    extra_include_paths=None,
+    build_directory=None,
+    verbose=False,
+):
     """
     An Interface to automatically compile C++/CUDA source files Just-In-Time
     and return callable python function as other Paddle layers API. It will
     append user defined custom operators in background while building models.
 
     It will perform compiling, linking, Python API generation and module loading
-    processes under a individual subprocess. It does not require CMake or Ninja 
-    environment. On Linux platform, it requires GCC compiler whose version is 
-    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows 
+    processes under a individual subprocess. It does not require CMake or Ninja
+    environment. On Linux platform, it requires GCC compiler whose version is
+    greater than 5.4 and it should be soft linked to ``/usr/bin/cc`` . On Windows
     platform, it requires Visual Studio whose version is greater than 2017.
-    On MacOS, clang++ is requited. In addition, if compiling Operators supporting 
+    On MacOS, clang++ is requited. In addition, if compiling Operators supporting
     GPU device, please make sure ``nvcc`` compiler is installed in local environment.
-    
-    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_ 
+
+    Moreover, `ABI compatibility <https://gcc.gnu.org/onlinedocs/libstdc++/manual/abi.html>`_
     will be checked to ensure that compiler version from ``cc(Linux)`` , ``cl.exe(Windows)``
     on local machine is compatible with pre-installed Paddle whl in python site-packages.
 
-    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2, 
-    then the version of user's local machine should satisfy GCC >= 8.2. 
-    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of 
-    PaddlePaddle (Visual Studio 2017). 
-    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may 
+    For Linux, GCC version will be checked . For example if Paddle with CUDA 10.1 is built with GCC 8.2,
+    then the version of user's local machine should satisfy GCC >= 8.2.
+    For Windows, Visual Studio version will be checked, and it should be greater than or equal to that of
+    PaddlePaddle (Visual Studio 2017).
+    If the above conditions are not met, the corresponding warning will be printed, and a fatal error may
     occur because of ABI compatibility.
 
     Compared with ``setup`` interface, it doesn't need extra ``setup.py`` and excute
     ``python setup.py install`` command. The interface contains all compiling and installing
     process underground.
 
-    .. note::
+    Note:
 
         1. Currently we support Linux, MacOS and Windows platfrom.
         2. On Linux platform, we recommend to use GCC 8.2 as soft linking condidate of ``/usr/bin/cc`` .
-           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking 
+           Then, Use ``which cc`` to ensure location of ``cc`` and using ``cc --version`` to ensure linking
            GCC version.
         3. On Windows platform, we recommend to install `` Visual Studio`` (>=2017).
 
@@ -784,7 +852,7 @@ def load(name,
     **A simple example:**
 
     .. code-block:: text
-    
+
         import paddle
         from paddle.utils.cpp_extension import load
 
@@ -807,7 +875,7 @@ def load(name,
         extra_cxx_cflags(list[str], optional): Specify additional flags used to compile CPP files. By default
                                all basic and framework related flags have been included.
         extra_cuda_cflags(list[str], optional): Specify additional flags used to compile CUDA files. By default
-                               all basic and framework related flags have been included. 
+                               all basic and framework related flags have been included.
                                See `Cuda Compiler Driver NVCC <https://docs.nvidia.com/cuda/cuda-compiler-driver-nvcc/index.html>`_
                                for details. Default is None.
         extra_ldflags(list[str], optional): Specify additional flags used to link shared library. See
@@ -837,27 +905,42 @@ def load(name,
     file_path = os.path.join(build_directory, "{}_setup.py".format(name))
     sources = [os.path.abspath(source) for source in sources]
 
-    if extra_cxx_cflags is None: extra_cxx_cflags = []
-    if extra_cuda_cflags is None: extra_cuda_cflags = []
+    if extra_cxx_cflags is None:
+        extra_cxx_cflags = []
+    if extra_cuda_cflags is None:
+        extra_cuda_cflags = []
     assert isinstance(
         extra_cxx_cflags, list
     ), "Required type(extra_cxx_cflags) == list[str], but received {}".format(
-        extra_cxx_cflags)
+        extra_cxx_cflags
+    )
     assert isinstance(
         extra_cuda_cflags, list
     ), "Required type(extra_cuda_cflags) == list[str], but received {}".format(
-        extra_cuda_cflags)
+        extra_cuda_cflags
+    )
 
     log_v(
         "additional extra_cxx_cflags: [{}], extra_cuda_cflags: [{}]".format(
-            ' '.join(extra_cxx_cflags), ' '.join(extra_cuda_cflags)), verbose)
+            ' '.join(extra_cxx_cflags), ' '.join(extra_cuda_cflags)
+        ),
+        verbose,
+    )
 
     # write setup.py file and compile it
     build_base_dir = os.path.join(build_directory, name)
 
-    _write_setup_file(name, sources, file_path, build_base_dir,
-                      extra_include_paths, extra_cxx_cflags, extra_cuda_cflags,
-                      extra_ldflags, verbose)
+    _write_setup_file(
+        name,
+        sources,
+        file_path,
+        build_base_dir,
+        extra_include_paths,
+        extra_cxx_cflags,
+        extra_cuda_cflags,
+        extra_ldflags,
+        verbose,
+    )
     _jit_compile(file_path, verbose)
 
     # import as callable python api
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 62fce3360042bc..693a47f3f86da6 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -399,10 +399,7 @@ def _get_core_name():
     """
     import paddle
     ext_name = '.pyd' if IS_WINDOWS else '.so'
-    if not paddle.fluid.core.load_noavx:
-        return 'core_avx' + ext_name
-    else:
-        return 'core_noavx' + ext_name
+    return 'libpaddle' + ext_name
 
 
 def _get_lib_core_path():
@@ -419,13 +416,13 @@ def _get_dll_core_path():
     Return real path of libcore_(no)avx.dylib on Windows.
     """
     raw_core_name = _get_core_name()
-    dll_core_name = "paddle_pybind.dll"
+    dll_core_name = "libpaddle.dll"
     return os.path.join(_get_fluid_path(), dll_core_name)
 
 
 def _reset_so_rpath(so_path):
     """
-    NOTE(Aurelius84): Runtime path of core_(no)avx.so is modified into `@loader_path/../libs`
+    NOTE(Aurelius84): Runtime path of libpaddle.so is modified into `@loader_path/../libs`
     in setup.py.in. While loading custom op, `@loader_path` is the dirname of custom op
     instead of `paddle/fluid`. So we modify `@loader_path` from custom dylib into `@rpath`
     to ensure dynamic loader find it correctly.
@@ -524,7 +521,7 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
             # See _reset_so_rpath for details.
             extra_link_args.append('-Wl,-rpath,{}'.format(_get_fluid_path()))
             # On MacOS, ld don't support `-l:xx`, so we create a
-            # libcore_avx.dylib symbol link.
+            # liblibpaddle.dylib symbol link.
             lib_core_name = create_sym_link_if_not_exist()
             extra_link_args.append('-l{}'.format(lib_core_name))
         ###########################   -- END --    ###########################
@@ -555,7 +552,7 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
 
 def create_sym_link_if_not_exist():
     """
-    Create soft symbol link of `core_avx.so` or `core_noavx.so`
+    Create soft symbol link of `libpaddle.so`
     """
     assert OS_NAME.startswith('darwin') or IS_WINDOWS
 
@@ -574,7 +571,7 @@ def create_sym_link_if_not_exist():
                     .format(raw_core_name, new_dll_core_path, core_path,
                             raw_core_name))
                 run_cmd('mklink /H {} {}'.format(new_dll_core_path, core_path))
-        # core_avx or core_noavx with lib suffix
+        # libpaddle with lib suffix
         assert os.path.exists(new_dll_core_path)
         return raw_core_name[:-4] + ".lib"
 
@@ -590,7 +587,7 @@ def create_sym_link_if_not_exist():
                     "Failed to create soft symbol link for {}.\n Please execute the following command manually: `ln -s {} {}`"
                     .format(raw_core_name, core_path, new_lib_core_path))
 
-        # core_avx or core_noavx without suffix
+        # libpaddle without suffix
         return raw_core_name[:-3]
 
 
@@ -779,7 +776,7 @@ def find_paddle_libraries(use_cuda=False):
             cuda_lib_dir = find_cuda_libraries()
             paddle_lib_dirs.extend(cuda_lib_dir)
 
-    # add `paddle/fluid` to search `core_avx.so` or `core_noavx.so`
+    # add `paddle/fluid` to search `libpaddle.so`
     paddle_lib_dirs.append(_get_fluid_path())
 
     return paddle_lib_dirs
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index f083d01c5a8cc2..b274ab42a23f0a 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -36,7 +36,7 @@
     'train10': 'data_batch',
     'test10': 'test_batch',
     'train100': 'train',
-    'test100': 'test'
+    'test100': 'test',
 }
 
 
@@ -105,14 +105,18 @@ class Cifar10(Dataset):
                 # <class 'paddle.Tensor'> [3, 64, 64] 3
     """
 
-    def __init__(self,
-                 data_file=None,
-                 mode='train',
-                 transform=None,
-                 download=True,
-                 backend=None):
-        assert mode.lower() in ['train', 'test', 'train', 'test'], \
-            "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(mode)
+    def __init__(
+        self,
+        data_file=None,
+        mode='train',
+        transform=None,
+        download=True,
+        backend=None,
+    ):
+        assert mode.lower() in [
+            'train',
+            'test',
+        ], "mode.lower() should be 'train' or 'test', but got {}".format(mode)
         self.mode = mode.lower()
 
         if backend is None:
@@ -120,18 +124,21 @@ def __init__(self,
         if backend not in ['pil', 'cv2']:
             raise ValueError(
                 "Expected backend are one of ['pil', 'cv2'], but got {}".format(
-                    backend))
+                    backend
+                )
+            )
         self.backend = backend
 
         self._init_url_md5_flag()
 
         self.data_file = data_file
         if self.data_file is None:
-            assert download, "data_file is not set and downloading automatically is disabled"
-            self.data_file = _check_exists_and_download(data_file,
-                                                        self.data_url,
-                                                        self.data_md5, 'cifar',
-                                                        download)
+            assert (
+                download
+            ), "data_file is not set and downloading automatically is disabled"
+            self.data_file = _check_exists_and_download(
+                data_file, self.data_url, self.data_md5, 'cifar', download
+            )
 
         self.transform = transform
 
@@ -148,8 +155,9 @@ def _init_url_md5_flag(self):
     def _load_data(self):
         self.data = []
         with tarfile.open(self.data_file, mode='r') as f:
-            names = (each_item.name for each_item in f
-                     if self.flag in each_item.name)
+            names = (
+                each_item.name for each_item in f if self.flag in each_item.name
+            )
 
             names = sorted(list(names))
 
@@ -157,8 +165,9 @@ def _load_data(self):
                 batch = pickle.load(f.extractfile(name), encoding='bytes')
 
                 data = batch[six.b('data')]
-                labels = batch.get(six.b('labels'),
-                                   batch.get(six.b('fine_labels'), None))
+                labels = batch.get(
+                    six.b('labels'), batch.get(six.b('fine_labels'), None)
+                )
                 assert labels is not None
                 for sample, label in six.moves.zip(data, labels):
                     self.data.append((sample, label))
@@ -247,14 +256,17 @@ class Cifar100(Cifar10):
                 # <class 'paddle.Tensor'> [3, 64, 64] 49
     """
 
-    def __init__(self,
-                 data_file=None,
-                 mode='train',
-                 transform=None,
-                 download=True,
-                 backend=None):
-        super(Cifar100, self).__init__(data_file, mode, transform, download,
-                                       backend)
+    def __init__(
+        self,
+        data_file=None,
+        mode='train',
+        transform=None,
+        download=True,
+        backend=None,
+    ):
+        super(Cifar100, self).__init__(
+            data_file, mode, transform, download, backend
+        )
 
     def _init_url_md5_flag(self):
         self.data_url = CIFAR100_URL
diff --git a/python/paddle/vision/ops.py b/python/paddle/vision/ops.py
index 032fe4bd356072..9adda41eebfa92 100755
--- a/python/paddle/vision/ops.py
+++ b/python/paddle/vision/ops.py
@@ -14,47 +14,72 @@
 
 import numpy as np
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype
+from ..fluid.data_feeder import (
+    check_variable_and_dtype,
+    check_type,
+    check_dtype,
+)
 from ..fluid import core, layers
 from ..fluid.layers import nn, utils
 from ..nn import Layer, Conv2D, Sequential, ReLU, BatchNorm2D
 from ..fluid.initializer import Normal
-from ..fluid.framework import _non_static_mode, in_dygraph_mode, _in_legacy_dygraph
+from ..fluid.framework import (
+    Variable,
+    _non_static_mode,
+    in_dygraph_mode,
+    _in_legacy_dygraph,
+)
 from paddle.common_ops_import import *
 from paddle import _C_ops, _legacy_C_ops
 
-__all__ = [  #noqa
-    'yolo_loss', 'yolo_box', 'deform_conv2d', 'DeformConv2D',
-    'distribute_fpn_proposals', 'generate_proposals', 'read_file',
-    'decode_jpeg', 'roi_pool', 'RoIPool', 'psroi_pool', 'PSRoIPool',
-    'roi_align', 'RoIAlign', 'nms', 'matrix_nms'
+__all__ = [  # noqa
+    'yolo_loss',
+    'yolo_box',
+    'prior_box',
+    'box_coder',
+    'deform_conv2d',
+    'DeformConv2D',
+    'distribute_fpn_proposals',
+    'generate_proposals',
+    'read_file',
+    'decode_jpeg',
+    'roi_pool',
+    'RoIPool',
+    'psroi_pool',
+    'PSRoIPool',
+    'roi_align',
+    'RoIAlign',
+    'nms',
+    'matrix_nms',
 ]
 
 
-def yolo_loss(x,
-              gt_box,
-              gt_label,
-              anchors,
-              anchor_mask,
-              class_num,
-              ignore_thresh,
-              downsample_ratio,
-              gt_score=None,
-              use_label_smooth=True,
-              name=None,
-              scale_x_y=1.):
+def yolo_loss(
+    x,
+    gt_box,
+    gt_label,
+    anchors,
+    anchor_mask,
+    class_num,
+    ignore_thresh,
+    downsample_ratio,
+    gt_score=None,
+    use_label_smooth=True,
+    name=None,
+    scale_x_y=1.0,
+):
     r"""
 
     This operator generates YOLOv3 loss based on given predict result and ground
     truth boxes.
-    
+
     The output of previous network is in shape [N, C, H, W], while H and W
-    should be the same, H and W specify the grid size, each grid point predict 
+    should be the same, H and W specify the grid size, each grid point predict
     given number bounding boxes, this given number, which following will be represented as S,
     is specified by the number of anchor clusters in each scale. In the second dimension(the channel
-    dimension), C should be equal to S * (class_num + 5), class_num is the object 
-    category number of source dataset(such as 80 in coco dataset), so in the 
-    second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
+    dimension), C should be equal to S * (class_num + 5), class_num is the object
+    category number of source dataset(such as 80 in coco dataset), so in the
+    second(channel) dimension, apart from 4 box location coordinates x, y, w, h,
     also includes confidence score of the box and class one-hot key of each anchor box.
 
     Assume the 4 location coordinates are :math:`t_x, t_y, t_w, t_h`, the box predictions
@@ -77,21 +102,21 @@ def yolo_loss(x,
     and :math:`p_w, p_h` is specified by anchors.
 
     As for confidence score, it is the logistic regression value of IoU between
-    anchor boxes and ground truth boxes, the score of the anchor box which has 
-    the max IoU should be 1, and if the anchor box has IoU bigger than ignore 
+    anchor boxes and ground truth boxes, the score of the anchor box which has
+    the max IoU should be 1, and if the anchor box has IoU bigger than ignore
     thresh, the confidence score loss of this anchor box will be ignored.
 
     Therefore, the YOLOv3 loss consists of three major parts: box location loss,
-    objectness loss and classification loss. The L1 loss is used for 
-    box coordinates (w, h), sigmoid cross entropy loss is used for box 
+    objectness loss and classification loss. The L1 loss is used for
+    box coordinates (w, h), sigmoid cross entropy loss is used for box
     coordinates (x, y), objectness loss and classification loss.
 
-    Each groud truth box finds a best matching anchor box in all anchors. 
+    Each groud truth box finds a best matching anchor box in all anchors.
     Prediction of this anchor box will incur all three parts of losses, and
     prediction of anchor boxes with no GT box matched will only incur objectness
     loss.
 
-    In order to trade off box coordinate losses between big boxes and small 
+    In order to trade off box coordinate losses between big boxes and small
     boxes, box coordinate losses will be mutiplied by scale weight, which is
     calculated as follows.
 
@@ -106,12 +131,12 @@ def yolo_loss(x,
     $$
 
     While :attr:`use_label_smooth` is set to be :attr:`True`, the classification
-    target will be smoothed when calculating classification loss, target of 
+    target will be smoothed when calculating classification loss, target of
     positive samples will be smoothed to :math:`1.0 - 1.0 / class\_num` and target of
     negetive samples will be smoothed to :math:`1.0 / class\_num`.
 
-    While :attr:`gt_score` is given, which means the mixup score of ground truth 
-    boxes, all losses incured by a ground truth box will be multiplied by its 
+    While :attr:`gt_score` is given, which means the mixup score of ground truth
+    boxes, all losses incured by a ground truth box will be multiplied by its
     mixup score.
 
     Args:
@@ -119,16 +144,16 @@ def yolo_loss(x,
                       tensor with shape of [N, C, H, W]. H and W should be same,
                       and the second dimension(C) stores box locations, confidence
                       score and classification one-hot keys of each anchor box.
-                      The data type is float32 or float64. 
+                      The data type is float32 or float64.
         gt_box (Tensor): groud truth boxes, should be in shape of [N, B, 4],
-                          in the third dimension, x, y, w, h should be stored. 
+                          in the third dimension, x, y, w, h should be stored.
                           x,y is the center coordinate of boxes, w, h are the
-                          width and height, x, y, w, h should be divided by 
+                          width and height, x, y, w, h should be divided by
                           input image height to scale to [0, 1].
-                          N is the batch number and B is the max box number in 
-                          an image.The data type is float32 or float64. 
+                          N is the batch number and B is the max box number in
+                          an image.The data type is float32 or float64.
         gt_label (Tensor): class id of ground truth boxes, should be in shape
-                            of [N, B].The data type is int32. 
+                            of [N, B].The data type is int32.
         anchors (list|tuple): The anchor width and height, it will be parsed
                               pair by pair.
         anchor_mask (list|tuple): The mask index of anchors used in current
@@ -137,42 +162,28 @@ def yolo_loss(x,
         ignore_thresh (float): The ignore threshold to ignore confidence loss.
         downsample_ratio (int): The downsample ratio from network input to YOLOv3
                                 loss input, so 32, 16, 8 should be set for the
-                                first, second, and thrid YOLOv3 loss operators. 
-        name (string): The default value is None.  Normally there is no need 
-                       for user to set this property.  For more information, 
+                                first, second, and thrid YOLOv3 loss operators.
+        name (string): The default value is None.  Normally there is no need
+                       for user to set this property.  For more information,
                        please refer to :ref:`api_guide_Name`
         gt_score (Tensor): mixup score of ground truth boxes, should be in shape
                             of [N, B]. Default None.
-        use_label_smooth (bool): Whether to use label smooth. Default True. 
+        use_label_smooth (bool): Whether to use label smooth. Default True.
         scale_x_y (float): Scale the center point of decoded bounding box.
                            Default 1.0
 
     Returns:
         Tensor: A 1-D tensor with shape [N], the value of yolov3 loss
 
-    Raises:
-        TypeError: Input x of yolov3_loss must be Tensor
-        TypeError: Input gtbox of yolov3_loss must be Tensor 
-        TypeError: Input gtlabel of yolov3_loss must be Tensor 
-        TypeError: Input gtscore of yolov3_loss must be None or Tensor 
-        TypeError: Attr anchors of yolov3_loss must be list or tuple
-        TypeError: Attr class_num of yolov3_loss must be an integer
-        TypeError: Attr ignore_thresh of yolov3_loss must be a float number
-        TypeError: Attr use_label_smooth of yolov3_loss must be a bool value
-
     Examples:
       .. code-block:: python
 
           import paddle
-          import numpy as np
 
-          x = np.random.random([2, 14, 8, 8]).astype('float32')
-          gt_box = np.random.random([2, 10, 4]).astype('float32')
-          gt_label = np.random.random([2, 10]).astype('int32')
+          x = paddle.rand([2, 14, 8, 8]).astype('float32')
+          gt_box = paddle.rand([2, 10, 4]).astype('float32')
+          gt_label = paddle.rand([2, 10]).astype('int32')
 
-          x = paddle.to_tensor(x)
-          gt_box = paddle.to_tensor(gt_box)
-          gt_label = paddle.to_tensor(gt_label)
 
           loss = paddle.vision.ops.yolo_loss(x,
                                              gt_box=gt_box,
@@ -187,25 +198,50 @@ def yolo_loss(x,
     """
 
     if in_dygraph_mode():
-        loss, _, _ = _C_ops.yolov3_loss(x, gt_box, gt_label, gt_score, anchors,
-                                        anchor_mask, class_num, ignore_thresh,
-                                        downsample_ratio, use_label_smooth,
-                                        scale_x_y)
+        loss, _, _ = _C_ops.yolov3_loss(
+            x,
+            gt_box,
+            gt_label,
+            gt_score,
+            anchors,
+            anchor_mask,
+            class_num,
+            ignore_thresh,
+            downsample_ratio,
+            use_label_smooth,
+            scale_x_y,
+        )
         return loss
 
     if _non_static_mode():
         loss, _, _ = _legacy_C_ops.yolov3_loss(
-            x, gt_box, gt_label, gt_score, 'anchors', anchors, 'anchor_mask',
-            anchor_mask, 'class_num', class_num, 'ignore_thresh', ignore_thresh,
-            'downsample_ratio', downsample_ratio, 'use_label_smooth',
-            use_label_smooth, 'scale_x_y', scale_x_y)
+            x,
+            gt_box,
+            gt_label,
+            gt_score,
+            'anchors',
+            anchors,
+            'anchor_mask',
+            anchor_mask,
+            'class_num',
+            class_num,
+            'ignore_thresh',
+            ignore_thresh,
+            'downsample_ratio',
+            downsample_ratio,
+            'use_label_smooth',
+            use_label_smooth,
+            'scale_x_y',
+            scale_x_y,
+        )
         return loss
 
     helper = LayerHelper('yolov3_loss', **locals())
 
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'yolo_loss')
-    check_variable_and_dtype(gt_box, 'gt_box', ['float32', 'float64'],
-                             'yolo_loss')
+    check_variable_and_dtype(
+        gt_box, 'gt_box', ['float32', 'float64'], 'yolo_loss'
+    )
     check_variable_and_dtype(gt_label, 'gt_label', 'int32', 'yolo_loss')
     check_type(anchors, 'anchors', (list, tuple), 'yolo_loss')
     check_type(anchor_mask, 'anchor_mask', (list, tuple), 'yolo_loss')
@@ -236,28 +272,32 @@ def yolo_loss(x,
         "scale_x_y": scale_x_y,
     }
 
-    helper.append_op(type='yolov3_loss',
-                     inputs=inputs,
-                     outputs={
-                         'Loss': loss,
-                         'ObjectnessMask': objectness_mask,
-                         'GTMatchMask': gt_match_mask
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='yolov3_loss',
+        inputs=inputs,
+        outputs={
+            'Loss': loss,
+            'ObjectnessMask': objectness_mask,
+            'GTMatchMask': gt_match_mask,
+        },
+        attrs=attrs,
+    )
     return loss
 
 
-def yolo_box(x,
-             img_size,
-             anchors,
-             class_num,
-             conf_thresh,
-             downsample_ratio,
-             clip_bbox=True,
-             name=None,
-             scale_x_y=1.,
-             iou_aware=False,
-             iou_aware_factor=0.5):
+def yolo_box(
+    x,
+    img_size,
+    anchors,
+    class_num,
+    conf_thresh,
+    downsample_ratio,
+    clip_bbox=True,
+    name=None,
+    scale_x_y=1.0,
+    iou_aware=False,
+    iou_aware_factor=0.5,
+):
     r"""
 
     This operator generates YOLO detection boxes from output of YOLOv3 network.
@@ -347,24 +387,14 @@ def yolo_box(x,
         and a 3-D tensor with shape [N, M, :attr:`class_num`], the classification 
         scores of boxes.
 
-    Raises:
-        TypeError: Input x of yolov_box must be Tensor
-        TypeError: Attr anchors of yolo box must be list or tuple
-        TypeError: Attr class_num of yolo box must be an integer
-        TypeError: Attr conf_thresh of yolo box must be a float number
-
     Examples:
 
     .. code-block:: python
 
         import paddle
-        import numpy as np
-
-        x = np.random.random([2, 14, 8, 8]).astype('float32')
-        img_size = np.ones((2, 2)).astype('int32')
 
-        x = paddle.to_tensor(x)
-        img_size = paddle.to_tensor(img_size)
+        x = paddle.rand([2, 14, 8, 8]).astype('float32')
+        img_size = paddle.ones((2, 2)).astype('int32')
 
         boxes, scores = paddle.vision.ops.yolo_box(x,
                                                    img_size=img_size,
@@ -376,18 +406,41 @@ def yolo_box(x,
                                                    scale_x_y=1.)
     """
     if in_dygraph_mode():
-        boxes, scores = _C_ops.yolo_box(x, img_size, anchors, class_num,
-                                        conf_thresh, downsample_ratio,
-                                        clip_bbox, scale_x_y, iou_aware,
-                                        iou_aware_factor)
+        boxes, scores = _C_ops.yolo_box(
+            x,
+            img_size,
+            anchors,
+            class_num,
+            conf_thresh,
+            downsample_ratio,
+            clip_bbox,
+            scale_x_y,
+            iou_aware,
+            iou_aware_factor,
+        )
         return boxes, scores
 
     if _non_static_mode():
         boxes, scores = _legacy_C_ops.yolo_box(
-            x, img_size, 'anchors', anchors, 'class_num', class_num,
-            'conf_thresh', conf_thresh, 'downsample_ratio', downsample_ratio,
-            'clip_bbox', clip_bbox, 'scale_x_y', scale_x_y, 'iou_aware',
-            iou_aware, 'iou_aware_factor', iou_aware_factor)
+            x,
+            img_size,
+            'anchors',
+            anchors,
+            'class_num',
+            class_num,
+            'conf_thresh',
+            conf_thresh,
+            'downsample_ratio',
+            downsample_ratio,
+            'clip_bbox',
+            clip_bbox,
+            'scale_x_y',
+            scale_x_y,
+            'iou_aware',
+            iou_aware,
+            'iou_aware_factor',
+            iou_aware_factor,
+        )
         return boxes, scores
 
     helper = LayerHelper('yolo_box', **locals())
@@ -408,33 +461,410 @@ def yolo_box(x,
         "clip_bbox": clip_bbox,
         "scale_x_y": scale_x_y,
         "iou_aware": iou_aware,
-        "iou_aware_factor": iou_aware_factor
+        "iou_aware_factor": iou_aware_factor,
     }
 
-    helper.append_op(type='yolo_box',
-                     inputs={
-                         "X": x,
-                         "ImgSize": img_size,
-                     },
-                     outputs={
-                         'Boxes': boxes,
-                         'Scores': scores,
-                     },
-                     attrs=attrs)
+    helper.append_op(
+        type='yolo_box',
+        inputs={
+            "X": x,
+            "ImgSize": img_size,
+        },
+        outputs={
+            'Boxes': boxes,
+            'Scores': scores,
+        },
+        attrs=attrs,
+    )
     return boxes, scores
 
 
-def deform_conv2d(x,
-                  offset,
-                  weight,
-                  bias=None,
-                  stride=1,
-                  padding=0,
-                  dilation=1,
-                  deformable_groups=1,
-                  groups=1,
-                  mask=None,
-                  name=None):
+def prior_box(
+    input,
+    image,
+    min_sizes,
+    max_sizes=None,
+    aspect_ratios=[1.0],
+    variance=[0.1, 0.1, 0.2, 0.2],
+    flip=False,
+    clip=False,
+    steps=[0.0, 0.0],
+    offset=0.5,
+    min_max_aspect_ratios_order=False,
+    name=None,
+):
+    r"""
+
+    This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
+
+    Each position of the input produce N prior boxes, N is determined by
+    the count of min_sizes, max_sizes and aspect_ratios, The size of the
+    box is in range(min_size, max_size) interval, which is generated in
+    sequence according to the aspect_ratios.
+
+    Args:
+       input (Tensor): 4-D tensor(NCHW), the data type should be float32 or float64.
+       image (Tensor): 4-D tensor(NCHW), the input image data of PriorBoxOp,
+            the data type should be float32 or float64.
+       min_sizes (list|tuple|float): the min sizes of generated prior boxes.
+       max_sizes (list|tuple|None, optional): the max sizes of generated prior boxes.
+            Default: None.
+       aspect_ratios (list|tuple|float, optional): the aspect ratios of generated
+            prior boxes. Default: [1.].
+       variance (list|tuple, optional): the variances to be encoded in prior boxes.
+            Default:[0.1, 0.1, 0.2, 0.2].
+       flip (bool): Whether to flip aspect ratios. Default:False.
+       clip (bool): Whether to clip out-of-boundary boxes. Default: False.
+       steps (list|tuple, optional): Prior boxes steps across width and height, If
+            steps[0] equals to 0.0 or steps[1] equals to 0.0, the prior boxes steps across
+            height or weight of the input will be automatically calculated.
+            Default: [0., 0.]
+       offset (float, optional)): Prior boxes center offset. Default: 0.5
+       min_max_aspect_ratios_order (bool, optional): If set True, the output prior box is
+            in order of [min, max, aspect_ratios], which is consistent with
+            Caffe. Please note, this order affects the weights order of
+            convolution layer followed by and does not affect the final
+            detection results. Default: False.
+       name (str, optional): The default value is None. Normally there is no need for
+            user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: the output prior boxes and the expanded variances of PriorBox.
+            The prior boxes is a 4-D tensor, the layout is [H, W, num_priors, 4],
+            num_priors is the total box count of each position of input.
+            The expanded variances is a 4-D tensor, same shape as the prior boxes.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            input = paddle.rand((1, 3, 6, 9), dtype=paddle.float32)
+            image = paddle.rand((1, 3, 9, 12), dtype=paddle.float32)
+
+            box, var = paddle.vision.ops.prior_box(
+                input=input,
+                image=image,
+                min_sizes=[2.0, 4.0],
+                clip=True,
+                flip=True)
+
+    """
+    helper = LayerHelper("prior_box", **locals())
+    dtype = helper.input_dtype()
+    check_variable_and_dtype(
+        input, 'input', ['uint8', 'int8', 'float32', 'float64'], 'prior_box'
+    )
+
+    def _is_list_or_tuple_(data):
+        return isinstance(data, list) or isinstance(data, tuple)
+
+    if not _is_list_or_tuple_(min_sizes):
+        min_sizes = [min_sizes]
+    if not _is_list_or_tuple_(aspect_ratios):
+        aspect_ratios = [aspect_ratios]
+    if not _is_list_or_tuple_(steps):
+        steps = [steps]
+    if not len(steps) == 2:
+        raise ValueError('steps should be (step_w, step_h)')
+
+    min_sizes = list(map(float, min_sizes))
+    aspect_ratios = list(map(float, aspect_ratios))
+    steps = list(map(float, steps))
+
+    cur_max_sizes = None
+    if max_sizes is not None and len(max_sizes) > 0 and max_sizes[0] > 0:
+        if not _is_list_or_tuple_(max_sizes):
+            max_sizes = [max_sizes]
+        cur_max_sizes = max_sizes
+
+    if in_dygraph_mode():
+        step_w, step_h = steps
+        if max_sizes == None:
+            max_sizes = []
+        box, var = _C_ops.prior_box(
+            input,
+            image,
+            min_sizes,
+            aspect_ratios,
+            variance,
+            max_sizes,
+            flip,
+            clip,
+            step_w,
+            step_h,
+            offset,
+            min_max_aspect_ratios_order,
+        )
+        return box, var
+
+    if _in_legacy_dygraph():
+        attrs = (
+            'min_sizes',
+            min_sizes,
+            'aspect_ratios',
+            aspect_ratios,
+            'variances',
+            variance,
+            'flip',
+            flip,
+            'clip',
+            clip,
+            'step_w',
+            steps[0],
+            'step_h',
+            steps[1],
+            'offset',
+            offset,
+            'min_max_aspect_ratios_order',
+            min_max_aspect_ratios_order,
+        )
+        if cur_max_sizes is not None:
+            attrs += ('max_sizes', cur_max_sizes)
+        box, var = _legacy_C_ops.prior_box(input, image, *attrs)
+        return box, var
+    else:
+        attrs = {
+            'min_sizes': min_sizes,
+            'aspect_ratios': aspect_ratios,
+            'variances': variance,
+            'flip': flip,
+            'clip': clip,
+            'step_w': steps[0],
+            'step_h': steps[1],
+            'offset': offset,
+            'min_max_aspect_ratios_order': min_max_aspect_ratios_order,
+        }
+        if cur_max_sizes is not None:
+            attrs['max_sizes'] = cur_max_sizes
+
+        box = helper.create_variable_for_type_inference(dtype)
+        var = helper.create_variable_for_type_inference(dtype)
+        helper.append_op(
+            type="prior_box",
+            inputs={"Input": input, "Image": image},
+            outputs={"Boxes": box, "Variances": var},
+            attrs=attrs,
+        )
+        box.stop_gradient = True
+        var.stop_gradient = True
+        return box, var
+
+
+def box_coder(
+    prior_box,
+    prior_box_var,
+    target_box,
+    code_type="encode_center_size",
+    box_normalized=True,
+    axis=0,
+    name=None,
+):
+    r"""
+    Encode/Decode the target bounding box with the priorbox information.
+
+    The Encoding schema described below:
+
+    .. math::
+
+        ox &= (tx - px) / pw / pxv
+
+        oy &= (ty - py) / ph / pyv
+
+        ow &= log(abs(tw / pw)) / pwv
+
+        oh &= log(abs(th / ph)) / phv
+
+    The Decoding schema described below:
+
+    .. math::
+
+        ox &= (pw * pxv * tx * + px) - tw / 2
+
+        oy &= (ph * pyv * ty * + py) - th / 2
+
+        ow &= exp(pwv * tw) * pw + tw / 2
+
+        oh &= exp(phv * th) * ph + th / 2
+
+    where `tx`, `ty`, `tw`, `th` denote the target box's center coordinates,
+    width and height respectively. Similarly, `px`, `py`, `pw`, `ph` denote
+    the priorbox's (anchor) center coordinates, width and height. `pxv`,
+    `pyv`, `pwv`, `phv` denote the variance of the priorbox and `ox`, `oy`,
+    `ow`, `oh` denote the encoded/decoded coordinates, width and height.
+    During Box Decoding, two modes for broadcast are supported. Say target
+    box has shape [N, M, 4], and the shape of prior box can be [N, 4] or
+    [M, 4]. Then prior box will broadcast to target box along the
+    assigned axis.
+
+    Args:
+        prior_box (Tensor): Box list prior_box is a 2-D Tensor with shape
+            [M, 4] holds M boxes and data type is float32 or float64. Each box
+            is represented as [xmin, ymin, xmax, ymax], [xmin, ymin] is the
+            left top coordinate of the anchor box, if the input is image feature
+            map, they are close to the origin of the coordinate system.
+            [xmax, ymax] is the right bottom coordinate of the anchor box.
+        prior_box_var (List|Tensor|None): prior_box_var supports three types
+            of input. One is Tensor with shape [M, 4] which holds M group and
+            data type is float32 or float64. The second is list consist of
+            4 elements shared by all boxes and data type is float32 or float64.
+            Other is None and not involved in calculation.
+        target_box (Tensor): This input can be a 2-D LoDTensor with shape
+            [N, 4] when code_type is 'encode_center_size'. This input also can
+            be a 3-D Tensor with shape [N, M, 4] when code_type is
+            'decode_center_size'. Each box is represented as
+            [xmin, ymin, xmax, ymax]. The data type is float32 or float64.
+        code_type (str, optional): The code type used with the target box. It can be
+            `encode_center_size` or `decode_center_size`. `encode_center_size`
+            by default.
+        box_normalized (bool, optional): Whether treat the priorbox as a normalized box.
+            Set true by default.
+        axis (int, optional): Which axis in PriorBox to broadcast for box decode,
+            for example, if axis is 0 and TargetBox has shape [N, M, 4] and
+            PriorBox has shape [M, 4], then PriorBox will broadcast to [N, M, 4]
+            for decoding. It is only valid when code type is
+            `decode_center_size`. Set 0 by default.
+        name (str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
+
+    Returns:
+        Tensor: output boxes, when code_type is 'encode_center_size', the
+            output tensor of box_coder_op with shape [N, M, 4] representing the
+            result of N target boxes encoded with M Prior boxes and variances.
+            When code_type is 'decode_center_size', N represents the batch size
+            and M represents the number of decoded boxes.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            # For encode
+            prior_box_encode = paddle.rand((80, 4), dtype=paddle.float32)
+            prior_box_var_encode = paddle.rand((80, 4), dtype=paddle.float32)
+            target_box_encode = paddle.rand((20, 4), dtype=paddle.float32)
+            output_encode = paddle.vision.ops.box_coder(
+                prior_box=prior_box_encode,
+                prior_box_var=prior_box_var_encode,
+                target_box=target_box_encode,
+                code_type="encode_center_size")
+
+            # For decode
+            prior_box_decode = paddle.rand((80, 4), dtype=paddle.float32)
+            prior_box_var_decode = paddle.rand((80, 4), dtype=paddle.float32)
+            target_box_decode = paddle.rand((20, 80, 4), dtype=paddle.float32)
+            output_decode = paddle.vision.ops.box_coder(
+                prior_box=prior_box_decode,
+                prior_box_var=prior_box_var_decode,
+                target_box=target_box_decode,
+                code_type="decode_center_size",
+                box_normalized=False)
+
+    """
+    check_variable_and_dtype(
+        prior_box, 'prior_box', ['float32', 'float64'], 'box_coder'
+    )
+    check_variable_and_dtype(
+        target_box, 'target_box', ['float32', 'float64'], 'box_coder'
+    )
+
+    if in_dygraph_mode():
+        if isinstance(prior_box_var, Variable):
+            output_box = _C_ops.box_coder(
+                prior_box,
+                prior_box_var,
+                target_box,
+                code_type,
+                box_normalized,
+                axis,
+                [],
+            )
+        elif isinstance(prior_box_var, list):
+            output_box = _C_ops.box_coder(
+                prior_box,
+                None,
+                target_box,
+                code_type,
+                box_normalized,
+                axis,
+                prior_box_var,
+            )
+        else:
+            raise TypeError("Input prior_box_var must be Variable or list")
+        return output_box
+
+    if _in_legacy_dygraph():
+        if isinstance(prior_box_var, Variable):
+            output_box = _legacy_C_ops.box_coder(
+                prior_box,
+                prior_box_var,
+                target_box,
+                "code_type",
+                code_type,
+                "box_normalized",
+                box_normalized,
+                "axis",
+                axis,
+            )
+        elif isinstance(prior_box_var, list):
+            output_box = _legacy_C_ops.box_coder(
+                prior_box,
+                None,
+                target_box,
+                "code_type",
+                code_type,
+                "box_normalized",
+                box_normalized,
+                "axis",
+                axis,
+                "variance",
+                prior_box_var,
+            )
+        else:
+            raise TypeError("Input prior_box_var must be Variable or list")
+        return output_box
+    else:
+        helper = LayerHelper("box_coder", **locals())
+
+        output_box = helper.create_variable_for_type_inference(
+            dtype=prior_box.dtype
+        )
+
+        inputs = {"PriorBox": prior_box, "TargetBox": target_box}
+        attrs = {
+            "code_type": code_type,
+            "box_normalized": box_normalized,
+            "axis": axis,
+        }
+        if isinstance(prior_box_var, Variable):
+            inputs['PriorBoxVar'] = prior_box_var
+        elif isinstance(prior_box_var, list):
+            attrs['variance'] = prior_box_var
+        else:
+            raise TypeError("Input prior_box_var must be Variable or list")
+        helper.append_op(
+            type="box_coder",
+            inputs=inputs,
+            attrs=attrs,
+            outputs={"OutputBox": output_box},
+        )
+        return output_box
+
+
+def deform_conv2d(
+    x,
+    offset,
+    weight,
+    bias=None,
+    stride=1,
+    padding=0,
+    dilation=1,
+    deformable_groups=1,
+    groups=1,
+    mask=None,
+    name=None,
+):
     r"""
     Compute 2-D deformable convolution on 4-D input.
     Given input image x, output feature map y, the deformable convolution operation can be expressed as follow:
@@ -511,9 +941,7 @@ def deform_conv2d(x,
     Returns:
         Tensor: The tensor variable storing the deformable convolution \
                   result. A Tensor with type float32, float64.
-    Raises:
-        ValueError: If the shapes of input, filter_size, stride, padding and
-                    groups mismatch.
+
     Examples:
         .. code-block:: python
 
@@ -556,34 +984,58 @@ def deform_conv2d(x,
     use_deform_conv2d_v1 = True if mask is None else False
 
     if in_dygraph_mode():
-        pre_bias = _C_ops.deformable_conv(x, offset, weight, mask, stride,
-                                          padding, dilation, deformable_groups,
-                                          groups, 1)
+        pre_bias = _C_ops.deformable_conv(
+            x,
+            offset,
+            weight,
+            mask,
+            stride,
+            padding,
+            dilation,
+            deformable_groups,
+            groups,
+            1,
+        )
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
             out = pre_bias
     elif _in_legacy_dygraph():
-        attrs = ('strides', stride, 'paddings', padding, 'dilations', dilation,
-                 'deformable_groups', deformable_groups, 'groups', groups,
-                 'im2col_step', 1)
+        attrs = (
+            'strides',
+            stride,
+            'paddings',
+            padding,
+            'dilations',
+            dilation,
+            'deformable_groups',
+            deformable_groups,
+            'groups',
+            groups,
+            'im2col_step',
+            1,
+        )
         if use_deform_conv2d_v1:
             op_type = 'deformable_conv_v1'
-            pre_bias = getattr(_legacy_C_ops, op_type)(x, offset, weight,
-                                                       *attrs)
+            pre_bias = getattr(_legacy_C_ops, op_type)(
+                x, offset, weight, *attrs
+            )
         else:
             op_type = 'deformable_conv'
-            pre_bias = getattr(_legacy_C_ops, op_type)(x, offset, mask, weight,
-                                                       *attrs)
+            pre_bias = getattr(_legacy_C_ops, op_type)(
+                x, offset, mask, weight, *attrs
+            )
         if bias is not None:
             out = nn.elementwise_add(pre_bias, bias, axis=1)
         else:
             out = pre_bias
     else:
-        check_variable_and_dtype(x, "x", ['float32', 'float64'],
-                                 'deform_conv2d')
-        check_variable_and_dtype(offset, "offset", ['float32', 'float64'],
-                                 'deform_conv2d')
+        check_variable_and_dtype(
+            x, "x", ['float32', 'float64'], 'deform_conv2d'
+        )
+        check_variable_and_dtype(
+            offset, "offset", ['float32', 'float64'], 'deform_conv2d'
+        )
 
         num_channels = x.shape[1]
 
@@ -621,20 +1073,18 @@ def deform_conv2d(x,
             'deformable_groups': deformable_groups,
             'im2col_step': 1,
         }
-        helper.append_op(type=op_type,
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs=attrs)
+        helper.append_op(
+            type=op_type, inputs=inputs, outputs=outputs, attrs=attrs
+        )
 
         if bias is not None:
             out = helper.create_variable_for_type_inference(dtype)
-            helper.append_op(type='elementwise_add',
-                             inputs={
-                                 'X': [pre_bias],
-                                 'Y': [bias]
-                             },
-                             outputs={'Out': [out]},
-                             attrs={'axis': 1})
+            helper.append_op(
+                type='elementwise_add',
+                inputs={'X': [pre_bias], 'Y': [bias]},
+                outputs={'Out': [out]},
+                attrs={'axis': 1},
+            )
         else:
             out = pre_bias
     return out
@@ -774,19 +1224,23 @@ class DeformConv2D(Layer):
           [8, 16, 26, 26]
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size,
-                 stride=1,
-                 padding=0,
-                 dilation=1,
-                 deformable_groups=1,
-                 groups=1,
-                 weight_attr=None,
-                 bias_attr=None):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size,
+        stride=1,
+        padding=0,
+        dilation=1,
+        deformable_groups=1,
+        groups=1,
+        weight_attr=None,
+        bias_attr=None,
+    ):
         super(DeformConv2D, self).__init__()
-        assert weight_attr is not False, "weight_attr should not be False in Conv."
+        assert (
+            weight_attr is not False
+        ), "weight_attr should not be False in Conv."
         self._weight_attr = weight_attr
         self._bias_attr = bias_attr
         self._deformable_groups = deformable_groups
@@ -808,79 +1262,86 @@ def __init__(self,
 
         def _get_default_param_initializer():
             filter_elem_num = np.prod(self._kernel_size) * self._in_channels
-            std = (2.0 / filter_elem_num)**0.5
+            std = (2.0 / filter_elem_num) ** 0.5
             return Normal(0.0, std, 0)
 
         self.weight = self.create_parameter(
             shape=filter_shape,
             attr=self._weight_attr,
-            default_initializer=_get_default_param_initializer())
-        self.bias = self.create_parameter(attr=self._bias_attr,
-                                          shape=[self._out_channels],
-                                          is_bias=True)
+            default_initializer=_get_default_param_initializer(),
+        )
+        self.bias = self.create_parameter(
+            attr=self._bias_attr, shape=[self._out_channels], is_bias=True
+        )
 
     def forward(self, x, offset, mask=None):
-        out = deform_conv2d(x=x,
-                            offset=offset,
-                            weight=self.weight,
-                            bias=self.bias,
-                            stride=self._stride,
-                            padding=self._padding,
-                            dilation=self._dilation,
-                            deformable_groups=self._deformable_groups,
-                            groups=self._groups,
-                            mask=mask)
+        out = deform_conv2d(
+            x=x,
+            offset=offset,
+            weight=self.weight,
+            bias=self.bias,
+            stride=self._stride,
+            padding=self._padding,
+            dilation=self._dilation,
+            deformable_groups=self._deformable_groups,
+            groups=self._groups,
+            mask=mask,
+        )
         return out
 
 
-def distribute_fpn_proposals(fpn_rois,
-                             min_level,
-                             max_level,
-                             refer_level,
-                             refer_scale,
-                             pixel_offset=False,
-                             rois_num=None,
-                             name=None):
+def distribute_fpn_proposals(
+    fpn_rois,
+    min_level,
+    max_level,
+    refer_level,
+    refer_scale,
+    pixel_offset=False,
+    rois_num=None,
+    name=None,
+):
     r"""
-        In Feature Pyramid Networks (FPN) models, it is needed to distribute 
-    all proposals into different FPN level, with respect to scale of the proposals, 
-    the referring scale and the referring level. Besides, to restore the order of 
-    proposals, we return an array which indicates the original index of rois 
+
+    In Feature Pyramid Networks (FPN) models, it is needed to distribute
+    all proposals into different FPN level, with respect to scale of the proposals,
+    the referring scale and the referring level. Besides, to restore the order of
+    proposals, we return an array which indicates the original index of rois
     in current proposals. To compute FPN level for each roi, the formula is given as follows:
-    
+
     .. math::
-        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)}
-        level = floor(&\log(\\frac{roi\_scale}{refer\_scale}) + refer\_level)
+        roi\_scale &= \sqrt{BBoxArea(fpn\_roi)} \\
+        level &= floor(\log(\frac{roi\_scale}{refer\_scale}) + refer\_level)
+
     where BBoxArea is a function to compute the area of each roi.
 
     Args:
         fpn_rois (Tensor): The input fpn_rois. 2-D Tensor with shape [N, 4] and data type can be
             float32 or float64.
-        min_level (int): The lowest level of FPN layer where the proposals come 
+        min_level (int): The lowest level of FPN layer where the proposals come
             from.
         max_level (int): The highest level of FPN layer where the proposals
             come from.
         refer_level (int): The referring level of FPN layer with specified scale.
         refer_scale (int): The referring scale of FPN layer with specified level.
-        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of 
+        pixel_offset (bool, optional): Whether there is pixel offset. If True, the offset of
             image shape will be 1. 'False' by default.
-        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image. 
+        rois_num (Tensor, optional): 1-D Tensor contains the number of RoIs in each image.
             The shape is [B] and data type is int32. B is the number of images.
-            If rois_num not None, it will return a list of 1-D Tensor. Each element 
+            If rois_num not None, it will return a list of 1-D Tensor. Each element
             is the output RoIs' number of each image on the corresponding level
             and the shape is [B]. None by default.
-        name (str, optional): For detailed information, please refer 
-            to :ref:`api_guide_Name`. Usually name is no need to set and 
-            None by default. 
+        name (str, optional): For detailed information, please refer
+            to :ref:`api_guide_Name`. Usually name is no need to set and
+            None by default.
 
     Returns:
-        multi_rois (List) : The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
-            and data type is same as `fpn_rois` . The length is max_level-min_level+1.         
-        restore_ind (Tensor): The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
-            , where N is the number of total rois. The data type is int32. 
-        rois_num_per_level (List): A list of 1-D Tensor and each Tensor is 
-            the RoIs' number in each image on the corresponding level. The shape 
-            is [B] and data type of int32, where B is the number of images.
+        - multi_rois (List), The proposals in each FPN level. It is a list of 2-D Tensor with shape [M, 4], where M is
+          and data type is same as `fpn_rois` . The length is max_level-min_level+1.
+        - restore_ind (Tensor), The index used to restore the order of fpn_rois. It is a 2-D Tensor with shape [N, 1]
+          , where N is the number of total rois. The data type is int32.
+        - rois_num_per_level (List), A list of 1-D Tensor and each Tensor is
+          the RoIs' number in each image on the corresponding level. The shape
+          is [B] and data type of int32, where B is the number of images.
 
     Examples:
         .. code-block:: python
@@ -897,28 +1358,61 @@ def distribute_fpn_proposals(fpn_rois,
                 refer_level=4,
                 refer_scale=224,
                 rois_num=rois_num)
+
     """
     num_lvl = max_level - min_level + 1
 
     if in_dygraph_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        multi_rois, rois_num_per_level, restore_ind = _C_ops.distribute_fpn_proposals(
-            fpn_rois, rois_num, min_level, max_level, refer_level, refer_scale,
-            pixel_offset)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        (
+            multi_rois,
+            rois_num_per_level,
+            restore_ind,
+        ) = _C_ops.distribute_fpn_proposals(
+            fpn_rois,
+            rois_num,
+            min_level,
+            max_level,
+            refer_level,
+            refer_scale,
+            pixel_offset,
+        )
         return multi_rois, restore_ind, rois_num_per_level
 
     if _non_static_mode():
-        assert rois_num is not None, "rois_num should not be None in dygraph mode."
-        attrs = ('min_level', min_level, 'max_level', max_level, 'refer_level',
-                 refer_level, 'refer_scale', refer_scale, 'pixel_offset',
-                 pixel_offset)
-        multi_rois, restore_ind, rois_num_per_level = _legacy_C_ops.distribute_fpn_proposals(
-            fpn_rois, rois_num, num_lvl, num_lvl, *attrs)
+        assert (
+            rois_num is not None
+        ), "rois_num should not be None in dygraph mode."
+        attrs = (
+            'min_level',
+            min_level,
+            'max_level',
+            max_level,
+            'refer_level',
+            refer_level,
+            'refer_scale',
+            refer_scale,
+            'pixel_offset',
+            pixel_offset,
+        )
+        (
+            multi_rois,
+            restore_ind,
+            rois_num_per_level,
+        ) = _legacy_C_ops.distribute_fpn_proposals(
+            fpn_rois, rois_num, num_lvl, num_lvl, *attrs
+        )
         return multi_rois, restore_ind, rois_num_per_level
 
     else:
-        check_variable_and_dtype(fpn_rois, 'fpn_rois', ['float32', 'float64'],
-                                 'distribute_fpn_proposals')
+        check_variable_and_dtype(
+            fpn_rois,
+            'fpn_rois',
+            ['float32', 'float64'],
+            'distribute_fpn_proposals',
+        )
         helper = LayerHelper('distribute_fpn_proposals', **locals())
         dtype = helper.input_dtype('fpn_rois')
         multi_rois = [
@@ -944,16 +1438,18 @@ def distribute_fpn_proposals(fpn_rois,
         else:
             rois_num_per_level = None
 
-        helper.append_op(type='distribute_fpn_proposals',
-                         inputs=inputs,
-                         outputs=outputs,
-                         attrs={
-                             'min_level': min_level,
-                             'max_level': max_level,
-                             'refer_level': refer_level,
-                             'refer_scale': refer_scale,
-                             'pixel_offset': pixel_offset
-                         })
+        helper.append_op(
+            type='distribute_fpn_proposals',
+            inputs=inputs,
+            outputs=outputs,
+            attrs={
+                'min_level': min_level,
+                'max_level': max_level,
+                'refer_level': refer_level,
+                'refer_scale': refer_scale,
+                'pixel_offset': pixel_offset,
+            },
+        )
         return multi_rois, restore_ind, rois_num_per_level
 
 
@@ -977,12 +1473,12 @@ def read_file(filename, name=None):
             import cv2
             import paddle
 
-            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')            
+            fake_img = (paddle.rand((400, 300, 3)).numpy() * 255).astype('uint8')
 
             cv2.imwrite('fake.jpg', fake_img)
 
             img_bytes = paddle.vision.ops.read_file('fake.jpg')
-            
+
             print(img_bytes.shape)
             # [142915]
     """
@@ -995,24 +1491,23 @@ def read_file(filename, name=None):
 
     helper = LayerHelper("read_file", **locals())
     out = helper.create_variable_for_type_inference('uint8')
-    helper.append_op(type="read_file",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="read_file", inputs=inputs, attrs=attrs, outputs={"Out": out}
+    )
 
     return out
 
 
 def decode_jpeg(x, mode='unchanged', name=None):
     """
-    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor. 
-    Optionally converts the image to the desired format. 
+    Decodes a JPEG image into a 3 dimensional RGB Tensor or 1 dimensional Gray Tensor.
+    Optionally converts the image to the desired format.
     The values of the output tensor are uint8 between 0 and 255.
 
     Args:
-        x (Tensor): A one dimensional uint8 tensor containing the raw bytes 
+        x (Tensor): A one dimensional uint8 tensor containing the raw bytes
             of the JPEG image.
-        mode (str): The read mode used for optionally converting the image. 
+        mode (str): The read mode used for optionally converting the image.
             Default: 'unchanged'.
         name (str, optional): The default value is None. Normally there is no
             need for user to set this property. For more information, please
@@ -1046,10 +1541,9 @@ def decode_jpeg(x, mode='unchanged', name=None):
 
     helper = LayerHelper("decode_jpeg", **locals())
     out = helper.create_variable_for_type_inference('uint8')
-    helper.append_op(type="decode_jpeg",
-                     inputs=inputs,
-                     attrs=attrs,
-                     outputs={"Out": out})
+    helper.append_op(
+        type="decode_jpeg", inputs=inputs, attrs=attrs, outputs={"Out": out}
+    )
 
     return out
 
@@ -1057,7 +1551,7 @@ def decode_jpeg(x, mode='unchanged', name=None):
 def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     Position sensitive region of interest pooling (also known as PSROIPooling) is to perform
-    position-sensitive average pooling on regions of interest specified by input. It performs 
+    position-sensitive average pooling on regions of interest specified by input. It performs
     on inputs of nonuniform sizes to obtain fixed-size feature maps.
 
     PSROIPooling is proposed by R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details.
@@ -1065,13 +1559,13 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     Args:
         x (Tensor): Input features with shape (N, C, H, W). The data type can be float32 or float64.
         boxes (Tensor): Box coordinates of ROIs (Regions of Interest) to pool over. It should be
-                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...], 
+                         a 2-D Tensor with shape (num_rois, 4). Given as [[x1, y1, x2, y2], ...],
                          (x1, y1) is the top left coordinates, and (x2, y2) is the bottom
                          right coordinates.
         boxes_num (Tensor): The number of boxes contained in each picture in the batch.
-        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                                is int32. If int, H and W are both equal to output_size.
-        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their 
+        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                                input scale to the scale used when pooling. Default: 1.0
         name(str, optional): The default value is None.
                              Normally there is no need for user to set this property.
@@ -1097,34 +1591,47 @@ def psroi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     if isinstance(output_size, int):
         output_size = (output_size, output_size)
     pooled_height, pooled_width = output_size
-    assert len(x.shape) == 4, \
-            "Input features with shape should be (N, C, H, W)"
+    assert len(x.shape) == 4, "Input features with shape should be (N, C, H, W)"
     output_channels = int(x.shape[1] / (pooled_height * pooled_width))
     if in_dygraph_mode():
-        return _C_ops.psroi_pool(x, boxes, boxes_num, pooled_height,
-                                 pooled_width, output_channels, spatial_scale)
+        return _C_ops.psroi_pool(
+            x,
+            boxes,
+            boxes_num,
+            pooled_height,
+            pooled_width,
+            output_channels,
+            spatial_scale,
+        )
     if _in_legacy_dygraph():
-        return _legacy_C_ops.psroi_pool(x, boxes, boxes_num, "output_channels",
-                                        output_channels, "spatial_scale",
-                                        spatial_scale, "pooled_height",
-                                        pooled_height, "pooled_width",
-                                        pooled_width)
+        return _legacy_C_ops.psroi_pool(
+            x,
+            boxes,
+            boxes_num,
+            "output_channels",
+            output_channels,
+            "spatial_scale",
+            spatial_scale,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+        )
 
     helper = LayerHelper('psroi_pool', **locals())
     dtype = helper.input_dtype()
     out = helper.create_variable_for_type_inference(dtype)
-    helper.append_op(type='psroi_pool',
-                     inputs={
-                         'X': x,
-                         'ROIs': boxes
-                     },
-                     outputs={'Out': out},
-                     attrs={
-                         'output_channels': output_channels,
-                         'spatial_scale': spatial_scale,
-                         'pooled_height': pooled_height,
-                         'pooled_width': pooled_width
-                     })
+    helper.append_op(
+        type='psroi_pool',
+        inputs={'X': x, 'ROIs': boxes},
+        outputs={'Out': out},
+        attrs={
+            'output_channels': output_channels,
+            'spatial_scale': spatial_scale,
+            'pooled_height': pooled_height,
+            'pooled_width': pooled_width,
+        },
+    )
     return out
 
 
@@ -1134,9 +1641,9 @@ class PSRoIPool(Layer):
     refer to :ref:`api_paddle_vision_ops_psroi_pool`.
 
     Args:
-        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type 
+        output_size (int|Tuple(int, int))  The pooled output size(H, W), data type
                                is int32. If int, H and W are both equal to output_size.
-        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their 
+        spatial_scale (float, optional): Multiplicative spatial scale factor to translate ROI coords from their
                                input scale to the scale used when pooling. Default: 1.0.
 
     Shape:
@@ -1153,7 +1660,7 @@ class PSRoIPool(Layer):
         .. code-block:: python
 
             import paddle
-            
+
             psroi_module = paddle.vision.ops.PSRoIPool(7, 1.0)
             x = paddle.uniform([2, 490, 28, 28], dtype='float32')
             boxes = paddle.to_tensor([[1, 5, 8, 10], [4, 2, 6, 7], [12, 12, 19, 21]], dtype='float32')
@@ -1168,24 +1675,25 @@ def __init__(self, output_size, spatial_scale=1.0):
         self.spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num):
-        return psroi_pool(x, boxes, boxes_num, self.output_size,
-                          self.spatial_scale)
+        return psroi_pool(
+            x, boxes, boxes_num, self.output_size, self.spatial_scale
+        )
 
 
 def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
     """
     This operator implements the roi_pooling layer.
     Region of interest pooling (also known as RoI pooling) is to perform max pooling on inputs of nonuniform sizes to obtain fixed-size feature maps (e.g. 7*7).
-    The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer  
+    The operator has three steps: 1. Dividing each region proposal into equal-sized sections with output_size(h, w) 2. Finding the largest value in each section 3. Copying these max values to the output buffer
     For more information, please refer to https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn.
 
     Args:
-        x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W], 
-            where N is the batch size, C is the input channel, H is Height, W is weight. 
+        x (Tensor): input feature, 4D-Tensor with the shape of [N,C,H,W],
+            where N is the batch size, C is the input channel, H is Height, W is weight.
             The data type is float32 or float64.
-        boxes (Tensor): boxes (Regions of Interest) to pool over. 
-            2D-Tensor with the shape of [num_boxes,4]. 
-            Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates, 
+        boxes (Tensor): boxes (Regions of Interest) to pool over.
+            2D-Tensor with the shape of [num_boxes,4].
+            Given as [[x1, y1, x2, y2], ...], (x1, y1) is the top left coordinates,
             and (x2, y2) is the bottom right coordinates.
         boxes_num (Tensor): the number of RoIs in each image, data type is int32. Default: None
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
@@ -1193,7 +1701,7 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         name(str, optional): for detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
     Returns:
-        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
+        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].
 
     Examples:
         .. code-block:: python
@@ -1216,14 +1724,27 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
 
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        return _C_ops.roi_pool(x, boxes, boxes_num, pooled_height, pooled_width,
-                               spatial_scale)
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
+        return _C_ops.roi_pool(
+            x, boxes, boxes_num, pooled_height, pooled_width, spatial_scale
+        )
     if _in_legacy_dygraph():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
         pool_out, argmaxes = _legacy_C_ops.roi_pool(
-            x, boxes, boxes_num, "pooled_height", pooled_height, "pooled_width",
-            pooled_width, "spatial_scale", spatial_scale)
+            x,
+            boxes,
+            boxes_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+        )
         return pool_out
 
     else:
@@ -1240,38 +1761,37 @@ def roi_pool(x, boxes, boxes_num, output_size, spatial_scale=1.0, name=None):
         }
         if boxes_num is not None:
             inputs['RoisNum'] = boxes_num
-        helper.append_op(type="roi_pool",
-                         inputs=inputs,
-                         outputs={
-                             "Out": pool_out,
-                             "Argmax": argmaxes
-                         },
-                         attrs={
-                             "pooled_height": pooled_height,
-                             "pooled_width": pooled_width,
-                             "spatial_scale": spatial_scale
-                         })
+        helper.append_op(
+            type="roi_pool",
+            inputs=inputs,
+            outputs={"Out": pool_out, "Argmax": argmaxes},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale,
+            },
+        )
         return pool_out
 
 
 class RoIPool(Layer):
     """
     This interface is used to construct a callable object of the `RoIPool` class. Please
-    refer to :ref:`api_paddle_vision_ops_roi_pool`.  
+    refer to :ref:`api_paddle_vision_ops_roi_pool`.
 
     Args:
         output_size (int or tuple[int, int]): the pooled output size(h, w), data type is int32. If int, h and w are both equal to output_size.
         spatial_scale (float, optional): multiplicative spatial scale factor to translate ROI coords from their input scale to the scale used when pooling. Default: 1.0.
 
     Returns:
-        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].  
+        pool_out (Tensor): the pooled feature, 4D-Tensor with the shape of [num_boxes, C, output_size[0], output_size[1]].
 
     Examples:
         .. code-block:: python
 
             import paddle
             from paddle.vision.ops import RoIPool
-            
+
             data = paddle.rand([1, 256, 32, 32])
             boxes = paddle.rand([3, 4])
             boxes[:, 2] += boxes[:, 0] + 3
@@ -1288,25 +1808,29 @@ def __init__(self, output_size, spatial_scale=1.0):
         self._spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num):
-        return roi_pool(x=x,
-                        boxes=boxes,
-                        boxes_num=boxes_num,
-                        output_size=self._output_size,
-                        spatial_scale=self._spatial_scale)
+        return roi_pool(
+            x=x,
+            boxes=boxes,
+            boxes_num=boxes_num,
+            output_size=self._output_size,
+            spatial_scale=self._spatial_scale,
+        )
 
     def extra_repr(self):
         main_str = 'output_size={_output_size}, spatial_scale={_spatial_scale}'
         return main_str.format(**self.__dict__)
 
 
-def roi_align(x,
-              boxes,
-              boxes_num,
-              output_size,
-              spatial_scale=1.0,
-              sampling_ratio=-1,
-              aligned=True,
-              name=None):
+def roi_align(
+    x,
+    boxes,
+    boxes_num,
+    output_size,
+    spatial_scale=1.0,
+    sampling_ratio=-1,
+    aligned=True,
+    name=None,
+):
     """
     Implementing the roi_align layer.
     Region of Interest (RoI) Align operator (also known as RoI Align) is to
@@ -1318,13 +1842,13 @@ def roi_align(x,
 
     In each ROI bin, the value of the four regularly sampled locations are
     computed directly through bilinear interpolation. The output is the mean of
-    four locations. Thus avoid the misaligned problem. 
+    four locations. Thus avoid the misaligned problem.
 
     Args:
-        x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W], 
+        x (Tensor): Input feature, 4D-Tensor with the shape of [N,C,H,W],
             where N is the batch size, C is the input channel, H is Height,
             W is weight. The data type is float32 or float64.
-        boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It 
+        boxes (Tensor): Boxes (RoIs, Regions of Interest) to pool over. It
             should be a 2-D Tensor of shape (num_boxes, 4). The data type is
             float32 or float64. Given as [[x1, y1, x2, y2], ...], (x1, y1) is
             the top left coordinates, and (x2, y2) is the bottom right coordinates.
@@ -1375,24 +1899,45 @@ def roi_align(x,
 
     pooled_height, pooled_width = output_size
     if in_dygraph_mode():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        return _C_ops.roi_align(x, boxes, boxes_num, pooled_height,
-                                pooled_width, spatial_scale, sampling_ratio,
-                                aligned)
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
+        return _C_ops.roi_align(
+            x,
+            boxes,
+            boxes_num,
+            pooled_height,
+            pooled_width,
+            spatial_scale,
+            sampling_ratio,
+            aligned,
+        )
     if _in_legacy_dygraph():
-        assert boxes_num is not None, "boxes_num should not be None in dygraph mode."
-        align_out = _legacy_C_ops.roi_align(x, boxes, boxes_num,
-                                            "pooled_height", pooled_height,
-                                            "pooled_width", pooled_width,
-                                            "spatial_scale", spatial_scale,
-                                            "sampling_ratio", sampling_ratio,
-                                            "aligned", aligned)
+        assert (
+            boxes_num is not None
+        ), "boxes_num should not be None in dygraph mode."
+        align_out = _legacy_C_ops.roi_align(
+            x,
+            boxes,
+            boxes_num,
+            "pooled_height",
+            pooled_height,
+            "pooled_width",
+            pooled_width,
+            "spatial_scale",
+            spatial_scale,
+            "sampling_ratio",
+            sampling_ratio,
+            "aligned",
+            aligned,
+        )
         return align_out
 
     else:
         check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'roi_align')
-        check_variable_and_dtype(boxes, 'boxes', ['float32', 'float64'],
-                                 'roi_align')
+        check_variable_and_dtype(
+            boxes, 'boxes', ['float32', 'float64'], 'roi_align'
+        )
         helper = LayerHelper('roi_align', **locals())
         dtype = helper.input_dtype()
         align_out = helper.create_variable_for_type_inference(dtype)
@@ -1402,16 +1947,18 @@ def roi_align(x,
         }
         if boxes_num is not None:
             inputs['RoisNum'] = boxes_num
-        helper.append_op(type="roi_align",
-                         inputs=inputs,
-                         outputs={"Out": align_out},
-                         attrs={
-                             "pooled_height": pooled_height,
-                             "pooled_width": pooled_width,
-                             "spatial_scale": spatial_scale,
-                             "sampling_ratio": sampling_ratio,
-                             "aligned": aligned,
-                         })
+        helper.append_op(
+            type="roi_align",
+            inputs=inputs,
+            outputs={"Out": align_out},
+            attrs={
+                "pooled_height": pooled_height,
+                "pooled_width": pooled_width,
+                "spatial_scale": spatial_scale,
+                "sampling_ratio": sampling_ratio,
+                "aligned": aligned,
+            },
+        )
         return align_out
 
 
@@ -1453,12 +2000,14 @@ def __init__(self, output_size, spatial_scale=1.0):
         self._spatial_scale = spatial_scale
 
     def forward(self, x, boxes, boxes_num, aligned=True):
-        return roi_align(x=x,
-                         boxes=boxes,
-                         boxes_num=boxes_num,
-                         output_size=self._output_size,
-                         spatial_scale=self._spatial_scale,
-                         aligned=aligned)
+        return roi_align(
+            x=x,
+            boxes=boxes,
+            boxes_num=boxes_num,
+            output_size=self._output_size,
+            spatial_scale=self._spatial_scale,
+            aligned=aligned,
+        )
 
 
 class ConvNormActivation(Sequential):
@@ -1482,30 +2031,34 @@ class ConvNormActivation(Sequential):
         bias (bool, optional): Whether to use bias in the convolution layer. By default, biases are included if ``norm_layer is None``.
     """
 
-    def __init__(self,
-                 in_channels,
-                 out_channels,
-                 kernel_size=3,
-                 stride=1,
-                 padding=None,
-                 groups=1,
-                 norm_layer=BatchNorm2D,
-                 activation_layer=ReLU,
-                 dilation=1,
-                 bias=None):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        kernel_size=3,
+        stride=1,
+        padding=None,
+        groups=1,
+        norm_layer=BatchNorm2D,
+        activation_layer=ReLU,
+        dilation=1,
+        bias=None,
+    ):
         if padding is None:
             padding = (kernel_size - 1) // 2 * dilation
         if bias is None:
             bias = norm_layer is None
         layers = [
-            Conv2D(in_channels,
-                   out_channels,
-                   kernel_size,
-                   stride,
-                   padding,
-                   dilation=dilation,
-                   groups=groups,
-                   bias_attr=bias)
+            Conv2D(
+                in_channels,
+                out_channels,
+                kernel_size,
+                stride,
+                padding,
+                dilation=dilation,
+                groups=groups,
+                bias_attr=bias,
+            )
         ]
         if norm_layer is not None:
             layers.append(norm_layer(out_channels))
@@ -1514,17 +2067,19 @@ def __init__(self,
         super().__init__(*layers)
 
 
-def nms(boxes,
-        iou_threshold=0.3,
-        scores=None,
-        category_idxs=None,
-        categories=None,
-        top_k=None):
+def nms(
+    boxes,
+    iou_threshold=0.3,
+    scores=None,
+    category_idxs=None,
+    categories=None,
+    top_k=None,
+):
     r"""
     This operator implements non-maximum suppression. Non-maximum suppression (NMS)
-    is used to select one bounding box out of many overlapping bounding boxes in object detection. 
-    Boxes with IoU > iou_threshold will be considered as overlapping boxes, 
-    just one with highest score can be kept. Here IoU is Intersection Over Union, 
+    is used to select one bounding box out of many overlapping bounding boxes in object detection.
+    Boxes with IoU > iou_threshold will be considered as overlapping boxes,
+    just one with highest score can be kept. Here IoU is Intersection Over Union,
     which can be computed by:
 
     ..  math::
@@ -1533,25 +2088,25 @@ def nms(boxes,
 
     If scores are provided, input boxes will be sorted by their scores firstly.
 
-    If category_idxs and categories are provided, NMS will be performed with a batched style, 
+    If category_idxs and categories are provided, NMS will be performed with a batched style,
     which means NMS will be applied to each category respectively and results of each category
     will be concated and sorted by scores.
-    
+
     If K is provided, only the first k elements will be returned. Otherwise, all box indices sorted by scores will be returned.
 
     Args:
-        boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with 
-            the shape of [num_boxes, 4]. The data type is float32 or float64. 
-            Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates, 
-            and (x2, y2) is the bottom right coordinates. 
+        boxes(Tensor): The input boxes data to be computed, it's a 2D-Tensor with
+            the shape of [num_boxes, 4]. The data type is float32 or float64.
+            Given as [[x1, y1, x2, y2], …],  (x1, y1) is the top left coordinates,
+            and (x2, y2) is the bottom right coordinates.
             Their relation should be ``0 <= x1 < x2 && 0 <= y1 < y2``.
         iou_threshold(float32, optional): IoU threshold for determine overlapping boxes. Default value: 0.3.
-        scores(Tensor, optional): Scores corresponding to boxes, it's a 1D-Tensor with 
+        scores(Tensor, optional): Scores corresponding to boxes, it's a 1D-Tensor with
             shape of [num_boxes]. The data type is float32 or float64. Default: None.
-        category_idxs(Tensor, optional): Category indices corresponding to boxes. 
+        category_idxs(Tensor, optional): Category indices corresponding to boxes.
             it's a 1D-Tensor with shape of [num_boxes]. The data type is int64. Default: None.
         categories(List, optional): A list of unique id of all categories. The data type is int64. Default: None.
-        top_k(int64, optional): The top K boxes who has higher score and kept by NMS preds to 
+        top_k(int64, optional): The top K boxes who has higher score and kept by NMS preds to
             consider. top_k should be smaller equal than num_boxes. Default: None.
 
     Returns:
@@ -1559,35 +2114,38 @@ def nms(boxes,
 
     Examples:
         .. code-block:: python
-        
+
             import paddle
-            import numpy as np
 
-            boxes = np.random.rand(4, 4).astype('float32')
+            boxes = paddle.rand([4, 4]).astype('float32')
             boxes[:, 2] = boxes[:, 0] + boxes[:, 2]
             boxes[:, 3] = boxes[:, 1] + boxes[:, 3]
-            # [[0.06287421 0.5809351  0.3443958  0.8713329 ]
-            #  [0.0749094  0.9713205  0.99241287 1.2799143 ]
-            #  [0.46246734 0.6753201  1.346266   1.3821303 ]
-            #  [0.8984796  0.5619834  1.1254641  1.0201943 ]]
+            print(boxes)
+            # Tensor(shape=[4, 4], dtype=float32, place=Place(gpu:0), stop_gradient=True,
+            #        [[0.64811575, 0.89756244, 0.86473107, 1.48552322],
+            #         [0.48085716, 0.84799081, 0.54517937, 0.86396021],
+            #         [0.62646860, 0.72901905, 1.17392159, 1.69691563],
+            #         [0.89729202, 0.46281594, 1.88733089, 0.98588502]])
 
-            out =  paddle.vision.ops.nms(paddle.to_tensor(boxes), 0.1)
-            # [0, 1, 3, 0]
+            out = paddle.vision.ops.nms(boxes, 0.1)
+            print(out)
+            # Tensor(shape=[3], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [0, 1, 3])
 
-            scores = np.random.rand(4).astype('float32')
-            # [0.98015213 0.3156527  0.8199343  0.874901 ]
+            scores = paddle.to_tensor([0.6, 0.7, 0.4, 0.233])
 
             categories = [0, 1, 2, 3]
-            category_idxs = np.random.choice(categories, 4)                        
-            # [2 0 0 3]
-
-            out =  paddle.vision.ops.nms(paddle.to_tensor(boxes), 
-                                                    0.1, 
-                                                    paddle.to_tensor(scores), 
-                                                    paddle.to_tensor(category_idxs), 
-                                                    categories, 
-                                                    4)
-            # [0, 3, 2]
+            category_idxs = paddle.to_tensor([2, 0, 0, 3], dtype="int64")
+
+            out = paddle.vision.ops.nms(boxes,
+                                        0.1,
+                                        paddle.to_tensor(scores),
+                                        paddle.to_tensor(category_idxs),
+                                        categories,
+                                        4)
+            print(out)
+            # Tensor(shape=[4], dtype=int64, place=Place(gpu:0), stop_gradient=True,
+            #        [1, 0, 2, 3])
     """
 
     def _nms(boxes, iou_threshold):
@@ -1599,32 +2157,42 @@ def _nms(boxes, iou_threshold):
 
         helper = LayerHelper('nms', **locals())
         out = helper.create_variable_for_type_inference('int64')
-        helper.append_op(type='nms',
-                         inputs={'Boxes': boxes},
-                         outputs={'KeepBoxesIdxs': out},
-                         attrs={'iou_threshold': iou_threshold})
+        helper.append_op(
+            type='nms',
+            inputs={'Boxes': boxes},
+            outputs={'KeepBoxesIdxs': out},
+            attrs={'iou_threshold': iou_threshold},
+        )
         return out
 
     if scores is None:
         return _nms(boxes, iou_threshold)
 
     import paddle
+
     if category_idxs is None:
         sorted_global_indices = paddle.argsort(scores, descending=True)
-        return _nms(boxes[sorted_global_indices], iou_threshold)
+        sorted_keep_boxes_indices = _nms(
+            boxes[sorted_global_indices], iou_threshold
+        )
+        return sorted_global_indices[sorted_keep_boxes_indices]
 
     if top_k is not None:
-        assert top_k <= scores.shape[
-            0], "top_k should be smaller equal than the number of boxes"
-    assert categories is not None, "if category_idxs is given, categories which is a list of unique id of all categories is necessary"
+        assert (
+            top_k <= scores.shape[0]
+        ), "top_k should be smaller equal than the number of boxes"
+    assert (
+        categories is not None
+    ), "if category_idxs is given, categories which is a list of unique id of all categories is necessary"
 
     mask = paddle.zeros_like(scores, dtype=paddle.int32)
 
     for category_id in categories:
         cur_category_boxes_idxs = paddle.where(category_idxs == category_id)[0]
         shape = cur_category_boxes_idxs.shape[0]
-        cur_category_boxes_idxs = paddle.reshape(cur_category_boxes_idxs,
-                                                 [shape])
+        cur_category_boxes_idxs = paddle.reshape(
+            cur_category_boxes_idxs, [shape]
+        )
         if shape == 0:
             continue
         elif shape == 1:
@@ -1632,27 +2200,33 @@ def _nms(boxes, iou_threshold):
             continue
         cur_category_boxes = boxes[cur_category_boxes_idxs]
         cur_category_scores = scores[cur_category_boxes_idxs]
-        cur_category_sorted_indices = paddle.argsort(cur_category_scores,
-                                                     descending=True)
+        cur_category_sorted_indices = paddle.argsort(
+            cur_category_scores, descending=True
+        )
         cur_category_sorted_boxes = cur_category_boxes[
-            cur_category_sorted_indices]
+            cur_category_sorted_indices
+        ]
 
-        cur_category_keep_boxes_sub_idxs = cur_category_sorted_indices[_nms(
-            cur_category_sorted_boxes, iou_threshold)]
+        cur_category_keep_boxes_sub_idxs = cur_category_sorted_indices[
+            _nms(cur_category_sorted_boxes, iou_threshold)
+        ]
 
         updates = paddle.ones_like(
             cur_category_boxes_idxs[cur_category_keep_boxes_sub_idxs],
-            dtype=paddle.int32)
+            dtype=paddle.int32,
+        )
         mask = paddle.scatter(
             mask,
             cur_category_boxes_idxs[cur_category_keep_boxes_sub_idxs],
             updates,
-            overwrite=True)
+            overwrite=True,
+        )
     keep_boxes_idxs = paddle.where(mask)[0]
     shape = keep_boxes_idxs.shape[0]
     keep_boxes_idxs = paddle.reshape(keep_boxes_idxs, [shape])
-    sorted_sub_indices = paddle.argsort(scores[keep_boxes_idxs],
-                                        descending=True)
+    sorted_sub_indices = paddle.argsort(
+        scores[keep_boxes_idxs], descending=True
+    )
 
     if top_k is None:
         return keep_boxes_idxs[sorted_sub_indices]
@@ -1665,32 +2239,34 @@ def _nms(boxes, iou_threshold):
     return keep_boxes_idxs[sorted_sub_indices][:top_k]
 
 
-def generate_proposals(scores,
-                       bbox_deltas,
-                       img_size,
-                       anchors,
-                       variances,
-                       pre_nms_top_n=6000,
-                       post_nms_top_n=1000,
-                       nms_thresh=0.5,
-                       min_size=0.1,
-                       eta=1.0,
-                       pixel_offset=False,
-                       return_rois_num=False,
-                       name=None):
+def generate_proposals(
+    scores,
+    bbox_deltas,
+    img_size,
+    anchors,
+    variances,
+    pre_nms_top_n=6000,
+    post_nms_top_n=1000,
+    nms_thresh=0.5,
+    min_size=0.1,
+    eta=1.0,
+    pixel_offset=False,
+    return_rois_num=False,
+    name=None,
+):
     """
     This operation proposes RoIs according to each box with their
-    probability to be a foreground object. And 
-    the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals 
+    probability to be a foreground object. And
+    the proposals of RPN output are  calculated by anchors, bbox_deltas and scores. Final proposals
     could be used to train detection net.
 
     For generating proposals, this operation performs following steps:
 
     1. Transpose and resize scores and bbox_deltas in size of
        (H * W * A, 1) and (H * W * A, 4)
-    2. Calculate box locations as proposals candidates. 
+    2. Calculate box locations as proposals candidates.
     3. Clip boxes to image
-    4. Remove predicted boxes with small area. 
+    4. Remove predicted boxes with small area.
     5. Apply non-maximum suppression (NMS) to get final proposals as output.
 
     Args:
@@ -1748,40 +2324,74 @@ def generate_proposals(scores,
     """
 
     if in_dygraph_mode():
-        assert return_rois_num, "return_rois_num should be True in dygraph mode."
-        attrs = (pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta,
-                 pixel_offset)
+        assert (
+            return_rois_num
+        ), "return_rois_num should be True in dygraph mode."
+        attrs = (
+            pre_nms_top_n,
+            post_nms_top_n,
+            nms_thresh,
+            min_size,
+            eta,
+            pixel_offset,
+        )
         rpn_rois, rpn_roi_probs, rpn_rois_num = _C_ops.generate_proposals_v2(
-            scores, bbox_deltas, img_size, anchors, variances, *attrs)
+            scores, bbox_deltas, img_size, anchors, variances, *attrs
+        )
 
         return rpn_rois, rpn_roi_probs, rpn_rois_num
     elif _non_static_mode():
-        assert return_rois_num, "return_rois_num should be True in dygraph mode."
-        attrs = ('pre_nms_topN', pre_nms_top_n, 'post_nms_topN', post_nms_top_n,
-                 'nms_thresh', nms_thresh, 'min_size', min_size, 'eta', eta,
-                 'pixel_offset', pixel_offset)
-        rpn_rois, rpn_roi_probs, rpn_rois_num = _legacy_C_ops.generate_proposals_v2(
-            scores, bbox_deltas, img_size, anchors, variances, *attrs)
+        assert (
+            return_rois_num
+        ), "return_rois_num should be True in dygraph mode."
+        attrs = (
+            'pre_nms_topN',
+            pre_nms_top_n,
+            'post_nms_topN',
+            post_nms_top_n,
+            'nms_thresh',
+            nms_thresh,
+            'min_size',
+            min_size,
+            'eta',
+            eta,
+            'pixel_offset',
+            pixel_offset,
+        )
+        (
+            rpn_rois,
+            rpn_roi_probs,
+            rpn_rois_num,
+        ) = _legacy_C_ops.generate_proposals_v2(
+            scores, bbox_deltas, img_size, anchors, variances, *attrs
+        )
 
         return rpn_rois, rpn_roi_probs, rpn_rois_num
 
     helper = LayerHelper('generate_proposals_v2', **locals())
 
-    check_variable_and_dtype(scores, 'scores', ['float32'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(bbox_deltas, 'bbox_deltas', ['float32'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(img_size, 'img_size', ['float32', 'float64'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(anchors, 'anchors', ['float32'],
-                             'generate_proposals_v2')
-    check_variable_and_dtype(variances, 'variances', ['float32'],
-                             'generate_proposals_v2')
+    check_variable_and_dtype(
+        scores, 'scores', ['float32'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        bbox_deltas, 'bbox_deltas', ['float32'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        img_size, 'img_size', ['float32', 'float64'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        anchors, 'anchors', ['float32'], 'generate_proposals_v2'
+    )
+    check_variable_and_dtype(
+        variances, 'variances', ['float32'], 'generate_proposals_v2'
+    )
 
     rpn_rois = helper.create_variable_for_type_inference(
-        dtype=bbox_deltas.dtype)
+        dtype=bbox_deltas.dtype
+    )
     rpn_roi_probs = helper.create_variable_for_type_inference(
-        dtype=scores.dtype)
+        dtype=scores.dtype
+    )
     outputs = {
         'RpnRois': rpn_rois,
         'RpnRoiProbs': rpn_roi_probs,
@@ -1791,23 +2401,25 @@ def generate_proposals(scores,
         rpn_rois_num.stop_gradient = True
         outputs['RpnRoisNum'] = rpn_rois_num
 
-    helper.append_op(type="generate_proposals_v2",
-                     inputs={
-                         'Scores': scores,
-                         'BboxDeltas': bbox_deltas,
-                         'ImShape': img_size,
-                         'Anchors': anchors,
-                         'Variances': variances
-                     },
-                     attrs={
-                         'pre_nms_topN': pre_nms_top_n,
-                         'post_nms_topN': post_nms_top_n,
-                         'nms_thresh': nms_thresh,
-                         'min_size': min_size,
-                         'eta': eta,
-                         'pixel_offset': pixel_offset
-                     },
-                     outputs=outputs)
+    helper.append_op(
+        type="generate_proposals_v2",
+        inputs={
+            'Scores': scores,
+            'BboxDeltas': bbox_deltas,
+            'ImShape': img_size,
+            'Anchors': anchors,
+            'Variances': variances,
+        },
+        attrs={
+            'pre_nms_topN': pre_nms_top_n,
+            'post_nms_topN': post_nms_top_n,
+            'nms_thresh': nms_thresh,
+            'min_size': min_size,
+            'eta': eta,
+            'pixel_offset': pixel_offset,
+        },
+        outputs=outputs,
+    )
     rpn_rois.stop_gradient = True
     rpn_roi_probs.stop_gradient = True
     if not return_rois_num:
@@ -1816,20 +2428,23 @@ def generate_proposals(scores,
     return rpn_rois, rpn_roi_probs, rpn_rois_num
 
 
-def matrix_nms(bboxes,
-               scores,
-               score_threshold,
-               post_threshold,
-               nms_top_k,
-               keep_top_k,
-               use_gaussian=False,
-               gaussian_sigma=2.,
-               background_label=0,
-               normalized=True,
-               return_index=False,
-               return_rois_num=True,
-               name=None):
+def matrix_nms(
+    bboxes,
+    scores,
+    score_threshold,
+    post_threshold,
+    nms_top_k,
+    keep_top_k,
+    use_gaussian=False,
+    gaussian_sigma=2.0,
+    background_label=0,
+    normalized=True,
+    return_index=False,
+    return_rois_num=True,
+    name=None,
+):
     """
+
     This operator does matrix non maximum suppression (NMS).
     First selects a subset of candidate bounding boxes that have higher scores
     than score_threshold (if provided), then the top k candidate is selected if
@@ -1837,6 +2452,7 @@ def matrix_nms(bboxes,
     decayed according to the Matrix NMS scheme.
     Aftern NMS step, at most keep_top_k number of total bboxes are to be kept
     per image if keep_top_k is larger than -1.
+
     Args:
         bboxes (Tensor): A 3-D Tensor with shape [N, M, 4] represents the
                            predicted locations of M bounding bboxes,
@@ -1860,29 +2476,32 @@ def matrix_nms(bboxes,
                          on score_threshold.
         keep_top_k (int): Number of total bboxes to be kept per image after NMS
                           step. -1 means keeping all bboxes after NMS step.
-        use_gaussian (bool): Use Gaussian as the decay function. Default: False
-        gaussian_sigma (float): Sigma for Gaussian decay function. Default: 2.0
-        background_label (int): The index of background label, the background
+        use_gaussian (bool, optional): Use Gaussian as the decay function. Default: False
+        gaussian_sigma (float, optional): Sigma for Gaussian decay function. Default: 2.0
+        background_label (int, optional): The index of background label, the background
                                 label will be ignored. If set to -1, then all
                                 categories will be considered. Default: 0
-        normalized (bool): Whether detections are normalized. Default: True
-        return_index(bool): Whether return selected index. Default: False
-        return_rois_num(bool): whether return rois_num. Default: True
-        name(str): Name of the matrix nms op. Default: None.
+        normalized (bool, optional): Whether detections are normalized. Default: True
+        return_index(bool, optional): Whether return selected index. Default: False
+        return_rois_num(bool, optional): whether return rois_num. Default: True
+        name(str, optional): Name of the matrix nms op. Default: None.
     Returns:
-        A tuple with three Tensor: (Out, Index, RoisNum) if return_index is True,
-        otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
-        Out (Tensor): A 2-D Tensor with shape [No, 6] containing the
-             detection results.
-             Each row has 6 values: [label, confidence, xmin, ymin, xmax, ymax]
-        Index (Tensor): A 2-D Tensor with shape [No, 1] containing the
-            selected indices, which are absolute values cross batches.
-        rois_num (Tensor): A 1-D Tensor with shape [N] containing
-            the number of detected boxes in each image.
+        - A tuple with three Tensor, (Out, Index, RoisNum) if return_index is True,
+          otherwise, a tuple with two Tensor (Out, RoisNum) is returned.
+        - Out (Tensor), A 2-D Tensor with shape [No, 6] containing the
+          detection results.
+          Each row has 6 values, [label, confidence, xmin, ymin, xmax, ymax]
+        - Index (Tensor), A 2-D Tensor with shape [No, 1] containing the
+          selected indices, which are absolute values cross batches.
+        - rois_num (Tensor), A 1-D Tensor with shape [N] containing
+          the number of detected boxes in each image.
+
     Examples:
         .. code-block:: python
+
             import paddle
             from paddle.vision.ops import matrix_nms
+
             boxes = paddle.rand([4, 1, 4])
             boxes[..., 2] = boxes[..., 0] + boxes[..., 2]
             boxes[..., 3] = boxes[..., 1] + boxes[..., 3]
@@ -1890,11 +2509,14 @@ def matrix_nms(bboxes,
             out = matrix_nms(bboxes=boxes, scores=scores, background_label=0,
                                  score_threshold=0.5, post_threshold=0.1,
                                  nms_top_k=400, keep_top_k=200, normalized=False)
+
     """
-    check_variable_and_dtype(bboxes, 'BBoxes', ['float32', 'float64'],
-                             'matrix_nms')
-    check_variable_and_dtype(scores, 'Scores', ['float32', 'float64'],
-                             'matrix_nms')
+    check_variable_and_dtype(
+        bboxes, 'BBoxes', ['float32', 'float64'], 'matrix_nms'
+    )
+    check_variable_and_dtype(
+        scores, 'Scores', ['float32', 'float64'], 'matrix_nms'
+    )
     check_type(score_threshold, 'score_threshold', float, 'matrix_nms')
     check_type(post_threshold, 'post_threshold', float, 'matrix_nms')
     check_type(nms_top_k, 'nums_top_k', int, 'matrix_nms')
@@ -1905,22 +2527,42 @@ def matrix_nms(bboxes,
     check_type(background_label, 'background_label', int, 'matrix_nms')
 
     if in_dygraph_mode():
-        out, index, rois_num = _C_ops.matrix_nms(bboxes, scores,
-                                                 score_threshold, nms_top_k,
-                                                 keep_top_k, post_threshold,
-                                                 use_gaussian, gaussian_sigma,
-                                                 background_label, normalized)
+        out, index, rois_num = _C_ops.matrix_nms(
+            bboxes,
+            scores,
+            score_threshold,
+            nms_top_k,
+            keep_top_k,
+            post_threshold,
+            use_gaussian,
+            gaussian_sigma,
+            background_label,
+            normalized,
+        )
         if not return_index:
             index = None
         if not return_rois_num:
             rois_num = None
         return out, rois_num, index
     elif _in_legacy_dygraph():
-        attrs = ('background_label', background_label, 'score_threshold',
-                 score_threshold, 'post_threshold', post_threshold, 'nms_top_k',
-                 nms_top_k, 'gaussian_sigma', gaussian_sigma, 'use_gaussian',
-                 use_gaussian, 'keep_top_k', keep_top_k, 'normalized',
-                 normalized)
+        attrs = (
+            'background_label',
+            background_label,
+            'score_threshold',
+            score_threshold,
+            'post_threshold',
+            post_threshold,
+            'nms_top_k',
+            nms_top_k,
+            'gaussian_sigma',
+            gaussian_sigma,
+            'use_gaussian',
+            use_gaussian,
+            'keep_top_k',
+            keep_top_k,
+            'normalized',
+            normalized,
+        )
         out, index, rois_num = _legacy_C_ops.matrix_nms(bboxes, scores, *attrs)
         if not return_index:
             index = None
@@ -1936,22 +2578,21 @@ def matrix_nms(bboxes,
             rois_num = helper.create_variable_for_type_inference(dtype='int32')
             outputs['RoisNum'] = rois_num
 
-        helper.append_op(type="matrix_nms",
-                         inputs={
-                             'BBoxes': bboxes,
-                             'Scores': scores
-                         },
-                         attrs={
-                             'background_label': background_label,
-                             'score_threshold': score_threshold,
-                             'post_threshold': post_threshold,
-                             'nms_top_k': nms_top_k,
-                             'gaussian_sigma': gaussian_sigma,
-                             'use_gaussian': use_gaussian,
-                             'keep_top_k': keep_top_k,
-                             'normalized': normalized
-                         },
-                         outputs=outputs)
+        helper.append_op(
+            type="matrix_nms",
+            inputs={'BBoxes': bboxes, 'Scores': scores},
+            attrs={
+                'background_label': background_label,
+                'score_threshold': score_threshold,
+                'post_threshold': post_threshold,
+                'nms_top_k': nms_top_k,
+                'gaussian_sigma': gaussian_sigma,
+                'use_gaussian': use_gaussian,
+                'keep_top_k': keep_top_k,
+                'normalized': normalized,
+            },
+            outputs=outputs,
+        )
         output.stop_gradient = True
 
         if not return_index:
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 301252a048b7a7..25cd2600ea0bbc 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -51,33 +51,38 @@ def _get_image_size(img):
             return img.shape[2:][::-1]  # nchw -> wh
         else:
             raise ValueError(
-                "The dim for input Tensor should be 3-D or 4-D, but received {}"
-                .format(len(img.shape)))
+                "The dim for input Tensor should be 3-D or 4-D, but received {}".format(
+                    len(img.shape)
+                )
+            )
     else:
         raise TypeError("Unexpected type {}".format(type(img)))
 
 
-def _check_input(value,
-                 name,
-                 center=1,
-                 bound=(0, float('inf')),
-                 clip_first_on_zero=True):
+def _check_input(
+    value, name, center=1, bound=(0, float('inf')), clip_first_on_zero=True
+):
     if isinstance(value, numbers.Number):
         if value < 0:
             raise ValueError(
                 "If {} is a single number, it must be non negative.".format(
-                    name))
+                    name
+                )
+            )
         value = [center - value, center + value]
         if clip_first_on_zero:
             value[0] = max(value[0], 0)
     elif isinstance(value, (tuple, list)) and len(value) == 2:
         if not bound[0] <= value[0] <= value[1] <= bound[1]:
-            raise ValueError("{} values should be between {}".format(
-                name, bound))
+            raise ValueError(
+                "{} values should be between {}".format(name, bound)
+            )
     else:
         raise TypeError(
-            "{} should be a single number or a list/tuple with lenght 2.".
-            format(name))
+            "{} should be a single number or a list/tuple with lenght 2.".format(
+                name
+            )
+        )
 
     if value[0] == value[1] == center:
         value = None
@@ -97,7 +102,7 @@ class Compose(object):
         object will call each given :attr:`transforms` sequencely.
 
     Examples:
-    
+
         .. code-block:: python
 
             from paddle.vision.datasets import Flowers
@@ -121,8 +126,10 @@ def __call__(self, data):
                 data = f(data)
             except Exception as e:
                 stack_info = traceback.format_exc()
-                print("fail to perform transform [{}] with error: "
-                      "{} and stack:\n{}".format(f, e, str(stack_info)))
+                print(
+                    "fail to perform transform [{}] with error: "
+                    "{} and stack:\n{}".format(f, e, str(stack_info))
+                )
                 raise e
         return data
 
@@ -139,12 +146,12 @@ class BaseTransform(object):
     """
     Base class of all transforms used in computer vision.
 
-    calling logic: 
+    calling logic:
 
         if keys is None:
             _get_params -> _apply_image()
         else:
-            _get_params -> _apply_*() for * in keys 
+            _get_params -> _apply_*() for * in keys
 
     If you want to implement a self-defined transform method for image,
     rewrite _apply_* method in subclass.
@@ -153,25 +160,25 @@ class BaseTransform(object):
         keys (list[str]|tuple[str], optional): Input type. Input is a tuple contains different structures,
             key is used to specify the type of input. For example, if your input
             is image type, then the key can be None or ("image"). if your input
-            is (image, image) type, then the keys should be ("image", "image"). 
+            is (image, image) type, then the keys should be ("image", "image").
             if your input is (image, boxes), then the keys should be ("image", "boxes").
 
             Current available strings & data type are describe below:
 
-            - "image": input image, with shape of (H, W, C) 
-            - "coords": coordinates, with shape of (N, 2) 
-            - "boxes": bounding boxes, with shape of (N, 4), "xyxy" format, 
-            
-                       the 1st "xy" represents top left point of a box, 
+            - "image": input image, with shape of (H, W, C)
+            - "coords": coordinates, with shape of (N, 2)
+            - "boxes": bounding boxes, with shape of (N, 4), "xyxy" format,
+
+                       the 1st "xy" represents top left point of a box,
                        the 2nd "xy" represents right bottom point.
 
             - "mask": map used for segmentation, with shape of (H, W, 1)
-            
+
             You can also customize your data types only if you implement the corresponding
             _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
-    
+
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -220,7 +227,7 @@ def _apply_boxes(self, boxes):
                     maxxy = coords.max(axis=1)
                     trans_boxes = np.concatenate((minxy, maxxy), axis=1)
                     return trans_boxes
-                    
+
                 # if you only want to transform image, do not need to rewrite this function
                 def _apply_mask(self, mask):
                     if self.params['flip']:
@@ -245,14 +252,16 @@ def _apply_mask(self, mask):
 
     def __init__(self, keys=None):
         if keys is None:
-            keys = ("image", )
+            keys = ("image",)
         elif not isinstance(keys, Sequence):
             raise ValueError(
-                "keys should be a sequence, but got keys={}".format(keys))
+                "keys should be a sequence, but got keys={}".format(keys)
+            )
         for k in keys:
             if self._get_apply(k) is None:
                 raise NotImplementedError(
-                    "{} is unsupported data structure".format(k))
+                    "{} is unsupported data structure".format(k)
+                )
         self.keys = keys
 
         # storage some params get from function get_params()
@@ -264,7 +273,7 @@ def _get_params(self, inputs):
     def __call__(self, inputs):
         """Apply transform on single input data"""
         if not isinstance(inputs, tuple):
-            inputs = (inputs, )
+            inputs = (inputs,)
 
         self.params = self._get_params(inputs)
 
@@ -276,7 +285,7 @@ def __call__(self, inputs):
             else:
                 outputs.append(apply_func(inputs[i]))
         if len(inputs) > len(self.keys):
-            outputs.extend(inputs[len(self.keys):])
+            outputs.extend(inputs[len(self.keys) :])
 
         if len(outputs) == 1:
             outputs = outputs[0]
@@ -302,22 +311,22 @@ class ToTensor(BaseTransform):
 
     Converts a PIL.Image or numpy.ndarray (H x W x C) to a paddle.Tensor of shape (C x H x W).
 
-    If input is a grayscale image (H x W), it will be converted to a image of shape (H x W x 1). 
+    If input is a grayscale image (H x W), it will be converted to an image of shape (H x W x 1).
     And the shape of output tensor will be (1 x H x W).
 
     If you want to keep the shape of output tensor as (H x W x C), you can set data_format = ``HWC`` .
 
-    Converts a PIL.Image or numpy.ndarray in the range [0, 255] to a paddle.Tensor in the 
-    range [0.0, 1.0] if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, 
-    RGBA, CMYK, 1) or if the numpy.ndarray has dtype = np.uint8. 
+    Converts a PIL.Image or numpy.ndarray in the range [0, 255] to a paddle.Tensor in the
+    range [0.0, 1.0] if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr,
+    RGBA, CMYK, 1) or if the numpy.ndarray has dtype = np.uint8.
 
     In the other cases, tensors are returned without scaling.
 
     Args:
-        data_format (str, optional): Data format of output tensor, should be 'HWC' or 
+        data_format (str, optional): Data format of output tensor, should be 'HWC' or
             'CHW'. Default: 'CHW'.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray): The input image with shape (H x W x C).
         - output(np.ndarray): A tensor with shape (C x H x W) or (H x W x C) according option data_format.
@@ -326,7 +335,7 @@ class ToTensor(BaseTransform):
         A callable object of ToTensor.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -340,10 +349,10 @@ class ToTensor(BaseTransform):
             transform = T.ToTensor()
 
             tensor = transform(fake_img)
-            
+
             print(tensor.shape)
             # [3, 4, 5]
-    
+
             print(tensor.dtype)
             # paddle.float32
     """
@@ -372,19 +381,19 @@ class Resize(BaseTransform):
             smaller edge of the image will be matched to this number.
             i.e, if height > width, then image will be rescaled to
             (size * height / width, size)
-        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. 
-            when use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
-            - "bicubic": Image.BICUBIC, 
-            - "box": Image.BOX, 
-            - "lanczos": Image.LANCZOS, 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'.
+            when use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
+            - "bicubic": Image.BICUBIC,
+            - "box": Image.BOX,
+            - "lanczos": Image.LANCZOS,
             - "hamming": Image.HAMMING
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
-            - "area": cv2.INTER_AREA, 
-            - "bicubic": cv2.INTER_CUBIC, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
+            - "area": cv2.INTER_AREA,
+            - "bicubic": cv2.INTER_CUBIC,
             - "lanczos": cv2.INTER_LANCZOS4
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
@@ -396,7 +405,7 @@ class Resize(BaseTransform):
         A callable object of Resize.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -418,8 +427,9 @@ class Resize(BaseTransform):
 
     def __init__(self, size, interpolation='bilinear', keys=None):
         super(Resize, self).__init__(keys)
-        assert isinstance(size, int) or (isinstance(size, Iterable)
-                                         and len(size) == 2)
+        assert isinstance(size, int) or (
+            isinstance(size, Iterable) and len(size) == 2
+        )
         self.size = size
         self.interpolation = interpolation
 
@@ -435,22 +445,22 @@ class RandomResizedCrop(BaseTransform):
 
     Args:
         size (int|list|tuple): Target size of output image, with (height, width) shape.
-        scale (list|tuple): Scale range of the cropped image before resizing, relatively to the origin 
+        scale (list|tuple): Scale range of the cropped image before resizing, relatively to the origin
             image. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend, 
-            support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
-            - "bicubic": Image.BICUBIC, 
-            - "box": Image.BOX, 
-            - "lanczos": Image.LANCZOS, 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend,
+            support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
+            - "bicubic": Image.BICUBIC,
+            - "box": Image.BOX,
+            - "lanczos": Image.LANCZOS,
             - "hamming": Image.HAMMING
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
-            - "area": cv2.INTER_AREA, 
-            - "bicubic": cv2.INTER_CUBIC, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
+            - "area": cv2.INTER_AREA,
+            - "bicubic": cv2.INTER_CUBIC,
             - "lanczos": cv2.INTER_LANCZOS4
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
@@ -462,7 +472,7 @@ class RandomResizedCrop(BaseTransform):
         A callable object of RandomResizedCrop.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -478,19 +488,21 @@ class RandomResizedCrop(BaseTransform):
 
     """
 
-    def __init__(self,
-                 size,
-                 scale=(0.08, 1.0),
-                 ratio=(3. / 4, 4. / 3),
-                 interpolation='bilinear',
-                 keys=None):
+    def __init__(
+        self,
+        size,
+        scale=(0.08, 1.0),
+        ratio=(3.0 / 4, 4.0 / 3),
+        interpolation='bilinear',
+        keys=None,
+    ):
         super(RandomResizedCrop, self).__init__(keys)
         if isinstance(size, int):
             self.size = (size, size)
         else:
             self.size = size
-        assert (scale[0] <= scale[1]), "scale should be of kind (min, max)"
-        assert (ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
+        assert scale[0] <= scale[1], "scale should be of kind (min, max)"
+        assert ratio[0] <= ratio[1], "ratio should be of kind (min, max)"
         self.scale = scale
         self.ratio = ratio
         self.interpolation = interpolation
@@ -550,7 +562,7 @@ class CenterCrop(BaseTransform):
         A callable object of CenterCrop.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -591,7 +603,7 @@ class RandomHorizontalFlip(BaseTransform):
         A callable object of RandomHorizontalFlip.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -632,7 +644,7 @@ class RandomVerticalFlip(BaseTransform):
         A callable object of RandomVerticalFlip.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -668,7 +680,7 @@ class Normalize(BaseTransform):
     Args:
         mean (int|float|list|tuple, optional): Sequence of means for each channel.
         std (int|float|list|tuple, optional): Sequence of standard deviations for each channel.
-        data_format (str, optional): Data format of img, should be 'HWC' or 
+        data_format (str, optional): Data format of img, should be 'HWC' or
             'CHW'. Default: 'CHW'.
         to_rgb (bool, optional): Whether to convert to rgb. Default: False.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
@@ -681,7 +693,7 @@ class Normalize(BaseTransform):
         A callable object of Normalize.
 
     Examples:
-    
+
         .. code-block:: python
           :name: code-example
             import paddle
@@ -698,15 +710,12 @@ class Normalize(BaseTransform):
             # (300, 320, 3)
             print(fake_img.max(), fake_img.min())
             # 0.99999905 -0.999974
-    
+
     """
 
-    def __init__(self,
-                 mean=0.0,
-                 std=1.0,
-                 data_format='CHW',
-                 to_rgb=False,
-                 keys=None):
+    def __init__(
+        self, mean=0.0, std=1.0, data_format='CHW', to_rgb=False, keys=None
+    ):
         super(Normalize, self).__init__(keys)
         if isinstance(mean, numbers.Number):
             mean = [mean, mean, mean]
@@ -720,30 +729,31 @@ def __init__(self,
         self.to_rgb = to_rgb
 
     def _apply_image(self, img):
-        return F.normalize(img, self.mean, self.std, self.data_format,
-                           self.to_rgb)
+        return F.normalize(
+            img, self.mean, self.std, self.data_format, self.to_rgb
+        )
 
 
 class Transpose(BaseTransform):
     """Transpose input data to a target format.
     For example, most transforms use HWC mode image,
     while the Neural Network might use CHW mode input tensor.
-    output image will be an instance of numpy.ndarray. 
+    output image will be an instance of numpy.ndarray.
 
     Args:
         order (list|tuple, optional): Target order of input data. Default: (2, 0, 1).
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
-        - output(np.ndarray|Paddle.Tensor): A transposed array or tensor. If input 
+        - output(np.ndarray|Paddle.Tensor): A transposed array or tensor. If input
             is a PIL.Image, output will be converted to np.ndarray automatically.
 
     Returns:
         A callable object of Transpose.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -756,7 +766,7 @@ class Transpose(BaseTransform):
 
             fake_img = transform(fake_img)
             print(fake_img.shape)
-    
+
     """
 
     def __init__(self, order=(2, 0, 1), keys=None):
@@ -791,7 +801,7 @@ class BrightnessTransform(BaseTransform):
         A callable object of BrightnessTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -803,7 +813,7 @@ class BrightnessTransform(BaseTransform):
             fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            
+
     """
 
     def __init__(self, value, keys=None):
@@ -834,7 +844,7 @@ class ContrastTransform(BaseTransform):
         A callable object of ContrastTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -879,7 +889,7 @@ class SaturationTransform(BaseTransform):
         A callable object of SaturationTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -889,7 +899,7 @@ class SaturationTransform(BaseTransform):
             transform = SaturationTransform(0.4)
 
             fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
-        
+
             fake_img = transform(fake_img)
 
     """
@@ -922,7 +932,7 @@ class HueTransform(BaseTransform):
         A callable object of HueTransform.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -939,11 +949,9 @@ class HueTransform(BaseTransform):
 
     def __init__(self, value, keys=None):
         super(HueTransform, self).__init__(keys)
-        self.value = _check_input(value,
-                                  'hue',
-                                  center=0,
-                                  bound=(-0.5, 0.5),
-                                  clip_first_on_zero=False)
+        self.value = _check_input(
+            value, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False
+        )
 
     def _apply_image(self, img):
         if self.value is None:
@@ -975,7 +983,7 @@ class ColorJitter(BaseTransform):
         A callable object of ColorJitter.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -990,12 +998,9 @@ class ColorJitter(BaseTransform):
 
     """
 
-    def __init__(self,
-                 brightness=0,
-                 contrast=0,
-                 saturation=0,
-                 hue=0,
-                 keys=None):
+    def __init__(
+        self, brightness=0, contrast=0, saturation=0, hue=0, keys=None
+    ):
         super(ColorJitter, self).__init__(keys)
         self.brightness = brightness
         self.contrast = contrast
@@ -1038,8 +1043,9 @@ def _apply_image(self, img):
         Returns:
             PIL Image: Color jittered image.
         """
-        transform = self._get_param(self.brightness, self.contrast,
-                                    self.saturation, self.hue)
+        transform = self._get_param(
+            self.brightness, self.contrast, self.saturation, self.hue
+        )
         return transform(img)
 
 
@@ -1051,7 +1057,7 @@ class RandomCrop(BaseTransform):
             int instead of sequence like (h, w), a square crop (size, size) is
             made.
         padding (int|sequence, optional): Optional padding on each border
-            of the image. If a sequence of length 4 is provided, it is used to pad left, 
+            of the image. If a sequence of length 4 is provided, it is used to pad left,
             top, right, bottom borders respectively. Default: None, without padding.
         pad_if_needed (boolean, optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Default: False.
@@ -1074,7 +1080,7 @@ class RandomCrop(BaseTransform):
                      padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
                      will result in [2, 1, 1, 2, 3, 4, 4, 3]
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A random cropped image.
@@ -1083,7 +1089,7 @@ class RandomCrop(BaseTransform):
         A callable object of RandomCrop.
 
     Examples:
-    
+
         .. code-block:: python
           :name: code-example1
 
@@ -1098,13 +1104,15 @@ class RandomCrop(BaseTransform):
             print(crop_img.shape) # [3, 224, 224]
     """
 
-    def __init__(self,
-                 size,
-                 padding=None,
-                 pad_if_needed=False,
-                 fill=0,
-                 padding_mode='constant',
-                 keys=None):
+    def __init__(
+        self,
+        size,
+        padding=None,
+        pad_if_needed=False,
+        fill=0,
+        padding_mode='constant',
+        keys=None,
+    ):
         super(RandomCrop, self).__init__(keys)
         if isinstance(size, numbers.Number):
             self.size = (int(size), int(size))
@@ -1149,12 +1157,14 @@ def _apply_image(self, img):
 
         # pad the width if needed
         if self.pad_if_needed and w < self.size[1]:
-            img = F.pad(img, (self.size[1] - w, 0), self.fill,
-                        self.padding_mode)
+            img = F.pad(
+                img, (self.size[1] - w, 0), self.fill, self.padding_mode
+            )
         # pad the height if needed
         if self.pad_if_needed and h < self.size[0]:
-            img = F.pad(img, (0, self.size[0] - h), self.fill,
-                        self.padding_mode)
+            img = F.pad(
+                img, (0, self.size[0] - h), self.fill, self.padding_mode
+            )
 
         i, j, h, w = self._get_param(img, self.size)
 
@@ -1174,16 +1184,16 @@ class Pad(BaseTransform):
             length 3, it is used to fill R, G, B channels respectively.
             This value is only used when the padding_mode is constant
         padding_mode (str): Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
-            ``constant`` means pads with a constant value, this value is specified with fill. 
-            ``edge`` means pads with the last value at the edge of the image. 
-            ``reflect`` means pads with reflection of image (without repeating the last value on the edge) 
-            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode 
+            ``constant`` means pads with a constant value, this value is specified with fill.
+            ``edge`` means pads with the last value at the edge of the image.
+            ``reflect`` means pads with reflection of image (without repeating the last value on the edge)
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode
             will result in ``[3, 2, 1, 2, 3, 4, 3, 2]``.
             ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
-            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode 
+            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode
             will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A paded image.
@@ -1192,7 +1202,7 @@ class Pad(BaseTransform):
         A callable object of Pad.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1219,8 +1229,9 @@ def __init__(self, padding, fill=0, padding_mode='constant', keys=None):
 
         if isinstance(padding, Sequence) and len(padding) not in [2, 4]:
             raise ValueError(
-                "Padding must be an int or a 2, or 4 element tuple, not a " +
-                "{} element tuple".format(len(padding)))
+                "Padding must be an int or a 2, or 4 element tuple, not a "
+                + "{} element tuple".format(len(padding))
+            )
 
         super(Pad, self).__init__(keys)
         self.padding = padding
@@ -1239,19 +1250,23 @@ def _apply_image(self, img):
 
 
 def _check_sequence_input(x, name, req_sizes):
-    msg = req_sizes[0] if len(req_sizes) < 2 else " or ".join(
-        [str(s) for s in req_sizes])
+    msg = (
+        req_sizes[0]
+        if len(req_sizes) < 2
+        else " or ".join([str(s) for s in req_sizes])
+    )
     if not isinstance(x, Sequence):
         raise TypeError(f"{name} should be a sequence of length {msg}.")
     if len(x) not in req_sizes:
         raise ValueError(f"{name} should be sequence of length {msg}.")
 
 
-def _setup_angle(x, name, req_sizes=(2, )):
+def _setup_angle(x, name, req_sizes=(2,)):
     if isinstance(x, numbers.Number):
         if x < 0:
             raise ValueError(
-                f"If {name} is a single number, it must be positive.")
+                f"If {name} is a single number, it must be positive."
+            )
         x = [-x, x]
     else:
         _check_sequence_input(x, name, req_sizes)
@@ -1268,25 +1283,25 @@ class RandomAffine(BaseTransform):
             will be (-degrees, +degrees) in clockwise order. If set 0, will not rotate.
         translate (tuple, optional): Maximum absolute fraction for horizontal and vertical translations.
             For example translate=(a, b), then horizontal shift is randomly sampled in the range -img_width * a < dx < img_width * a
-            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b. 
+            and vertical shift is randomly sampled in the range -img_height * b < dy < img_height * b.
             Default is None, will not translate.
-        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b. 
+        scale (tuple, optional): Scaling factor interval, e.g (a, b), then scale is randomly sampled from the range a <= scale <= b.
             Default is None, will keep original scale and not scale.
         shear (sequence or number, optional): Range of degrees to shear, ranges from -180 to 180 in clockwise order.
-            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied. 
-            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied. 
+            If set as a number, a shear parallel to the x axis in the range (-shear, +shear) will be applied.
+            Else if set as a sequence of 2 values a shear parallel to the x axis in the range (shear[0], shear[1]) will be applied.
             Else if set as a sequence of 4 values, a x-axis shear in (shear[0], shear[1]) and y-axis shear in (shear[2], shear[3]) will be applied.
             Default is None, will not apply shear.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. 
-            When use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend.
+            When use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            When use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            When use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
@@ -1303,7 +1318,7 @@ class RandomAffine(BaseTransform):
         A callable object of RandomAffine.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -1317,31 +1332,34 @@ class RandomAffine(BaseTransform):
             print(fake_img.shape)
     """
 
-    def __init__(self,
-                 degrees,
-                 translate=None,
-                 scale=None,
-                 shear=None,
-                 interpolation='nearest',
-                 fill=0,
-                 center=None,
-                 keys=None):
-        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2, ))
+    def __init__(
+        self,
+        degrees,
+        translate=None,
+        scale=None,
+        shear=None,
+        interpolation='nearest',
+        fill=0,
+        center=None,
+        keys=None,
+    ):
+        self.degrees = _setup_angle(degrees, name="degrees", req_sizes=(2,))
 
         super(RandomAffine, self).__init__(keys)
         assert interpolation in ['nearest', 'bilinear', 'bicubic']
         self.interpolation = interpolation
 
         if translate is not None:
-            _check_sequence_input(translate, "translate", req_sizes=(2, ))
+            _check_sequence_input(translate, "translate", req_sizes=(2,))
             for t in translate:
                 if not (0.0 <= t <= 1.0):
                     raise ValueError(
-                        "translation values should be between 0 and 1")
+                        "translation values should be between 0 and 1"
+                    )
         self.translate = translate
 
         if scale is not None:
-            _check_sequence_input(scale, "scale", req_sizes=(2, ))
+            _check_sequence_input(scale, "scale", req_sizes=(2,))
             for s in scale:
                 if s <= 0:
                     raise ValueError("scale values should be positive")
@@ -1359,15 +1377,12 @@ def __init__(self,
         self.fill = fill
 
         if center is not None:
-            _check_sequence_input(center, "center", req_sizes=(2, ))
+            _check_sequence_input(center, "center", req_sizes=(2,))
         self.center = center
 
-    def _get_param(self,
-                   img_size,
-                   degrees,
-                   translate=None,
-                   scale_ranges=None,
-                   shears=None):
+    def _get_param(
+        self, img_size, degrees, translate=None, scale_ranges=None, shears=None
+    ):
         """Get parameters for affine transformation
 
         Returns:
@@ -1410,14 +1425,17 @@ def _apply_image(self, img):
         w, h = _get_image_size(img)
         img_size = [w, h]
 
-        ret = self._get_param(img_size, self.degrees, self.translate,
-                              self.scale, self.shear)
+        ret = self._get_param(
+            img_size, self.degrees, self.translate, self.scale, self.shear
+        )
 
-        return F.affine(img,
-                        *ret,
-                        interpolation=self.interpolation,
-                        fill=self.fill,
-                        center=self.center)
+        return F.affine(
+            img,
+            *ret,
+            interpolation=self.interpolation,
+            fill=self.fill,
+            center=self.center,
+        )
 
 
 class RandomRotation(BaseTransform):
@@ -1427,15 +1445,15 @@ class RandomRotation(BaseTransform):
         degrees (sequence or float or int): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees) clockwise order.
-        interpolation (str, optional): Interpolation method. If omitted, or if the 
-            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
-            according the backend. when use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+        interpolation (str, optional): Interpolation method. If omitted, or if the
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST
+            according the backend. when use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            when use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            when use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         expand (bool|optional): Optional expansion flag. Default: False.
             If true, expands the output to make it large enough to hold the entire rotated image.
@@ -1445,7 +1463,7 @@ class RandomRotation(BaseTransform):
             Origin is the upper left corner.
             Default is the center of the image.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
         - output(PIL.Image|np.ndarray|Paddle.Tensor): A rotated image.
@@ -1454,7 +1472,7 @@ class RandomRotation(BaseTransform):
         A callable object of RandomRotation.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1469,22 +1487,26 @@ class RandomRotation(BaseTransform):
             print(fake_img.size)
     """
 
-    def __init__(self,
-                 degrees,
-                 interpolation='nearest',
-                 expand=False,
-                 center=None,
-                 fill=0,
-                 keys=None):
+    def __init__(
+        self,
+        degrees,
+        interpolation='nearest',
+        expand=False,
+        center=None,
+        fill=0,
+        keys=None,
+    ):
         if isinstance(degrees, numbers.Number):
             if degrees < 0:
                 raise ValueError(
-                    "If degrees is a single number, it must be positive.")
+                    "If degrees is a single number, it must be positive."
+                )
             self.degrees = (-degrees, degrees)
         else:
             if len(degrees) != 2:
                 raise ValueError(
-                    "If degrees is a sequence, it must be of len 2.")
+                    "If degrees is a sequence, it must be of len 2."
+                )
             self.degrees = degrees
 
         super(RandomRotation, self).__init__(keys)
@@ -1509,8 +1531,9 @@ def _apply_image(self, img):
 
         angle = self._get_param(self.degrees)
 
-        return F.rotate(img, angle, self.interpolation, self.expand,
-                        self.center, self.fill)
+        return F.rotate(
+            img, angle, self.interpolation, self.expand, self.center, self.fill
+        )
 
 
 class RandomPerspective(BaseTransform):
@@ -1524,13 +1547,13 @@ class RandomPerspective(BaseTransform):
         interpolation (str, optional): Interpolation method. If omitted, or if
             the image has only one channel, it is set to PIL.Image.NEAREST or
             cv2.INTER_NEAREST.
-            When use pil backend, support method are as following: 
-            - "nearest": Image.NEAREST, 
-            - "bilinear": Image.BILINEAR, 
+            When use pil backend, support method are as following:
+            - "nearest": Image.NEAREST,
+            - "bilinear": Image.BILINEAR,
             - "bicubic": Image.BICUBIC
-            When use cv2 backend, support method are as following: 
-            - "nearest": cv2.INTER_NEAREST, 
-            - "bilinear": cv2.INTER_LINEAR, 
+            When use cv2 backend, support method are as following:
+            - "nearest": cv2.INTER_NEAREST,
+            - "bilinear": cv2.INTER_LINEAR,
             - "bicubic": cv2.INTER_CUBIC
         fill (int|list|tuple, optional): Pixel fill value for the area outside the transformed
             image. If given a number, the value is used for all bands respectively.
@@ -1544,7 +1567,7 @@ class RandomPerspective(BaseTransform):
         A callable object of RandomPerspective.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
@@ -1558,15 +1581,19 @@ class RandomPerspective(BaseTransform):
             print(fake_img.shape)
     """
 
-    def __init__(self,
-                 prob=0.5,
-                 distortion_scale=0.5,
-                 interpolation='nearest',
-                 fill=0,
-                 keys=None):
+    def __init__(
+        self,
+        prob=0.5,
+        distortion_scale=0.5,
+        interpolation='nearest',
+        fill=0,
+        keys=None,
+    ):
         super(RandomPerspective, self).__init__(keys)
         assert 0 <= prob <= 1, "probability must be between 0 and 1"
-        assert 0 <= distortion_scale <= 1, "distortion_scale must be between 0 and 1"
+        assert (
+            0 <= distortion_scale <= 1
+        ), "distortion_scale must be between 0 and 1"
         assert interpolation in ['nearest', 'bilinear', 'bicubic']
         assert isinstance(fill, (numbers.Number, str, list, tuple))
 
@@ -1584,35 +1611,43 @@ def get_params(self, width, height, distortion_scale):
         half_height = height // 2
         half_width = width // 2
         topleft = [
-            int(random.uniform(0,
-                               int(distortion_scale * half_width) + 1)),
-            int(random.uniform(0,
-                               int(distortion_scale * half_height) + 1)),
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
         ]
         topright = [
             int(
-                random.uniform(width - int(distortion_scale * half_width) - 1,
-                               width)),
-            int(random.uniform(0,
-                               int(distortion_scale * half_height) + 1)),
+                random.uniform(
+                    width - int(distortion_scale * half_width) - 1, width
+                )
+            ),
+            int(random.uniform(0, int(distortion_scale * half_height) + 1)),
         ]
         botright = [
             int(
-                random.uniform(width - int(distortion_scale * half_width) - 1,
-                               width)),
+                random.uniform(
+                    width - int(distortion_scale * half_width) - 1, width
+                )
+            ),
             int(
-                random.uniform(height - int(distortion_scale * half_height) - 1,
-                               height)),
+                random.uniform(
+                    height - int(distortion_scale * half_height) - 1, height
+                )
+            ),
         ]
         botleft = [
-            int(random.uniform(0,
-                               int(distortion_scale * half_width) + 1)),
+            int(random.uniform(0, int(distortion_scale * half_width) + 1)),
             int(
-                random.uniform(height - int(distortion_scale * half_height) - 1,
-                               height)),
+                random.uniform(
+                    height - int(distortion_scale * half_height) - 1, height
+                )
+            ),
+        ]
+        startpoints = [
+            [0, 0],
+            [width - 1, 0],
+            [width - 1, height - 1],
+            [0, height - 1],
         ]
-        startpoints = [[0, 0], [width - 1, 0], [width - 1, height - 1],
-                       [0, height - 1]]
         endpoints = [topleft, topright, botright, botleft]
 
         return startpoints, endpoints
@@ -1629,10 +1664,12 @@ def _apply_image(self, img):
         width, height = _get_image_size(img)
 
         if random.random() < self.prob:
-            startpoints, endpoints = self.get_params(width, height,
-                                                     self.distortion_scale)
-            return F.perspective(img, startpoints, endpoints,
-                                 self.interpolation, self.fill)
+            startpoints, endpoints = self.get_params(
+                width, height, self.distortion_scale
+            )
+            return F.perspective(
+                img, startpoints, endpoints, self.interpolation, self.fill
+            )
         return img
 
 
@@ -1645,7 +1682,7 @@ class Grayscale(BaseTransform):
 
     Shape:
         - img(PIL.Image|np.ndarray|Paddle.Tensor): The input image with shape (H x W x C).
-        - output(PIL.Image|np.ndarray|Paddle.Tensor): Grayscale version of the input image. 
+        - output(PIL.Image|np.ndarray|Paddle.Tensor): Grayscale version of the input image.
             - If output_channels == 1 : returned image is single channel
             - If output_channels == 3 : returned image is 3 channel with r == g == b
 
@@ -1653,7 +1690,7 @@ class Grayscale(BaseTransform):
         A callable object of Grayscale.
 
     Examples:
-    
+
         .. code-block:: python
 
             import numpy as np
@@ -1688,19 +1725,19 @@ class RandomErasing(BaseTransform):
 
     Args:
         prob (float, optional): Probability of the input data being erased. Default: 0.5.
-        scale (sequence, optional): The proportional range of the erased area to the input image. 
+        scale (sequence, optional): The proportional range of the erased area to the input image.
                                     Default: (0.02, 0.33).
         ratio (sequence, optional): Aspect ratio range of the erased area. Default: (0.3, 3.3).
         value (int|float|sequence|str, optional): The value each pixel in erased area will be replaced with.
-                               If value is a single number, all pixels will be erased with this value. 
-                               If value is a sequence with length 3, the R, G, B channels will be ereased 
-                               respectively. If value is set to "random", each pixel will be erased with 
+                               If value is a single number, all pixels will be erased with this value.
+                               If value is a sequence with length 3, the R, G, B channels will be ereased
+                               respectively. If value is set to "random", each pixel will be erased with
                                random values. Default: 0.
         inplace (bool, optional): Whether this transform is inplace. Default: False.
         keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
-    
+
     Shape:
-        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W). 
+        - img(paddle.Tensor | np.array | PIL.Image): The input image. For Tensor input, the shape should be (C, H, W).
                  For np.array input, the shape should be (H, W, C).
         - output(paddle.Tensor | np.array | PIL.Image): A random erased image.
 
@@ -1708,11 +1745,11 @@ class RandomErasing(BaseTransform):
         A callable object of RandomErasing.
 
     Examples:
-    
+
         .. code-block:: python
 
             import paddle
-            
+
             fake_img = paddle.randn((3, 10, 10)).astype(paddle.float32)
             transform = paddle.vision.transforms.RandomErasing()
             result = transform(fake_img)
@@ -1720,27 +1757,34 @@ class RandomErasing(BaseTransform):
             print(result)
     """
 
-    def __init__(self,
-                 prob=0.5,
-                 scale=(0.02, 0.33),
-                 ratio=(0.3, 3.3),
-                 value=0,
-                 inplace=False,
-                 keys=None):
+    def __init__(
+        self,
+        prob=0.5,
+        scale=(0.02, 0.33),
+        ratio=(0.3, 3.3),
+        value=0,
+        inplace=False,
+        keys=None,
+    ):
         super(RandomErasing, self).__init__(keys)
-        assert isinstance(scale,
-                          (tuple, list)), "scale should be a tuple or list"
-        assert (scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
-                ), "scale should be of kind (min, max) and in range [0, 1]"
-        assert isinstance(ratio,
-                          (tuple, list)), "ratio should be a tuple or list"
-        assert (ratio[0] >= 0
-                and ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
-        assert (prob >= 0
-                and prob <= 1), "The probability should be in range [0, 1]"
         assert isinstance(
-            value, (numbers.Number, str, tuple,
-                    list)), "value should be a number, tuple, list or str"
+            scale, (tuple, list)
+        ), "scale should be a tuple or list"
+        assert (
+            scale[0] >= 0 and scale[1] <= 1 and scale[0] <= scale[1]
+        ), "scale should be of kind (min, max) and in range [0, 1]"
+        assert isinstance(
+            ratio, (tuple, list)
+        ), "ratio should be a tuple or list"
+        assert (
+            ratio[0] >= 0 and ratio[0] <= ratio[1]
+        ), "ratio should be of kind (min, max)"
+        assert (
+            prob >= 0 and prob <= 1
+        ), "The probability should be in range [0, 1]"
+        assert isinstance(
+            value, (numbers.Number, str, tuple, list)
+        ), "value should be a number, tuple, list or str"
         if isinstance(value, str) and value != "random":
             raise ValueError("value must be 'random' when type is str")
 
@@ -1755,10 +1799,10 @@ def _get_param(self, img, scale, ratio, value):
 
         Args:
             img (paddle.Tensor | np.array | PIL.Image): Image to be erased.
-            scale (sequence, optional): The proportional range of the erased area to the input image. 
+            scale (sequence, optional): The proportional range of the erased area to the input image.
             ratio (sequence, optional): Aspect ratio range of the erased area.
             value (sequence | None): The value each pixel in erased area will be replaced with.
-                               If value is a sequence with length 3, the R, G, B channels will be ereased 
+                               If value is a sequence with length 3, the R, G, B channels will be ereased
                                respectively. If value is None, each pixel will be erased with random values.
 
         Returns:
@@ -1784,7 +1828,8 @@ def _get_param(self, img, scale, ratio, value):
             if F._is_tensor_image(img):
                 if value is None:
                     v = paddle.normal(shape=[c, erase_h, erase_w]).astype(
-                        img.dtype)
+                        img.dtype
+                    )
                 else:
                     v = paddle.to_tensor(value, dtype=img.dtype)[:, None, None]
             else:
@@ -1820,6 +1865,7 @@ def _apply_image(self, img):
                     "Value should be a single number or a sequence with length equals to image's channel."
                 )
             top, left, erase_h, erase_w, v = self._get_param(
-                img, self.scale, self.ratio, value)
+                img, self.scale, self.ratio, value
+            )
             return F.erase(img, top, left, erase_h, erase_w, v, self.inplace)
         return img
diff --git a/python/setup.py.in b/python/setup.py.in
old mode 100755
new mode 100644
index d919227450fe4b..92ff4a8e5407e7
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -283,10 +283,10 @@ packages=['paddle',
           'paddle.incubate.tensor',
           'paddle.incubate.multiprocessing',
           'paddle.incubate.nn',
-          'paddle.incubate.sparse',
           'paddle.incubate.asp',
           'paddle.incubate.passes',
           'paddle.distribution',
+          'paddle.distributed.utils',
           'paddle.distributed.sharding',
           'paddle.distributed.fleet',
           'paddle.distributed.launch',
@@ -296,6 +296,7 @@ packages=['paddle',
           'paddle.distributed.launch.plugins',
           'paddle.distributed.launch.utils',
           'paddle.distributed.fleet.base',
+          'paddle.distributed.fleet.recompute',
           'paddle.distributed.fleet.elastic',
           'paddle.distributed.fleet.meta_optimizers',
           'paddle.distributed.fleet.meta_optimizers.sharding',
@@ -307,6 +308,8 @@ packages=['paddle',
           'paddle.distributed.fleet.metrics',
           'paddle.distributed.fleet.proto',
           'paddle.distributed.fleet.utils',
+          'paddle.distributed.fleet.layers',
+          'paddle.distributed.fleet.layers.mpu',
           'paddle.distributed.fleet.meta_parallel',
           'paddle.distributed.fleet.meta_parallel.pp_utils',
           'paddle.distributed.fleet.meta_parallel.sharding',
@@ -365,6 +368,11 @@ packages=['paddle',
           'paddle.vision.models',
           'paddle.vision.transforms',
           'paddle.vision.datasets',
+          'paddle.audio',
+	        'paddle.audio.functional',
+	        'paddle.audio.features',
+	        'paddle.audio.datasets',
+	        'paddle.audio.backends',
           'paddle.text',
           'paddle.text.datasets',
           'paddle.incubate',
@@ -374,13 +382,14 @@ packages=['paddle',
           'paddle.incubate.optimizer.functional',
           'paddle.incubate.autograd',
           'paddle.incubate.distributed',
+          'paddle.incubate.distributed.fleet',
           'paddle.incubate.distributed.models',
           'paddle.incubate.distributed.models.moe',
           'paddle.incubate.distributed.models.moe.gate',
-          'paddle.incubate.sparse',
-          'paddle.incubate.sparse.nn',
-          'paddle.incubate.sparse.nn.layer',
-          'paddle.incubate.sparse.nn.functional',
+          'paddle.sparse',
+          'paddle.sparse.nn',
+          'paddle.sparse.nn.layer',
+          'paddle.sparse.nn.functional',
           'paddle.incubate.xpu',
           'paddle.io',
           'paddle.optimizer',
@@ -446,8 +455,6 @@ else:
     package_data={'paddle.fluid': ['${FLUID_CORE_NAME}' + '.pyd', '${FLUID_CORE_NAME}' + '.lib']}
 
 package_data['paddle.fluid'] += ['${PADDLE_BINARY_DIR}/python/paddle/cost_model/static_op_benchmark.json']
-if '${HAS_NOAVX_CORE}' == 'ON':
-    package_data['paddle.fluid'] += ['core_noavx' + ('.so' if os.name != 'nt' else '.pyd')]
 
 package_dir={
     '': '${PADDLE_BINARY_DIR}/python',
@@ -489,7 +496,7 @@ else:
         package_data['paddle.libs'] += ['openblas' + ext_name]
     elif os.name == 'posix' and platform.machine() == 'aarch64' and '${OPENBLAS_LIB}'.endswith('so'):
         # copy the libopenblas.so on linux+aarch64
-        # special: core_noavx.so depends on 'libopenblas.so.0', not 'libopenblas.so'
+        # special: libpaddle.so without avx depends on 'libopenblas.so.0', not 'libopenblas.so'
         if os.path.exists('${OPENBLAS_LIB}' + '.0'):
             shutil.copy('${OPENBLAS_LIB}' + '.0', libs_path)
             package_data['paddle.libs'] += ['libopenblas.so.0']
@@ -584,8 +591,7 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             commands = ["install_name_tool -id '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
             commands.append("install_name_tool -add_rpath '@loader_path/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
         else:
-            commands = ["patchelf --set-soname '${FLUID_CORE_NAME}.so' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
-            commands.append("patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so')
+            commands = ["patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so']
         # The sw_64 not suppot patchelf, so we just disable that.
         if platform.machine() != 'sw_64' and platform.machine() != 'mips64':
             for command in commands:
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index f70037e71611fb..9ff175d2017a81 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -3,7 +3,7 @@ PyGithub
 coverage==5.5
 pycrypto ; platform_system != "Windows"
 mock
-gym
+gym==0.25.2
 pygame==2.1.0
 hypothesis
 opencv-python<=4.2.0.32
@@ -15,3 +15,5 @@ prettytable
 distro
 numpy>=1.20,<1.22; python_version >= "3.7"
 autograd==1.4
+librosa==0.8.1
+parameterized
diff --git a/security/README.md b/security/README.md
index 506bfbb91184ab..eefde5344eb930 100644
--- a/security/README.md
+++ b/security/README.md
@@ -4,9 +4,10 @@ We regularly publish security advisories about using PaddlePaddle.
 
 
 
-*Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](https://github.com/PaddlePaddle/Paddle/blob/develop/SECURITY.md).
+*Note*: In conjunction with these security advisories, we strongly encourage PaddlePaddle users to read and understand PaddlePaddle's security model as outlined in [SECURITY.md](../SECURITY.md).
 
 
-| Advisory Number | Type | Versions affected | Reported by | Additional Information|
-| --------------- | ---- | :---------------: | ----------- | ----------------------|
-|  |  |  |  |  |
+| Advisory Number                              | Type                    | Versions affected | Reported by                           | Additional Information |
+|----------------------------------------------|-------------------------|:-----------------:|---------------------------------------|------------------------|
+| [PDSA-2022-001](./advisory/pdsa-2022-001.md) | OOB read in gather_tree |       < 2.4       | Wang Xuan(王旋) of Qihoo 360 AIVul Team |                        |
+| [PDSA-2022-002](./advisory/pdsa-2022-002.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University |                        |
diff --git a/security/README_cn.md b/security/README_cn.md
index 49f486b0f7878e..1beba5c1fa729d 100644
--- a/security/README_cn.md
+++ b/security/README_cn.md
@@ -4,9 +4,10 @@
 
 
 
-注：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](https://github.com/PaddlePaddle/Paddle/blob/develop/SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
+注：我们非常建议飞桨用户阅读和理解[SECURITY_cn.md](../SECURITY_cn.md)所介绍的飞桨安全模型，以便更好地了解此安全公告。
 
 
-| 安全公告编号 | 类型 | 受影响版本 | 报告者 | 备注 |
-| --------------- | ---- | :---------------: | ----------- | ----------------------|
-|  |  |  |  |  |
+| 安全公告编号                                          | 类型                      | 受影响版本 | 报告者                                   | 备注  |
+|-------------------------------------------------|-------------------------|:-----:|---------------------------------------|-----|
+| [PDSA-2022-001](./advisory/pdsa-2022-001_cn.md) | OOB read in gather_tree | < 2.4 | Wang Xuan(王旋) of Qihoo 360 AIVul Team |     |
+| [PDSA-2022-002](./advisory/pdsa-2022-002_cn.md) | Code injection in paddle.audio.functional.get_window | = 2.4.0-rc0 | Tong Liu of ShanghaiTech University |     |
diff --git a/security/advisory/pdsa-2022-001.md b/security/advisory/pdsa-2022-001.md
new file mode 100644
index 00000000000000..5b77efb9a36320
--- /dev/null
+++ b/security/advisory/pdsa-2022-001.md
@@ -0,0 +1,49 @@
+## PDSA-2022-001: OOB read in gather_tree
+
+### Impact
+
+The PoC is as follows:
+
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy as  np
+
+ids = paddle.to_tensor([[2,2],[6,1]])
+parents = paddle.to_tensor([[2,2],[6,1]])
+
+out = paddle.nn.functional.gather_tree(ids,parents)
+```
+
+The [implementation](https://github.com/PaddlePaddle/Paddle/blob/release/2.3/paddle/phi/kernels/cpu/gather_tree_kernel.cc#L31-L33) of GatherTreeKernel does not validate the ids_dims size which would result in a memory out-of-bounds read if the ids shape is invalid.
+
+```c++
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  auto &ids_dims = ids.dims();
+  auto max_length = ids_dims[0];
+  auto batch_size = ids_dims[1];
+  auto beam_size = ids_dims[2];    //[1]
+```
+
+### Patches
+
+We have patched the issue in commit [6712e262fc6734873cc6d5ca4f45973339a88697](https://github.com/PaddlePaddle/Paddle/commit/6712e262fc6734873cc6d5ca4f45973339a88697).
+
+The fix will be included in PaddlePaddle 2.4.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Wang Xuan(王旋) of Qihoo 360 AIVul Team.
diff --git a/security/advisory/pdsa-2022-001_cn.md b/security/advisory/pdsa-2022-001_cn.md
new file mode 100644
index 00000000000000..fba4d98e632451
--- /dev/null
+++ b/security/advisory/pdsa-2022-001_cn.md
@@ -0,0 +1,49 @@
+## PDSA-2022-001: OOB read in gather_tree
+
+### 影响
+
+PoC如下：
+
+```python
+import paddle
+import paddle.fluid as fluid
+import numpy as  np
+
+ids = paddle.to_tensor([[2,2],[6,1]])
+parents = paddle.to_tensor([[2,2],[6,1]])
+
+out = paddle.nn.functional.gather_tree(ids,parents)
+```
+
+在GatherTreeKernel的[实现代码中](https://github.com/PaddlePaddle/Paddle/blob/release/2.3/paddle/phi/kernels/cpu/gather_tree_kernel.cc#L31-L33)，并没有检查ids_dims的大小，当输入非预期的ids，其shape不正确时会造成可能造成越界读ids_dims。
+
+```c++
+template <typename T, typename Context>
+void GatherTreeKernel(const Context &dev_ctx,
+                      const DenseTensor &ids,
+                      const DenseTensor &parents,
+                      DenseTensor *out) {
+  const auto *ids_data = ids.data<T>();
+  const auto *parents_data = parents.data<T>();
+
+  T *out_data = dev_ctx.template Alloc<T>(out);
+
+  auto &ids_dims = ids.dims();
+  auto max_length = ids_dims[0];
+  auto batch_size = ids_dims[1];
+  auto beam_size = ids_dims[2];    //[1]
+```
+
+### 补丁
+
+我们在commit [6712e262fc6734873cc6d5ca4f45973339a88697](https://github.com/PaddlePaddle/Paddle/commit/6712e262fc6734873cc6d5ca4f45973339a88697)中对此问题进行了补丁。
+
+修复将包含在飞桨2.4版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Wang Xuan(王旋) of Qihoo 360 AIVul Team 提交。
diff --git a/security/advisory/pdsa-2022-002.md b/security/advisory/pdsa-2022-002.md
new file mode 100644
index 00000000000000..efb8e931722bbf
--- /dev/null
+++ b/security/advisory/pdsa-2022-002.md
@@ -0,0 +1,33 @@
+## PDSA-2022-002: Code injection in paddle.audio.functional.get_window
+
+### Impact
+
+`paddle.audio.functional.get_windowis` vulnerable to a code injection as it calls `eval` on user supplied `winstr`. This may lead to arbitrary code execution.
+
+```python
+def get_window(
+    window: Union[str, Tuple[str, float]],
+    win_length: int,
+    fftbins: bool = True,
+    dtype: str = 'float64',
+) -> Tensor:
+    ...
+    try:
+        winfunc = eval('_' + winstr)
+    except NameError as e:
+        raise ValueError("Unknown window type.") from e
+```
+
+### Patches
+
+We have patched the issue in commit [26c419ca386aeae3c461faf2b828d00b48e908eb](https://github.com/PaddlePaddle/Paddle/commit/26c419ca386aeae3c461faf2b828d00b48e908eb).
+
+The fix will be included in PaddlePaddle 2.4.
+
+### For more information
+
+Please consult [our security guide](../../SECURITY.md) for more information regarding the security model and how to contact us with issues and questions.
+
+### Attribution
+
+This vulnerability has been reported by Tong Liu of ShanghaiTech University.
diff --git a/security/advisory/pdsa-2022-002_cn.md b/security/advisory/pdsa-2022-002_cn.md
new file mode 100644
index 00000000000000..84fc365fbbcd89
--- /dev/null
+++ b/security/advisory/pdsa-2022-002_cn.md
@@ -0,0 +1,33 @@
+## PDSA-2022-002: Code injection in paddle.audio.functional.get_window
+
+### 影响
+
+`paddle.audio.functional.get_window`由于使用`eval`用户提供的参数`winstr`而存在代码注入漏洞，将导致任意代码执行。
+
+```python
+def get_window(
+    window: Union[str, Tuple[str, float]],
+    win_length: int,
+    fftbins: bool = True,
+    dtype: str = 'float64',
+) -> Tensor:
+    ...
+    try:
+        winfunc = eval('_' + winstr)
+    except NameError as e:
+        raise ValueError("Unknown window type.") from e
+```
+
+### 补丁
+
+我们在commit [26c419ca386aeae3c461faf2b828d00b48e908eb](https://github.com/PaddlePaddle/Paddle/commit/26c419ca386aeae3c461faf2b828d00b48e908eb)中对此问题进行了补丁。
+
+修复将包含在飞桨2.4版本当中。
+
+### 更多信息
+
+请参考我们的[安全指南](../../SECURITY_cn.md)以获得更多关于安全的信息，以及如何与我们联系问题。
+
+### 贡献者
+
+此漏洞由 Tong Liu of ShanghaiTech University 提交。
diff --git a/tools/check_op_benchmark_result.py b/tools/check_op_benchmark_result.py
index 73075125ac46b7..aaf194ff95ec58 100644
--- a/tools/check_op_benchmark_result.py
+++ b/tools/check_op_benchmark_result.py
@@ -72,15 +72,20 @@ def check_speed_result(case_name, develop_data, pr_data, pr_result):
     """
     pr_gpu_time = pr_data.get("gpu_time")
     develop_gpu_time = develop_data.get("gpu_time")
-    gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time
+    if develop_gpu_time != 0.0:
+        gpu_time_diff = (pr_gpu_time - develop_gpu_time) / develop_gpu_time
+        gpu_time_diff_str = "{:.5f}".format(gpu_time_diff * 100)
+    else:
+        gpu_time_diff = None
+        gpu_time_diff_str = ""
 
     pr_total_time = pr_data.get("total")
     develop_total_time = develop_data.get("total")
     total_time_diff = (pr_total_time - develop_total_time) / develop_total_time
 
     logging.info("------ OP: %s ------" % case_name)
-    logging.info("GPU time change: %.5f%% (develop: %.7f -> PR: %.7f)" %
-                 (gpu_time_diff * 100, develop_gpu_time, pr_gpu_time))
+    logging.info("GPU time change: %s (develop: %.7f -> PR: %.7f)" %
+                 (gpu_time_diff_str, develop_gpu_time, pr_gpu_time))
     logging.info("Total time change: %.5f%% (develop: %.7f -> PR: %.7f)" %
                  (total_time_diff * 100, develop_total_time, pr_total_time))
     logging.info("backward: %s" % pr_result.get("backward"))
@@ -196,7 +201,8 @@ def summary_results(check_results, api_info_file):
         args.develop_logs_dir)
 
     check_path_exists(args.pr_logs_dir)
-    for log_file in os.listdir(args.pr_logs_dir):
+    pr_log_files = os.listdir(args.pr_logs_dir)
+    for log_file in sorted(pr_log_files):
         develop_result = develop_result_dict.get(log_file)
         pr_result = parse_log_file(os.path.join(args.pr_logs_dir, log_file))
         if develop_result is None or pr_result is None:
diff --git a/tools/dockerfile/Dockerfile.ipu b/tools/dockerfile/Dockerfile.ipu
index b79a113332a3e2..18ed1006bf2bc4 100644
--- a/tools/dockerfile/Dockerfile.ipu
+++ b/tools/dockerfile/Dockerfile.ipu
@@ -6,7 +6,7 @@
 # run a container
 # docker run --ulimit memlock=-1:-1 --net=host --cap-add=IPC_LOCK --device=/dev/infiniband/ --ipc=host --rm -it paddlepaddle/paddle:latest-dev-ipu bash
 
-FROM graphcore/poplar:2.6.0
+FROM graphcore/poplar:3.0.0
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 # ENV variables
diff --git a/tools/dockerfile/Dockerfile.mlu b/tools/dockerfile/Dockerfile.mlu
index b3edb25fd54da6..65ab49dd775981 100644
--- a/tools/dockerfile/Dockerfile.mlu
+++ b/tools/dockerfile/Dockerfile.mlu
@@ -1,15 +1,17 @@
 # A image for building paddle binaries
-# Update CNTOOLKIT_VERSION, CNNL_VERSION and CNCL_VERSION if using other versions
+# Update CNTOOLKIT_VERSION, CNNL_VERSION, CNCL_VERSION and MLUOPS_VERSION if using other versions
 #
 # Build:
-# - CNTOOLKIT_VERSION 2.8.5
-# - CNNL_VERSION 1.10.5
-# - CNCL_VERSION 1.1.2
+# - CNTOOLKIT_VERSION 3.0.2-1
+# - CNNL_VERSION 1.13.0-1
+# - CNCL_VERSION 1.2.1-1
+# - MLUOPS_VERSION 0.2.0-1
 #
 # Download three packages from FTP (need to connect cambricon AE to get FTP url)
-# - cntoolkit_2.8.5.ubuntu18.04_amd64.deb
-# - cnnl_1.10.5.ubuntu18.04_amd64.deb
-# - cncl_1.1.2.ubuntu18.04_amd64.deb
+# - cntoolkit_3.0.2-1.ubuntu18.04_amd64.deb
+# - cnnl_1.13.0-1.ubuntu18.04_amd64.deb
+# - cncl_1.2.1-1.ubuntu18.04_amd64.deb
+# - mluops_0.2.0-1.ubuntu18.04_amd64.deb
 # copy them to current directory first, then run build commands
 #
 # For example:
@@ -19,11 +21,13 @@
 # (get cntoolkit pkg)
 # (get cnnl pkg)
 # (get cncl pkg)
+# (get mluops pkg)
 #
 # docker build -f Dockerfile.mlu  \
-# --build-arg CNTOOLKIT_VERSION=2.8.5 \
-# --build-arg CNNL_VERSION=1.10.5 \
-# --build-arg CNCL_VERSION=1.1.2 \
+# --build-arg CNTOOLKIT_VERSION=3.0.2-1 \
+# --build-arg CNNL_VERSION=1.13.0-1 \
+# --build-arg CNCL_VERSION=1.2.1-1 \
+# --build-arg MLUOPS_VERSION=0.2.0-1 \
 # -t paddlepaddle/paddle:latest-dev-mlu .
 #
 # without mlu device:
@@ -40,12 +44,14 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ENV WITH_GPU=OFF
 
-ARG CNTOOLKIT_VERSION=2.8.5
-ARG CNNL_VERSION=1.10.5
-ARG CNCL_VERSION=1.1.2
+ARG CNTOOLKIT_VERSION=3.0.2-1
+ARG CNNL_VERSION=1.13.0-1
+ARG CNCL_VERSION=1.2.1-1
+ARG MLUOPS_VERSION=0.2.0-1
 ARG CNTOOLKIT_PKG=cntoolkit_$CNTOOLKIT_VERSION.ubuntu18.04_amd64.deb
 ARG CNNL_PKG=cnnl_$CNNL_VERSION.ubuntu18.04_amd64.deb
 ARG CNCL_PKG=cncl_$CNCL_VERSION.ubuntu18.04_amd64.deb
+ARG MLUOPS_PKG=mluops_$MLUOPS_VERSION.ubuntu18.04_amd64.deb
 
 # install cntoolkit
 COPY $CNTOOLKIT_PKG ./
@@ -67,6 +73,11 @@ COPY $CNCL_PKG ./
 RUN dpkg -i $CNCL_PKG && \
     rm -f $CNCL_PKG
 
+# install mluops
+COPY $MLUOPS_PKG ./
+RUN dpkg -i $MLUOPS_PKG && \
+    rm -f $MLUOPS_PKG
+
 # Clean
 RUN apt-get clean -y
 
diff --git a/tools/dockerfile/Dockerfile.release16 b/tools/dockerfile/Dockerfile.release16
index 66974f46d91e49..482518bf283052 100644
--- a/tools/dockerfile/Dockerfile.release16
+++ b/tools/dockerfile/Dockerfile.release16
@@ -101,8 +101,13 @@ RUN curl -s -q https://glide.sh/get | sh
 # Downgrade TensorRT 
 COPY tools/dockerfile/build_scripts /build_scripts
 RUN bash /build_scripts/install_nccl2.sh
-RUN rm -rf /build_scripts
 
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
+
+RUN rm -rf /build_scripts
 # git credential to skip password typing
 RUN git config --global credential.helper store
 
@@ -143,13 +148,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
 
 RUN apt-get install libprotobuf-dev -y
 
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 CMD source ~/.bashrc
diff --git a/tools/dockerfile/Dockerfile.release18 b/tools/dockerfile/Dockerfile.release18
index d646f41b00d0b9..fe8513d662badd 100644
--- a/tools/dockerfile/Dockerfile.release18
+++ b/tools/dockerfile/Dockerfile.release18
@@ -28,6 +28,10 @@ RUN apt-get update && \
 # Downgrade gcc&&g++
 WORKDIR /usr/bin 
 COPY tools/dockerfile/build_scripts /build_scripts 
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
 RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
 RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
 RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
@@ -99,14 +103,6 @@ RUN pip3.7 --no-cache-dir install pylint pytest astroid isort
 COPY ./python/requirements.txt /root/
 RUN pip3.7 --no-cache-dir install -r /root/requirements.txt
 
-
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 #CMD source ~/.bashrc
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 7e0c3a62b1d501..b165f757325b4e 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -143,9 +143,14 @@ RUN curl -s -q https://glide.sh/get | sh
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 
 # Downgrade TensorRT 
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
 COPY tools/dockerfile/build_scripts /build_scripts
 RUN bash /build_scripts/install_trt.sh && \
-    bash /build_scripts/install_nccl2.sh
+    bash /build_scripts/install_nccl2.sh && \
+    bash /build_scripts/install_patchelf.sh
 RUN rm -rf /build_scripts
 
 # git credential to skip password typing
@@ -236,13 +241,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
 
 RUN apt-get install libprotobuf-dev -y
 
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 CMD source ~/.bashrc
diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18
index a5dba053b98b2e..8ebfd9b8371c20 100644
--- a/tools/dockerfile/Dockerfile.ubuntu18
+++ b/tools/dockerfile/Dockerfile.ubuntu18
@@ -35,6 +35,10 @@ RUN apt-get update --allow-unauthenticated && \
 WORKDIR /usr/bin 
 COPY tools/dockerfile/build_scripts /build_scripts 
 RUN bash /build_scripts/install_trt.sh
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# # So install a newer version here.
+RUN bash /build_scripts/install_patchelf.sh
 RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
 RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
 RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
@@ -151,14 +155,6 @@ RUN pip3.6 --no-cache-dir install -r /root/requirements.txt && \
     pip3.8 --no-cache-dir install -r /root/requirements.txt && \
     pip3.9 --no-cache-dir install -r /root/requirements.txt
 
-
-# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
-# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
-# So install a newer version here.
-RUN wget -q https://paddle-ci.cdn.bcebos.com/patchelf_0.10-2_amd64.deb && \
-    dpkg -i patchelf_0.10-2_amd64.deb && \
-    rm -rf patchelf_0.10-2_amd64.deb
-
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
 #RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 #CMD source ~/.bashrc
diff --git a/tools/dockerfile/build_scripts/build.sh b/tools/dockerfile/build_scripts/build.sh
index 92d1c12d2bc412..61bcc1f1035630 100644
--- a/tools/dockerfile/build_scripts/build.sh
+++ b/tools/dockerfile/build_scripts/build.sh
@@ -106,7 +106,7 @@ export SSL_CERT_FILE=/opt/_internal/certs.pem
 # tar -xzf patchelf-0.9njs2.tar.gz
 # (cd patchelf-0.9njs2 && ./configure && make && make install)
 # rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
-yum install -y patchelf
+sh "$MY_DIR/install_patchelf.sh"
 
 # Install latest pypi release of auditwheel
 #LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
diff --git a/tools/dockerfile/build_scripts/install_patchelf.sh b/tools/dockerfile/build_scripts/install_patchelf.sh
new file mode 100644
index 00000000000000..9fda46e5b6f865
--- /dev/null
+++ b/tools/dockerfile/build_scripts/install_patchelf.sh
@@ -0,0 +1,29 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+
+TMP_DIR=patchelf_tmp
+
+rm -rf "$TMP_DIR"
+git clone -b 0.15.0 https://github.com/NixOS/patchelf "$TMP_DIR"
+
+cd "$TMP_DIR"
+./bootstrap.sh
+./configure
+make
+make install
+
+cd ..
+rm -rf "$TMP_DIR" 
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index e5a6c240fe5f95..fbc21ec955d586 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -38,7 +38,7 @@ function make_ubuntu_dockerfile(){
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
   sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
     RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libsndfile1 libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 function make_ubuntu_trt7_dockerfile(){
@@ -47,7 +47,7 @@ function make_ubuntu_trt7_dockerfile(){
   sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN apt remove -y libcudnn* --allow-change-held-packages \&\& \
-      apt-get install -y --allow-unauthenticated libcudnn8=8.1.0.77-1+cuda10.2 libcudnn8-dev=8.1.0.77-1+cuda10.2 --allow-change-held-packages" ${dockerfile_name}
+      apt-get install -y --allow-unauthenticated libsndfile1 libcudnn8=8.1.0.77-1+cuda10.2 libcudnn8-dev=8.1.0.77-1+cuda10.2 --allow-change-held-packages" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q  \
       https://developer.download.nvidia.com/compute/cuda/10.2/Prod/patches/2/cuda_10.2.2_linux.run \&\& \
       bash cuda_10.2.2_linux.run --silent --toolkit \&\& ldconfig" ${dockerfile_name}
@@ -73,7 +73,7 @@ function make_ubuntu_trt7_dockerfile(){
     RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
     ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
   sed -i "s#bash /build_scripts/install_nccl2.sh#wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
-    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
+    RUN apt remove -y libnccl* --allow-change-held-packages \&\&  apt-get install -y libsndfile1 libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 zstd pigz --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 
@@ -82,7 +82,7 @@ function make_centos_dockerfile(){
   sed "s/<baseimg>/11.0-cudnn8-devel-centos7/g" Dockerfile.centos >${dockerfile_name}
   sed -i "s#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts  ./build_scripts#g" ${dockerfile_name} 
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
-  sed -i "${dockerfile_line}i RUN yum install -y pigz graphviz zstd" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN yum install -y pigz graphviz zstd libsndfile" ${dockerfile_name}
   sed -i "${dockerfile_line}i RUN pip3.7 install distro" ${dockerfile_name}
   sed -i "${dockerfile_line}i ENV LD_LIBRARY_PATH /opt/_internal/cpython-3.7.0/lib:/usr/local/ssl/lib:/opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 " ${dockerfile_name}
   sed -i "${dockerfile_line}i ENV PATH /opt/_internal/cpython-3.7.0/bin:/usr/local/ssl:/usr/local/gcc-8.2/bin:/usr/local/go/bin:/root/gopath/bin:/opt/rh/devtoolset-2/root/usr/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/java/jdk1.8.0_192/bin " ${dockerfile_name}
@@ -104,7 +104,7 @@ function make_cinn_dockerfile(){
   sed -i 's#<install_cpu_package>##g' ${dockerfile_name}
   sed -i "7i ENV TZ=Asia/Beijing" ${dockerfile_name}
   sed -i "8i RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone" ${dockerfile_name}
-  sed -i "9i RUN apt-get update && apt-get install -y liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev" ${dockerfile_name}
+  sed -i "27i RUN apt-get update && apt-get install -y liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev libsndfile1" ${dockerfile_name}
   dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \&\& \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
diff --git a/tools/get_op_list.py b/tools/get_op_list.py
new file mode 100644
index 00000000000000..05e666d2270f5e
--- /dev/null
+++ b/tools/get_op_list.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import argparse
+import numpy as np
+import os
+import re
+from paddle.inference import _get_phi_kernel_name
+
+paddle.enable_static()
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_dir',
+                        type=str,
+                        default="",
+                        help='Directory of the inference models.')
+    return parser.parse_args()
+
+
+def get_model_ops(model_file):
+    model_bytes = paddle.static.load_from_file(model_file)
+    pg = paddle.static.deserialize_program(model_bytes)
+    ops_set = set()
+
+    for i in range(0, pg.desc.num_blocks()):
+        block = pg.desc.block(i)
+        size = block.op_size()
+
+        for j in range(0, size):
+            ops_set.add(block.op(j).type())
+
+    return ops_set
+
+
+def get_model_phi_kernels(ops_set):
+    phi_set = set()
+    for op in ops_set:
+        phi_set.add(_get_phi_kernel_name(op))
+
+    return phi_set
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    for root, dirs, files in os.walk(args.model_dir, topdown=True):
+        for name in files:
+            if (re.match(r'.*pdmodel', name)):
+                ops_set = get_model_ops(os.path.join(root, name))
+    phi_set = get_model_phi_kernels(ops_set)
+    ops = ";".join(ops_set)
+    kernels = ";".join(phi_set)
+    print("op_list: ", ops)
+    print("kernel_list: ", kernels)
+    ops = np.array([ops])
+    kernels = np.array([kernels])
+    np.savetxt("op_list.txt", ops, fmt='%s')
+    np.savetxt("kernel_list.txt", kernels, fmt='%s')
diff --git a/tools/infrt/generate_phi_kernel_dialect.py b/tools/infrt/generate_phi_kernel_dialect.py
index 39b0d5484a8ff8..379525c1a7ad7c 100644
--- a/tools/infrt/generate_phi_kernel_dialect.py
+++ b/tools/infrt/generate_phi_kernel_dialect.py
@@ -64,7 +64,7 @@ def get_skipped_kernel_list():
     for api in infer_meta_data:
         if "kernel" not in api or "infer_meta" not in api:
             continue
-        if api["api"] in skiped_api_list["phi_apis"]:
+        if api["op"] in skiped_api_list["phi_apis"]:
             skiped_kernel_list.append(api["kernel"]["func"])
     skiped_kernel_list += skiped_api_list["phi_kernels"]
     return skiped_kernel_list
diff --git a/tools/infrt/get_phi_kernel_info.py b/tools/infrt/get_phi_kernel_info.py
index 4837ca582135c4..6309197a10f4c7 100644
--- a/tools/infrt/get_phi_kernel_info.py
+++ b/tools/infrt/get_phi_kernel_info.py
@@ -39,7 +39,7 @@ def get_skipped_kernel_list():
     for api in infer_meta_data:
         if "kernel" not in api or "infer_meta" not in api:
             continue
-        if api["api"] in skiped_api_list["phi_apis"]:
+        if api["op"] in skiped_api_list["phi_apis"]:
             skiped_kernel_list.append(api["kernel"]["func"])
     skiped_kernel_list += skiped_api_list["phi_kernels"]
     return skiped_kernel_list
diff --git a/tools/nvcc_lazy b/tools/nvcc_lazy
deleted file mode 100755
index a553c6f5dcdfd0..00000000000000
--- a/tools/nvcc_lazy
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-unset GREP_OPTIONS
-set -e
-
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY
-
-# set cicc PATH for Centos
-export PATH=$PATH:/usr/local/cuda/nvvm/bin
-
-# check nvcc version, if nvcc >= 11.7, just run nvcc itself
-CUDA_VERSION=$(nvcc --version | grep -oP '(?<=V)\d*\.\d*')
-CUDA_VERSION_MAJOR=${CUDA_VERSION%.*}
-CUDA_VERSION_MINOR=${CUDA_VERSION#*.}
-if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 7) )); then
-  nvcc "$@"
-  exit
-fi
-
-BUILDDIR=$(mktemp -d  /tmp/nvcc-lazy-build.XXXXXXXX)
-echo "$@" > ${BUILDDIR}/args
-BUILDSH=${BUILDDIR}/build.sh
-/usr/local/cuda/bin/nvcc --dryrun --keep --keep-dir=${BUILDDIR} "$@" 2>&1 | sed -e 's/#\$ //;/^rm/d' > $BUILDSH
-sed -i -e '/^\s*--/d' $BUILDSH
-sed -ne '1,/^cicc.*cudafe1.stub.c/p' ${BUILDSH} > ${BUILDSH}.pre
-sed -e '1,/^cicc.*cudafe1.stub.c/d' ${BUILDSH} > ${BUILDSH}.post
-
-sed -i -e '/LIBRARIES=/{s/\s//g;s/""/ /g}' ${BUILDSH}.pre
-
-/usr/bin/env bash ${BUILDSH}.pre
-STUBF=$(find $BUILDDIR -name *.cudafe1.stub.c)
-CUFILE=$(basename -s '.cudafe1.stub.c' $STUBF)
-sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' $STUBF
-sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' $STUBF
-# sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\"===> ${CUFILE} lazy-load? %d\\\\n\", l); __do____cudaRegisterAll();}" $STUBF
-sed -i -e "/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\"CUDA_MODULE_LOADING\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}" $STUBF
-sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' $STUBF
-sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' $STUBF
-/usr/bin/env bash ${BUILDSH}.post
-rm -rf $BUILDDIR
diff --git a/tools/nvcc_lazy.sh b/tools/nvcc_lazy.sh
new file mode 100755
index 00000000000000..011ac564cf91ba
--- /dev/null
+++ b/tools/nvcc_lazy.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+echo "#!/usr/bin/env bash" >> $1
+echo "unset GREP_OPTIONS" >> $1
+echo "set -e" >> $1
+echo -e >> $1 
+echo "if [[ \$# -le 8 ]]; then" >> $1
+echo "  nvcc \"\$@\"" >> $1
+echo "  exit 0" >> $1
+echo "fi" >> $1
+echo -e >> $1
+echo "# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved." >> $1
+echo "#" >> $1
+echo "# Licensed under the Apache License, Version 2.0 (the \"License\");" >> $1
+echo "# you may not use this file except in compliance with the License." >> $1
+echo "# You may obtain a copy of the License at" >> $1
+echo "#" >> $1
+echo "#     http://www.apache.org/licenses/LICENSE-2.0" >> $1
+echo "#" >> $1 
+echo "# Unless required by applicable law or agreed to in writing, software" >> $1
+echo "# distributed under the License is distributed on an \"AS IS\" BASIS," >> $1
+echo "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." >> $1
+echo "# See the License for the specific language governing permissions and" >> $1
+echo "# limitations under the License." >> $1
+echo -e >> $1
+echo -e >> $1
+echo "## CUDA_MODULE_LOADING=EAGER,DEFAULT,LAZY" >> $1
+echo -e >> $1
+echo "# set cicc PATH for Centos" >> $1
+echo "export PATH=\$PATH:$2/bin" >> $1
+echo "export PATH=\$PATH:$2/nvvm/bin" >> $1
+echo -e >> $1
+echo "# check nvcc version, if nvcc >= 11.7, just run nvcc itself" >> $1
+echo "CUDA_VERSION=\$(nvcc --version | grep -oP '(?<=V)\d*\.\d*')" >> $1
+echo "CUDA_VERSION_MAJOR=\${CUDA_VERSION%.*}" >> $1
+echo "CUDA_VERSION_MINOR=\${CUDA_VERSION#*.}" >> $1
+echo "if (( CUDA_VERSION_MAJOR > 11 || (CUDA_VERSION_MAJOR == 11 && CUDA_VERSION_MINOR >= 7) )); then" >> $1
+echo "  nvcc \"\$@\"" >> $1
+echo "  exit" >> $1
+echo "fi" >> $1
+echo -e >> $1
+echo "BUILDDIR=\$(mktemp -d  /tmp/nvcc-lazy-build.XXXXXXXX)" >> $1
+echo "echo \"\$@\" > \${BUILDDIR}/args" >> $1
+echo "BUILDSH=\${BUILDDIR}/build.sh" >> $1
+echo "$2/bin/nvcc --dryrun --keep --keep-dir=\${BUILDDIR} \"\$@\" 2>&1 | sed -e 's/#\\$ //;/^rm/d' > \$BUILDSH" >> $1
+echo "sed -i -e '/^\s*--/d' \$BUILDSH" >> $1
+echo "sed -ne '1,/^cicc.*cudafe1.stub.c/p' \${BUILDSH} > \${BUILDSH}.pre" >> $1
+echo "sed -e '1,/^cicc.*cudafe1.stub.c/d' \${BUILDSH} > \${BUILDSH}.post" >> $1
+echo -e >> $1
+echo "sed -i -e '/LIBRARIES=/{s/\s//g;s/\"\"/ /g}' \${BUILDSH}.pre" >> $1
+echo -e >> $1
+echo "/usr/bin/env bash \${BUILDSH}.pre" >> $1
+echo "STUBF=\$(find \$BUILDDIR -name *.cudafe1.stub.c)" >> $1
+echo "CUFILE=\$(basename -s '.cudafe1.stub.c' \$STUBF)" >> $1
+echo "sed -i -e '/__sti____cudaRegisterAll.*__attribute__/a static void __try____cudaRegisterAll(int);' \$STUBF" >> $1
+echo "sed -i -e 's/__sti____cudaRegisterAll\(.*{\)/__do____cudaRegisterAll\1/' \$STUBF" >> $1
+echo "# sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; fprintf(stderr,\\\"===> \${CUFILE} lazy-load? %d\\\\\\\\n\\\", l); __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "sed -i -e \"/__do____cudaRegisterAll\(.*{\)/a static void __try____cudaRegisterAll(int l){static int _ls = 0; if (_ls) return; const char* lm = getenv(\\\"CUDA_MODULE_LOADING\\\"); if (lm&&(lm[0]=='L')&&(lm[1]=='A')&&(lm[2]=='Z')&&(lm[3]=='Y')&&(l!=1)) return; _ls = 1; __do____cudaRegisterAll();}\" \$STUBF" >> $1
+echo "sed -i -e '/__try____cudaRegisterAll\(.*{\)/a static void __sti____cudaRegisterAll(void){__try____cudaRegisterAll(0);}' \$STUBF" >> $1
+echo "sed -i -e 's/{\(__device_stub__\)/{__try____cudaRegisterAll(1);\1/' \$STUBF" >> $1
+echo "/usr/bin/env bash \${BUILDSH}.post" >> $1
+echo "rm -rf \$BUILDDIR" >> $1
diff --git a/tools/print_signatures.py b/tools/print_signatures.py
index f751709a767a50..cb50c8637f30e5 100644
--- a/tools/print_signatures.py
+++ b/tools/print_signatures.py
@@ -224,17 +224,50 @@ def process_module(m, attr="__all__"):
 def check_public_api():
     import paddle
     modulelist = [  #npqa
-        paddle, paddle.amp, paddle.nn, paddle.nn.functional,
-        paddle.nn.initializer, paddle.nn.utils, paddle.static, paddle.static.nn,
-        paddle.io, paddle.jit, paddle.metric, paddle.distribution,
-        paddle.optimizer, paddle.optimizer.lr, paddle.regularizer, paddle.text,
-        paddle.utils, paddle.utils.download, paddle.utils.profiler,
-        paddle.utils.cpp_extension, paddle.sysconfig, paddle.vision,
-        paddle.vision.datasets, paddle.vision.models, paddle.vision.transforms,
-        paddle.vision.ops, paddle.distributed, paddle.distributed.fleet,
-        paddle.distributed.fleet.utils, paddle.distributed.parallel,
-        paddle.distributed.utils, paddle.callbacks, paddle.hub, paddle.autograd,
-        paddle.incubate, paddle.inference, paddle.onnx, paddle.device
+        paddle,
+        paddle.amp,
+        paddle.nn,
+        paddle.nn.functional,
+        paddle.nn.initializer,
+        paddle.nn.utils,
+        paddle.static,
+        paddle.static.nn,
+        paddle.io,
+        paddle.jit,
+        paddle.metric,
+        paddle.distribution,
+        paddle.optimizer,
+        paddle.optimizer.lr,
+        paddle.regularizer,
+        paddle.text,
+        paddle.utils,
+        paddle.utils.download,
+        paddle.utils.profiler,
+        paddle.utils.cpp_extension,
+        paddle.sysconfig,
+        paddle.vision,
+        paddle.vision.datasets,
+        paddle.vision.models,
+        paddle.vision.transforms,
+        paddle.vision.ops,
+        paddle.distributed,
+        paddle.distributed.fleet,
+        paddle.distributed.fleet.utils,
+        paddle.distributed.parallel,
+        paddle.distributed.utils,
+        paddle.callbacks,
+        paddle.hub,
+        paddle.autograd,
+        paddle.incubate,
+        paddle.inference,
+        paddle.onnx,
+        paddle.device,
+        paddle.audio,
+        paddle.audio.backends,
+        paddle.audio.datasets,
+        paddle.sparse,
+        paddle.sparse.nn,
+        paddle.sparse.nn.functional,
     ]
 
     apinum = 0
@@ -296,7 +329,7 @@ def parse_args():
                         dest='skipped',
                         type=str,
                         help='Skip Checking submodules',
-                        default='paddle.fluid.core_avx.eager.ops')
+                        default='paddle.fluid.libpaddle.eager.ops')
 
     if len(sys.argv) == 1:
         args = parser.parse_args(['paddle'])